diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..8c7eb987095169042c5ff90878020eb29b81e2ab
Binary files /dev/null and b/.DS_Store differ
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..233e4be19e6dd4f2a209c5fa78867feb7f4005b7
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,140 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(diffvg VERSION 0.0.1 DESCRIPTION "Differentiable Vector Graphics")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if(WIN32)
+    find_package(Python 3.6 COMPONENTS Development REQUIRED)
+else()
+    find_package(Python 3.7 COMPONENTS Development REQUIRED)
+endif()
+add_subdirectory(pybind11)
+
+option(DIFFVG_CUDA "Build diffvg with GPU code path?" ON)
+
+if(DIFFVG_CUDA)
+    message(STATUS "Build with CUDA support")
+    find_package(CUDA 10 REQUIRED)
+    set(CMAKE_CUDA_STANDARD 11)
+    if(NOT WIN32)
+        # Hack: for some reason the line above doesn't work on some Linux systems.
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
+        #set(CUDA_NVCC_FLAGS_DEBUG "-g -G")
+    endif()
+else()
+    message(STATUS "Build without CUDA support")
+    find_package(Thrust REQUIRED)
+endif()
+
+# include_directories(${CMAKE_SOURCE_DIR}/pybind11/include)
+include_directories(${PYTHON_INCLUDE_PATH})
+find_package(PythonLibs REQUIRED)
+include_directories(${PYTHON_INCLUDE_PATH})
+include_directories(${PYTHON_INCLUDE_DIRS})
+include_directories(pybind11/include)
+if(DIFFVG_CUDA)
+    link_directories(${CUDA_LIBRARIES})
+else()
+    include_directories(${THRUST_INCLUDE_DIR})
+endif()
+
+if(NOT MSVC)
+  # These compile definitions are not meaningful for MSVC
+  add_compile_options(-Wall -g -O3 -fvisibility=hidden -Wno-unknown-pragmas)
+else()
+  add_compile_options(/Wall /Zi)
+  add_link_options(/DEBUG)
+endif()
+
+if(NOT DIFFVG_CUDA)
+    add_compile_options("-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP")
+endif()
+
+set(SRCS atomic.h
+         color.h
+         cdf.h
+         cuda_utils.h
+         diffvg.h
+         edge_query.h
+         filter.h
+         matrix.h
+         parallel.h
+         pcg.h
+         ptr.h
+         sample_boundary.h
+         scene.h
+         shape.h
+         solve.h
+         vector.h
+         within_distance.h
+         winding_number.h
+         atomic.cpp
+         color.cpp
+         diffvg.cpp
+         parallel.cpp
+         scene.cpp
+         shape.cpp)
+
+if(DIFFVG_CUDA)
+    add_compile_definitions(COMPILE_WITH_CUDA)
+    set_source_files_properties(
+        diffvg.cpp
+        scene.cpp
+        PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+
+    cuda_add_library(diffvg MODULE ${SRCS})
+else()
+    add_library(diffvg MODULE ${SRCS})
+endif()
+
+if(APPLE)
+    # The "-undefined dynamic_lookup" is a hack for systems with
+    # multiple Python installed. If we link a particular Python version
+    # here, and we import it with a different Python version later.
+    # likely a segmentation fault.
+    # The solution for Linux Mac OS machines, as mentioned in 
+    # https://github.com/pybind/pybind11/blob/master/tools/pybind11Tools.cmake
+    # is to not link against Python library at all and resolve the symbols
+    # at compile time.
+    set(DYNAMIC_LOOKUP "-undefined dynamic_lookup")
+endif()
+
+target_link_libraries(diffvg ${DYNAMIC_LOOKUP})
+
+if(WIN32)
+    # See: https://pybind11.readthedocs.io/en/master/compiling.html#advanced-interface-library-target
+    target_link_libraries(diffvg pybind11::module)
+    set_target_properties(diffvg PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                            SUFFIX "${PYTHON_MODULE_EXTENSION}")
+endif()
+
+set_target_properties(diffvg PROPERTIES SKIP_BUILD_RPATH FALSE)
+set_target_properties(diffvg PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE)
+if(UNIX AND NOT APPLE)
+    set_target_properties(diffvg PROPERTIES INSTALL_RPATH "$ORIGIN")
+elseif(APPLE)
+    set_target_properties(diffvg PROPERTIES INSTALL_RPATH "@loader_path")
+endif()
+
+set_property(TARGET diffvg PROPERTY CXX_STANDARD 11)
+set_target_properties(diffvg PROPERTIES PREFIX "")
+# Still enable assertion in release mode
+string( REPLACE "/DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "/DNDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string( REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string( REPLACE "/DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "/DNDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string( REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+
+if(NOT WIN32)
+    find_package(TensorFlow)
+    if(TensorFlow_FOUND)
+        add_subdirectory(pydiffvg_tensorflow/custom_ops)
+    else()
+        message(INFO " Building without TensorFlow support (not found)")
+    endif()
+endif()
diff --git a/LIVE/LICENSE b/LIVE/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0ad25db4bd1d86c452db3f9602ccdbe172438f52
--- /dev/null
+++ b/LIVE/LICENSE
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.
diff --git a/LIVE/README.md b/LIVE/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..041ee859808934aa5f50b2bcca412893da126f11
--- /dev/null
+++ b/LIVE/README.md
@@ -0,0 +1,44 @@
+# LIVE-pytorch
+Towards Layer-wise Image Vectorization
+
+### Updated for rebuttal (Jan/28/2022)： 
+#### User study
+We create a [user study](https://wj.qq.com/s2/9665341/19ed) as suggested. A more complex user study will be added in the revised version.
+
+The results are collected here: [user study details](user_study_state.csv)
+
+#### Code installation
+
+we added  detailed [conda env file](env.yml) and collected detail [system information](system_info.txt) to help the installation.
+
+A more detailed docker and Google Colab demo will be provided.
+
+
+<div align="center">
+  <img src="example.png" width="650px" height="300px">
+</div>
+LIVE is able to explicitly presents a Layer-wise representation for simple images. 
+
+## Installation
+```bash
+pip3 install torch torchvision
+pip install svgwrite
+pip install svgpathtools
+pip install cssutils
+pip install numba
+pip install torch-tools
+pip install visdom
+pip install scikit-fmm
+pip install opencv-python==4.5.4.60 
+pip install easydict
+pip install scikit-fmm
+
+```
+Next, please refer DiffVG to install [pydiffvg](https://github.com/BachiLi/diffvg)
+
+
+## Run
+```bash
+python main.py --config config/all.yaml --experiment experiment_8x1 --signature demo1 --target data/demo1.png
+```
+Please modify the config files to change configurations.
diff --git a/LIVE/colab.py b/LIVE/colab.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0adc9ab4a11dee6d31e37d2b4c3b7f4fef9d23
--- /dev/null
+++ b/LIVE/colab.py
@@ -0,0 +1,687 @@
+"""
+Here are some use cases:
+python main.py --config config/all.yaml --experiment experiment_8x1 --signature demo1 --target data/demo1.png
+"""
+import pydiffvg
+import torch
+import cv2
+import matplotlib.pyplot as plt
+import random
+import argparse
+import math
+import errno
+from tqdm import tqdm
+from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
+from torch.nn.functional import adaptive_avg_pool2d
+import warnings
+warnings.filterwarnings("ignore")
+
+import PIL
+import PIL.Image
+import os
+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import shutil
+import copy
+# import skfmm
+from xing_loss import xing_loss
+
+import yaml
+from easydict import EasyDict as edict
+
+
+pydiffvg.set_print_timing(False)
+gamma = 1.0
+
+##########
+# helper #
+##########
+
+from utils import \
+    get_experiment_id, \
+    get_path_schedule, \
+    edict_2_dict, \
+    check_and_create_dir
+
+def get_bezier_circle(radius=1, segments=4, bias=None):
+    points = []
+    if bias is None:
+        bias = (random.random(), random.random())
+    avg_degree = 360 / (segments*3)
+    for i in range(0, segments*3):
+        point = (np.cos(np.deg2rad(i * avg_degree)),
+                    np.sin(np.deg2rad(i * avg_degree)))
+        points.append(point)
+    points = torch.tensor(points)
+    points = (points)*radius + torch.tensor(bias).unsqueeze(dim=0)
+    points = points.type(torch.FloatTensor)
+    return points
+
+def get_sdf(phi, method='skfmm', **kwargs):
+    if method == 'skfmm':
+        import skfmm
+        phi = (phi-0.5)*2
+        if (phi.max() <= 0) or (phi.min() >= 0):
+            return np.zeros(phi.shape).astype(np.float32)
+        sd = skfmm.distance(phi, dx=1)
+
+        flip_negative = kwargs.get('flip_negative', True)
+        if flip_negative:
+            sd = np.abs(sd)
+
+        truncate = kwargs.get('truncate', 10)
+        sd = np.clip(sd, -truncate, truncate)
+        # print(f"max sd value is: {sd.max()}")
+
+        zero2max = kwargs.get('zero2max', True)
+        if zero2max and flip_negative:
+            sd = sd.max() - sd
+        elif zero2max:
+            raise ValueError
+
+        normalize = kwargs.get('normalize', 'sum')
+        if normalize == 'sum':
+            sd /= sd.sum()
+        elif normalize == 'to1':
+            sd /= sd.max()
+        return sd
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--debug', action='store_true', default=False)
+    parser.add_argument("--config", type=str)
+    parser.add_argument("--experiment", type=str)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--target", type=str, help="target image path")
+    parser.add_argument('--log_dir', metavar='DIR', default="log/debug")
+    parser.add_argument('--initial', type=str, default="random", choices=['random', 'circle'])
+    parser.add_argument('--signature', nargs='+', type=str)
+    parser.add_argument('--seginit', nargs='+', type=str)
+    parser.add_argument("--num_segments", type=int, default=4)
+    # parser.add_argument("--num_paths", type=str, default="1,1,1")
+    # parser.add_argument("--num_iter", type=int, default=500)
+    # parser.add_argument('--free', action='store_true')
+    # Please ensure that image resolution is divisible by pool_size; otherwise the performance would drop a lot.
+    # parser.add_argument('--pool_size', type=int, default=40, help="the pooled image size for next path initialization")
+    # parser.add_argument('--save_loss', action='store_true')
+    # parser.add_argument('--save_init', action='store_true')
+    # parser.add_argument('--save_image', action='store_true')
+    # parser.add_argument('--save_video', action='store_true')
+    # parser.add_argument('--print_weight', action='store_true')
+    # parser.add_argument('--circle_init_radius',  type=float)
+    cfg = edict()
+    args = parser.parse_args()
+    cfg.debug = args.debug
+    cfg.config = args.config
+    cfg.experiment = args.experiment
+    cfg.seed = args.seed
+    cfg.target = args.target
+    cfg.log_dir = args.log_dir
+    cfg.initial = args.initial
+    cfg.signature = args.signature
+    # set cfg num_segments in command
+    cfg.num_segments = args.num_segments
+    if args.seginit is not None:
+        cfg.seginit = edict()
+        cfg.seginit.type = args.seginit[0]
+        if cfg.seginit.type == 'circle':
+            cfg.seginit.radius = float(args.seginit[1])
+    return cfg
+
+def ycrcb_conversion(im, format='[bs x 3 x 2D]', reverse=False):
+    mat = torch.FloatTensor([
+        [ 65.481/255, 128.553/255,  24.966/255], # ranged_from [0, 219/255]
+        [-37.797/255, -74.203/255, 112.000/255], # ranged_from [-112/255, 112/255]
+        [112.000/255, -93.786/255, -18.214/255], # ranged_from [-112/255, 112/255]
+    ]).to(im.device)
+
+    if reverse:
+        mat = mat.inverse()
+
+    if format == '[bs x 3 x 2D]':
+        im = im.permute(0, 2, 3, 1)
+        im = torch.matmul(im, mat.T)
+        im = im.permute(0, 3, 1, 2).contiguous()
+        return im
+    elif format == '[2D x 3]':
+        im = torch.matmul(im, mat.T)
+        return im
+    else:
+        raise ValueError
+
+class random_coord_init():
+    def __init__(self, canvas_size):
+        self.canvas_size = canvas_size
+    def __call__(self):
+        h, w = self.canvas_size
+        return [npr.uniform(0, 1)*w, npr.uniform(0, 1)*h]
+
+class naive_coord_init():
+    def __init__(self, pred, gt, format='[bs x c x 2D]', replace_sampling=True):
+        if isinstance(pred, torch.Tensor):
+            pred = pred.detach().cpu().numpy()
+        if isinstance(gt, torch.Tensor):
+            gt = gt.detach().cpu().numpy()
+
+        if format == '[bs x c x 2D]':
+            self.map = ((pred[0] - gt[0])**2).sum(0)
+        elif format == ['[2D x c]']:
+            self.map = ((pred - gt)**2).sum(-1)
+        else:
+            raise ValueError
+        self.replace_sampling = replace_sampling
+
+    def __call__(self):
+        coord = np.where(self.map == self.map.max())
+        coord_h, coord_w = coord[0][0], coord[1][0]
+        if self.replace_sampling:
+            self.map[coord_h, coord_w] = -1
+        return [coord_w, coord_h]
+
+
+class sparse_coord_init():
+    def __init__(self, pred, gt, format='[bs x c x 2D]', quantile_interval=200, nodiff_thres=0.1):
+        if isinstance(pred, torch.Tensor):
+            pred = pred.detach().cpu().numpy()
+        if isinstance(gt, torch.Tensor):
+            gt = gt.detach().cpu().numpy()
+        if format == '[bs x c x 2D]':
+            self.map = ((pred[0] - gt[0])**2).sum(0)
+            self.reference_gt = copy.deepcopy(
+                np.transpose(gt[0], (1, 2, 0)))
+        elif format == ['[2D x c]']:
+            self.map = (np.abs(pred - gt)).sum(-1)
+            self.reference_gt = copy.deepcopy(gt[0])
+        else:
+            raise ValueError
+        # OptionA: Zero too small errors to avoid the error too small deadloop
+        self.map[self.map < nodiff_thres] = 0
+        quantile_interval = np.linspace(0., 1., quantile_interval)
+        quantized_interval = np.quantile(self.map, quantile_interval)
+        # remove redundant
+        quantized_interval = np.unique(quantized_interval)
+        quantized_interval = sorted(quantized_interval[1:-1])
+        self.map = np.digitize(self.map, quantized_interval, right=False)
+        self.map = np.clip(self.map, 0, 255).astype(np.uint8)
+        self.idcnt = {}
+        for idi in sorted(np.unique(self.map)):
+            self.idcnt[idi] = (self.map==idi).sum()
+        self.idcnt.pop(min(self.idcnt.keys()))
+        # remove smallest one to remove the correct region
+    def __call__(self):
+        if len(self.idcnt) == 0:
+            h, w = self.map.shape
+            return [npr.uniform(0, 1)*w, npr.uniform(0, 1)*h]
+        target_id = max(self.idcnt, key=self.idcnt.get)
+        _, component, cstats, ccenter = cv2.connectedComponentsWithStats(
+            (self.map==target_id).astype(np.uint8), connectivity=4)
+        # remove cid = 0, it is the invalid area
+        csize = [ci[-1] for ci in cstats[1:]]
+        target_cid = csize.index(max(csize))+1
+        center = ccenter[target_cid][::-1]
+        coord = np.stack(np.where(component == target_cid)).T
+        dist = np.linalg.norm(coord-center, axis=1)
+        target_coord_id = np.argmin(dist)
+        coord_h, coord_w = coord[target_coord_id]
+        # replace_sampling
+        self.idcnt[target_id] -= max(csize)
+        if self.idcnt[target_id] == 0:
+            self.idcnt.pop(target_id)
+        self.map[component == target_cid] = 0
+        return [coord_w, coord_h]
+
+
+def init_shapes(num_paths,
+                num_segments,
+                canvas_size,
+                seginit_cfg,
+                shape_cnt,
+                pos_init_method=None,
+                trainable_stroke=False,
+                **kwargs):
+    shapes = []
+    shape_groups = []
+    h, w = canvas_size
+
+    # change path init location
+    if pos_init_method is None:
+        pos_init_method = random_coord_init(canvas_size=canvas_size)
+
+    for i in range(num_paths):
+        num_control_points = [2] * num_segments
+
+        if seginit_cfg.type=="random":
+            points = []
+            p0 = pos_init_method()
+            color_ref = copy.deepcopy(p0)
+            points.append(p0)
+            for j in range(num_segments):
+                radius = seginit_cfg.radius
+                p1 = (p0[0] + radius * npr.uniform(-0.5, 0.5),
+                      p0[1] + radius * npr.uniform(-0.5, 0.5))
+                p2 = (p1[0] + radius * npr.uniform(-0.5, 0.5),
+                      p1[1] + radius * npr.uniform(-0.5, 0.5))
+                p3 = (p2[0] + radius * npr.uniform(-0.5, 0.5),
+                      p2[1] + radius * npr.uniform(-0.5, 0.5))
+                points.append(p1)
+                points.append(p2)
+                if j < num_segments - 1:
+                    points.append(p3)
+                    p0 = p3
+            points = torch.FloatTensor(points)
+
+        # circle points initialization
+        elif seginit_cfg.type=="circle":
+            radius = seginit_cfg.radius
+            if radius is None:
+                radius = npr.uniform(0.5, 1)
+            center = pos_init_method()
+            color_ref = copy.deepcopy(center)
+            points = get_bezier_circle(
+                radius=radius, segments=num_segments,
+                bias=center)
+
+        path = pydiffvg.Path(num_control_points = torch.LongTensor(num_control_points),
+                             points = points,
+                             stroke_width = torch.tensor(0.0),
+                             is_closed = True)
+        shapes.append(path)
+        # !!!!!!problem is here. the shape group shape_ids is wrong
+
+        if 'gt' in kwargs:
+            wref, href = color_ref
+            wref = max(0, min(int(wref), w-1))
+            href = max(0, min(int(href), h-1))
+            fill_color_init = list(gt[0, :, href, wref]) + [1.]
+            fill_color_init = torch.FloatTensor(fill_color_init)
+            stroke_color_init = torch.FloatTensor(npr.uniform(size=[4]))
+        else:
+            fill_color_init = torch.FloatTensor(npr.uniform(size=[4]))
+            stroke_color_init = torch.FloatTensor(npr.uniform(size=[4]))
+
+        path_group = pydiffvg.ShapeGroup(
+            shape_ids = torch.LongTensor([shape_cnt+i]),
+            fill_color = fill_color_init,
+            stroke_color = stroke_color_init,
+        )
+        shape_groups.append(path_group)
+
+    point_var = []
+    color_var = []
+
+    for path in shapes:
+        path.points.requires_grad = True
+        point_var.append(path.points)
+    for group in shape_groups:
+        group.fill_color.requires_grad = True
+        color_var.append(group.fill_color)
+
+    if trainable_stroke:
+        stroke_width_var = []
+        stroke_color_var = []
+        for path in shapes:
+            path.stroke_width.requires_grad = True
+            stroke_width_var.append(path.stroke_width)
+        for group in shape_groups:
+            group.stroke_color.requires_grad = True
+            stroke_color_var.append(group.stroke_color)
+        return shapes, shape_groups, point_var, color_var, stroke_width_var, stroke_color_var
+    else:
+        return shapes, shape_groups, point_var, color_var
+
+class linear_decay_lrlambda_f(object):
+    def __init__(self, decay_every, decay_ratio):
+        self.decay_every = decay_every
+        self.decay_ratio = decay_ratio
+
+    def __call__(self, n):
+        decay_time = n//self.decay_every
+        decay_step = n %self.decay_every
+        lr_s = self.decay_ratio**decay_time
+        lr_e = self.decay_ratio**(decay_time+1)
+        r = decay_step/self.decay_every
+        lr = lr_s * (1-r) + lr_e * r
+        return lr
+
+
+if __name__ == "__main__":
+
+    ###############
+    # make config #
+    ###############
+
+    cfg_arg = parse_args()
+    with open(cfg_arg.config, 'r') as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+    cfg_default = edict(cfg['default'])
+    cfg = edict(cfg[cfg_arg.experiment])
+    cfg.update(cfg_default)
+    cfg.update(cfg_arg)
+    cfg.exid = get_experiment_id(cfg.debug)
+
+    cfg.experiment_dir = \
+        osp.join(cfg.log_dir, '{}_{}'.format(cfg.exid, '_'.join(cfg.signature)))
+    configfile = osp.join(cfg.experiment_dir, 'config.yaml')
+    check_and_create_dir(configfile)
+    with open(osp.join(configfile), 'w') as f:
+        yaml.dump(edict_2_dict(cfg), f)
+
+    # Use GPU if available
+    pydiffvg.set_use_gpu(torch.cuda.is_available())
+    device = pydiffvg.get_device()
+
+    gt = np.array(PIL.Image.open(cfg.target))
+    print(f"Input image shape is: {gt.shape}")
+    if len(gt.shape) == 2:
+        print("Converting the gray-scale image to RGB.")
+        gt = gt.unsqueeze(dim=-1).repeat(1,1,3)
+    if gt.shape[2] == 4:
+        print("Input image includes alpha channel, simply dropout alpha channel.")
+        gt = gt[:, :, :3]
+    gt = (gt/255).astype(np.float32)
+    gt = torch.FloatTensor(gt).permute(2, 0, 1)[None].to(device)
+    if cfg.use_ycrcb:
+        gt = ycrcb_conversion(gt)
+    h, w = gt.shape[2:]
+
+    path_schedule = get_path_schedule(**cfg.path_schedule)
+
+    if cfg.seed is not None:
+        random.seed(cfg.seed)
+        npr.seed(cfg.seed)
+        torch.manual_seed(cfg.seed)
+    render = pydiffvg.RenderFunction.apply
+
+    shapes_record, shape_groups_record = [], []
+
+    region_loss = None
+    loss_matrix = []
+
+    para_point, para_color = {}, {}
+    if cfg.trainable.stroke:
+        para_stroke_width, para_stroke_color = {}, {}
+
+    pathn_record = []
+    # Background
+    if cfg.trainable.bg:
+        # meancolor = gt.mean([2, 3])[0]
+        para_bg = torch.tensor([1., 1., 1.], requires_grad=True, device=device)
+    else:
+        if cfg.use_ycrcb:
+            para_bg = torch.tensor([219/255, 0, 0], requires_grad=False, device=device)
+        else:
+            para_bg = torch.tensor([1., 1., 1.], requires_grad=False, device=device)
+
+    ##################
+    # start_training #
+    ##################
+
+    loss_weight = None
+    loss_weight_keep = 0
+    if cfg.coord_init.type == 'naive':
+        pos_init_method = naive_coord_init(
+            para_bg.view(1, -1, 1, 1).repeat(1, 1, h, w), gt)
+    elif cfg.coord_init.type == 'sparse':
+        pos_init_method = sparse_coord_init(
+            para_bg.view(1, -1, 1, 1).repeat(1, 1, h, w), gt)
+    elif cfg.coord_init.type == 'random':
+        pos_init_method = random_coord_init([h, w])
+    else:
+        raise ValueError
+
+    lrlambda_f = linear_decay_lrlambda_f(cfg.num_iter, 0.4)
+    optim_schedular_dict = {}
+
+    for path_idx, pathn in enumerate(path_schedule):
+        loss_list = []
+        print("=> Adding [{}] paths, [{}] ...".format(pathn, cfg.seginit.type))
+        pathn_record.append(pathn)
+        pathn_record_str = '-'.join([str(i) for i in pathn_record])
+
+        # initialize new shapes related stuffs.
+        if cfg.trainable.stroke:
+            shapes, shape_groups, point_var, color_var, stroke_width_var, stroke_color_var = init_shapes(
+                pathn, cfg.num_segments, (h, w),
+                cfg.seginit, len(shapes_record),
+                pos_init_method,
+                trainable_stroke=True,
+                gt=gt, )
+            para_stroke_width[path_idx] = stroke_width_var
+            para_stroke_color[path_idx] = stroke_color_var
+        else:
+            shapes, shape_groups, point_var, color_var = init_shapes(
+                pathn, cfg.num_segments, (h, w),
+                cfg.seginit, len(shapes_record),
+                pos_init_method,
+                trainable_stroke=False,
+                gt=gt, )
+
+        shapes_record += shapes
+        shape_groups_record += shape_groups
+
+        if cfg.save.init:
+            filename = os.path.join(
+                cfg.experiment_dir, "svg-init",
+                "{}-init.svg".format(pathn_record_str))
+            check_and_create_dir(filename)
+            pydiffvg.save_svg(
+                filename, w, h,
+                shapes_record, shape_groups_record)
+
+        para = {}
+        if (cfg.trainable.bg) and (path_idx == 0):
+            para['bg'] = [para_bg]
+        para['point'] = point_var
+        para['color'] = color_var
+        if cfg.trainable.stroke:
+            para['stroke_width'] = stroke_width_var
+            para['stroke_color'] = stroke_color_var
+
+        pg = [{'params' : para[ki], 'lr' : cfg.lr_base[ki]} for ki in sorted(para.keys())]
+        optim = torch.optim.Adam(pg)
+
+        if cfg.trainable.record:
+            scheduler = LambdaLR(
+                optim, lr_lambda=lrlambda_f, last_epoch=-1)
+        else:
+            scheduler = LambdaLR(
+                optim, lr_lambda=lrlambda_f, last_epoch=cfg.num_iter)
+        optim_schedular_dict[path_idx] = (optim, scheduler)
+
+        # Inner loop training
+        t_range = tqdm(range(cfg.num_iter))
+        for t in t_range:
+
+            for _, (optim, _) in optim_schedular_dict.items():
+                optim.zero_grad()
+
+            # Forward pass: render the image.
+            scene_args = pydiffvg.RenderFunction.serialize_scene(
+                w, h, shapes_record, shape_groups_record)
+            img = render(w, h, 2, 2, t, None, *scene_args)
+
+            # Compose img with white background
+            img = img[:, :, 3:4] * img[:, :, :3] + \
+                para_bg * (1 - img[:, :, 3:4])
+
+            if cfg.save.video:
+                filename = os.path.join(
+                    cfg.experiment_dir, "video-png",
+                    "{}-iter{}.png".format(pathn_record_str, t))
+                check_and_create_dir(filename)
+                if cfg.use_ycrcb:
+                    imshow = ycrcb_conversion(
+                        img, format='[2D x 3]', reverse=True).detach().cpu()
+                else:
+                    imshow = img.detach().cpu()
+                pydiffvg.imwrite(imshow, filename, gamma=gamma)
+
+            x = img.unsqueeze(0).permute(0, 3, 1, 2) # HWC -> NCHW
+
+            if cfg.use_ycrcb:
+                color_reweight = torch.FloatTensor([255/219, 255/224, 255/255]).to(device)
+                loss = ((x-gt)*(color_reweight.view(1, -1, 1, 1)))**2
+            else:
+                loss = ((x-gt)**2)
+
+            if cfg.loss.use_l1_loss:
+                loss = abs(x-gt)
+
+            if cfg.loss.use_distance_weighted_loss:
+                if cfg.use_ycrcb:
+                    raise ValueError
+                shapes_forsdf = copy.deepcopy(shapes)
+                shape_groups_forsdf = copy.deepcopy(shape_groups)
+                for si in shapes_forsdf:
+                    si.stroke_width = torch.FloatTensor([0]).to(device)
+                for sg_idx, sgi in enumerate(shape_groups_forsdf):
+                    sgi.fill_color = torch.FloatTensor([1, 1, 1, 1]).to(device)
+                    sgi.shape_ids = torch.LongTensor([sg_idx]).to(device)
+
+                sargs_forsdf = pydiffvg.RenderFunction.serialize_scene(
+                    w, h, shapes_forsdf, shape_groups_forsdf)
+                with torch.no_grad():
+                    im_forsdf = render(w, h, 2, 2, 0, None, *sargs_forsdf)
+                # use alpha channel is a trick to get 0-1 image
+                im_forsdf = (im_forsdf[:, :, 3]).detach().cpu().numpy()
+                loss_weight = get_sdf(im_forsdf, normalize='to1')
+                loss_weight += loss_weight_keep
+                loss_weight = np.clip(loss_weight, 0, 1)
+                loss_weight = torch.FloatTensor(loss_weight).to(device)
+
+            if cfg.save.loss:
+                save_loss = loss.squeeze(dim=0).mean(dim=0,keepdim=False).cpu().detach().numpy()
+                save_weight = loss_weight.cpu().detach().numpy()
+                save_weighted_loss = save_loss*save_weight
+                # normalize to [0,1]
+                save_loss = (save_loss - np.min(save_loss))/np.ptp(save_loss)
+                save_weight = (save_weight - np.min(save_weight))/np.ptp(save_weight)
+                save_weighted_loss = (save_weighted_loss - np.min(save_weighted_loss))/np.ptp(save_weighted_loss)
+
+                # save
+                plt.imshow(save_loss, cmap='Reds')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-mseloss.png".format(pathn_record_str, t))
+                check_and_create_dir(filename)
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+                plt.imshow(save_weight, cmap='Greys')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-sdfweight.png".format(pathn_record_str, t))
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+                plt.imshow(save_weighted_loss, cmap='Reds')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-weightedloss.png".format(pathn_record_str, t))
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+
+
+
+
+            if loss_weight is None:
+                loss = loss.sum(1).mean()
+            else:
+                loss = (loss.sum(1)*loss_weight).mean()
+
+            # if (cfg.loss.bis_loss_weight is not None)  and (cfg.loss.bis_loss_weight > 0):
+            #     loss_bis = bezier_intersection_loss(point_var[0]) * cfg.loss.bis_loss_weight
+            #     loss = loss + loss_bis
+            if (cfg.loss.xing_loss_weight is not None) \
+                    and (cfg.loss.xing_loss_weight > 0):
+                loss_xing = xing_loss(point_var) * cfg.loss.xing_loss_weight
+                loss = loss + loss_xing
+
+
+            loss_list.append(loss.item())
+            t_range.set_postfix({'loss': loss.item()})
+            loss.backward()
+
+            # step
+            for _, (optim, scheduler) in optim_schedular_dict.items():
+                optim.step()
+                scheduler.step()
+
+            for group in shape_groups_record:
+                group.fill_color.data.clamp_(0.0, 1.0)
+
+        if cfg.loss.use_distance_weighted_loss:
+            loss_weight_keep = loss_weight.detach().cpu().numpy() * 1
+
+        if not cfg.trainable.record:
+            for _, pi in pg.items():
+                for ppi in pi:
+                    pi.require_grad = False
+            optim_schedular_dict = {}
+
+        if cfg.save.image:
+            filename = os.path.join(
+                cfg.experiment_dir, "demo-png", "{}.png".format(pathn_record_str))
+            check_and_create_dir(filename)
+            if cfg.use_ycrcb:
+                imshow = ycrcb_conversion(
+                    img, format='[2D x 3]', reverse=True).detach().cpu()
+            else:
+                imshow = img.detach().cpu()
+            pydiffvg.imwrite(imshow, filename, gamma=gamma)
+
+        if cfg.save.output:
+            filename = os.path.join(
+                cfg.experiment_dir, "output-svg", "{}.svg".format(pathn_record_str))
+            check_and_create_dir(filename)
+            pydiffvg.save_svg(filename, w, h, shapes_record, shape_groups_record)
+
+        loss_matrix.append(loss_list)
+
+        # calculate the pixel loss
+        # pixel_loss = ((x-gt)**2).sum(dim=1, keepdim=True).sqrt_() # [N,1,H, W]
+        # region_loss = adaptive_avg_pool2d(pixel_loss, cfg.region_loss_pool_size)
+        # loss_weight = torch.softmax(region_loss.reshape(1, 1, -1), dim=-1)\
+        #     .reshape_as(region_loss)
+
+        pos_init_method = naive_coord_init(x, gt)
+
+        if cfg.coord_init.type == 'naive':
+            pos_init_method = naive_coord_init(x, gt)
+        elif cfg.coord_init.type == 'sparse':
+            pos_init_method = sparse_coord_init(x, gt)
+        elif cfg.coord_init.type == 'random':
+            pos_init_method = random_coord_init([h, w])
+        else:
+            raise ValueError
+
+        if cfg.save.video:
+            print("saving iteration video...")
+            img_array = []
+            for ii in range(0, cfg.num_iter):
+                filename = os.path.join(
+                    cfg.experiment_dir, "video-png", 
+                    "{}-iter{}.png".format(pathn_record_str, ii))
+                img = cv2.imread(filename)
+                # cv2.putText(
+                #     img, "Path:{} \nIteration:{}".format(pathn_record_str, ii), 
+                #     (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
+                img_array.append(img)
+
+            videoname = os.path.join(
+                cfg.experiment_dir, "video-mp4",
+                "{}.mp4".format(pathn_record_str))
+            check_and_create_dir(videoname)
+            out = cv2.VideoWriter(
+                videoname, 
+                cv2.VideoWriter_fourcc(*'mp4v'),
+                # cv2.VideoWriter_fourcc(*'FFV1'),
+                20.0, (w, h))
+            for iii in range(len(img_array)):
+                out.write(img_array[iii])
+            out.release()
+            # shutil.rmtree(os.path.join(cfg.experiment_dir, "video-png"))
+
+    print("The last loss is: {}".format(loss.item()))
diff --git a/LIVE/env.yml b/LIVE/env.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4c47783a63fc5f2920b628bdf85dba6e2cfbd50c
--- /dev/null
+++ b/LIVE/env.yml
@@ -0,0 +1,164 @@
+name: live
+channels:
+  - pytorch
+  - anaconda
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2021.5.30=ha878542_0
+  - certifi=2021.5.30=py37h06a4308_0
+  - cloudpickle=1.6.0=py_0
+  - cmake=3.18.2=ha30ef3c_0
+  - cudatoolkit=10.2.89=hfd86e86_1
+  - cycler=0.10.0=py37_0
+  - cytoolz=0.11.0=py37h7b6447c_0
+  - dask-core=2021.6.2=pyhd3eb1b0_0
+  - decorator=5.0.9=pyhd3eb1b0_0
+  - expat=2.2.10=he6710b0_2
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.10.4=h5ab3b9f_0
+  - gmp=6.2.1=h2531618_2
+  - gnutls=3.6.15=he1e5248_0
+  - imageio=2.9.0=pyhd3eb1b0_0
+  - intel-openmp=2021.2.0=h06a4308_610
+  - jpeg=9b=h024ee3a_2
+  - kiwisolver=1.3.1=py37h2531618_0
+  - krb5=1.18.2=h173b8e3_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - libcurl=7.71.1=h20c2e04_1
+  - libedit=3.1.20191231=h14c3975_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libiconv=1.15=h63c8f33_5
+  - libidn2=2.3.1=h27cfd23_0
+  - libpng=1.6.37=hbc83047_0
+  - libssh2=1.9.0=h1ba5d50_1
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libtasn1=4.16.0=h27cfd23_0
+  - libtiff=4.2.0=h85742a9_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuv=1.40.0=h7b6447c_0
+  - libwebp-base=1.2.0=h27cfd23_0
+  - locket=0.2.1=py37h06a4308_1
+  - lz4-c=1.9.3=h2531618_0
+  - matplotlib-base=3.3.4=py37h62a2d02_0
+  - mkl=2021.2.0=h06a4308_296
+  - mkl-service=2.3.0=py37h27cfd23_1
+  - mkl_fft=1.3.0=py37h42c9631_2
+  - mkl_random=1.2.1=py37ha9443f7_2
+  - ncurses=6.2=he6710b0_1
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=2.2=py37_1
+  - ninja=1.10.2=hff7bd54_1
+  - numpy=1.20.2=py37h2d18471_0
+  - numpy-base=1.20.2=py37hfae3a4d_0
+  - olefile=0.46=py37_0
+  - openh264=2.1.0=hd408876_0
+  - openssl=1.1.1k=h27cfd23_0
+  - partd=1.2.0=pyhd3eb1b0_0
+  - pillow=8.2.0=py37he98fc37_0
+  - pip=21.1.3=py37h06a4308_0
+  - pyparsing=2.4.7=pyhd3eb1b0_0
+  - python=3.7.10=h12debd9_4
+  - python-dateutil=2.8.1=pyhd3eb1b0_0
+  - pytorch=1.9.0=py3.7_cuda10.2_cudnn7.6.5_0
+  - pywavelets=1.1.1=py37h7b6447c_2
+  - pyyaml=5.4.1=py37h27cfd23_1
+  - readline=8.1=h27cfd23_0
+  - rhash=1.4.0=h1ba5d50_0
+  - scikit-image=0.18.1=py37ha9443f7_0
+  - scipy=1.6.2=py37had2a1c9_1
+  - setuptools=52.0.0=py37h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_0
+  - sqlite=3.36.0=hc218d9a_0
+  - tifffile=2020.10.1=py37hdd07704_2
+  - tk=8.6.10=hbc83047_0
+  - toolz=0.11.1=pyhd3eb1b0_0
+  - torchvision=0.10.0=py37_cu102
+  - tornado=6.1=py37h27cfd23_0
+  - typing_extensions=3.10.0.0=pyh06a4308_0
+  - wheel=0.36.2=pyhd3eb1b0_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.4.5=h9ceee32_0
+  - pip:
+    - absl-py==0.13.0
+    - aiohttp==3.7.4.post0
+    - async-timeout==3.0.1
+    - attrs==21.2.0
+    - cachetools==4.2.2
+    - cffi==1.14.5
+    - chardet==4.0.0
+    - coloredlogs==15.0.1
+    - cssutils==2.3.0
+    - diffvg==0.0.1
+    - easydict==1.9
+    - einops==0.3.0
+    - fsspec==2021.6.1
+    - future==0.18.2
+    - google-auth==1.32.1
+    - google-auth-oauthlib==0.4.4
+    - greenlet==1.1.0
+    - grpcio==1.38.1
+    - humanfriendly==9.2
+    - idna==2.10
+    - imageio-ffmpeg==0.4.4
+    - importlib-metadata==4.6.0
+    - jinja2==3.0.1
+    - jsonpatch==1.32
+    - jsonpointer==2.1
+    - kornia==0.1.4
+    - llvmlite==0.36.0
+    - markdown==3.3.4
+    - markupsafe==2.0.1
+    - multidict==5.1.0
+    - numba==0.53.1
+    - oauthlib==3.1.1
+    - opencv-python==4.5.3.56
+    - packaging==20.9
+    - pandas==1.3.0
+    - protobuf==3.17.3
+    - pyaml==20.4.0
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pybind11==2.6.2
+    - pycparser==2.20
+    - pydeprecate==0.3.0
+    - pypng==0.0.20
+    - pytorch-lightning==1.3.8
+    - pytorch-ranger==0.1.1
+    - pytz==2021.1
+    - pyzmq==22.1.0
+    - requests==2.25.1
+    - requests-oauthlib==1.3.0
+    - rsa==4.7.2
+    - scikit-fmm==2021.10.29
+    - seaborn==0.11.1
+    - sqlalchemy==1.4.20
+    - svgpathtools==1.4.1
+    - svgwrite==1.4.1
+    - tensorboard==2.4.1
+    - tensorboard-plugin-wit==1.8.0
+    - torch-optimizer==0.0.1a15
+    - torch-tools==0.1.5
+    - torchfile==0.1.0
+    - torchmetrics==0.4.0
+    - tqdm==4.61.1
+    - urllib3==1.26.6
+    - visdom==0.1.8.9
+    - websocket-client==1.1.0
+    - werkzeug==2.0.1
+    - yarl==1.6.3
+    - zipp==3.4.1
+prefix: /home/UserName/.conda/envs/live
+
diff --git a/LIVE/example.png b/LIVE/example.png
new file mode 100644
index 0000000000000000000000000000000000000000..9288df7e77f73ea6e0fc21cf0ef7647ff2193236
Binary files /dev/null and b/LIVE/example.png differ
diff --git a/LIVE/system_info.txt b/LIVE/system_info.txt
new file mode 100644
index 0000000000000000000000000000000000000000..18bb1d608f0c17366741293849b456bac68e57af
--- /dev/null
+++ b/LIVE/system_info.txt
@@ -0,0 +1 @@
+{'sys.platform': 'linux', 'Python': '3.7.10 (default, Jun  4 2021, 14:48:32) [GCC 7.5.0]', 'CUDA available': True, 'GPU 0': 'Tesla V100-SXM2-32GB', 'GCC': 'gcc (GCC) 8.1.0', 'PyTorch': '1.9.0', 'PyTorch compiling details': 'PyTorch built with:\n  - GCC 7.3\n  - C++ Version: 201402\n  - Intel(R) oneAPI Math Kernel Library Version 2021.2-Product Build 20210312 for Intel(R) 64 architecture applications\n  - Intel(R) MKL-DNN v2.1.2 (Git Hash 98be7e8afa711dc9b66c8ff3504129cb82013cdb)\n  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n  - NNPACK is enabled\n  - CPU capability usage: AVX2\n  - CUDA Runtime 10.2\n  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37\n  - CuDNN 7.6.5\n  - Magma 2.5.2\n  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=10.2, CUDNN_VERSION=7.6.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n', 'TorchVision': '0.10.0'}
\ No newline at end of file
diff --git a/LIVE/user_study_state.csv b/LIVE/user_study_state.csv
new file mode 100644
index 0000000000000000000000000000000000000000..c400fb4087c5ce85e71dab8149ffdc0fc87efc6e
--- /dev/null
+++ b/LIVE/user_study_state.csv
@@ -0,0 +1,148 @@
+﻿Page 1,,
+,,
+"1. Please carefully select the method that best rebuilds the original image ""progressively""", showing a human-like interpretation.,
+Option,Percentage%,Count
+DiffVG,20.00%,4
+Painting,25.00%,5
+LIVE,55.00%,11
+Total,,20
+,,
+2. Same question,,
+Option,Percentage%,Count
+DiffVG,25.00%,5
+Painting,15.00%,3
+LIVE,60.00%,12
+Total,,20
+,,
+3. Same question,,
+Option,Percentage%,Count
+DiffVG,10.00%,2
+Painting,10.00%,2
+LIVE,80.00%,16
+Total,,20
+,,
+4. Same question,,
+Option,Percentage%,Count
+DiffVG,40.00%,8
+Painting,0.00%,0
+LIVE,60.00%,12
+Total,,20
+,,
+5. Same question,,
+Option,Percentage%,Count
+DiffVG,20.00%,4
+Painting,5.00%,1
+LIVE,75.00%,15
+Total,,20
+,,
+6. Same Question,,
+Option,Percentage%,Count
+DiffVG,20.00%,4
+Painting,15.00%,3
+LIVE,65.00%,13
+Total,,20
+,,
+7. Same question,,
+Option,Percentage%,Count
+DiffVG,5.00%,1
+Painting,10.00%,2
+LIVE,85.00%,17
+Total,,20
+,,
+8. Same question,,
+Option,Percentage%,Count
+DiffVG,25.00%,5
+Painting,10.00%,2
+LIVE,65.00%,13
+Total,,20
+,,
+9. Same question,,
+Option,Percentage%,Count
+DiffVG,15.00%,3
+Painting,5.00%,1
+LIVE,80.00%,16
+Total,,20
+,,
+10. Same question,,
+Option,Percentage%,Count
+DiffVG,25.00%,5
+Painting,5.00%,1
+LIVE,70.00%,14
+Total,,20
+,,
+11. Same question,,
+Option,Percentage%,Count
+DiffVG,10.00%,2
+Painting,15.00%,3
+LIVE,75.00%,15
+Total,,20
+,,
+12. Same question,,
+Option,Percentage%,Count
+DiffVG,15.00%,3
+Painting,10.00%,2
+LIVE,75.00%,15
+Total,,20
+,,
+13. Same question,,
+Option,Percentage%,Count
+DiffVG,25.00%,5
+Painting,15.00%,3
+LIVE,60.00%,12
+Total,,20
+,,
+14. Same question,,
+Option,Percentage%,Count
+DiffVG,5.00%,1
+Painting,15.00%,3
+LIVE,80.00%,16
+Total,,20
+,,
+15. Same question,,
+Option,Percentage%,Count
+DiffVG,40.00%,8
+Painting,5.00%,1
+LIVE,55.00%,11
+Total,,20
+,,
+16. Same question,,
+Option,Percentage%,Count
+DiffVG,0.00%,0
+Painting,15.00%,3
+LIVE,85.00%,17
+Total,,20
+,,
+17. Same question,,
+Option,Percentage%,Count
+DiffVG,0.00%,0
+Painting,15.00%,3
+LIVE,85.00%,17
+Total,,20
+,,
+18. Same question,,
+Option,Percentage%,Count
+DiffVG,0.00%,0
+Painting,15.00%,3
+LIVE,85.00%,17
+Total,,20
+,,
+19. Same question,,
+Option,Percentage%,Count
+DiffVG,0.00%,0
+Painting,15.00%,3
+LIVE,85.00%,17
+Total,,20
+,,
+20. Same question,,
+Option,Percentage%,Count
+DiffVG,0.00%,0
+Painting,15.00%,3
+LIVE,85.00%,17
+Total,,20
+,,
+21. Same question,,
+Option,Percentage%,Count
+DiffVG,0.00%,0
+Painting,15.00%,3
+LIVE,85.00%,17
+Total,,20
\ No newline at end of file
diff --git a/README.md b/README.md
index 9965e544ab7fae842de8f38a6f1490bb10a0a77b..e53462aba16e23c6ac5a3a178a02636f8bf8e76b 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
 ---
 title: LIVE
-emoji: 📈
-colorFrom: purple
-colorTo: pink
+emoji: 📊
+colorFrom: pink
+colorTo: indigo
 sdk: gradio
-sdk_version: 3.0.13
+sdk_version: 2.9.1
 app_file: app.py
 pinned: false
-license: mit
+license: gpl-3.0
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
+
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b871b92efc87bfec551a82ef42a7963f168b2b1b
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,2 @@
+__author__ = "Xu Ma"
+__email__ = "ma.xu1@northeastern.edu"
diff --git a/aabb.h b/aabb.h
new file mode 100644
index 0000000000000000000000000000000000000000..c35968e113188e1503e61c1eff3ec346161cf025
--- /dev/null
+++ b/aabb.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "diffvg.h"
+#include "cuda_utils.h"
+#include "vector.h"
+#include "matrix.h"
+
+struct AABB {
+    DEVICE
+    inline AABB(const Vector2f &p_min = Vector2f{infinity<float>(), infinity<float>()},
+                const Vector2f &p_max = Vector2f{-infinity<float>(), -infinity<float>()})
+        : p_min(p_min), p_max(p_max) {}
+    Vector2f p_min, p_max;
+};
+
+DEVICE
+inline
+AABB merge(const AABB &box, const Vector2f &p) {
+    return AABB{Vector2f{min(p.x, box.p_min.x), min(p.y, box.p_min.y)},
+                Vector2f{max(p.x, box.p_max.x), max(p.y, box.p_max.y)}};
+}
+
+DEVICE
+inline
+AABB merge(const AABB &box0, const AABB &box1) {
+    return AABB{Vector2f{min(box0.p_min.x, box1.p_min.x), min(box0.p_min.y, box1.p_min.y)},
+                Vector2f{max(box0.p_max.x, box1.p_max.x), max(box0.p_max.y, box1.p_max.y)}};
+}
+
+DEVICE
+inline
+bool inside(const AABB &box, const Vector2f &p) {
+    return p.x >= box.p_min.x && p.x <= box.p_max.x &&
+           p.y >= box.p_min.y && p.y <= box.p_max.y;
+}
+
+DEVICE
+inline
+bool inside(const AABB &box, const Vector2f &p, float radius) {
+    return p.x >= box.p_min.x - radius && p.x <= box.p_max.x + radius &&
+           p.y >= box.p_min.y - radius && p.y <= box.p_max.y + radius;
+}
+
+DEVICE
+inline
+AABB enlarge(const AABB &box, float width) {
+    return AABB{Vector2f{box.p_min.x - width, box.p_min.y - width},
+                Vector2f{box.p_max.x + width, box.p_max.y + width}};
+}
+
+DEVICE
+inline
+AABB transform(const Matrix3x3f &xform, const AABB &box) {
+    auto ret = AABB();
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_min.x, box.p_min.y}));
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_min.x, box.p_max.y}));
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_max.x, box.p_min.y}));
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_max.x, box.p_max.y}));
+    return ret;
+}
+
+DEVICE
+inline
+bool within_distance(const AABB &box, const Vector2f &pt, float r) {
+    return pt.x >= box.p_min.x - r && pt.x <= box.p_max.x + r &&
+           pt.y >= box.p_min.y - r && pt.y <= box.p_max.y + r;
+}
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d869df383fe6caf91cb40fe535d96cb1caa5ea4
--- /dev/null
+++ b/app.py
@@ -0,0 +1,375 @@
+import os
+os.system('python setup.py install --user')
+import argparse
+import csv
+import numpy as np
+import sys
+sys.path.append("/home/user/.local/lib/python3.8/site-packages/diffvg-0.0.1-py3.8-linux-x86_64.egg")
+print(sys.path)
+from pathlib import Path
+
+import gradio as gr
+
+import torch
+import yaml
+from PIL import Image
+from subprocess import call
+import torch
+import cv2
+import matplotlib.pyplot as plt
+import random
+import argparse
+import math
+import errno
+from tqdm import tqdm
+import yaml
+from easydict import EasyDict as edict
+
+
+def run_cmd(command):
+    try:
+        print(command)
+        call(command, shell=True)
+    except KeyboardInterrupt:
+        print("Process interrupted")
+        sys.exit(1)
+# run_cmd("gcc --version")
+# run_cmd("pwd")
+# run_cmd("ls")
+# run_cmd("git submodule update --init --recursive")
+# run_cmd("python setup.py install --user")
+# run_cmd("pip3 list")
+# import pydiffvg
+#
+# print("Sccuessfuly import diffvg ")
+# run_cmd("pwd")
+# run_cmd("ls")
+# run_cmd("git submodule update --init --recursive")
+# run_cmd("python setup.py install --user")
+
+# run_cmd("python main.py --config config/base.yaml --experiment experiment_5x1 --signature smile --target figures/smile.png --log_dir log/")
+from main import main_func
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--debug', action='store_true', default=False)
+    parser.add_argument("--config", default="config/base.yaml", type=str)
+    parser.add_argument("--experiment", type=str)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--target", type=str, help="target image path")
+    parser.add_argument('--log_dir', metavar='DIR', default="log/")
+    parser.add_argument('--initial', type=str, default="random", choices=['random', 'circle'])
+    parser.add_argument('--signature', default="demo", nargs='+', type=str)
+    parser.add_argument('--seginit', nargs='+', type=str)
+    parser.add_argument("--num_segments", type=int, default=4)
+    # parser.add_argument("--num_paths", type=str, default="1,1,1")
+    # parser.add_argument("--num_iter", type=int, default=500)
+    # parser.add_argument('--free', action='store_true')
+    # Please ensure that image resolution is divisible by pool_size; otherwise the performance would drop a lot.
+    # parser.add_argument('--pool_size', type=int, default=40, help="the pooled image size for next path initialization")
+    # parser.add_argument('--save_loss', action='store_true')
+    # parser.add_argument('--save_init', action='store_true')
+    # parser.add_argument('--save_image', action='store_true')
+    # parser.add_argument('--save_video', action='store_true')
+    # parser.add_argument('--print_weight', action='store_true')
+    # parser.add_argument('--circle_init_radius',  type=float)
+    cfg = edict()
+    args = parser.parse_args()
+    cfg.debug = args.debug
+    cfg.config = args.config
+    cfg.experiment = args.experiment
+    cfg.seed = args.seed
+    cfg.target = args.target
+    cfg.log_dir = args.log_dir
+    cfg.initial = args.initial
+    cfg.signature = args.signature
+    # set cfg num_segments in command
+    cfg.num_segments = args.num_segments
+    if args.seginit is not None:
+        cfg.seginit = edict()
+        cfg.seginit.type = args.seginit[0]
+        if cfg.seginit.type == 'circle':
+            cfg.seginit.radius = float(args.seginit[1])
+    return cfg
+
+
+def app_experiment_change(experiment_id):
+    if experiment_id == "add [1] total 1 path for demonstration":
+        return "experiment_1x1"
+    if experiment_id == "add [1, 1, 1, 1, 1] total 5 paths one by one":
+        return "experiment_5x1"
+    elif experiment_id == "add [1, 1, 1, 1, 1, 1, 1, 1] total 8 paths one by one":
+        return "experiment_8x1"
+    elif experiment_id == "add [1,2,4,8,16,32, ...] total 128 paths":
+        return "experiment_exp2_128"
+    elif experiment_id == "add [1,2,4,8,16,32, ...] total 256 paths":
+        return "experiment_exp2_256"
+
+
+cfg_arg = parse_args()
+temp_image = np.random.rand(224,224,3)
+temp_text = "start"
+temp_input = np.random.rand(224,224,3)
+def run_live(img, experiment_id, num_iter, cfg_arg=cfg_arg):
+    experiment = app_experiment_change(experiment_id)
+    cfg_arg.target = img
+    cfg_arg.experiment = experiment
+    img, text = main_func(img, experiment_id, num_iter, cfg_arg=cfg_arg)
+    return img, text
+
+
+
+
+
+
+
+
+
+# ROOT_PATH = sys.path[0]  # 根目录
+# # 模型路径
+# model_path = "ultralytics/yolov5"
+# # 模型名称临时变量
+# model_name_tmp = ""
+# # 设备临时变量
+# device_tmp = ""
+# # 文件后缀
+# suffix_list = [".csv", ".yaml"]
+# def parse_args(known=False):
+#     parser = argparse.ArgumentParser(description="Gradio LIVE")
+#     parser.add_argument(
+#         "--model_name", "-mn", default="yolov5s", type=str, help="model name"
+#     )
+#     parser.add_argument(
+#         "--model_cfg",
+#         "-mc",
+#         default="./model_config/model_name_p5_all.yaml",
+#         type=str,
+#         help="model config",
+#     )
+#     parser.add_argument(
+#         "--cls_name",
+#         "-cls",
+#         default="./cls_name/cls_name.yaml",
+#         type=str,
+#         help="cls name",
+#     )
+#     parser.add_argument(
+#         "--nms_conf",
+#         "-conf",
+#         default=0.5,
+#         type=float,
+#         help="model NMS confidence threshold",
+#     )
+#     parser.add_argument(
+#         "--nms_iou", "-iou", default=0.45, type=float, help="model NMS IoU threshold"
+#     )
+#
+#     parser.add_argument(
+#         "--label_dnt_show",
+#         "-lds",
+#         action="store_false",
+#         default=True,
+#         help="label show",
+#     )
+#     parser.add_argument(
+#         "--device",
+#         "-dev",
+#         default="cpu",
+#         type=str,
+#         help="cuda or cpu, hugging face only cpu",
+#     )
+#     parser.add_argument(
+#         "--inference_size", "-isz", default=640, type=int, help="model inference size"
+#     )
+#
+#     args = parser.parse_known_args()[0] if known else parser.parse_args()
+#     return args
+# #  模型加载
+# def model_loading(model_name, device):
+#
+#     # 加载本地模型
+#     model = torch.hub.load(model_path, model_name, force_reload=True, device=device)
+#
+#     return model
+# # 检测信息
+# def export_json(results, model, img_size):
+#
+#     return [
+#         [
+#             {
+#                 "id": int(i),
+#                 "class": int(result[i][5]),
+#                 "class_name": model.model.names[int(result[i][5])],
+#                 "normalized_box": {
+#                     "x0": round(result[i][:4].tolist()[0], 6),
+#                     "y0": round(result[i][:4].tolist()[1], 6),
+#                     "x1": round(result[i][:4].tolist()[2], 6),
+#                     "y1": round(result[i][:4].tolist()[3], 6),
+#                 },
+#                 "confidence": round(float(result[i][4]), 2),
+#                 "fps": round(1000 / float(results.t[1]), 2),
+#                 "width": img_size[0],
+#                 "height": img_size[1],
+#             }
+#             for i in range(len(result))
+#         ]
+#         for result in results.xyxyn
+#     ]
+# def yolo_det(img, experiment_id, device=None, model_name=None, inference_size=None, conf=None, iou=None, label_opt=None, model_cls=None):
+#
+#     global model, model_name_tmp, device_tmp
+#
+#     if model_name_tmp != model_name:
+#         # 模型判断，避免反复加载
+#         model_name_tmp = model_name
+#         model = model_loading(model_name_tmp, device)
+#     elif device_tmp != device:
+#         device_tmp = device
+#         model = model_loading(model_name_tmp, device)
+#
+#     # -----------模型调参-----------
+#     model.conf = conf  # NMS 置信度阈值
+#     model.iou = iou  # NMS IOU阈值
+#     model.max_det = 1000  # 最大检测框数
+#     model.classes = model_cls  # 模型类别
+#
+#     results = model(img, size=inference_size)  # 检测
+#     results.render(labels=label_opt)  # 渲染
+#
+#     det_img = Image.fromarray(results.imgs[0])  # 检测图片
+#
+#     det_json = export_json(results, model, img.size)[0]  # 检测信息
+#
+#     return det_img, det_json
+
+
+# def run_cmd(command):
+#     try:
+#         print(command)
+#         call(command, shell=True)
+#     except KeyboardInterrupt:
+#         print("Process interrupted")
+#         sys.exit(1)
+#
+# run_cmd("gcc --version")
+# run_cmd("pwd")
+# run_cmd("ls")
+# run_cmd("git submodule update --init --recursive")
+# run_cmd("python setup.py install --user")
+# run_cmd("ls")
+# run_cmd("python main.py --config config/base.yaml --experiment experiment_5x1 --signature smile --target figures/smile.png --log_dir log/")
+
+
+
+
+
+
+# # yaml文件解析
+# def yaml_parse(file_path):
+#     return yaml.safe_load(open(file_path, "r", encoding="utf-8").read())
+#
+#
+# # yaml csv 文件解析
+# def yaml_csv(file_path, file_tag):
+#     file_suffix = Path(file_path).suffix
+#     if file_suffix == suffix_list[0]:
+#         # 模型名称
+#         file_names = [i[0] for i in list(csv.reader(open(file_path)))]  # csv版
+#     elif file_suffix == suffix_list[1]:
+#         # 模型名称
+#         file_names = yaml_parse(file_path).get(file_tag)  # yaml版
+#     else:
+#         print(f"{file_path}格式不正确！程序退出！")
+#         sys.exit()
+#
+#     return file_names
+
+
+def main(args):
+    gr.close_all()
+    # -------------------Inputs-------------------
+    inputs_iteration = gr.inputs.Slider(
+        label="Optimization Iteration",
+        default=500, maximum=600, minimum=100, step=100)
+    inputs_img = gr.inputs.Image(type="pil", label="Input Image", shape=[160, 160])
+    experiment_id = gr.inputs.Radio(
+        choices=[
+            "add [1] total 1 path for demonstration",
+            "add [1, 1, 1, 1, 1] total 5 paths one by one",
+            "add [1, 1, 1, 1, 1, 1, 1, 1] total 8 paths one by one",
+            "add [1,2,4,8,16,32, ...] total 128 paths",
+            "add [1,2,4,8,16,32, ...] total 256 paths"], type="value", default="add [1, 1, 1, 1, 1] total 5 paths one by one", label="Path Adding Scheduler"
+    )
+
+    # inputs
+    inputs = [
+
+        inputs_img,  # input image
+        experiment_id, # path adding scheduler
+        inputs_iteration, # input iteration
+
+    ]
+    # outputs
+    outputs = gr.outputs.Image(type="numpy", label="Vectorized Image")
+    outputs02 = gr.outputs.File(label="Generated SVG output")
+
+    # title
+    title = "LIVE: Towards Layer-wise Image Vectorization"
+    # description
+    description = "<div align='center'>(CVPR 2022 Oral Presentation)</div>" \
+                  "<div align='center'>Without GPUs, LIVE will cost longer time.</div>" \
+                  "<div align='center'>For efficiency, we rescale input to 160x160 (smaller size and fewer iterations will decrease the reconstructions).</div> "
+
+    # examples
+    examples = [
+        [
+            "./examples/1.png",
+            "add [1] total 1 path for demonstration",
+            100,
+        ],
+        [
+            "./examples/2.png",
+            "add [1, 1, 1, 1, 1] total 5 paths one by one",
+            300,
+        ],
+        [
+            "./examples/3.jpg",
+            "add [1,2,4,8,16,32, ...] total 128 paths",
+            300,
+        ],
+        [
+            "./examples/4.png",
+            "add [1,2,4,8,16,32, ...] total 256 paths",
+            300,
+        ],
+        [
+            "./examples/5.png",
+            "add [1, 1, 1, 1, 1] total 5 paths one by one",
+            300,
+        ],
+    ]
+
+    # Interface
+    gr.Interface(
+        fn=run_live,
+        inputs=inputs,
+        outputs=[outputs, outputs02],
+        title=title,
+        description=description,
+        examples=examples,
+        theme="seafoam",
+        # live=True, # 实时变更输出
+        flagging_dir="log"  # 输出目录
+        # ).launch(inbrowser=True, auth=['admin', 'admin'])
+    ).launch(
+        inbrowser=True,  # 自动打开默认浏览器
+        show_tips=True,  # 自动显示gradio最新功能
+        enable_queue=True
+        # favicon_path="./icon/logo.ico",
+    )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/atomic.cpp b/atomic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c642b9b84357a10f2155d28324517f36d00b0cb
--- /dev/null
+++ b/atomic.cpp
@@ -0,0 +1,27 @@
+//A hacky solution to get around the Ellipse include
+
+#ifdef WIN32
+#include <windows.h>
+#include <cstdint>
+
+float win_atomic_add(float &target, float source) {
+	union { int i; float f; } old_val;
+	union { int i; float f; } new_val;
+	do {
+		old_val.f = target;
+		new_val.f = old_val.f + (float)source;
+	} while (InterlockedCompareExchange((LONG*)&target, (LONG)new_val.i, (LONG)old_val.i) != old_val.i);
+	return old_val.f;
+}
+
+double win_atomic_add(double &target, double source) {
+	union { int64_t i; double f; } old_val;
+	union { int64_t i; double f; } new_val;
+	do {
+		old_val.f = target;
+		new_val.f = old_val.f + (double)source;
+	} while (InterlockedCompareExchange64((LONG64*)&target, (LONG64)new_val.i, (LONG64)old_val.i) != old_val.i);
+	return old_val.f;
+}
+
+#endif
\ No newline at end of file
diff --git a/atomic.h b/atomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..c721722df23f17097c67b79b05b57eecd12c5912
--- /dev/null
+++ b/atomic.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "diffvg.h"
+#include "vector.h"
+#include "matrix.h"
+
+// https://stackoverflow.com/questions/39274472/error-function-atomicadddouble-double-has-already-been-defined
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+static inline DEVICE double atomicAdd(double *address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    if (val == 0.0)
+        return __longlong_as_double(old);
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+#ifndef WIN32
+    template <typename T0, typename T1>
+    DEVICE
+    inline T0 atomic_add_(T0 &target, T1 source) {
+    #ifdef __CUDA_ARCH__
+        return atomicAdd(&target, (T0)source);
+    #else
+        T0 old_val;
+        T0 new_val;
+        do {
+            old_val = target;
+            new_val = old_val + source;
+        } while (!__atomic_compare_exchange(&target, &old_val, &new_val, true,
+            std::memory_order::memory_order_seq_cst,
+            std::memory_order::memory_order_seq_cst));
+        return old_val;
+    #endif
+    }
+
+    DEVICE
+    inline
+    float atomic_add(float &target, float source) {
+        return atomic_add_(target, source);
+    }
+    DEVICE
+    inline
+    double atomic_add(double &target, double source) {
+        return atomic_add_(target, source);
+    }
+#else
+	float win_atomic_add(float &target, float source);
+	double win_atomic_add(double &target, double source);
+    DEVICE
+    static float atomic_add(float &target, float source) {
+    #ifdef __CUDA_ARCH__
+        return atomicAdd(&target, source);
+    #else
+		return win_atomic_add(target, source);
+    #endif
+    }
+    DEVICE
+    static double atomic_add(double &target, double source) {
+    #ifdef __CUDA_ARCH__
+        return atomicAdd(&target, (double)source);
+    #else
+		return win_atomic_add(target, source);
+    #endif
+    }
+#endif
+
+template <typename T0, typename T1>
+DEVICE
+inline T0 atomic_add(T0 *target, T1 source) {
+    return atomic_add(*target, (T0)source);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline TVector2<T0> atomic_add(TVector2<T0> &target, const TVector2<T1> &source) {
+    atomic_add(target[0], source[0]);
+    atomic_add(target[1], source[1]);
+    return target;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TVector2<T1> &source) {
+    atomic_add(target[0], (T0)source[0]);
+    atomic_add(target[1], (T0)source[1]);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline TVector3<T0> atomic_add(TVector3<T0> &target, const TVector3<T1> &source) {
+    atomic_add(target[0], source[0]);
+    atomic_add(target[1], source[1]);
+    atomic_add(target[2], source[2]);
+    return target;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TVector3<T1> &source) {
+    atomic_add(target[0], (T0)source[0]);
+    atomic_add(target[1], (T0)source[1]);
+    atomic_add(target[2], (T0)source[2]);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline TVector4<T0> atomic_add(TVector4<T0> &target, const TVector4<T1> &source) {
+    atomic_add(target[0], source[0]);
+    atomic_add(target[1], source[1]);
+    atomic_add(target[2], source[2]);
+    atomic_add(target[3], source[3]);
+    return target;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TVector4<T1> &source) {
+    atomic_add(target[0], (T0)source[0]);
+    atomic_add(target[1], (T0)source[1]);
+    atomic_add(target[2], (T0)source[2]);
+    atomic_add(target[3], (T0)source[3]);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TMatrix3x3<T1> &source) {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            atomic_add(target[3 * i + j], (T0)source(i, j));
+        }
+    }
+}
+
diff --git a/cdf.h b/cdf.h
new file mode 100644
index 0000000000000000000000000000000000000000..48a64f897f2c230e3e0b5595de401dd644b8b777
--- /dev/null
+++ b/cdf.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "diffvg.h"
+
+DEVICE int sample(const float *cdf, int num_entries, float u, float *updated_u = nullptr) {
+    // Binary search the cdf
+    auto lb = 0;
+    auto len = num_entries - 1 - lb;
+    while (len > 0) {
+        auto half_len = len / 2;
+        auto mid = lb + half_len;
+        assert(mid >= 0 && mid < num_entries);
+        if (u < cdf[mid]) {
+            len = half_len;
+        } else {
+            lb = mid + 1;
+            len = len - half_len - 1;
+        }
+    }
+    lb = clamp(lb, 0, num_entries - 1);
+    if (updated_u != nullptr) {
+    	if (lb > 0) {
+    		*updated_u = (u - cdf[lb - 1]) / (cdf[lb] - cdf[lb - 1]);
+    	} else {
+    		*updated_u = u / cdf[lb];
+    	}
+    }
+    return lb;
+}
diff --git a/cls_name/cls_name.csv b/cls_name/cls_name.csv
new file mode 100644
index 0000000000000000000000000000000000000000..612e83beac9dfb3045b412a503a0efa8524c46bd
--- /dev/null
+++ b/cls_name/cls_name.csv
@@ -0,0 +1,80 @@
+人
+自行车
+汽车
+摩托车
+飞机
+公交车
+火车
+卡车
+船
+红绿灯
+消防栓
+停止标志
+停车收费表
+长凳
+鸟
+猫
+狗
+马
+羊
+牛
+象
+熊
+斑马
+长颈鹿
+背包
+雨伞
+手提包
+领带
+手提箱
+飞盘
+滑雪板
+单板滑雪
+运动球
+风筝
+棒球棒
+棒球手套
+滑板
+冲浪板
+网球拍
+瓶子
+红酒杯
+杯子
+叉子
+刀
+勺
+碗
+香蕉
+苹果
+三明治
+橙子
+西兰花
+胡萝卜
+热狗
+比萨
+甜甜圈
+蛋糕
+椅子
+长椅
+盆栽
+床
+餐桌
+马桶
+电视
+笔记本电脑
+鼠标
+遥控器
+键盘
+手机
+微波炉
+烤箱
+烤面包机
+洗碗槽
+冰箱
+书
+时钟
+花瓶
+剪刀
+泰迪熊
+吹风机
+牙刷
\ No newline at end of file
diff --git a/cls_name/cls_name.yaml b/cls_name/cls_name.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e03abc7379e74582534b4ad085939bdc70d43057
--- /dev/null
+++ b/cls_name/cls_name.yaml
@@ -0,0 +1,7 @@
+model_cls_name: ['人', '自行车', '汽车', '摩托车', '飞机', '公交车', '火车', '卡车', '船', '红绿灯', '消防栓', '停止标志',
+                '停车收费表', '长凳', '鸟', '猫', '狗', '马', '羊', '牛', '象', '熊', '斑马', '长颈鹿', '背包', '雨伞', '手提包', '领带',
+                '手提箱', '飞盘', '滑雪板', '单板滑雪', '运动球', '风筝', '棒球棒', '棒球手套', '滑板', '冲浪板', '网球拍', '瓶子', '红酒杯',
+                '杯子', '叉子', '刀', '勺', '碗', '香蕉', '苹果', '三明治', '橙子', '西兰花', '胡萝卜', '热狗', '比萨', '甜甜圈', '蛋糕',
+                '椅子', '长椅', '盆栽', '床', '餐桌', '马桶', '电视', '笔记本电脑', '鼠标', '遥控器', '键盘', '手机', '微波炉', '烤箱',
+                '烤面包机', '洗碗槽', '冰箱', '书', '时钟', '花瓶', '剪刀', '泰迪熊', '吹风机', '牙刷'
+            ]
\ No newline at end of file
diff --git a/cmake/FindTensorFlow.cmake b/cmake/FindTensorFlow.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..b251b10538f69f3dce42370e840f167ea24fc4fc
--- /dev/null
+++ b/cmake/FindTensorFlow.cmake
@@ -0,0 +1,34 @@
+# https://github.com/PatWie/tensorflow-cmake/blob/master/cmake/modules/FindTensorFlow.cmake
+
+execute_process(
+    COMMAND python -c "exec(\"try:\\n  import tensorflow as tf; print(tf.__version__); print(tf.__cxx11_abi_flag__);print(tf.sysconfig.get_include()); print(tf.sysconfig.get_lib())\\nexcept ImportError:\\n  exit(1)\")"
+    OUTPUT_VARIABLE TF_INFORMATION_STRING
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE retcode)
+
+if("${retcode}" STREQUAL "0")
+    string(REPLACE "\n" ";" TF_INFORMATION_LIST ${TF_INFORMATION_STRING})
+    list(GET TF_INFORMATION_LIST 0 TF_DETECTED_VERSION)
+    list(GET TF_INFORMATION_LIST 1 TF_DETECTED_ABI)
+    list(GET TF_INFORMATION_LIST 2 TF_DETECTED_INCLUDE_DIR)
+    list(GET TF_INFORMATION_LIST 3 TF_DETECTED_LIBRARY_DIR)
+    if(WIN32)
+        find_library(TF_DETECTED_LIBRARY NAMES _pywrap_tensorflow_internal PATHS 
+            ${TF_DETECTED_LIBRARY_DIR}/python)        
+    else()
+        # For some reason my tensorflow doesn't have a .so file
+        list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES .so.1)
+        list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES .so.2)
+        find_library(TF_DETECTED_LIBRARY NAMES tensorflow_framework PATHS 
+            ${TF_DETECTED_LIBRARY_DIR})
+    endif()
+    set(TensorFlow_VERSION ${TF_DETECTED_VERSION})
+    set(TensorFlow_ABI ${TF_DETECTED_ABI})
+    set(TensorFlow_INCLUDE_DIR ${TF_DETECTED_INCLUDE_DIR})
+    set(TensorFlow_LIBRARY ${TF_DETECTED_LIBRARY})
+    if(TensorFlow_LIBRARY AND TensorFlow_INCLUDE_DIR)
+        set(TensorFlow_FOUND TRUE)
+    else()
+        set(TensorFlow_FOUND FALSE)
+    endif()
+endif()
diff --git a/cmake/FindThrust.cmake b/cmake/FindThrust.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..61eef297b996496f4222d6afb570fb5aa960781d
--- /dev/null
+++ b/cmake/FindThrust.cmake
@@ -0,0 +1,40 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2012 Sandia Corporation.
+##  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+##  the U.S. Government retains certain rights in this software.
+##
+##=============================================================================
+
+#
+# FindThrust
+#
+# This module finds the Thrust header files and extrats their version.  It
+# sets the following variables.
+#
+# THRUST_INCLUDE_DIR -  Include directory for thrust header files.  (All header
+#                       files will actually be in the thrust subdirectory.)
+# THRUST_VERSION -      Version of thrust in the form "major.minor.patch".
+#
+
+find_path(THRUST_INCLUDE_DIR
+	HINTS /usr/include/cuda
+	      /usr/local/include
+	      /usr/local/cuda/include
+	      ${CUDA_INCLUDE_DIRS}
+	      ./thrust
+	      ../thrust
+	NAMES thrust/version.h
+)
+
+if (THRUST_INCLUDE_DIR)
+  set(THRUST_FOUND TRUE)
+endif ()
\ No newline at end of file
diff --git a/color.cpp b/color.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a2e8abcee1dacefeaeb0268359737aec178bace
--- /dev/null
+++ b/color.cpp
@@ -0,0 +1,25 @@
+#include "color.h"
+
+void LinearGradient::copy_to(ptr<float> stop_offsets,
+                             ptr<float> stop_colors) const {
+    float *o = stop_offsets.get();
+    float *c = stop_colors.get();
+    for (int i = 0; i < num_stops; i++) {
+        o[i] = this->stop_offsets[i];
+    }
+    for (int i = 0; i < 4 * num_stops; i++) {
+        c[i] = this->stop_colors[i];
+    }
+}
+
+void RadialGradient::copy_to(ptr<float> stop_offsets,
+                             ptr<float> stop_colors) const {
+    float *o = stop_offsets.get();
+    float *c = stop_colors.get();
+    for (int i = 0; i < num_stops; i++) {
+        o[i] = this->stop_offsets[i];
+    }
+    for (int i = 0; i < 4 * num_stops; i++) {
+        c[i] = this->stop_colors[i];
+    }
+}
diff --git a/color.h b/color.h
new file mode 100644
index 0000000000000000000000000000000000000000..c787105636d42b4706110500982d0ce576eda47e
--- /dev/null
+++ b/color.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "diffvg.h"
+#include "vector.h"
+#include "ptr.h"
+
+enum class ColorType {
+    Constant,
+    LinearGradient,
+    RadialGradient
+};
+
+struct Constant {
+    Vector4f color;
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+};
+
+struct LinearGradient {
+    LinearGradient(const Vector2f &begin,
+                   const Vector2f &end,
+                   int num_stops,
+                   ptr<float> stop_offsets,
+                   ptr<float> stop_colors)
+        : begin(begin), end(end), num_stops(num_stops),
+          stop_offsets(stop_offsets.get()), stop_colors(stop_colors.get()) {}
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+
+    void copy_to(ptr<float> stop_offset,
+                 ptr<float> stop_colors) const;
+
+    Vector2f begin, end;
+    int num_stops;
+    float *stop_offsets;
+    float *stop_colors; // rgba
+};
+
+struct RadialGradient {
+    RadialGradient(const Vector2f &center,
+                   const Vector2f &radius,
+                   int num_stops,
+                   ptr<float> stop_offsets,
+                   ptr<float> stop_colors)
+        : center(center), radius(radius), num_stops(num_stops),
+          stop_offsets(stop_offsets.get()), stop_colors(stop_colors.get()) {}
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+
+    void copy_to(ptr<float> stop_offset,
+                 ptr<float> stop_colors) const;
+
+    Vector2f center, radius;
+    int num_stops;
+    float *stop_offsets;
+    float *stop_colors; // rgba
+};
diff --git a/compute_distance.h b/compute_distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..c125641a9d720bd16be1428e205bd6c07c726bc5
--- /dev/null
+++ b/compute_distance.h
@@ -0,0 +1,949 @@
+#pragma once
+
+#include "diffvg.h"
+#include "edge_query.h"
+#include "scene.h"
+#include "shape.h"
+#include "solve.h"
+#include "vector.h"
+
+#include <cassert>
+
+struct ClosestPointPathInfo {
+    int base_point_id;
+    int point_id;
+    float t_root;
+};
+
+DEVICE
+inline
+bool closest_point(const Circle &circle, const Vector2f &pt,
+                   Vector2f *result) {
+    *result = circle.center + circle.radius * normalize(pt - circle.center);
+    return false;
+}
+
+DEVICE
+inline
+bool closest_point(const Path &path, const BVHNode *bvh_nodes, const Vector2f &pt, float max_radius,
+                   ClosestPointPathInfo *path_info,
+                   Vector2f *result) {
+    auto min_dist = max_radius;
+    auto ret_pt = Vector2f{0, 0};
+    auto found = false;
+    auto num_segments = path.num_base_points;
+    constexpr auto max_bvh_size = 128;
+    int bvh_stack[max_bvh_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * num_segments - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto base_point_id = node.child0;
+            auto point_id = - node.child1 - 1;
+            assert(base_point_id < num_segments);
+            assert(point_id < path.num_points);
+            auto dist = 0.f;
+            auto closest_pt = Vector2f{0, 0};
+            auto t_root = 0.f;
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                // project pt to line
+                auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+                if (t < 0) {
+                    dist = distance(p0, pt);
+                    closest_pt = p0;
+                    t_root = 0;
+                } else if (t > 1) {
+                    dist = distance(p1, pt);
+                    closest_pt = p1;
+                    t_root = 1;
+                } else {
+                    dist = distance(p0 + t * (p1 - p0), pt);
+                    closest_pt = p0 + t * (p1 - p0);
+                    t_root = t;
+                }
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                if (path.use_distance_approx) {
+                    closest_pt = quadratic_closest_pt_approx(p0, p1, p2, pt, &t_root);
+                    dist = distance(closest_pt, pt);
+                } else {
+                    auto eval = [&](float t) -> Vector2f {
+                        auto tt = 1 - t;
+                        return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+                    };
+                    auto pt0 = eval(0);
+                    auto pt1 = eval(1);
+                    auto dist0 = distance(pt0, pt);
+                    auto dist1 = distance(pt1, pt);
+                    {
+                        dist = dist0;
+                        closest_pt = pt0;
+                        t_root = 0;
+                    }
+                    if (dist1 < dist) {
+                        dist = dist1;
+                        closest_pt = pt1;
+                        t_root = 1;
+                    }
+                    // The curve is (1-t)^2p0 + 2(1-t)tp1 + t^2p2
+                    // = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0 = q
+                    // Want to solve (q - pt) dot q' = 0
+                    // q' = (p0-2p1+p2)t + (-p0+p1)
+                    // Expanding (p0-2p1+p2)^2 t^3 +
+                    //           3(p0-2p1+p2)(-p0+p1) t^2 +
+                    //           (2(-p0+p1)^2+(p0-2p1+p2)(p0-pt))t +
+                    //           (-p0+p1)(p0-pt) = 0
+                    auto A = sum((p0-2*p1+p2)*(p0-2*p1+p2));
+                    auto B = sum(3*(p0-2*p1+p2)*(-p0+p1));
+                    auto C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt));
+                    auto D = sum((-p0+p1)*(p0-pt));
+                    float t[3];
+                    int num_sol = solve_cubic(A, B, C, D, t);
+                    for (int j = 0; j < num_sol; j++) {
+                        if (t[j] >= 0 && t[j] <= 1) {
+                            auto p = eval(t[j]);
+                            auto distp = distance(p, pt);
+                            if (distp < dist) {
+                                dist = distp;
+                                closest_pt = p;
+                                t_root = t[j];
+                            }
+                        }
+                    }
+                }
+            } else if (path.num_control_points[base_point_id] == 2) {
+                // Cubic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+                auto eval = [&](float t) -> Vector2f {
+                    auto tt = 1 - t;
+                    return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+                };
+                auto pt0 = eval(0);
+                auto pt1 = eval(1);
+                auto dist0 = distance(pt0, pt);
+                auto dist1 = distance(pt1, pt);
+                {
+                    dist = dist0;
+                    closest_pt = pt0;
+                    t_root = 0;
+                }
+                if (dist1 < dist) {
+                    dist = dist1;
+                    closest_pt = pt1;
+                    t_root = 1;
+                }
+                // The curve is (1 - t)^3 p0 + 3 * (1 - t)^2 t p1 + 3 * (1 - t) t^2 p2 + t^3 p3
+                // = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+                // Want to solve (q - pt) dot q' = 0
+                // q' = 3*(-p0+3p1-3p2+p3)t^2 + 2*(3p0-6p1+3p2)t + (-3p0+3p1)
+                // Expanding 
+                // 3*(-p0+3p1-3p2+p3)^2 t^5
+                // 5*(-p0+3p1-3p2+p3)(3p0-6p1+3p2) t^4
+                // 4*(-p0+3p1-3p2+p3)(-3p0+3p1) + 2*(3p0-6p1+3p2)^2 t^3
+                // 3*(3p0-6p1+3p2)(-3p0+3p1) + 3*(-p0+3p1-3p2+p3)(p0-pt) t^2
+                // (-3p0+3p1)^2+2(p0-pt)(3p0-6p1+3p2) t
+                // (p0-pt)(-3p0+3p1)
+                double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3));
+                double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+                double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+                double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+                double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+                double F = sum((p0-pt)*(-3*p0+3*p1));
+                // normalize the polynomial
+                B /= A;
+                C /= A;
+                D /= A;
+                E /= A;
+                F /= A;
+                // Isolator Polynomials:
+                // https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.133.2233&rep=rep1&type=pdf
+                //                                       x/5 + B/25
+                //                                    /-----------------------------------------------------
+                // 5x^4 + 4B x^3 + 3C x^2 + 2D x + E /   x^5 +    B x^4 +       C x^3 +      D x^2 +      E x + F
+                //                                       x^5 + 4B/5 x^4 +    3C/5 x^3 +   2D/5 x^2 +    E/5 x
+                //                                      ----------------------------------------------------
+                //                                              B/5 x^4 +    2C/5 x^3 +   3D/5 x^2 +   4E/5 x + F
+                //                                              B/5 x^4 + 4B^2/25 x^3 + 3BC/25 x^2 + 2BD/25 x + BE/25
+                //                                      ----------------------------------------------------
+                //                                     (2C/5 - 4B^2/25)x^3 + (3D/5-3BC/25)x^2 + (4E/5-2BD/25) + (F-BE/25)
+                auto p1A = ((2 / 5.f) * C - (4 / 25.f) * B * B);
+                auto p1B = ((3 / 5.f) * D - (3 / 25.f) * B * C);
+                auto p1C = ((4 / 5.f) * E - (2 / 25.f) * B * D);
+                auto p1D = F - B * E / 25.f;
+                // auto q1A = 1 / 5.f;
+                // auto q1B = B / 25.f;
+                // x/5 + B/25 = 0
+                // x = -B/5
+                auto q_root = -B/5.f;
+                double p_roots[3];
+                int num_sol = solve_cubic(p1A, p1B, p1C, p1D, p_roots);
+                float intervals[4];
+                if (q_root >= 0 && q_root <= 1) {
+                    intervals[0] = q_root;
+                }
+                for (int j = 0; j < num_sol; j++) {
+                    intervals[j + 1] = p_roots[j];
+                }
+                auto num_intervals = 1 + num_sol;
+                // sort intervals
+                for (int j = 1; j < num_intervals; j++) {
+                    for (int k = j; k > 0 && intervals[k - 1] > intervals[k]; k--) {
+                        auto tmp = intervals[k];
+                        intervals[k] = intervals[k - 1];
+                        intervals[k - 1] = tmp;
+                    }
+                }
+                auto eval_polynomial = [&] (double t) {
+                    return t*t*t*t*t+
+                           B*t*t*t*t+
+                           C*t*t*t+
+                           D*t*t+
+                           E*t+
+                           F;
+                };
+                auto eval_polynomial_deriv = [&] (double t) {
+                    return 5*t*t*t*t+
+                           4*B*t*t*t+
+                           3*C*t*t+
+                           2*D*t+
+                           E;
+                };
+                auto lower_bound = 0.f;
+                for (int j = 0; j < num_intervals + 1; j++) {
+                    if (j < num_intervals && intervals[j] < 0.f) {
+                        continue;
+                    }
+                    auto upper_bound = j < num_intervals ?
+                        min(intervals[j], 1.f) : 1.f;
+                    auto lb = lower_bound;
+                    auto ub = upper_bound;
+                    auto lb_eval = eval_polynomial(lb);
+                    auto ub_eval = eval_polynomial(ub);
+                    if (lb_eval * ub_eval > 0) {
+                        // Doesn't have root
+                        continue;
+                    }
+                    if (lb_eval > ub_eval) {
+                        swap_(lb, ub);
+                    }
+                    auto t = 0.5f * (lb + ub);
+                    auto num_iter = 20;
+                    for (int it = 0; it < num_iter; it++) {
+                        if (!(t >= lb && t <= ub)) {
+                            t = 0.5f * (lb + ub);
+                        }
+                        auto value = eval_polynomial(t);
+                        if (fabs(value) < 1e-5f || it == num_iter - 1) {
+                            break;
+                        }
+                        // The derivative may not be entirely accurate,
+                        // but the bisection is going to handle this
+                        if (value > 0.f) {
+                            ub = t;
+                        } else {
+                            lb = t;
+                        }
+                        auto derivative = eval_polynomial_deriv(t);
+                        t -= value / derivative;
+                    }
+                    auto p = eval(t);
+                    auto distp = distance(p, pt);
+                    if (distp < dist) {
+                        dist = distp;
+                        closest_pt = p;
+                        t_root = t;
+                    }
+                    if (upper_bound >= 1.f) {
+                        break;
+                    }
+                    lower_bound = upper_bound;
+                }
+            } else {
+                assert(false);
+            }
+            if (dist < min_dist) {
+                min_dist = dist;
+                ret_pt = closest_pt;
+                path_info->base_point_id = base_point_id;
+                path_info->point_id = point_id;
+                path_info->t_root = t_root;
+                found = true;
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (within_distance(b0, pt, min_dist)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (within_distance(b1, pt, min_dist)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_size);
+        }
+    }
+    if (found) {
+        assert(path_info->base_point_id < num_segments);
+    }
+    *result = ret_pt;
+    return found;
+}
+
+DEVICE
+inline
+bool closest_point(const Rect &rect, const Vector2f &pt,
+                   Vector2f *result) {
+    auto min_dist = 0.f;
+    auto closest_pt = Vector2f{0, 0};
+    auto update = [&](const Vector2f &p0, const Vector2f &p1, bool first) {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            auto d = distance(p0, pt);
+            if (first || d < min_dist) {
+                min_dist = d;
+                closest_pt = p0;
+            }
+        } else if (t > 1) {
+            auto d = distance(p1, pt);
+            if (first || d < min_dist) {
+                min_dist = d;
+                closest_pt = p1;
+            }
+        } else {
+            auto p = p0 + t * (p1 - p0);
+            auto d = distance(p, pt);
+            if (first || d < min_dist) {
+                min_dist = d;
+                closest_pt = p0;
+            }
+        }
+    };
+    auto left_top = rect.p_min;
+    auto right_top = Vector2f{rect.p_max.x, rect.p_min.y};
+    auto left_bottom = Vector2f{rect.p_min.x, rect.p_max.y};
+    auto right_bottom = rect.p_max;
+    update(left_top, left_bottom, true);
+    update(left_top, right_top, false);
+    update(right_top, right_bottom, false);
+    update(left_bottom, right_bottom, false);
+    *result = closest_pt;
+    return true;
+}
+
+DEVICE
+inline
+bool closest_point(const Shape &shape, const BVHNode *bvh_nodes, const Vector2f &pt, float max_radius,
+                   ClosestPointPathInfo *path_info,
+                   Vector2f *result) {
+    switch (shape.type) {
+        case ShapeType::Circle:
+            return closest_point(*(const Circle *)shape.ptr, pt, result);
+        case ShapeType::Ellipse:
+            // https://www.geometrictools.com/Documentation/DistancePointEllipseEllipsoid.pdf
+            assert(false);
+            return false;
+        case ShapeType::Path:
+            return closest_point(*(const Path *)shape.ptr, bvh_nodes, pt, max_radius, path_info, result);
+        case ShapeType::Rect:
+            return closest_point(*(const Rect *)shape.ptr, pt, result);
+    }
+    assert(false);
+    return false;
+}
+
+DEVICE
+inline
+bool compute_distance(const SceneData &scene,
+                      int shape_group_id,
+                      const Vector2f &pt,
+                      float max_radius,
+                      int *min_shape_id,
+                      Vector2f *closest_pt_,
+                      ClosestPointPathInfo *path_info,
+                      float *result) {
+    const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(shape_group.canvas_to_shape, pt);
+
+    constexpr auto max_bvh_stack_size = 64;
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * shape_group.num_shapes - 2;
+    const auto &bvh_nodes = scene.shape_groups_bvh_nodes[shape_group_id];
+
+    auto min_dist = max_radius;
+    auto found = false;
+
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto shape_id = node.child0;
+            const auto &shape = scene.shapes[shape_id];
+            ClosestPointPathInfo local_path_info{-1, -1};
+            auto local_closest_pt = Vector2f{0, 0};
+            if (closest_point(shape, scene.path_bvhs[shape_id], local_pt, max_radius, &local_path_info, &local_closest_pt)) {
+                auto closest_pt = xform_pt(shape_group.shape_to_canvas, local_closest_pt);
+                auto dist = distance(closest_pt, pt);
+                if (!found || dist < min_dist) {
+                    found = true;
+                    min_dist = dist;
+                    if (min_shape_id != nullptr) {
+                        *min_shape_id = shape_id;
+                    }
+                    if (closest_pt_ != nullptr) {
+                        *closest_pt_ = closest_pt;
+                    }
+                    if (path_info != nullptr) {
+                        *path_info = local_path_info;
+                    }
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (inside(b0, local_pt, max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (inside(b1, local_pt, max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+
+    *result = min_dist;
+    return found;
+}
+
+
+DEVICE
+inline
+void d_closest_point(const Circle &circle,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     Circle &d_circle,
+                     Vector2f &d_pt) {
+    // return circle.center + circle.radius * normalize(pt - circle.center);
+    auto d_center = d_closest_pt *
+        (1 + d_normalize(pt - circle.center, circle.radius * d_closest_pt));
+    atomic_add(&d_circle.center.x, d_center);
+    atomic_add(&d_circle.radius, dot(d_closest_pt, normalize(pt - circle.center)));
+}
+
+DEVICE
+inline
+void d_closest_point(const Path &path,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     const ClosestPointPathInfo &path_info,
+                     Path &d_path,
+                     Vector2f &d_pt) {
+    auto base_point_id = path_info.base_point_id;
+    auto point_id = path_info.point_id;
+    auto min_t_root = path_info.t_root;
+    
+    if (path.num_control_points[base_point_id] == 0) {
+        // Straight line
+        auto i0 = point_id;
+        auto i1 = (point_id + 1) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        auto d_p0 = Vector2f{0, 0};
+        auto d_p1 = Vector2f{0, 0};
+        if (t < 0) {
+            d_p0 += d_closest_pt;
+        } else if (t > 1) {
+            d_p1 += d_closest_pt;
+        } else {
+            auto d_p = d_closest_pt;
+            // p = p0 + t * (p1 - p0)
+            d_p0 += d_p * (1 - t);
+            d_p1 += d_p * t;
+        }
+        atomic_add(d_path.points + 2 * i0, d_p0);
+        atomic_add(d_path.points + 2 * i1, d_p1);
+    } else if (path.num_control_points[base_point_id] == 1) {
+        // Quadratic Bezier curve
+        auto i0 = point_id;
+        auto i1 = point_id + 1;
+        auto i2 = (point_id + 2) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+        // auto eval = [&](float t) -> Vector2f {
+        //     auto tt = 1 - t;
+        //     return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+        // };
+        // auto dist0 = distance(eval(0), pt);
+        // auto dist1 = distance(eval(1), pt);
+        auto d_p0 = Vector2f{0, 0};
+        auto d_p1 = Vector2f{0, 0};
+        auto d_p2 = Vector2f{0, 0};
+        auto t = min_t_root;
+        if (t == 0) {
+            d_p0 += d_closest_pt;
+        } else if (t == 1) {
+            d_p2 += d_closest_pt;
+        } else {
+            // The curve is (1-t)^2p0 + 2(1-t)tp1 + t^2p2
+            // = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0 = q
+            // Want to solve (q - pt) dot q' = 0
+            // q' = (p0-2p1+p2)t + (-p0+p1)
+            // Expanding (p0-2p1+p2)^2 t^3 +
+            //           3(p0-2p1+p2)(-p0+p1) t^2 +
+            //           (2(-p0+p1)^2+(p0-2p1+p2)(p0-pt))t +
+            //           (-p0+p1)(p0-pt) = 0
+            auto A = sum((p0-2*p1+p2)*(p0-2*p1+p2));
+            auto B = sum(3*(p0-2*p1+p2)*(-p0+p1));
+            auto C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt));
+            // auto D = sum((-p0+p1)*(p0-pt));
+            auto d_p = d_closest_pt;
+            // p = eval(t)
+            auto tt = 1 - t;
+            // (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2
+            auto d_tt = 2 * tt * dot(d_p, p0) + 2 * t * dot(d_p, p1);
+            auto d_t = -d_tt + 2 * tt * dot(d_p, p1) + 2 * t * dot(d_p, p2);
+            auto d_p0 = d_p * tt * tt;
+            auto d_p1 = 2 * d_p * tt * t;
+            auto d_p2 = d_p * t * t;
+            // implicit function theorem: dt/dA = -1/(p'(t)) * dp/dA
+            auto poly_deriv_t = 3 * A * t * t + 2 * B * t + C;
+            if (fabs(poly_deriv_t) > 1e-6f) {
+                auto d_A = - (d_t / poly_deriv_t) * t * t * t;
+                auto d_B = - (d_t / poly_deriv_t) * t * t;
+                auto d_C = - (d_t / poly_deriv_t) * t;
+                auto d_D = - (d_t / poly_deriv_t);
+                // A = sum((p0-2*p1+p2)*(p0-2*p1+p2))
+                // B = sum(3*(p0-2*p1+p2)*(-p0+p1))
+                // C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt))
+                // D = sum((-p0+p1)*(p0-pt))
+                d_p0 += 2*d_A*(p0-2*p1+p2)+
+                        3*d_B*((-p0+p1)-(p0-2*p1+p2))+
+                        2*d_C*(-2*(-p0+p1))+
+                          d_C*((p0-pt)+(p0-2*p1+p2))+
+                        2*d_D*(-(p0-pt)+(-p0+p1));
+                d_p1 += (-2)*2*d_A*(p0-2*p1+p2)+
+                        3*d_B*(-2*(-p0+p1)+(p0-2*p1+p2))+
+                        2*d_C*(2*(-p0+p1))+
+                          d_C*((-2)*(p0-pt))+
+                        d_D*(p0-pt);
+                d_p2 += 2*d_A*(p0-2*p1+p2)+
+                        3*d_B*(-p0+p1)+
+                        d_C*(p0-pt);
+                d_pt += d_C*(-(p0-2*p1+p2))+
+                        d_D*(-(-p0+p1));
+            }
+        }
+        atomic_add(d_path.points + 2 * i0, d_p0);
+        atomic_add(d_path.points + 2 * i1, d_p1);
+        atomic_add(d_path.points + 2 * i2, d_p2);
+    } else if (path.num_control_points[base_point_id] == 2) {
+        // Cubic Bezier curve
+        auto i0 = point_id;
+        auto i1 = point_id + 1;
+        auto i2 = point_id + 2;
+        auto i3 = (point_id + 3) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+        auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+        // auto eval = [&](float t) -> Vector2f {
+        //     auto tt = 1 - t;
+        //     return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+        // };
+        auto d_p0 = Vector2f{0, 0};
+        auto d_p1 = Vector2f{0, 0};
+        auto d_p2 = Vector2f{0, 0};
+        auto d_p3 = Vector2f{0, 0};
+        auto t = min_t_root;
+        if (t == 0) {
+            // closest_pt = p0
+            d_p0 += d_closest_pt;
+        } else if (t == 1) {
+            // closest_pt = p1
+            d_p3 += d_closest_pt;
+        } else {
+            // The curve is (1 - t)^3 p0 + 3 * (1 - t)^2 t p1 + 3 * (1 - t) t^2 p2 + t^3 p3
+            // = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+            // Want to solve (q - pt) dot q' = 0
+            // q' = 3*(-p0+3p1-3p2+p3)t^2 + 2*(3p0-6p1+3p2)t + (-3p0+3p1)
+            // Expanding 
+            // 3*(-p0+3p1-3p2+p3)^2 t^5
+            // 5*(-p0+3p1-3p2+p3)(3p0-6p1+3p2) t^4
+            // 4*(-p0+3p1-3p2+p3)(-3p0+3p1) + 2*(3p0-6p1+3p2)^2 t^3
+            // 3*(3p0-6p1+3p2)(-3p0+3p1) + 3*(-p0+3p1-3p2+p3)(p0-pt) t^2
+            // (-3p0+3p1)^2+2(p0-pt)(3p0-6p1+3p2) t
+            // (p0-pt)(-3p0+3p1)
+            double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3));
+            double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+            double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+            double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+            double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+            double F = sum((p0-pt)*(-3*p0+3*p1));
+            B /= A;
+            C /= A;
+            D /= A;
+            E /= A;
+            F /= A;
+            // auto eval_polynomial = [&] (double t) {
+            //     return t*t*t*t*t+
+            //            B*t*t*t*t+
+            //            C*t*t*t+
+            //            D*t*t+
+            //            E*t+
+            //            F;
+            // };
+            auto eval_polynomial_deriv = [&] (double t) {
+                return 5*t*t*t*t+
+                       4*B*t*t*t+
+                       3*C*t*t+
+                       2*D*t+
+                       E;
+            };
+
+            // auto p = eval(t);
+            auto d_p = d_closest_pt;
+            // (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3
+            auto tt = 1 - t;
+            auto d_tt = 3 * tt * tt * dot(d_p, p0) +
+                        6 * tt * t * dot(d_p, p1) +
+                        3 * t * t * dot(d_p, p2);
+            auto d_t = -d_tt +
+                       3 * tt * tt * dot(d_p, p1) +
+                       6 * tt * t * dot(d_p, p2) +
+                       3 * t * t * dot(d_p, p3);
+            d_p0 += d_p * (tt * tt * tt);
+            d_p1 += d_p * (3 * tt * tt * t);
+            d_p2 += d_p * (3 * tt * t * t);
+            d_p3 += d_p * (t * t * t);
+            // implicit function theorem: dt/dA = -1/(p'(t)) * dp/dA
+            auto poly_deriv_t = eval_polynomial_deriv(t);
+            if (fabs(poly_deriv_t) > 1e-10f) {
+                auto d_B = -(d_t / poly_deriv_t) * t * t * t * t;
+                auto d_C = -(d_t / poly_deriv_t) * t * t * t;
+                auto d_D = -(d_t / poly_deriv_t) * t * t;
+                auto d_E = -(d_t / poly_deriv_t) * t;
+                auto d_F = -(d_t / poly_deriv_t);
+                // B = B' / A
+                // C = C' / A
+                // D = D' / A
+                // E = E' / A
+                // F = F' / A
+                auto d_A = -d_B * B / A
+                           -d_C * C / A
+                           -d_D * D / A
+                           -d_E * E / A
+                           -d_F * F / A;
+                d_B /= A;
+                d_C /= A;
+                d_D /= A;
+                d_E /= A;
+                d_F /= A;
+                {
+                    double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3)) + 1e-3;
+                    double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+                    double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+                    double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+                    double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+                    double F = sum((p0-pt)*(-3*p0+3*p1));
+                    B /= A;
+                    C /= A;
+                    D /= A;
+                    E /= A;
+                    F /= A;
+                    auto eval_polynomial = [&] (double t) {
+                        return t*t*t*t*t+
+                               B*t*t*t*t+
+                               C*t*t*t+
+                               D*t*t+
+                               E*t+
+                               F;
+                    };
+                    auto eval_polynomial_deriv = [&] (double t) {
+                        return 5*t*t*t*t+
+                               4*B*t*t*t+
+                               3*C*t*t+
+                               2*D*t+
+                               E;
+                    };
+                    auto lb = t - 1e-2f;
+                    auto ub = t + 1e-2f;
+                    auto lb_eval = eval_polynomial(lb);
+                    auto ub_eval = eval_polynomial(ub);
+                    if (lb_eval > ub_eval) {
+                        swap_(lb, ub);
+                    }
+                    auto t_ = 0.5f * (lb + ub);
+                    auto num_iter = 20;
+                    for (int it = 0; it < num_iter; it++) {
+                        if (!(t_ >= lb && t_ <= ub)) {
+                            t_ = 0.5f * (lb + ub);
+                        }
+                        auto value = eval_polynomial(t_);
+                        if (fabs(value) < 1e-5f || it == num_iter - 1) {
+                            break;
+                        }
+                        // The derivative may not be entirely accurate,
+                        // but the bisection is going to handle this
+                        if (value > 0.f) {
+                            ub = t_;
+                        } else {
+                            lb = t_;
+                        }
+                        auto derivative = eval_polynomial_deriv(t);
+                        t_ -= value / derivative;
+                    }
+                }
+                // A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3))
+                d_p0 += d_A * 3 * (-1) * 2 * (-p0+3*p1-3*p2+p3);
+                d_p1 += d_A * 3 *   3  * 2 * (-p0+3*p1-3*p2+p3);
+                d_p2 += d_A * 3 * (-3) * 2 * (-p0+3*p1-3*p2+p3);
+                d_p3 += d_A * 3 *   1  * 2 * (-p0+3*p1-3*p2+p3);
+                // B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2))
+                d_p0 += d_B * 5 * ((-1) * (3*p0-6*p1+3*p2) + 3 * (-p0+3*p1-3*p2+p3));
+                d_p1 += d_B * 5 * (3 * (3*p0-6*p1+3*p2) + (-6) * (-p0+3*p1-3*p2+p3));
+                d_p2 += d_B * 5 * ((-3) * (3*p0-6*p1+3*p2) + 3 * (-p0+3*p1-3*p2+p3));
+                d_p3 += d_B * 5 * (3*p0-6*p1+3*p2);
+                // C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2))
+                d_p0 += d_C * 4 * ((-1) * (-3*p0+3*p1) + (-3) * (-p0+3*p1-3*p2+p3)) +
+                        d_C * 2 * (3 * 2 * (3*p0-6*p1+3*p2));
+                d_p1 += d_C * 4 * (3 * (-3*p0+3*p1) + 3 * (-p0+3*p1-3*p2+p3)) +
+                        d_C * 2 * ((-6) * 2 * (3*p0-6*p1+3*p2));
+                d_p2 += d_C * 4 * ((-3) * (-3*p0+3*p1)) +
+                        d_C * 2 * (3 * 2 * (3*p0-6*p1+3*p2));
+                d_p3 += d_C * 4 * (-3*p0+3*p1);
+                // D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)))
+                d_p0 += d_D * 3 * (3 * (-3*p0+3*p1) + (-3) * (3*p0-6*p1+3*p2)) +
+                        d_D * 3 * ((-1) * (p0-pt) + 1 * (-p0+3*p1-3*p2+p3));
+                d_p1 += d_D * 3 * ((-6) * (-3*p0+3*p1) + (3) * (3*p0-6*p1+3*p2)) +
+                        d_D * 3 * (3 * (p0-pt));
+                d_p2 += d_D * 3 * (3 * (-3*p0+3*p1)) +
+                        d_D * 3 * ((-3) * (p0-pt));
+                d_pt += d_D * 3 * ((-1) * (-p0+3*p1-3*p2+p3));
+                // E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2))
+                d_p0 += d_E * ((-3) * 2 * (-3*p0+3*p1)) +
+                        d_E * 2 * (1 * (3*p0-6*p1+3*p2) + 3 * (p0-pt));
+                d_p1 += d_E * (  3  * 2 * (-3*p0+3*p1)) +
+                        d_E * 2 * ((-6) * (p0-pt));
+                d_p2 += d_E * 2 * (  3  * (p0-pt));
+                d_pt += d_E * 2 * ((-1) * (3*p0-6*p1+3*p2));
+                // F = sum((p0-pt)*(-3*p0+3*p1))
+                d_p0 += d_F * (1 * (-3*p0+3*p1)) +
+                        d_F * ((-3) * (p0-pt));
+                d_p1 += d_F * (3 * (p0-pt));
+                d_pt += d_F * ((-1) * (-3*p0+3*p1));
+            }
+        }
+        atomic_add(d_path.points + 2 * i0, d_p0);
+        atomic_add(d_path.points + 2 * i1, d_p1);
+        atomic_add(d_path.points + 2 * i2, d_p2);
+        atomic_add(d_path.points + 2 * i3, d_p3);
+    } else {
+        assert(false);
+    }
+}
+
+DEVICE
+inline
+void d_closest_point(const Rect &rect,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     Rect &d_rect,
+                     Vector2f &d_pt) {
+    auto dist = [&](const Vector2f &p0, const Vector2f &p1) -> float {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            return distance(p0, pt);
+        } else if (t > 1) {
+            return distance(p1, pt);
+        } else {
+            return distance(p0 + t * (p1 - p0), pt);
+        }
+        // return 0;
+    };
+    auto left_top = rect.p_min;
+    auto right_top = Vector2f{rect.p_max.x, rect.p_min.y};
+    auto left_bottom = Vector2f{rect.p_min.x, rect.p_max.y};
+    auto right_bottom = rect.p_max;
+    auto left_dist = dist(left_top, left_bottom);
+    auto top_dist = dist(left_top, right_top);
+    auto right_dist = dist(right_top, right_bottom);
+    auto bottom_dist = dist(left_bottom, right_bottom);
+    int min_id = 0;
+    auto min_dist = left_dist;
+    if (top_dist < min_dist) { min_dist = top_dist; min_id = 1; }
+    if (right_dist < min_dist) { min_dist = right_dist; min_id = 2; }
+    if (bottom_dist < min_dist) { min_dist = bottom_dist; min_id = 3; }
+
+    auto d_update = [&](const Vector2f &p0, const Vector2f &p1,
+                        const Vector2f &d_closest_pt,
+                        Vector2f &d_p0, Vector2f &d_p1) {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            d_p0 += d_closest_pt;
+        } else if (t > 1) {
+            d_p1 += d_closest_pt;
+        } else {
+            // p = p0 + t * (p1 - p0)
+            auto d_p = d_closest_pt;
+            d_p0 += d_p * (1 - t);
+            d_p1 += d_p * t;
+            auto d_t = sum(d_p * (p1 - p0));
+            // t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0)
+            auto d_numerator = d_t / dot(p1 - p0, p1 - p0);
+            auto d_denominator = d_t * (-t) / dot(p1 - p0, p1 - p0);
+            // numerator = dot(pt - p0, p1 - p0)
+            d_pt += (p1 - p0) * d_numerator;
+            d_p1 += (pt - p0) * d_numerator;
+            d_p0 += ((p0 - p1) + (p0 - pt)) * d_numerator;
+            // denominator = dot(p1 - p0, p1 - p0)
+            d_p1 += 2 * (p1 - p0) * d_denominator;
+            d_p0 += 2 * (p0 - p1) * d_denominator;
+        }
+    };
+    auto d_left_top = Vector2f{0, 0};
+    auto d_right_top = Vector2f{0, 0};
+    auto d_left_bottom = Vector2f{0, 0};
+    auto d_right_bottom = Vector2f{0, 0};
+    if (min_id == 0) {
+        d_update(left_top, left_bottom, d_closest_pt, d_left_top, d_left_bottom);
+    } else if (min_id == 1) {
+        d_update(left_top, right_top, d_closest_pt, d_left_top, d_right_top);
+    } else if (min_id == 2) {
+        d_update(right_top, right_bottom, d_closest_pt, d_right_top, d_right_bottom);
+    } else {
+        assert(min_id == 3);
+        d_update(left_bottom, right_bottom, d_closest_pt, d_left_bottom, d_right_bottom);
+    }
+    auto d_p_min = Vector2f{0, 0};
+    auto d_p_max = Vector2f{0, 0};
+    // left_top = rect.p_min
+    // right_top = Vector2f{rect.p_max.x, rect.p_min.y}
+    // left_bottom = Vector2f{rect.p_min.x, rect.p_max.y}
+    // right_bottom = rect.p_max
+    d_p_min += d_left_top;
+    d_p_max.x += d_right_top.x;
+    d_p_min.y += d_right_top.y;
+    d_p_min.x += d_left_bottom.x;
+    d_p_max.y += d_left_bottom.y;
+    d_p_max += d_right_bottom;
+    atomic_add(d_rect.p_min, d_p_min);
+    atomic_add(d_rect.p_max, d_p_max);
+}
+
+DEVICE
+inline
+void d_closest_point(const Shape &shape,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     const ClosestPointPathInfo &path_info,
+                     Shape &d_shape,
+                     Vector2f &d_pt) {
+    switch (shape.type) {
+        case ShapeType::Circle:
+            d_closest_point(*(const Circle *)shape.ptr,
+                            pt,
+                            d_closest_pt,
+                            *(Circle *)d_shape.ptr,
+                            d_pt);
+            break;
+        case ShapeType::Ellipse:
+            // https://www.geometrictools.com/Documentation/DistancePointEllipseEllipsoid.pdf
+            assert(false);
+            break;
+        case ShapeType::Path:
+            d_closest_point(*(const Path *)shape.ptr,
+                            pt,
+                            d_closest_pt,
+                            path_info,
+                            *(Path *)d_shape.ptr,
+                            d_pt);
+            break;
+        case ShapeType::Rect:
+            d_closest_point(*(const Rect *)shape.ptr,
+                            pt,
+                            d_closest_pt,
+                            *(Rect *)d_shape.ptr,
+                            d_pt);
+            break;
+    }
+}
+
+DEVICE
+inline
+void d_compute_distance(const Matrix3x3f &canvas_to_shape,
+                        const Matrix3x3f &shape_to_canvas,
+                        const Shape &shape,
+                        const Vector2f &pt,
+                        const Vector2f &closest_pt,
+                        const ClosestPointPathInfo &path_info,
+                        float d_dist,
+                        Matrix3x3f &d_shape_to_canvas,
+                        Shape &d_shape,
+                        float *d_translation) {
+    if (distance_squared(pt, closest_pt) < 1e-10f) {
+        // The derivative at distance=0 is undefined
+        return;
+    }
+    assert(isfinite(d_dist));
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(canvas_to_shape, pt);
+    auto local_closest_pt = xform_pt(canvas_to_shape, closest_pt);
+    // auto local_closest_pt = closest_point(shape, local_pt);
+    // auto closest_pt = xform_pt(shape_group.shape_to_canvas, local_closest_pt);
+    // auto dist = distance(closest_pt, pt);
+    auto d_pt = Vector2f{0, 0};
+    auto d_closest_pt = Vector2f{0, 0};
+    d_distance(closest_pt, pt, d_dist, d_closest_pt, d_pt);
+    assert(isfinite(d_pt));
+    assert(isfinite(d_closest_pt));
+    // auto closest_pt = xform_pt(shape_group.shape_to_canvas, local_closest_pt);
+    auto d_local_closest_pt = Vector2f{0, 0};
+    auto d_shape_to_canvas_ = Matrix3x3f();
+    d_xform_pt(shape_to_canvas, local_closest_pt, d_closest_pt,
+               d_shape_to_canvas_, d_local_closest_pt);
+    assert(isfinite(d_local_closest_pt));
+    auto d_local_pt = Vector2f{0, 0};
+    d_closest_point(shape, local_pt, d_local_closest_pt, path_info, d_shape, d_local_pt);
+    assert(isfinite(d_local_pt));
+    auto d_canvas_to_shape = Matrix3x3f();
+    d_xform_pt(canvas_to_shape,
+               pt,
+               d_local_pt,
+               d_canvas_to_shape,
+               d_pt);
+    // http://jack.valmadre.net/notes/2016/09/04/back-prop-differentials/#back-propagation-using-differentials
+    auto tc2s = transpose(canvas_to_shape);
+    d_shape_to_canvas_ += -tc2s * d_canvas_to_shape * tc2s;
+    atomic_add(&d_shape_to_canvas(0, 0), d_shape_to_canvas_);
+    if (d_translation != nullptr) {
+        atomic_add(d_translation, -d_pt);
+    }
+}
diff --git a/config/base.yaml b/config/base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35941a460665c58db80aad4db1aeaf99119a0b8f
--- /dev/null
+++ b/config/base.yaml
@@ -0,0 +1,91 @@
+
+default:
+  use_ycrcb: False
+  seginit:
+    type: circle
+    radius: 5
+  save:
+    init: false
+    image: false
+    output: true
+    video: false
+    loss: false
+  trainable: 
+    bg: False
+    record: True
+    stroke: False
+#  num_segments: 4
+  num_iter: 500
+  lr_base:
+    bg: 0.01
+    point: 1
+    color: 0.01
+    stroke_width: null
+    stroke_color: null
+  coord_init:
+    type: sparse
+  seed: 0
+  loss:
+    use_l1_loss: false
+    use_distance_weighted_loss: true
+    xing_loss_weight: 0.01
+    bis_loss_weight: null
+
+
+experiment_1x1:
+  path_schedule:
+    type: repeat
+    max_path: 1
+    schedule_each: 1
+
+experiment_4x1:
+  path_schedule:
+    type: repeat
+    max_path: 4
+    schedule_each: 1
+
+experiment_5x1:
+  path_schedule:
+    type: repeat
+    max_path: 5
+    schedule_each: 1
+
+experiment_8x1:
+  path_schedule:
+    type: repeat
+    max_path: 8
+    schedule_each: 1
+
+experiment_16x1:
+  path_schedule:
+    type: repeat
+    max_path: 16
+    schedule_each: 1
+
+experiment_32x1:
+  path_schedule:
+    type: repeat
+    max_path: 32
+    schedule_each: 1
+
+experiment_1357:
+  path_schedule:
+    type: list
+    schedule: [1, 3, 5, 7]
+
+
+experiment_exp2_256:
+  path_schedule:
+    type: exp
+    base: 2
+    max_path: 256
+    max_path_per_iter: 32
+
+
+experiment_exp2_128:
+  path_schedule:
+    type: exp
+    base: 2
+    max_path: 128
+    max_path_per_iter: 32
+
diff --git a/cuda_utils.h b/cuda_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e4609babc129a27397df72879bd6c8f55e71d1a
--- /dev/null
+++ b/cuda_utils.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#ifdef __CUDACC__
+    #include <cuda.h>
+    #include <cuda_runtime.h>
+#endif
+#include <cstdio>
+#include <cassert>
+#include <limits>
+
+#ifdef __CUDACC__
+#define checkCuda(x) do { if((x)!=cudaSuccess) { \
+    printf("CUDA Runtime Error: %s at %s:%d\n",\
+    cudaGetErrorString(x),__FILE__,__LINE__);\
+    exit(1);}} while(0)
+#endif
+
+template <typename T>
+DEVICE
+inline T infinity() {
+#ifdef __CUDA_ARCH__
+    const unsigned long long ieee754inf = 0x7ff0000000000000;
+    return __longlong_as_double(ieee754inf);
+#else
+    return std::numeric_limits<T>::infinity();
+#endif
+}
+
+template <>
+DEVICE
+inline double infinity() {
+#ifdef __CUDA_ARCH__
+    return __longlong_as_double(0x7ff0000000000000ULL);
+#else
+    return std::numeric_limits<double>::infinity();
+#endif
+}
+
+template <>
+DEVICE
+inline float infinity() {
+#ifdef __CUDA_ARCH__
+    return __int_as_float(0x7f800000);
+#else
+    return std::numeric_limits<float>::infinity();
+#endif
+}
+
+inline void cuda_synchronize() {
+#ifdef __CUDACC__
+    checkCuda(cudaDeviceSynchronize());
+#endif
+}
diff --git a/data/demo1.png b/data/demo1.png
new file mode 100644
index 0000000000000000000000000000000000000000..5705c2ff34aa0df1cffe65d5e5be7b41a607224c
Binary files /dev/null and b/data/demo1.png differ
diff --git a/data/demo2.jpg b/data/demo2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7ccdcf7d82f4dae849ceec62f68d10a6acddbcdd
Binary files /dev/null and b/data/demo2.jpg differ
diff --git a/data/demo3.png b/data/demo3.png
new file mode 100644
index 0000000000000000000000000000000000000000..6355c30cb8be9014029029f9b69453bae47c8b80
Binary files /dev/null and b/data/demo3.png differ
diff --git a/diffvg.cpp b/diffvg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7346d24b758b135bdd402fdb67ea412f48419eb3
--- /dev/null
+++ b/diffvg.cpp
@@ -0,0 +1,1792 @@
+#include "diffvg.h"
+#include "aabb.h"
+#include "shape.h"
+#include "sample_boundary.h"
+#include "atomic.h"
+#include "cdf.h"
+#include "compute_distance.h"
+#include "cuda_utils.h"
+#include "edge_query.h"
+#include "filter.h"
+#include "matrix.h"
+#include "parallel.h"
+#include "pcg.h"
+#include "ptr.h"
+#include "scene.h"
+#include "vector.h"
+#include "winding_number.h"
+#include "within_distance.h"
+#include <cassert>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+namespace py = pybind11;
+
+struct Command {
+    int shape_group_id;
+    int shape_id;
+    int point_id; // Only used by path
+};
+
+DEVICE
+bool is_inside(const SceneData &scene_data,
+               int shape_group_id,
+               const Vector2f &pt,
+               EdgeQuery *edge_query) {
+    const ShapeGroup &shape_group = scene_data.shape_groups[shape_group_id];
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(shape_group.canvas_to_shape, pt);
+    const auto &bvh_nodes = scene_data.shape_groups_bvh_nodes[shape_group_id];
+    const AABB &bbox = bvh_nodes[2 * shape_group.num_shapes - 2].box;
+    if (!inside(bbox, local_pt)) {
+        return false;
+    }
+    auto winding_number = 0;
+    // Traverse the shape group BVH
+    constexpr auto max_bvh_stack_size = 64;
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * shape_group.num_shapes - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto shape_id = node.child0;
+            auto w = compute_winding_number(
+                scene_data.shapes[shape_id], scene_data.path_bvhs[shape_id], local_pt);
+            winding_number += w;
+            if (edge_query != nullptr) {
+                if (edge_query->shape_group_id == shape_group_id &&
+                        edge_query->shape_id == shape_id) {
+                    if ((shape_group.use_even_odd_rule && abs(w) % 2 == 1) ||
+                        (!shape_group.use_even_odd_rule && w != 0)) {
+                        edge_query->hit = true;
+                    }
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (inside(b0, local_pt)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (inside(b1, local_pt)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    if (shape_group.use_even_odd_rule) {
+        return abs(winding_number) % 2 == 1;
+    } else {
+        return winding_number != 0;
+    }
+}
+
+DEVICE void accumulate_boundary_gradient(const Shape &shape,
+                                         float contrib,
+                                         float t,
+                                         const Vector2f &normal,
+                                         const BoundaryData &boundary_data,
+                                         Shape &d_shape,
+                                         const Matrix3x3f &shape_to_canvas,
+                                         const Vector2f &local_boundary_pt,
+                                         Matrix3x3f &d_shape_to_canvas) {
+    assert(isfinite(contrib));
+    assert(isfinite(normal));
+    // According to Reynold transport theorem,
+    // the Jacobian of the boundary integral is dot(velocity, normal),
+    // where the velocity depends on the variable being differentiated with.
+    if (boundary_data.is_stroke) {
+        auto has_path_thickness = false;
+        if (shape.type == ShapeType::Path) {
+            const Path &path = *(const Path *)shape.ptr;
+            has_path_thickness = path.thickness != nullptr;
+        }
+        // differentiate stroke width: velocity is the same as normal
+        if (has_path_thickness) {
+            Path *d_p = (Path*)d_shape.ptr;
+            auto base_point_id = boundary_data.path.base_point_id;
+            auto point_id = boundary_data.path.point_id;
+            auto t = boundary_data.path.t;
+            const Path &path = *(const Path *)shape.ptr;
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                // r = r0 + t * (r1 - r0)
+                atomic_add(&d_p->thickness[i0], (1 - t) * contrib);
+                atomic_add(&d_p->thickness[i1], (    t) * contrib);
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                // r = (1-t)^2r0 + 2(1-t)t r1 + t^2 r2
+                atomic_add(&d_p->thickness[i0], square(1 - t) * contrib);
+                atomic_add(&d_p->thickness[i1], (2*(1-t)*t) * contrib);
+                atomic_add(&d_p->thickness[i2], (t*t) * contrib);
+            } else if (path.num_control_points[base_point_id] == 2) {
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                // r = (1-t)^3r0 + 3*(1-t)^2tr1 + 3*(1-t)t^2r2 + t^3r3
+                atomic_add(&d_p->thickness[i0], cubic(1 - t) * contrib);
+                atomic_add(&d_p->thickness[i1], 3 * square(1 - t) * t * contrib);
+                atomic_add(&d_p->thickness[i2], 3 * (1 - t) * t * t * contrib);
+                atomic_add(&d_p->thickness[i3], t * t * t * contrib);
+            } else {
+                assert(false);
+            }
+        } else {
+            atomic_add(&d_shape.stroke_width, contrib);
+        }
+    }
+    switch (shape.type) {
+        case ShapeType::Circle: {
+            Circle *d_p = (Circle*)d_shape.ptr;
+            // velocity for the center is (1, 0) for x and (0, 1) for y
+            atomic_add(&d_p->center[0], normal * contrib);
+            // velocity for the radius is the same as the normal
+            atomic_add(&d_p->radius, contrib);
+            break;
+        } case ShapeType::Ellipse: {
+            Ellipse *d_p = (Ellipse*)d_shape.ptr;
+            // velocity for the center is (1, 0) for x and (0, 1) for y
+            atomic_add(&d_p->center[0], normal * contrib);
+            // velocity for the radius:
+            // x = center.x + r.x * cos(2pi * t)
+            // y = center.y + r.y * sin(2pi * t)
+            // for r.x: (cos(2pi * t), 0)
+            // for r.y: (0, sin(2pi * t))
+            atomic_add(&d_p->radius.x, cos(2 * float(M_PI) * t) * normal.x * contrib);
+            atomic_add(&d_p->radius.y, sin(2 * float(M_PI) * t) * normal.y * contrib);
+            break;
+        } case ShapeType::Path: {
+            Path *d_p = (Path*)d_shape.ptr;
+            auto base_point_id = boundary_data.path.base_point_id;
+            auto point_id = boundary_data.path.point_id;
+            auto t = boundary_data.path.t;
+            const Path &path = *(const Path *)shape.ptr;
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                // pt = p0 + t * (p1 - p0)
+                // velocity for p0.x: (1 - t,     0)
+                //              p0.y: (    0, 1 - t)
+                //              p1.x: (    t,     0)
+                //              p1.y: (    0,     t)
+                atomic_add(&d_p->points[2 * i0 + 0], (1 - t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i0 + 1], (1 - t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i1 + 0], (    t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i1 + 1], (    t) * normal.y * contrib);
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                // pt = (1-t)^2p0 + 2(1-t)t p1 + t^2 p2
+                // velocity for p0.x: ((1-t)^2,       0)
+                //              p0.y: (      0, (1-t)^2)
+                //              p1.x: (2(1-t)t,       0)
+                //              p1.y: (      0, 2(1-t)t)
+                //              p1.x: (    t^2,       0)
+                //              p1.y: (      0,     t^2)
+                atomic_add(&d_p->points[2 * i0 + 0], square(1 - t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i0 + 1], square(1 - t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i1 + 0], (2*(1-t)*t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i1 + 1], (2*(1-t)*t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i2 + 0], (t*t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i2 + 1], (t*t) * normal.y * contrib);
+            } else if (path.num_control_points[base_point_id] == 2) {
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                // pt = (1-t)^3p0 + 3*(1-t)^2tp1 + 3*(1-t)t^2p2 + t^3p3
+                // velocity for p0.x: (   (1-t)^3,          0)
+                //              p0.y: (         0,    (1-t)^3)
+                //              p1.x: (3*(1-t)^2t,          0)
+                //              p1.y: (         0, 3*(1-t)^2t)
+                //              p2.x: (3*(1-t)t^2,          0)
+                //              p2.y: (         0, 3*(1-t)t^2)
+                //              p2.x: (       t^3,          0)
+                //              p2.y: (         0,        t^3)
+                atomic_add(&d_p->points[2 * i0 + 0], cubic(1 - t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i0 + 1], cubic(1 - t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i1 + 0], 3 * square(1 - t) * t * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i1 + 1], 3 * square(1 - t) * t * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i2 + 0], 3 * (1 - t) * t * t * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i2 + 1], 3 * (1 - t) * t * t * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i3 + 0], t * t * t * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i3 + 1], t * t * t * normal.y * contrib);
+            } else {
+                assert(false);
+            }
+            break;
+        } case ShapeType::Rect: {
+            Rect *d_p = (Rect*)d_shape.ptr;
+            // The velocity depends on the position of the boundary
+            if (normal == Vector2f{-1, 0}) {
+                // left
+                // velocity for p_min is (1, 0) for x and (0, 0) for y
+                atomic_add(&d_p->p_min.x, -contrib);
+            } else if (normal == Vector2f{1, 0}) {
+                // right
+                // velocity for p_max is (1, 0) for x and (0, 0) for y
+                atomic_add(&d_p->p_max.x, contrib);
+            } else if (normal == Vector2f{0, -1}) {
+                // top
+                // velocity for p_min is (0, 0) for x and (0, 1) for y
+                atomic_add(&d_p->p_min.y, -contrib);
+            } else if (normal == Vector2f{0, 1}) {
+                // bottom
+                // velocity for p_max is (0, 0) for x and (0, 1) for y
+                atomic_add(&d_p->p_max.y, contrib);
+            } else {
+                // incorrect normal assignment?
+                assert(false);
+            }
+            break;
+        } default: {
+            assert(false);
+            break;
+        }
+    }
+    // for shape_to_canvas we have the following relationship:
+    // boundary_pt = xform_pt(shape_to_canvas, local_pt)
+    // the velocity is the derivative of boundary_pt with respect to shape_to_canvas
+    // we can use reverse-mode AD to compute the dot product of the velocity and the Jacobian
+    // by passing the normal in d_xform_pt
+    auto d_shape_to_canvas_ = Matrix3x3f();
+    auto d_local_boundary_pt = Vector2f{0, 0};
+    d_xform_pt(shape_to_canvas,
+               local_boundary_pt,
+               normal * contrib,
+               d_shape_to_canvas_,
+               d_local_boundary_pt);
+    atomic_add(&d_shape_to_canvas(0, 0), d_shape_to_canvas_);
+}
+
+DEVICE
+Vector4f sample_color(const ColorType &color_type,
+                      void *color,
+                      const Vector2f &pt) {
+    switch (color_type) {
+        case ColorType::Constant: {
+            auto c = (const Constant*)color;
+            assert(isfinite(c->color));
+            return c->color;
+        } case ColorType::LinearGradient: {
+            auto c = (const LinearGradient*)color;
+            // Project pt to (c->begin, c->end)
+            auto beg = c->begin;
+            auto end = c->end;
+            auto t = dot(pt - beg, end - beg) / max(dot(end - beg, end - beg), 1e-3f);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                return Vector4f{c->stop_colors[0],
+                                c->stop_colors[1],
+                                c->stop_colors[2],
+                                c->stop_colors[3]};
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    assert(isfinite(tt));
+                    assert(isfinite(color_curr));
+                    assert(isfinite(color_next));
+                    return color_curr * (1 - tt) + color_next * tt;
+                }
+            }
+            return Vector4f{c->stop_colors[4 * (c->num_stops - 1) + 0],
+                            c->stop_colors[4 * (c->num_stops - 1) + 1],
+                            c->stop_colors[4 * (c->num_stops - 1) + 2],
+                            c->stop_colors[4 * (c->num_stops - 1) + 3]};
+        } case ColorType::RadialGradient: {
+            auto c = (const RadialGradient*)color;
+            // Distance from pt to center
+            auto offset = pt - c->center;
+            auto normalized_offset = offset / c->radius;
+            auto t = length(normalized_offset);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                return Vector4f{c->stop_colors[0],
+                                c->stop_colors[1],
+                                c->stop_colors[2],
+                                c->stop_colors[3]};
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    assert(isfinite(tt));
+                    assert(isfinite(color_curr));
+                    assert(isfinite(color_next));
+                    return color_curr * (1 - tt) + color_next * tt;
+                }
+            }
+            return Vector4f{c->stop_colors[4 * (c->num_stops - 1) + 0],
+                            c->stop_colors[4 * (c->num_stops - 1) + 1],
+                            c->stop_colors[4 * (c->num_stops - 1) + 2],
+                            c->stop_colors[4 * (c->num_stops - 1) + 3]};
+        } default: {
+            assert(false);
+        }
+    }
+    return Vector4f{};
+}
+
+DEVICE
+void d_sample_color(const ColorType &color_type,
+                    void *color_ptr,
+                    const Vector2f &pt,
+                    const Vector4f &d_color,
+                    void *d_color_ptr,
+                    float *d_translation) {
+    switch (color_type) {
+        case ColorType::Constant: {
+            auto d_c = (Constant*)d_color_ptr;
+            atomic_add(&d_c->color[0], d_color);
+            return;
+        } case ColorType::LinearGradient: {
+            auto c = (const LinearGradient*)color_ptr;
+            auto d_c = (LinearGradient*)d_color_ptr;
+            // Project pt to (c->begin, c->end)
+            auto beg = c->begin;
+            auto end = c->end;
+            auto t = dot(pt - beg, end - beg) / max(dot(end - beg, end - beg), 1e-3f);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                atomic_add(&d_c->stop_colors[0], d_color);
+                return;
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    // return color_curr * (1 - tt) + color_next * tt;
+                    auto d_color_curr = d_color * (1 - tt);
+                    auto d_color_next = d_color * tt;
+                    auto d_tt = sum(d_color * (color_next - color_curr));
+                    auto d_offset_next = -d_tt * tt / (offset_next - offset_curr);
+                    auto d_offset_curr = d_tt * ((tt - 1.f) / (offset_next - offset_curr));
+                    auto d_t = d_tt / (offset_next - offset_curr);
+                    assert(isfinite(d_tt));
+                    atomic_add(&d_c->stop_colors[4 * i], d_color_curr);
+                    atomic_add(&d_c->stop_colors[4 * (i + 1)], d_color_next);
+                    atomic_add(&d_c->stop_offsets[i], d_offset_curr);
+                    atomic_add(&d_c->stop_offsets[i + 1], d_offset_next);
+                    // auto t = dot(pt - beg, end - beg) / max(dot(end - beg, end - beg), 1e-6f);
+                    // l = max(dot(end - beg, end - beg), 1e-3f)
+                    // t = dot(pt - beg, end - beg) / l;
+                    auto l = max(dot(end - beg, end - beg), 1e-3f);
+                    auto d_beg = d_t * (-(pt - beg)-(end - beg)) / l;
+                    auto d_end = d_t * (pt - beg) / l;
+                    auto d_l = -d_t * t / l;
+                    if (dot(end - beg, end - beg) > 1e-3f) {
+                        d_beg += 2 * d_l * (beg - end);
+                        d_end += 2 * d_l * (end - beg);
+                    }
+                    atomic_add(&d_c->begin[0], d_beg);
+                    atomic_add(&d_c->end[0], d_end);
+                    if (d_translation != nullptr) {
+                        atomic_add(d_translation, (d_beg + d_end));
+                    }
+                    return;
+                }
+            }
+            atomic_add(&d_c->stop_colors[4 * (c->num_stops - 1)], d_color);
+            return;
+        } case ColorType::RadialGradient: {
+            auto c = (const RadialGradient*)color_ptr;
+            auto d_c = (RadialGradient*)d_color_ptr;
+            // Distance from pt to center
+            auto offset = pt - c->center;
+            auto normalized_offset = offset / c->radius;
+            auto t = length(normalized_offset);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                atomic_add(&d_c->stop_colors[0], d_color);
+                return;
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    assert(isfinite(tt));
+                    // return color_curr * (1 - tt) + color_next * tt;
+                    auto d_color_curr = d_color * (1 - tt);
+                    auto d_color_next = d_color * tt;
+                    auto d_tt = sum(d_color * (color_next - color_curr));
+                    auto d_offset_next = -d_tt * tt / (offset_next - offset_curr);
+                    auto d_offset_curr = d_tt * ((tt - 1.f) / (offset_next - offset_curr));
+                    auto d_t = d_tt / (offset_next - offset_curr);
+                    assert(isfinite(d_t));
+                    atomic_add(&d_c->stop_colors[4 * i], d_color_curr);
+                    atomic_add(&d_c->stop_colors[4 * (i + 1)], d_color_next);
+                    atomic_add(&d_c->stop_offsets[i], d_offset_curr);
+                    atomic_add(&d_c->stop_offsets[i + 1], d_offset_next);
+                    // offset = pt - c->center
+                    // normalized_offset = offset / c->radius
+                    // t = length(normalized_offset)
+                    auto d_normalized_offset = d_length(normalized_offset, d_t);
+                    auto d_offset = d_normalized_offset / c->radius;
+                    auto d_radius = -d_normalized_offset * offset / (c->radius * c->radius);
+                    auto d_center = -d_offset;
+                    atomic_add(&d_c->center[0], d_center);
+                    atomic_add(&d_c->radius[0], d_radius);
+                    if (d_translation != nullptr) {
+                        atomic_add(d_translation, d_center);
+                    }
+                }
+            }
+            atomic_add(&d_c->stop_colors[4 * (c->num_stops - 1)], d_color);
+            return;
+        } default: {
+            assert(false);
+        }
+    }
+}
+
+struct Fragment {
+    Vector3f color;
+    float alpha;
+    int group_id;
+    bool is_stroke;
+};
+
+struct PrefilterFragment {
+    Vector3f color;
+    float alpha;
+    int group_id;
+    bool is_stroke;
+    int shape_id;
+    float distance;
+    Vector2f closest_pt;
+    ClosestPointPathInfo path_info;
+    bool within_distance;
+};
+
+DEVICE
+Vector4f sample_color(const SceneData &scene,
+                      const Vector4f *background_color,
+                      const Vector2f &screen_pt,
+                      const Vector4f *d_color = nullptr,
+                      EdgeQuery *edge_query = nullptr,
+                      Vector4f *d_background_color = nullptr,
+                      float *d_translation = nullptr) {
+    if (edge_query != nullptr) {
+        edge_query->hit = false;
+    }
+
+    // screen_pt is in screen space ([0, 1), [0, 1)),
+    // need to transform to canvas space
+    auto pt = screen_pt;
+    pt.x *= scene.canvas_width;
+    pt.y *= scene.canvas_height;
+    constexpr auto max_hit_shapes = 256;
+    constexpr auto max_bvh_stack_size = 64;
+    Fragment fragments[max_hit_shapes];
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    auto num_fragments = 0;
+    bvh_stack[stack_size++] = 2 * scene.num_shape_groups - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = scene.bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto group_id = node.child0;
+            const ShapeGroup &shape_group = scene.shape_groups[group_id];
+            if (shape_group.stroke_color != nullptr) {
+                if (within_distance(scene, group_id, pt, edge_query)) {
+                    auto color_alpha = sample_color(shape_group.stroke_color_type,
+                                                    shape_group.stroke_color,
+                                                    pt);
+                    Fragment f;
+                    f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                    f.alpha = color_alpha[3];
+                    f.group_id = group_id;
+                    f.is_stroke = true;
+                    assert(num_fragments < max_hit_shapes);
+                    fragments[num_fragments++] = f;
+                }
+            }
+            if (shape_group.fill_color != nullptr) {
+                if (is_inside(scene, group_id, pt, edge_query)) {
+                    auto color_alpha = sample_color(shape_group.fill_color_type,
+                                                    shape_group.fill_color,
+                                                    pt);
+                    Fragment f;
+                    f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                    f.alpha = color_alpha[3];
+                    f.group_id = group_id;
+                    f.is_stroke = false;
+                    assert(num_fragments < max_hit_shapes);
+                    fragments[num_fragments++] = f;
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = scene.bvh_nodes[node.child0].box;
+            if (inside(b0, pt, scene.bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = scene.bvh_nodes[node.child1].box;
+            if (inside(b1, pt, scene.bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    if (num_fragments <= 0) {
+        if (background_color != nullptr) {
+            if (d_background_color != nullptr) {
+                *d_background_color = *d_color;
+            }
+            return *background_color;
+        }
+        return Vector4f{0, 0, 0, 0};
+    }
+    // Sort the fragments from back to front (i.e. increasing order of group id)
+    // https://github.com/frigaut/yorick-imutil/blob/master/insort.c#L37
+    for (int i = 1; i < num_fragments; i++) {
+        auto j = i;
+        auto temp = fragments[j];
+        while (j > 0 && fragments[j - 1].group_id > temp.group_id) {
+            fragments[j] = fragments[j - 1];
+            j--;
+        }
+        fragments[j] = temp;
+    }
+    // Blend the color
+    Vector3f accum_color[max_hit_shapes];
+    float accum_alpha[max_hit_shapes];
+    // auto hit_opaque = false;
+    auto first_alpha = 0.f;
+    auto first_color = Vector3f{0, 0, 0};
+    if (background_color != nullptr) {
+        first_alpha = background_color->w;
+        first_color = Vector3f{background_color->x,
+                               background_color->y,
+                               background_color->z};
+    }
+    for (int i = 0; i < num_fragments; i++) {
+        const Fragment &fragment = fragments[i];
+        auto new_color = fragment.color;
+        auto new_alpha = fragment.alpha;
+        auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+        auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+        if (edge_query != nullptr) {
+            // Do we hit the target shape?
+            if (new_alpha >= 1.f && edge_query->hit) {
+                // A fully opaque shape in front of the target occludes it
+                edge_query->hit = false;
+            }
+            if (edge_query->shape_group_id == fragment.group_id) {
+                edge_query->hit = true;
+            }
+        }
+        // prev_color is alpha premultiplied, don't need to multiply with
+        // prev_alpha
+        accum_color[i] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+        accum_alpha[i] = prev_alpha * (1 - new_alpha) + new_alpha;
+    }
+    auto final_color = accum_color[num_fragments - 1];
+    auto final_alpha = accum_alpha[num_fragments - 1];
+    if (final_alpha > 1e-6f) {
+        final_color /= final_alpha;
+    }
+    assert(isfinite(final_color));
+    assert(isfinite(final_alpha));
+    if (d_color != nullptr) {
+        // Backward pass
+        auto d_final_color = Vector3f{(*d_color)[0], (*d_color)[1], (*d_color)[2]};
+        auto d_final_alpha = (*d_color)[3];
+        auto d_curr_color = d_final_color;
+        auto d_curr_alpha = d_final_alpha;
+        if (final_alpha > 1e-6f) {
+            // final_color = curr_color / final_alpha
+            d_curr_color = d_final_color / final_alpha;
+            d_curr_alpha -= sum(d_final_color * final_color) / final_alpha;
+        }
+        assert(isfinite(*d_color));
+        assert(isfinite(d_curr_color));
+        assert(isfinite(d_curr_alpha));
+        for (int i = num_fragments - 1; i >= 0; i--) {
+            // color[n] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+            // alpha[n] = prev_alpha * (1 - new_alpha) + new_alpha;
+            auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+            auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+            auto d_prev_alpha = d_curr_alpha * (1.f - fragments[i].alpha);
+            auto d_alpha_i = d_curr_alpha * (1.f - prev_alpha);
+            d_alpha_i += sum(d_curr_color * (fragments[i].color - prev_color));
+            auto d_prev_color = d_curr_color * (1 - fragments[i].alpha);
+            auto d_color_i = d_curr_color * fragments[i].alpha;
+            auto group_id = fragments[i].group_id;
+            if (fragments[i].is_stroke) {
+                d_sample_color(scene.shape_groups[group_id].stroke_color_type,
+                               scene.shape_groups[group_id].stroke_color,
+                               pt,
+                               Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                               scene.d_shape_groups[group_id].stroke_color,
+                               d_translation);
+            } else {
+                d_sample_color(scene.shape_groups[group_id].fill_color_type,
+                               scene.shape_groups[group_id].fill_color,
+                               pt,
+                               Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                               scene.d_shape_groups[group_id].fill_color,
+                               d_translation);
+            }
+            d_curr_color = d_prev_color;
+            d_curr_alpha = d_prev_alpha;
+        }
+        if (d_background_color != nullptr) {
+            d_background_color->x += d_curr_color.x;
+            d_background_color->y += d_curr_color.y;
+            d_background_color->z += d_curr_color.z;
+            d_background_color->w += d_curr_alpha;
+        }
+    }
+    return Vector4f{final_color[0], final_color[1], final_color[2], final_alpha};
+}
+
+DEVICE
+float sample_distance(const SceneData &scene,
+                      const Vector2f &screen_pt,
+                      float weight,
+                      const float *d_dist = nullptr,
+                      float *d_translation = nullptr) {
+    // screen_pt is in screen space ([0, 1), [0, 1)),
+    // need to transform to canvas space
+    auto pt = screen_pt;
+    pt.x *= scene.canvas_width;
+    pt.y *= scene.canvas_height;
+    // for each shape
+    auto min_group_id = -1;
+    auto min_distance = 0.f;
+    auto min_shape_id = -1;
+    auto closest_pt = Vector2f{0, 0};
+    auto min_path_info = ClosestPointPathInfo{-1, -1, 0};
+    for (int group_id = scene.num_shape_groups - 1; group_id >= 0; group_id--) {
+        auto s = -1;
+        auto p = Vector2f{0, 0};
+        ClosestPointPathInfo local_path_info;
+        auto d = infinity<float>();
+        if (compute_distance(scene, group_id, pt, infinity<float>(), &s, &p, &local_path_info, &d)) {
+            if (min_group_id == -1 || d < min_distance) {
+                min_distance = d;
+                min_group_id = group_id;
+                min_shape_id = s;
+                closest_pt = p;
+                min_path_info = local_path_info;
+            }
+        }
+    }
+    if (min_group_id == -1) {
+        return min_distance;
+    }
+    min_distance *= weight;
+    auto inside = false;
+    const ShapeGroup &shape_group = scene.shape_groups[min_group_id];
+    if (shape_group.fill_color != nullptr) {
+        inside = is_inside(scene,
+                           min_group_id,
+                           pt,
+                           nullptr);
+        if (inside) {
+            min_distance = -min_distance;
+        }
+    }
+    assert((min_group_id >= 0 && min_shape_id >= 0) || scene.num_shape_groups == 0);
+    if (d_dist != nullptr) {
+        auto d_abs_dist = inside ? -(*d_dist) : (*d_dist);
+        const ShapeGroup &shape_group = scene.shape_groups[min_group_id];
+        const Shape &shape = scene.shapes[min_shape_id];
+        ShapeGroup &d_shape_group = scene.d_shape_groups[min_group_id];
+        Shape &d_shape = scene.d_shapes[min_shape_id];
+        d_compute_distance(shape_group.canvas_to_shape,
+                           shape_group.shape_to_canvas,
+                           shape,
+                           pt,
+                           closest_pt,
+                           min_path_info,
+                           d_abs_dist,
+                           d_shape_group.shape_to_canvas,
+                           d_shape,
+                           d_translation);
+    }
+    return min_distance;
+}
+
+// Gather d_color from d_image inside the filter kernel, normalize by
+// weight_image.
+DEVICE
+Vector4f gather_d_color(const Filter &filter,
+                        const float *d_color_image,
+                        const float *weight_image,
+                        int width,
+                        int height,
+                        const Vector2f &pt) {
+    auto x = int(pt.x);
+    auto y = int(pt.y);
+    auto radius = filter.radius;
+    assert(radius > 0);
+    auto ri = (int)ceil(radius);
+    auto d_color = Vector4f{0, 0, 0, 0};
+    for (int dy = -ri; dy <= ri; dy++) {
+        for (int dx = -ri; dx <= ri; dx++) {
+            auto xx = x + dx;
+            auto yy = y + dy;
+            if (xx >= 0 && xx < width && yy >= 0 && yy < height) {
+                auto xc = xx + 0.5f;
+                auto yc = yy + 0.5f;
+                auto filter_weight =
+                    compute_filter_weight(filter, xc - pt.x, yc - pt.y);
+                // pixel = \sum weight * color / \sum weight
+                auto weight_sum = weight_image[yy * width + xx];
+                if (weight_sum > 0) {
+                    d_color += (filter_weight / weight_sum) * Vector4f{
+                        d_color_image[4 * (yy * width + xx) + 0],
+                        d_color_image[4 * (yy * width + xx) + 1],
+                        d_color_image[4 * (yy * width + xx) + 2],
+                        d_color_image[4 * (yy * width + xx) + 3],
+                    };
+                }
+            }
+        }
+    }
+    return d_color;
+}
+
+DEVICE
+float smoothstep(float d) {
+    auto t = clamp((d + 1.f) / 2.f, 0.f, 1.f);
+    return t * t * (3 - 2 * t);
+}
+
+DEVICE
+float d_smoothstep(float d, float d_ret) {
+    if (d < -1.f || d > 1.f) {
+        return 0.f;
+    }
+    auto t = (d + 1.f) / 2.f;
+    // ret = t * t * (3 - 2 * t)
+    //     = 3 * t * t - 2 * t * t * t
+    auto d_t = d_ret * (6 * t - 6 * t * t);
+    return d_t / 2.f;
+}
+
+DEVICE
+Vector4f sample_color_prefiltered(const SceneData &scene,
+                                  const Vector4f *background_color,
+                                  const Vector2f &screen_pt,
+                                  const Vector4f *d_color = nullptr,
+                                  Vector4f *d_background_color = nullptr,
+                                  float *d_translation = nullptr) {
+    // screen_pt is in screen space ([0, 1), [0, 1)),
+    // need to transform to canvas space
+    auto pt = screen_pt;
+    pt.x *= scene.canvas_width;
+    pt.y *= scene.canvas_height;
+    constexpr auto max_hit_shapes = 64;
+    constexpr auto max_bvh_stack_size = 64;
+    PrefilterFragment fragments[max_hit_shapes];
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    auto num_fragments = 0;
+    bvh_stack[stack_size++] = 2 * scene.num_shape_groups - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = scene.bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto group_id = node.child0;
+            const ShapeGroup &shape_group = scene.shape_groups[group_id];
+            if (shape_group.stroke_color != nullptr) {
+                auto min_shape_id = -1;
+                auto closest_pt = Vector2f{0, 0};
+                auto local_path_info = ClosestPointPathInfo{-1, -1, 0};
+                auto d = infinity<float>();
+                compute_distance(scene, group_id, pt, infinity<float>(),
+                                 &min_shape_id, &closest_pt, &local_path_info, &d);
+                assert(min_shape_id != -1);
+                const auto &shape = scene.shapes[min_shape_id];
+                auto w = smoothstep(fabs(d) + shape.stroke_width) -
+                         smoothstep(fabs(d) - shape.stroke_width);
+                if (w > 0) {
+                    auto color_alpha = sample_color(shape_group.stroke_color_type,
+                                                    shape_group.stroke_color,
+                                                    pt);
+                    color_alpha[3] *= w;
+
+                    PrefilterFragment f;
+                    f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                    f.alpha = color_alpha[3];
+                    f.group_id = group_id;
+                    f.shape_id = min_shape_id;
+                    f.distance = d;
+                    f.closest_pt = closest_pt;
+                    f.is_stroke = true;
+                    f.path_info = local_path_info;
+                    f.within_distance = true;
+                    assert(num_fragments < max_hit_shapes);
+                    fragments[num_fragments++] = f;
+                }
+            }
+            if (shape_group.fill_color != nullptr) {
+                auto min_shape_id = -1;
+                auto closest_pt = Vector2f{0, 0};
+                auto local_path_info = ClosestPointPathInfo{-1, -1, 0};
+                auto d = infinity<float>();
+                auto found = compute_distance(scene,
+                                              group_id,
+                                              pt,
+                                              1.f,
+                                              &min_shape_id,
+                                              &closest_pt,
+                                              &local_path_info,
+                                              &d);
+                auto inside = is_inside(scene, group_id, pt, nullptr);
+                if (found || inside) {
+                    if (!inside) {
+                        d = -d;
+                    }
+                    auto w = smoothstep(d);
+                    if (w > 0) {
+                        auto color_alpha = sample_color(shape_group.fill_color_type,
+                                                        shape_group.fill_color,
+                                                        pt);
+                        color_alpha[3] *= w;
+
+                        PrefilterFragment f;
+                        f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                        f.alpha = color_alpha[3];
+                        f.group_id = group_id;
+                        f.shape_id = min_shape_id;
+                        f.distance = d;
+                        f.closest_pt = closest_pt;
+                        f.is_stroke = false;
+                        f.path_info = local_path_info;
+                        f.within_distance = found;
+                        assert(num_fragments < max_hit_shapes);
+                        fragments[num_fragments++] = f;
+                    }
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = scene.bvh_nodes[node.child0].box;
+            if (inside(b0, pt, scene.bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = scene.bvh_nodes[node.child1].box;
+            if (inside(b1, pt, scene.bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    if (num_fragments <= 0) {
+        if (background_color != nullptr) {
+            if (d_background_color != nullptr) {
+                *d_background_color = *d_color;
+            }
+            return *background_color;
+        }
+        return Vector4f{0, 0, 0, 0};
+    }
+    // Sort the fragments from back to front (i.e. increasing order of group id)
+    // https://github.com/frigaut/yorick-imutil/blob/master/insort.c#L37
+    for (int i = 1; i < num_fragments; i++) {
+        auto j = i;
+        auto temp = fragments[j];
+        while (j > 0 && fragments[j - 1].group_id > temp.group_id) {
+            fragments[j] = fragments[j - 1];
+            j--;
+        }
+        fragments[j] = temp;
+    }
+    // Blend the color
+    Vector3f accum_color[max_hit_shapes];
+    float accum_alpha[max_hit_shapes];
+    auto first_alpha = 0.f;
+    auto first_color = Vector3f{0, 0, 0};
+    if (background_color != nullptr) {
+        first_alpha = background_color->w;
+        first_color = Vector3f{background_color->x,
+                               background_color->y,
+                               background_color->z};
+    }
+    for (int i = 0; i < num_fragments; i++) {
+        const PrefilterFragment &fragment = fragments[i];
+        auto new_color = fragment.color;
+        auto new_alpha = fragment.alpha;
+        auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+        auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+        // prev_color is alpha premultiplied, don't need to multiply with
+        // prev_alpha
+        accum_color[i] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+        accum_alpha[i] = prev_alpha * (1 - new_alpha) + new_alpha;
+    }
+    auto final_color = accum_color[num_fragments - 1];
+    auto final_alpha = accum_alpha[num_fragments - 1];
+    if (final_alpha > 1e-6f) {
+        final_color /= final_alpha;
+    }
+    assert(isfinite(final_color));
+    assert(isfinite(final_alpha));
+    if (d_color != nullptr) {
+        // Backward pass
+        auto d_final_color = Vector3f{(*d_color)[0], (*d_color)[1], (*d_color)[2]};
+        auto d_final_alpha = (*d_color)[3];
+        auto d_curr_color = d_final_color;
+        auto d_curr_alpha = d_final_alpha;
+        if (final_alpha > 1e-6f) {
+            // final_color = curr_color / final_alpha
+            d_curr_color = d_final_color / final_alpha;
+            d_curr_alpha -= sum(d_final_color * final_color) / final_alpha;
+        }
+        assert(isfinite(*d_color));
+        assert(isfinite(d_curr_color));
+        assert(isfinite(d_curr_alpha));
+        for (int i = num_fragments - 1; i >= 0; i--) {
+            // color[n] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+            // alpha[n] = prev_alpha * (1 - new_alpha) + new_alpha;
+            auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+            auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+            auto d_prev_alpha = d_curr_alpha * (1.f - fragments[i].alpha);
+            auto d_alpha_i = d_curr_alpha * (1.f - prev_alpha);
+            d_alpha_i += sum(d_curr_color * (fragments[i].color - prev_color));
+            auto d_prev_color = d_curr_color * (1 - fragments[i].alpha);
+            auto d_color_i = d_curr_color * fragments[i].alpha;
+            auto group_id = fragments[i].group_id;
+            if (fragments[i].is_stroke) {
+                const auto &shape = scene.shapes[fragments[i].shape_id];
+                auto d = fragments[i].distance;
+                auto abs_d_plus_width = fabs(d) + shape.stroke_width;
+                auto abs_d_minus_width = fabs(d) - shape.stroke_width;
+                auto w = smoothstep(abs_d_plus_width) -
+                         smoothstep(abs_d_minus_width);
+                if (w != 0) {
+                    auto d_w = w > 0 ? (fragments[i].alpha / w) * d_alpha_i : 0.f;
+                    d_alpha_i *= w;
+
+                    // Backprop to color
+                    d_sample_color(scene.shape_groups[group_id].stroke_color_type,
+                                   scene.shape_groups[group_id].stroke_color,
+                                   pt,
+                                   Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                                   scene.d_shape_groups[group_id].stroke_color,
+                                   d_translation);
+
+                    auto d_abs_d_plus_width = d_smoothstep(abs_d_plus_width, d_w);
+                    auto d_abs_d_minus_width = -d_smoothstep(abs_d_minus_width, d_w);
+
+                    auto d_d = d_abs_d_plus_width + d_abs_d_minus_width;
+                    if (d < 0) {
+                        d_d = -d_d;
+                    }
+                    auto d_stroke_width = d_abs_d_plus_width - d_abs_d_minus_width;
+
+                    const auto &shape_group = scene.shape_groups[group_id];
+                    ShapeGroup &d_shape_group = scene.d_shape_groups[group_id];
+                    Shape &d_shape = scene.d_shapes[fragments[i].shape_id];
+                    if (fabs(d_d) > 1e-10f) {
+                        d_compute_distance(shape_group.canvas_to_shape,
+                                           shape_group.shape_to_canvas,
+                                           shape,
+                                           pt,
+                                           fragments[i].closest_pt,
+                                           fragments[i].path_info,
+                                           d_d,
+                                           d_shape_group.shape_to_canvas,
+                                           d_shape,
+                                           d_translation);
+                    }
+                    atomic_add(&d_shape.stroke_width, d_stroke_width);
+                }
+            } else {
+                const auto &shape = scene.shapes[fragments[i].shape_id];
+                auto d = fragments[i].distance;
+                auto w = smoothstep(d);
+                if (w != 0) {
+                    // color_alpha[3] = color_alpha[3] * w;
+                    auto d_w = w > 0 ? (fragments[i].alpha / w) * d_alpha_i : 0.f;
+                    d_alpha_i *= w;
+
+                    d_sample_color(scene.shape_groups[group_id].fill_color_type,
+                                   scene.shape_groups[group_id].fill_color,
+                                   pt,
+                                   Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                                   scene.d_shape_groups[group_id].fill_color,
+                                   d_translation);
+
+                    // w = smoothstep(d)
+                    auto d_d = d_smoothstep(d, d_w);
+                    if (d < 0) {
+                        d_d = -d_d;
+                    }
+
+                    const auto &shape_group = scene.shape_groups[group_id];
+                    ShapeGroup &d_shape_group = scene.d_shape_groups[group_id];
+                    Shape &d_shape = scene.d_shapes[fragments[i].shape_id];
+                    if (fabs(d_d) > 1e-10f && fragments[i].within_distance) {
+                        d_compute_distance(shape_group.canvas_to_shape,
+                                           shape_group.shape_to_canvas,
+                                           shape,
+                                           pt,
+                                           fragments[i].closest_pt,
+                                           fragments[i].path_info,
+                                           d_d,
+                                           d_shape_group.shape_to_canvas,
+                                           d_shape,
+                                           d_translation);
+                    }
+                }
+            }
+            d_curr_color = d_prev_color;
+            d_curr_alpha = d_prev_alpha;
+        }
+        if (d_background_color != nullptr) {
+            d_background_color->x += d_curr_color.x;
+            d_background_color->y += d_curr_color.y;
+            d_background_color->z += d_curr_color.z;
+            d_background_color->w += d_curr_alpha;
+        }
+    }
+    return Vector4f{final_color[0], final_color[1], final_color[2], final_alpha};
+}
+
+struct weight_kernel {
+    DEVICE void operator()(int idx) {
+        auto rng_state = init_pcg32(idx, seed);
+        // height * width * num_samples_y * num_samples_x
+        auto sx = idx % num_samples_x;
+        auto sy = (idx / num_samples_x) % num_samples_y;
+        auto x = (idx / (num_samples_x * num_samples_y)) % width;
+        auto y = (idx / (num_samples_x * num_samples_y * width));
+        assert(y < height);
+        auto rx = next_pcg32_float(&rng_state);
+        auto ry = next_pcg32_float(&rng_state);
+        if (use_prefiltering) {
+            rx = ry = 0.5f;
+        }
+        auto pt = Vector2f{x + ((float)sx + rx) / num_samples_x,
+                           y + ((float)sy + ry) / num_samples_y};
+        auto radius = scene.filter->radius;
+        assert(radius >= 0);
+        auto ri = (int)ceil(radius);
+        for (int dy = -ri; dy <= ri; dy++) {
+            for (int dx = -ri; dx <= ri; dx++) {
+                auto xx = x + dx;
+                auto yy = y + dy;
+                if (xx >= 0 && xx < width && yy >= 0 && yy < height) {
+                    auto xc = xx + 0.5f;
+                    auto yc = yy + 0.5f;
+                    auto filter_weight = compute_filter_weight(*scene.filter,
+                                                               xc - pt.x,
+                                                               yc - pt.y);
+                    atomic_add(weight_image[yy * width + xx], filter_weight);
+                }
+            }
+        }
+    }
+
+    SceneData scene;
+    float *weight_image;
+    int width;
+    int height;
+    int num_samples_x;
+    int num_samples_y;
+    uint64_t seed;
+    bool use_prefiltering;
+};
+
+// We use a "mega kernel" for rendering
+struct render_kernel {
+    DEVICE void operator()(int idx) {
+        // height * width * num_samples_y * num_samples_x
+        auto pt = Vector2f{0, 0};
+        auto x = 0;
+        auto y = 0;
+        if (eval_positions == nullptr) {
+            auto rng_state = init_pcg32(idx, seed);
+            auto sx = idx % num_samples_x;
+            auto sy = (idx / num_samples_x) % num_samples_y;
+            x = (idx / (num_samples_x * num_samples_y)) % width;
+            y = (idx / (num_samples_x * num_samples_y * width));
+            assert(x < width && y < height);
+            auto rx = next_pcg32_float(&rng_state);
+            auto ry = next_pcg32_float(&rng_state);
+            if (use_prefiltering) {
+                rx = ry = 0.5f;
+            }
+            pt = Vector2f{x + ((float)sx + rx) / num_samples_x,
+                          y + ((float)sy + ry) / num_samples_y};
+        } else {
+            pt = Vector2f{eval_positions[2 * idx],
+                          eval_positions[2 * idx + 1]};
+            x = int(pt.x);
+            y = int(pt.y);
+        }
+
+        // normalize pt to [0, 1]
+        auto npt = pt;
+        npt.x /= width;
+        npt.y /= height;
+        auto num_samples = num_samples_x * num_samples_y;
+        if (render_image != nullptr || d_render_image != nullptr) {
+            Vector4f d_color = Vector4f{0, 0, 0, 0};
+            if (d_render_image != nullptr) {
+                // Gather d_color from d_render_image inside the filter kernel
+                // normalize using weight_image
+                d_color = gather_d_color(*scene.filter,
+                                         d_render_image,
+                                         weight_image,
+                                         width,
+                                         height,
+                                         pt);
+            }
+            auto color = Vector4f{0, 0, 0, 0};
+            if (use_prefiltering) {
+                color = sample_color_prefiltered(scene,
+                    background_image != nullptr ? (const Vector4f*)&background_image[4 * ((y * width) + x)] : nullptr,
+                    npt,
+                    d_render_image != nullptr ? &d_color : nullptr,
+                    d_background_image != nullptr ? (Vector4f*)&d_background_image[4 * ((y * width) + x)] : nullptr,
+                    d_translation != nullptr ? &d_translation[2 * (y * width + x)] : nullptr);
+            } else {
+                color = sample_color(scene,
+                    background_image != nullptr ? (const Vector4f*)&background_image[4 * ((y * width) + x)] : nullptr,
+                    npt,
+                    d_render_image != nullptr ? &d_color : nullptr,
+                    nullptr,
+                    d_background_image != nullptr ? (Vector4f*)&d_background_image[4 * ((y * width) + x)] : nullptr,
+                    d_translation != nullptr ? &d_translation[2 * (y * width + x)] : nullptr);
+            }
+            assert(isfinite(color));
+            // Splat color onto render_image
+            auto radius = scene.filter->radius;
+            assert(radius >= 0);
+            auto ri = (int)ceil(radius);
+            for (int dy = -ri; dy <= ri; dy++) {
+                for (int dx = -ri; dx <= ri; dx++) {
+                    auto xx = x + dx;
+                    auto yy = y + dy;
+                    if (xx >= 0 && xx < width && yy >= 0 && yy < height &&
+                            weight_image[yy * width + xx] > 0) {
+                        auto weight_sum = weight_image[yy * width + xx];
+                        auto xc = xx + 0.5f;
+                        auto yc = yy + 0.5f;
+                        auto filter_weight = compute_filter_weight(*scene.filter,
+                                                                   xc - pt.x,
+                                                                   yc - pt.y);
+                        auto weighted_color = filter_weight * color / weight_sum;
+                        if (render_image != nullptr) {
+                            atomic_add(render_image[4 * (yy * width + xx) + 0],
+                                       weighted_color[0]);
+                            atomic_add(render_image[4 * (yy * width + xx) + 1],
+                                       weighted_color[1]);
+                            atomic_add(render_image[4 * (yy * width + xx) + 2],
+                                       weighted_color[2]);
+                            atomic_add(render_image[4 * (yy * width + xx) + 3],
+                                       weighted_color[3]);
+                        }
+                        if (d_render_image != nullptr) {
+                            // Backprop to filter_weight
+                            // pixel = \sum weight * color / \sum weight
+                            auto d_pixel = Vector4f{
+                                d_render_image[4 * (yy * width + xx) + 0],
+                                d_render_image[4 * (yy * width + xx) + 1],
+                                d_render_image[4 * (yy * width + xx) + 2],
+                                d_render_image[4 * (yy * width + xx) + 3],
+                            };
+                            auto d_weight =
+                                (dot(d_pixel, color) * weight_sum -
+                                 filter_weight * dot(d_pixel, color) * (weight_sum - filter_weight)) /
+                                square(weight_sum);
+                            d_compute_filter_weight(*scene.filter,
+                                                    xc - pt.x,
+                                                    yc - pt.y,
+                                                    d_weight,
+                                                    scene.d_filter);
+                        }
+                    }
+                }
+            }
+        }
+        if (sdf_image != nullptr || d_sdf_image != nullptr) {
+            float d_dist = 0.f;
+            if (d_sdf_image != nullptr) {
+                if (eval_positions == nullptr) {
+                    d_dist = d_sdf_image[y * width + x];
+                } else {
+                    d_dist = d_sdf_image[idx];
+                }
+            }
+            auto weight = eval_positions == nullptr ? 1.f / num_samples : 1.f;
+            auto dist = sample_distance(scene, npt, weight,
+                d_sdf_image != nullptr ? &d_dist : nullptr, 
+                d_translation != nullptr ? &d_translation[2 * (y * width + x)] : nullptr);
+            if (sdf_image != nullptr) {
+                if (eval_positions == nullptr) {
+                    atomic_add(sdf_image[y * width + x], dist);
+                } else {
+                    atomic_add(sdf_image[idx], dist);
+                }
+            }
+        }
+    }
+
+    SceneData scene;
+    float *background_image;
+    float *render_image;
+    float *weight_image;
+    float *sdf_image;
+    float *d_background_image;
+    float *d_render_image;
+    float *d_sdf_image;
+    float *d_translation;
+    int width;
+    int height;
+    int num_samples_x;
+    int num_samples_y;
+    uint64_t seed;
+    bool use_prefiltering;
+    float *eval_positions;
+};
+
+struct BoundarySample {
+    Vector2f pt;
+    Vector2f local_pt;
+    Vector2f normal;
+    int shape_group_id;
+    int shape_id;
+    float t;
+    BoundaryData data;
+    float pdf;
+};
+
+struct sample_boundary_kernel {
+    DEVICE void operator()(int idx) {
+        boundary_samples[idx].pt = Vector2f{0, 0};
+        boundary_samples[idx].shape_id = -1;
+        boundary_ids[idx] = idx;
+        morton_codes[idx] = 0;
+
+        auto rng_state = init_pcg32(idx, seed);
+        auto u = next_pcg32_float(&rng_state);
+        // Sample a shape
+        auto sample_id = sample(scene.sample_shapes_cdf,
+                                scene.num_total_shapes,
+                                u);
+        assert(sample_id >= 0 && sample_id < scene.num_total_shapes);
+        auto shape_id = scene.sample_shape_id[sample_id];
+        assert(shape_id >= 0 && shape_id < scene.num_shapes);
+        auto shape_group_id = scene.sample_group_id[sample_id];
+        assert(shape_group_id >= 0 && shape_group_id < scene.num_shape_groups);
+        auto shape_pmf = scene.sample_shapes_pmf[shape_id];
+        if (shape_pmf <= 0) {
+            return;
+        }
+        // Sample a point on the boundary of the shape
+        auto boundary_pdf = 0.f;
+        auto normal = Vector2f{0, 0};
+        auto t = next_pcg32_float(&rng_state);
+        BoundaryData boundary_data;
+        const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+        auto local_boundary_pt = sample_boundary(
+            scene, shape_group_id, shape_id,
+            t, normal, boundary_pdf, boundary_data);
+        if (boundary_pdf <= 0) {
+            return;
+        }
+
+        // local_boundary_pt & normal are in shape's local space,
+        // transform them to canvas space
+        auto boundary_pt = xform_pt(shape_group.shape_to_canvas, local_boundary_pt);
+        normal = xform_normal(shape_group.canvas_to_shape, normal);
+        // Normalize boundary_pt to [0, 1)
+        boundary_pt.x /= scene.canvas_width;
+        boundary_pt.y /= scene.canvas_height;
+
+        boundary_samples[idx].pt = boundary_pt;
+        boundary_samples[idx].local_pt = local_boundary_pt;
+        boundary_samples[idx].normal = normal;
+        boundary_samples[idx].shape_group_id = shape_group_id;
+        boundary_samples[idx].shape_id = shape_id;
+        boundary_samples[idx].t = t;
+        boundary_samples[idx].data = boundary_data;
+        boundary_samples[idx].pdf = shape_pmf * boundary_pdf;
+        TVector2<uint32_t> p_i{boundary_pt.x * 1023, boundary_pt.y * 1023};
+        morton_codes[idx] = (expand_bits(p_i.x) << 1u) |
+                            (expand_bits(p_i.y) << 0u);
+    }
+
+    SceneData scene;
+    uint64_t seed;
+    BoundarySample *boundary_samples;
+    int *boundary_ids;
+    uint32_t *morton_codes;
+};
+
+struct render_edge_kernel {
+    DEVICE void operator()(int idx) {
+        auto bid = boundary_ids[idx];
+        if (boundary_samples[bid].shape_id == -1) {
+            return;
+        }
+        auto boundary_pt = boundary_samples[bid].pt;
+        auto local_boundary_pt = boundary_samples[bid].local_pt;
+        auto normal = boundary_samples[bid].normal;
+        auto shape_group_id = boundary_samples[bid].shape_group_id;
+        auto shape_id = boundary_samples[bid].shape_id;
+        auto t = boundary_samples[bid].t;
+        auto boundary_data = boundary_samples[bid].data;
+        auto pdf = boundary_samples[bid].pdf;
+
+        const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+
+        auto bx = int(boundary_pt.x * width);
+        auto by = int(boundary_pt.y * height);
+        if (bx < 0 || bx >= width || by < 0 || by >= height) {
+            return;
+        }
+
+        // Sample the two sides of the boundary
+        auto inside_query = EdgeQuery{shape_group_id, shape_id, false};
+        auto outside_query = EdgeQuery{shape_group_id, shape_id, false};
+        auto color_inside = sample_color(scene,
+            background_image != nullptr ? (const Vector4f *)&background_image[4 * ((by * width) + bx)] : nullptr,
+            boundary_pt - 1e-4f * normal,
+            nullptr, &inside_query);
+        auto color_outside = sample_color(scene,
+            background_image != nullptr ? (const Vector4f *)&background_image[4 * ((by * width) + bx)] : nullptr,
+            boundary_pt + 1e-4f * normal,
+            nullptr, &outside_query);
+        if (!inside_query.hit && !outside_query.hit) {
+            // occluded
+            return;
+        }
+        if (!inside_query.hit) {
+            normal = -normal;
+            swap_(inside_query, outside_query);
+            swap_(color_inside, color_outside);
+        }
+        // Boundary point in screen space
+        auto sboundary_pt = boundary_pt;
+        sboundary_pt.x *= width;
+        sboundary_pt.y *= height;
+        auto d_color = gather_d_color(*scene.filter,
+                                      d_render_image,
+                                      weight_image,
+                                      width,
+                                      height,
+                                      sboundary_pt);
+        // Normalization factor
+        d_color /= float(scene.canvas_width * scene.canvas_height);
+        
+        assert(isfinite(d_color));
+        assert(isfinite(pdf) && pdf > 0);
+        auto contrib = dot(color_inside - color_outside, d_color) / pdf;
+        ShapeGroup &d_shape_group = scene.d_shape_groups[shape_group_id];
+        accumulate_boundary_gradient(scene.shapes[shape_id],
+            contrib, t, normal, boundary_data, scene.d_shapes[shape_id],
+            shape_group.shape_to_canvas, local_boundary_pt, d_shape_group.shape_to_canvas);
+        // Don't need to backprop to filter weights:
+        // \int f'(x) g(x) dx doesn't contain discontinuities
+        // if f is continuous, even if g is discontinuous
+        if (d_translation != nullptr) {
+            // According to Reynold transport theorem,
+            // the Jacobian of the boundary integral is dot(velocity, normal)
+            // The velocity of the object translating x is (1, 0)
+            // The velocity of the object translating y is (0, 1)
+            atomic_add(&d_translation[2 * (by * width + bx) + 0], normal.x * contrib);
+            atomic_add(&d_translation[2 * (by * width + bx) + 1], normal.y * contrib);
+        }
+    }
+
+    SceneData scene;
+    const float *background_image;
+    const BoundarySample *boundary_samples;
+    const int *boundary_ids;
+    float *weight_image;
+    float *d_render_image;
+    float *d_translation;
+    int width;
+    int height;
+    int num_samples_x;
+    int num_samples_y;
+};
+
+void render(std::shared_ptr<Scene> scene,
+            ptr<float> background_image,
+            ptr<float> render_image,
+            ptr<float> render_sdf,
+            int width,
+            int height,
+            int num_samples_x,
+            int num_samples_y,
+            uint64_t seed,
+            ptr<float> d_background_image,
+            ptr<float> d_render_image,
+            ptr<float> d_render_sdf,
+            ptr<float> d_translation,
+            bool use_prefiltering,
+            ptr<float> eval_positions,
+            int num_eval_positions) {
+#ifdef __NVCC__
+    int old_device_id = -1;
+    if (scene->use_gpu) {
+        checkCuda(cudaGetDevice(&old_device_id));
+        if (scene->gpu_index != -1) {
+            checkCuda(cudaSetDevice(scene->gpu_index));
+        }
+    }
+#endif
+    parallel_init();
+
+    float *weight_image = nullptr;
+    // Allocate and zero the weight image
+    if (scene->use_gpu) {
+#ifdef __CUDACC__
+        if (eval_positions.get() == nullptr) {
+            checkCuda(cudaMallocManaged(&weight_image, width * height * sizeof(float)));
+            cudaMemset(weight_image, 0, width * height * sizeof(float));
+        }
+#else
+        assert(false);
+#endif
+    } else {
+        if (eval_positions.get() == nullptr) {
+            weight_image = (float*)malloc(width * height * sizeof(float));
+            memset(weight_image, 0, width * height * sizeof(float));
+        }
+    }
+
+    if (render_image.get() != nullptr || d_render_image.get() != nullptr ||
+        render_sdf.get() != nullptr || d_render_sdf.get() != nullptr) {
+        if (weight_image != nullptr) {
+            parallel_for(weight_kernel{
+                get_scene_data(*scene.get()),
+                weight_image,
+                width,
+                height,
+                num_samples_x,
+                num_samples_y,
+                seed
+            }, width * height * num_samples_x * num_samples_y, scene->use_gpu);
+        }
+
+        auto num_samples = eval_positions.get() == nullptr ?
+            width * height * num_samples_x * num_samples_y : num_eval_positions;
+        parallel_for(render_kernel{
+            get_scene_data(*scene.get()),
+            background_image.get(),
+            render_image.get(),
+            weight_image,
+            render_sdf.get(),
+            d_background_image.get(),
+            d_render_image.get(),
+            d_render_sdf.get(),
+            d_translation.get(),
+            width,
+            height,
+            num_samples_x,
+            num_samples_y,
+            seed,
+            use_prefiltering,
+            eval_positions.get()
+        }, num_samples, scene->use_gpu);
+    }
+
+    // Boundary sampling
+    if (!use_prefiltering && d_render_image.get() != nullptr) {
+        auto num_samples = width * height * num_samples_x * num_samples_y;
+        BoundarySample *boundary_samples = nullptr;
+        int *boundary_ids = nullptr; // for sorting
+        uint32_t *morton_codes = nullptr; // for sorting
+        // Allocate boundary samples
+        if (scene->use_gpu) {
+#ifdef __CUDACC__
+            checkCuda(cudaMallocManaged(&boundary_samples,
+                num_samples * sizeof(BoundarySample)));
+            checkCuda(cudaMallocManaged(&boundary_ids,
+                num_samples * sizeof(int)));
+            checkCuda(cudaMallocManaged(&morton_codes,
+                num_samples * sizeof(uint32_t)));
+#else
+            assert(false);
+    #endif
+        } else {
+            boundary_samples = (BoundarySample*)malloc(
+                num_samples * sizeof(BoundarySample));
+            boundary_ids = (int*)malloc(
+                num_samples * sizeof(int));
+            morton_codes = (uint32_t*)malloc(
+                num_samples * sizeof(uint32_t));
+        }
+        
+        // Edge sampling
+        // We sort the boundary samples for better thread coherency
+        parallel_for(sample_boundary_kernel{
+            get_scene_data(*scene.get()),
+            seed,
+            boundary_samples,
+            boundary_ids,
+            morton_codes
+        }, num_samples, scene->use_gpu);
+        if (scene->use_gpu) {
+            thrust::sort_by_key(thrust::device, morton_codes, morton_codes + num_samples, boundary_ids);
+        } else {
+            // Don't need to sort for CPU, we are not using SIMD hardware anyway.
+            // thrust::sort_by_key(thrust::host, morton_codes, morton_codes + num_samples, boundary_ids);
+        }
+        parallel_for(render_edge_kernel{
+            get_scene_data(*scene.get()),
+            background_image.get(),
+            boundary_samples,
+            boundary_ids,
+            weight_image,
+            d_render_image.get(),
+            d_translation.get(),
+            width,
+            height,
+            num_samples_x,
+            num_samples_y
+        }, num_samples, scene->use_gpu);
+        if (scene->use_gpu) {
+#ifdef __CUDACC__
+            checkCuda(cudaFree(boundary_samples));
+            checkCuda(cudaFree(boundary_ids));
+            checkCuda(cudaFree(morton_codes));
+#else
+            assert(false);
+#endif
+        } else {
+            free(boundary_samples);
+            free(boundary_ids);
+            free(morton_codes);
+        }
+    }
+
+    // Clean up weight image
+    if (scene->use_gpu) {
+#ifdef __CUDACC__
+        checkCuda(cudaFree(weight_image));
+#else
+        assert(false);
+#endif
+    } else {
+        free(weight_image);
+    }
+
+    if (scene->use_gpu) {
+        cuda_synchronize();
+    }
+
+    parallel_cleanup();
+#ifdef __NVCC__
+    if (old_device_id != -1) {
+        checkCuda(cudaSetDevice(old_device_id));
+    }
+#endif
+}
+
+PYBIND11_MODULE(diffvg, m) {
+    m.doc() = "Differential Vector Graphics";
+
+    py::class_<ptr<void>>(m, "void_ptr")
+        .def(py::init<std::size_t>())
+        .def("as_size_t", &ptr<void>::as_size_t);
+    py::class_<ptr<float>>(m, "float_ptr")
+        .def(py::init<std::size_t>());
+    py::class_<ptr<int>>(m, "int_ptr")
+        .def(py::init<std::size_t>());
+
+    py::class_<Vector2f>(m, "Vector2f")
+        .def(py::init<float, float>())
+        .def_readwrite("x", &Vector2f::x)
+        .def_readwrite("y", &Vector2f::y);
+
+    py::class_<Vector3f>(m, "Vector3f")
+        .def(py::init<float, float, float>())
+        .def_readwrite("x", &Vector3f::x)
+        .def_readwrite("y", &Vector3f::y)
+        .def_readwrite("z", &Vector3f::z);
+
+    py::class_<Vector4f>(m, "Vector4f")
+        .def(py::init<float, float, float, float>())
+        .def_readwrite("x", &Vector4f::x)
+        .def_readwrite("y", &Vector4f::y)
+        .def_readwrite("z", &Vector4f::z)
+        .def_readwrite("w", &Vector4f::w);
+
+    py::enum_<ShapeType>(m, "ShapeType")
+        .value("circle", ShapeType::Circle)
+        .value("ellipse", ShapeType::Ellipse)
+        .value("path", ShapeType::Path)
+        .value("rect", ShapeType::Rect);
+
+    py::class_<Circle>(m, "Circle")
+        .def(py::init<float, Vector2f>())
+        .def("get_ptr", &Circle::get_ptr)
+        .def_readonly("radius", &Circle::radius)
+        .def_readonly("center", &Circle::center);
+
+    py::class_<Ellipse>(m, "Ellipse")
+        .def(py::init<Vector2f, Vector2f>())
+        .def("get_ptr", &Ellipse::get_ptr)
+        .def_readonly("radius", &Ellipse::radius)
+        .def_readonly("center", &Ellipse::center);
+
+    py::class_<Path>(m, "Path")
+        .def(py::init<ptr<int>, ptr<float>, ptr<float>, int, int, bool, bool>())
+        .def("get_ptr", &Path::get_ptr)
+        .def("has_thickness", &Path::has_thickness)
+        .def("copy_to", &Path::copy_to)
+        .def_readonly("num_points", &Path::num_points);
+
+    py::class_<Rect>(m, "Rect")
+        .def(py::init<Vector2f, Vector2f>())
+        .def("get_ptr", &Rect::get_ptr)
+        .def_readonly("p_min", &Rect::p_min)
+        .def_readonly("p_max", &Rect::p_max);
+
+    py::enum_<ColorType>(m, "ColorType")
+        .value("constant", ColorType::Constant)
+        .value("linear_gradient", ColorType::LinearGradient)
+        .value("radial_gradient", ColorType::RadialGradient);
+
+    py::class_<Constant>(m, "Constant")
+        .def(py::init<Vector4f>())
+        .def("get_ptr", &Constant::get_ptr)
+        .def_readonly("color", &Constant::color);
+
+    py::class_<LinearGradient>(m, "LinearGradient")
+        .def(py::init<Vector2f, Vector2f, int, ptr<float>, ptr<float>>())
+        .def("get_ptr", &LinearGradient::get_ptr)
+        .def("copy_to", &LinearGradient::copy_to)
+        .def_readonly("begin", &LinearGradient::begin)
+        .def_readonly("end", &LinearGradient::end)
+        .def_readonly("num_stops", &LinearGradient::num_stops);
+
+    py::class_<RadialGradient>(m, "RadialGradient")
+        .def(py::init<Vector2f, Vector2f, int, ptr<float>, ptr<float>>())
+        .def("get_ptr", &RadialGradient::get_ptr)
+        .def("copy_to", &RadialGradient::copy_to)
+        .def_readonly("center", &RadialGradient::center)
+        .def_readonly("radius", &RadialGradient::radius)
+        .def_readonly("num_stops", &RadialGradient::num_stops);
+
+    py::class_<Shape>(m, "Shape")
+        .def(py::init<ShapeType, ptr<void>, float>())
+        .def("as_circle", &Shape::as_circle)
+        .def("as_ellipse", &Shape::as_ellipse)
+        .def("as_path", &Shape::as_path)
+        .def("as_rect", &Shape::as_rect)
+        .def_readonly("type", &Shape::type)
+        .def_readonly("stroke_width", &Shape::stroke_width);
+
+    py::class_<ShapeGroup>(m, "ShapeGroup")
+        .def(py::init<ptr<int>,
+                      int,
+                      ColorType,
+                      ptr<void>,
+                      ColorType,
+                      ptr<void>,
+                      bool,
+                      ptr<float>>())
+        .def("fill_color_as_constant", &ShapeGroup::fill_color_as_constant)
+        .def("fill_color_as_linear_gradient", &ShapeGroup::fill_color_as_linear_gradient)
+        .def("fill_color_as_radial_gradient", &ShapeGroup::fill_color_as_radial_gradient)
+        .def("stroke_color_as_constant", &ShapeGroup::stroke_color_as_constant)
+        .def("stroke_color_as_linear_gradient", &ShapeGroup::stroke_color_as_linear_gradient)
+        .def("stroke_color_as_radial_gradient", &ShapeGroup::fill_color_as_radial_gradient)
+        .def("has_fill_color", &ShapeGroup::has_fill_color)
+        .def("has_stroke_color", &ShapeGroup::has_stroke_color)
+        .def("copy_to", &ShapeGroup::copy_to)
+        .def_readonly("fill_color_type", &ShapeGroup::fill_color_type)
+        .def_readonly("stroke_color_type", &ShapeGroup::stroke_color_type);
+
+    py::enum_<FilterType>(m, "FilterType")
+        .value("box", FilterType::Box)
+        .value("tent", FilterType::Tent)
+        .value("parabolic", FilterType::RadialParabolic)
+        .value("hann", FilterType::Hann);
+
+    py::class_<Filter>(m, "Filter")
+        .def(py::init<FilterType,
+                      float>());
+
+    py::class_<Scene, std::shared_ptr<Scene>>(m, "Scene")
+        .def(py::init<int,
+                      int,
+                      const std::vector<const Shape*> &,
+                      const std::vector<const ShapeGroup*> &,
+                      const Filter &,
+                      bool,
+                      int>())
+        .def("get_d_shape", &Scene::get_d_shape)
+        .def("get_d_shape_group", &Scene::get_d_shape_group)
+        .def("get_d_filter_radius", &Scene::get_d_filter_radius)
+        .def_readonly("num_shapes", &Scene::num_shapes)
+        .def_readonly("num_shape_groups", &Scene::num_shape_groups);
+
+    m.def("render", &render, "");
+}
diff --git a/diffvg.h b/diffvg.h
new file mode 100644
index 0000000000000000000000000000000000000000..400e4dc3f60d89061fe3842e09688f130d49c557
--- /dev/null
+++ b/diffvg.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#ifdef __NVCC__ 
+    #define DEVICE __device__ __host__ 
+#else
+    #define DEVICE
+#endif
+
+#ifndef __NVCC__
+    #include <cmath>
+    namespace {
+        inline float fmodf(float a, float b) {
+            return std::fmod(a, b);
+        }
+        inline double fmod(double a, double b) {
+            return std::fmod(a, b);
+        }
+    }
+    using std::isfinite;
+#endif
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#include <cstdint>
+#include <atomic>
+
+// We use Real for most of the internal computation.
+// However, for PyTorch interfaces, Optix Prime and Embree queries
+// we use float
+using Real = float;
+
+template <typename T>
+DEVICE
+inline T square(const T &x) {
+    return x * x;
+}
+
+template <typename T>
+DEVICE
+inline T cubic(const T &x) {
+    return x * x * x;
+}
+
+template <typename T>
+DEVICE
+inline T clamp(const T &v, const T &lo, const T &hi) {
+    if (v < lo) return lo;
+    else if (v > hi) return hi;
+    else return v;
+}
+
+DEVICE
+inline int modulo(int a, int b) {
+    auto r = a % b;
+    return (r < 0) ? r+b : r;
+}
+
+DEVICE
+inline float modulo(float a, float b) {
+    float r = ::fmodf(a, b);
+    return (r < 0.0f) ? r+b : r;
+}
+
+DEVICE
+inline double modulo(double a, double b) {
+    double r = ::fmod(a, b);
+    return (r < 0.0) ? r+b : r;
+}
+
+template <typename T>
+DEVICE
+inline T max(const T &a, const T &b) {
+    return a > b ? a : b;
+}
+
+template <typename T>
+DEVICE
+inline T min(const T &a, const T &b) {
+    return a < b ? a : b;
+}
+
+/// Return ceil(x/y) for integers x and y
+inline int idiv_ceil(int x, int y) {
+    return (x + y-1) / y;
+}
+
+template <typename T>
+DEVICE
+inline void swap_(T &a, T &b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+inline double log2(double x) {
+    return log(x) / log(Real(2));
+}
+
+template <typename T>
+DEVICE
+inline T safe_acos(const T &x) {
+    if (x >= 1) return T(0);
+    else if(x <= -1) return T(M_PI);
+    return acos(x);
+}
+
+// For Morton code computation. This can be made faster.
+DEVICE
+inline uint32_t expand_bits(uint32_t x) {
+    // Insert one zero after every bit given a 10-bit integer
+    constexpr uint64_t mask = 0x1u;
+    // We start from LSB (bit 31)
+    auto result = (x & (mask << 0u));
+    result |= ((x & (mask << 1u)) << 1u);
+    result |= ((x & (mask << 2u)) << 2u);
+    result |= ((x & (mask << 3u)) << 3u);
+    result |= ((x & (mask << 4u)) << 4u);
+    result |= ((x & (mask << 5u)) << 5u);
+    result |= ((x & (mask << 6u)) << 6u);
+    result |= ((x & (mask << 7u)) << 7u);
+    result |= ((x & (mask << 8u)) << 8u);
+    result |= ((x & (mask << 9u)) << 9u);
+    return result;
+}
+
+// DEVICE
+// inline int clz(uint64_t x) {
+// #ifdef __CUDA_ARCH__
+//     return __clzll(x);
+// #else
+//     // TODO: use _BitScanReverse in windows
+//     return x == 0 ? 64 : __builtin_clzll(x);
+// #endif
+// }
+
+// DEVICE
+// inline int ffs(uint8_t x) {
+// #ifdef __CUDA_ARCH__
+//     return __ffs(x);
+// #else
+//     // TODO: use _BitScanReverse in windows
+//     return __builtin_ffs(x);
+// #endif
+// }
+
+// DEVICE
+// inline int popc(uint8_t x) {
+// #ifdef __CUDA_ARCH__
+//     return __popc(x);
+// #else
+//     // TODO: use _popcnt in windows
+//     return __builtin_popcount(x);
+// #endif
+// }
diff --git a/edge_query.h b/edge_query.h
new file mode 100644
index 0000000000000000000000000000000000000000..57f233a3203c1ea8d6b73f6624036578483442bb
--- /dev/null
+++ b/edge_query.h
@@ -0,0 +1,7 @@
+#pragma once
+
+struct EdgeQuery {
+	int shape_group_id;
+    int shape_id;
+    bool hit; // Do we hit the specified shape_group_id & shape_id?
+};
diff --git a/examples/1.png b/examples/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a5b458aae3c26c7614c63cee104dc6614b3a5a2
Binary files /dev/null and b/examples/1.png differ
diff --git a/examples/2.png b/examples/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3e1050726124b2c197dd05e1ffa44342e2acb36
Binary files /dev/null and b/examples/2.png differ
diff --git a/examples/3.jpg b/examples/3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7ccdcf7d82f4dae849ceec62f68d10a6acddbcdd
Binary files /dev/null and b/examples/3.jpg differ
diff --git a/examples/4.png b/examples/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..6355c30cb8be9014029029f9b69453bae47c8b80
Binary files /dev/null and b/examples/4.png differ
diff --git a/examples/5.png b/examples/5.png
new file mode 100644
index 0000000000000000000000000000000000000000..5705c2ff34aa0df1cffe65d5e5be7b41a607224c
Binary files /dev/null and b/examples/5.png differ
diff --git a/figures/smile.png b/figures/smile.png
new file mode 100644
index 0000000000000000000000000000000000000000..5705c2ff34aa0df1cffe65d5e5be7b41a607224c
Binary files /dev/null and b/figures/smile.png differ
diff --git a/filter.h b/filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dd0b62acb83e94da89696e9a8024c4b919f6749
--- /dev/null
+++ b/filter.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include "diffvg.h"
+#include "atomic.h"
+
+enum class FilterType {
+    Box,
+    Tent,
+    RadialParabolic, // 4/3(1 - (d/r))
+    Hann // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+};
+
+struct Filter {
+    FilterType type;
+    float radius;
+};
+
+struct DFilter {
+    float radius;
+};
+
+DEVICE
+inline
+float compute_filter_weight(const Filter &filter,
+                            float dx,
+                            float dy) {
+    if (fabs(dx) > filter.radius || fabs(dy) > filter.radius) {
+        return 0;
+    }
+    if (filter.type == FilterType::Box) {
+        return 1.f / square(2 * filter.radius);
+    } else if (filter.type == FilterType::Tent) {
+        return (filter.radius - fabs(dx)) * (filter.radius - fabs(dy)) /
+               square(square(filter.radius));
+    } else if (filter.type == FilterType::RadialParabolic) {
+        return (4.f / 3.f) * (1 - square(dx / filter.radius)) *
+               (4.f / 3.f) * (1 - square(dy / filter.radius));
+    } else {
+        assert(filter.type == FilterType::Hann);
+        // normalize dx, dy to [0, 1]
+        auto ndx = (dx / (2*filter.radius)) + 0.5f;
+        auto ndy = (dy / (2*filter.radius)) + 0.5f;
+        // the normalization factor is R^2
+        return 0.5f * (1.f - cos(float(2 * M_PI) * ndx)) *
+               0.5f * (1.f - cos(float(2 * M_PI) * ndy)) /
+               square(filter.radius);
+    }
+}
+
+DEVICE
+inline
+void d_compute_filter_weight(const Filter &filter,
+                             float dx,
+                             float dy,
+                             float d_return,
+                             DFilter *d_filter) {
+    if (filter.type == FilterType::Box) {
+        // return 1.f / square(2 * filter.radius);
+        atomic_add(d_filter->radius,
+            d_return * (-2) * 2 * filter.radius / cubic(2 * filter.radius));
+    } else if (filter.type == FilterType::Tent) {
+        // return (filer.radius - fabs(dx)) * (filer.radius - fabs(dy)) /
+        //        square(square(filter.radius));
+        auto fx = filter.radius - fabs(dx);
+        auto fy = filter.radius - fabs(dy);
+        auto norm = 1 / square(filter.radius);
+        auto d_fx = d_return * fy * norm;
+        auto d_fy = d_return * fx * norm;
+        auto d_norm = d_return * fx * fy;
+        atomic_add(d_filter->radius,
+            d_fx + d_fy + (-4) * d_norm / pow(filter.radius, 5));
+    } else if (filter.type == FilterType::RadialParabolic) {
+        // return (4.f / 3.f) * (1 - square(dx / filter.radius)) *
+        //        (4.f / 3.f) * (1 - square(dy / filter.radius));
+        // auto d_square_x = d_return * (-4.f / 3.f);
+        // auto d_square_y = d_return * (-4.f / 3.f);
+        auto r3 = filter.radius * filter.radius * filter.radius;
+        auto d_radius = -(2 * square(dx) + 2 * square(dy)) / r3;
+        atomic_add(d_filter->radius, d_radius);
+    } else {
+        assert(filter.type == FilterType::Hann);
+        // // normalize dx, dy to [0, 1]
+        // auto ndx = (dx / (2*filter.radius)) + 0.5f;
+        // auto ndy = (dy / (2*filter.radius)) + 0.5f;
+        // // the normalization factor is R^2
+        // return 0.5f * (1.f - cos(float(2 * M_PI) * ndx)) *
+        //        0.5f * (1.f - cos(float(2 * M_PI) * ndy)) /
+        //        square(filter.radius);
+
+        // normalize dx, dy to [0, 1]
+        auto ndx = (dx / (2*filter.radius)) + 0.5f;
+        auto ndy = (dy / (2*filter.radius)) + 0.5f;
+        auto fx = 0.5f * (1.f - cos(float(2*M_PI) * ndx));
+        auto fy = 0.5f * (1.f - cos(float(2*M_PI) * ndy));
+        auto norm = 1 / square(filter.radius);
+        auto d_fx = d_return * fy * norm;
+        auto d_fy = d_return * fx * norm;
+        auto d_norm = d_return * fx * fy;
+        auto d_ndx = d_fx * 0.5f * sin(float(2*M_PI) * ndx) * float(2*M_PI);
+        auto d_ndy = d_fy * 0.5f * sin(float(2*M_PI) * ndy) * float(2*M_PI);
+        atomic_add(d_filter->radius,
+            d_ndx * (-2*dx / square(2*filter.radius)) +
+            d_ndy * (-2*dy / square(2*filter.radius)) +
+            (-2) * d_norm / cubic(filter.radius));
+    }
+}
diff --git a/icon/logo.ico b/icon/logo.ico
new file mode 100644
index 0000000000000000000000000000000000000000..11baf9d82d4cc010b86460dd965167e64f5a88a3
Binary files /dev/null and b/icon/logo.ico differ
diff --git a/img_example/Millenial-at-work.jpg b/img_example/Millenial-at-work.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..78cb50b60443c03873bd9ee35c8cd4541387fa34
Binary files /dev/null and b/img_example/Millenial-at-work.jpg differ
diff --git a/img_example/bus.jpg b/img_example/bus.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b43e311165c785f000eb7493ff8fb662d06a3f83
Binary files /dev/null and b/img_example/bus.jpg differ
diff --git a/img_example/zidane.jpg b/img_example/zidane.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92d72ea124760ce5dbf9425e3aa8f371e7481328
Binary files /dev/null and b/img_example/zidane.jpg differ
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ed8601b4b1d85741ab8d5c75adbbf425942d2b
--- /dev/null
+++ b/main.py
@@ -0,0 +1,1040 @@
+"""
+Here are some use cases:
+python main.py --config config/all.yaml --experiment experiment_8x1 --signature demo1 --target data/demo1.png
+"""
+import pydiffvg
+import torch
+import cv2
+import matplotlib.pyplot as plt
+import random
+import argparse
+import math
+import errno
+from tqdm import tqdm
+from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
+from torch.nn.functional import adaptive_avg_pool2d
+import warnings
+warnings.filterwarnings("ignore")
+
+import PIL
+import PIL.Image
+import os
+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import shutil
+import copy
+# import skfmm
+from xing_loss import xing_loss
+
+import yaml
+from easydict import EasyDict as edict
+
+
+pydiffvg.set_print_timing(False)
+gamma = 1.0
+
+##########
+# helper #
+##########
+
+from utils import \
+    get_experiment_id, \
+    get_path_schedule, \
+    edict_2_dict, \
+    check_and_create_dir
+
+def get_bezier_circle(radius=1, segments=4, bias=None):
+    points = []
+    if bias is None:
+        bias = (random.random(), random.random())
+    avg_degree = 360 / (segments*3)
+    for i in range(0, segments*3):
+        point = (np.cos(np.deg2rad(i * avg_degree)),
+                    np.sin(np.deg2rad(i * avg_degree)))
+        points.append(point)
+    points = torch.tensor(points)
+    points = (points)*radius + torch.tensor(bias).unsqueeze(dim=0)
+    points = points.type(torch.FloatTensor)
+    return points
+
+def get_sdf(phi, method='skfmm', **kwargs):
+    if method == 'skfmm':
+        import skfmm
+        phi = (phi-0.5)*2
+        if (phi.max() <= 0) or (phi.min() >= 0):
+            return np.zeros(phi.shape).astype(np.float32)
+        sd = skfmm.distance(phi, dx=1)
+
+        flip_negative = kwargs.get('flip_negative', True)
+        if flip_negative:
+            sd = np.abs(sd)
+
+        truncate = kwargs.get('truncate', 10)
+        sd = np.clip(sd, -truncate, truncate)
+        # print(f"max sd value is: {sd.max()}")
+
+        zero2max = kwargs.get('zero2max', True)
+        if zero2max and flip_negative:
+            sd = sd.max() - sd
+        elif zero2max:
+            raise ValueError
+
+        normalize = kwargs.get('normalize', 'sum')
+        if normalize == 'sum':
+            sd /= sd.sum()
+        elif normalize == 'to1':
+            sd /= sd.max()
+        return sd
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--debug', action='store_true', default=False)
+    parser.add_argument("--config", type=str)
+    parser.add_argument("--experiment", type=str)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--target", type=str, help="target image path")
+    parser.add_argument('--log_dir', metavar='DIR', default="log/debug")
+    parser.add_argument('--initial', type=str, default="random", choices=['random', 'circle'])
+    parser.add_argument('--signature', nargs='+', type=str)
+    parser.add_argument('--seginit', nargs='+', type=str)
+    parser.add_argument("--num_segments", type=int, default=4)
+    # parser.add_argument("--num_paths", type=str, default="1,1,1")
+    # parser.add_argument("--num_iter", type=int, default=500)
+    # parser.add_argument('--free', action='store_true')
+    # Please ensure that image resolution is divisible by pool_size; otherwise the performance would drop a lot.
+    # parser.add_argument('--pool_size', type=int, default=40, help="the pooled image size for next path initialization")
+    # parser.add_argument('--save_loss', action='store_true')
+    # parser.add_argument('--save_init', action='store_true')
+    # parser.add_argument('--save_image', action='store_true')
+    # parser.add_argument('--save_video', action='store_true')
+    # parser.add_argument('--print_weight', action='store_true')
+    # parser.add_argument('--circle_init_radius',  type=float)
+    cfg = edict()
+    args = parser.parse_args()
+    cfg.debug = args.debug
+    cfg.config = args.config
+    cfg.experiment = args.experiment
+    cfg.seed = args.seed
+    cfg.target = args.target
+    cfg.log_dir = args.log_dir
+    cfg.initial = args.initial
+    cfg.signature = args.signature
+    # set cfg num_segments in command
+    cfg.num_segments = args.num_segments
+    if args.seginit is not None:
+        cfg.seginit = edict()
+        cfg.seginit.type = args.seginit[0]
+        if cfg.seginit.type == 'circle':
+            cfg.seginit.radius = float(args.seginit[1])
+    return cfg
+
+def ycrcb_conversion(im, format='[bs x 3 x 2D]', reverse=False):
+    mat = torch.FloatTensor([
+        [ 65.481/255, 128.553/255,  24.966/255], # ranged_from [0, 219/255]
+        [-37.797/255, -74.203/255, 112.000/255], # ranged_from [-112/255, 112/255]
+        [112.000/255, -93.786/255, -18.214/255], # ranged_from [-112/255, 112/255]
+    ]).to(im.device)
+
+    if reverse:
+        mat = mat.inverse()
+
+    if format == '[bs x 3 x 2D]':
+        im = im.permute(0, 2, 3, 1)
+        im = torch.matmul(im, mat.T)
+        im = im.permute(0, 3, 1, 2).contiguous()
+        return im
+    elif format == '[2D x 3]':
+        im = torch.matmul(im, mat.T)
+        return im
+    else:
+        raise ValueError
+
+class random_coord_init():
+    def __init__(self, canvas_size):
+        self.canvas_size = canvas_size
+    def __call__(self):
+        h, w = self.canvas_size
+        return [npr.uniform(0, 1)*w, npr.uniform(0, 1)*h]
+
+class naive_coord_init():
+    def __init__(self, pred, gt, format='[bs x c x 2D]', replace_sampling=True):
+        if isinstance(pred, torch.Tensor):
+            pred = pred.detach().cpu().numpy()
+        if isinstance(gt, torch.Tensor):
+            gt = gt.detach().cpu().numpy()
+
+        if format == '[bs x c x 2D]':
+            self.map = ((pred[0] - gt[0])**2).sum(0)
+        elif format == ['[2D x c]']:
+            self.map = ((pred - gt)**2).sum(-1)
+        else:
+            raise ValueError
+        self.replace_sampling = replace_sampling
+
+    def __call__(self):
+        coord = np.where(self.map == self.map.max())
+        coord_h, coord_w = coord[0][0], coord[1][0]
+        if self.replace_sampling:
+            self.map[coord_h, coord_w] = -1
+        return [coord_w, coord_h]
+
+
+class sparse_coord_init():
+    def __init__(self, pred, gt, format='[bs x c x 2D]', quantile_interval=200, nodiff_thres=0.1):
+        if isinstance(pred, torch.Tensor):
+            pred = pred.detach().cpu().numpy()
+        if isinstance(gt, torch.Tensor):
+            gt = gt.detach().cpu().numpy()
+        if format == '[bs x c x 2D]':
+            self.map = ((pred[0] - gt[0])**2).sum(0)
+            self.reference_gt = copy.deepcopy(
+                np.transpose(gt[0], (1, 2, 0)))
+        elif format == ['[2D x c]']:
+            self.map = (np.abs(pred - gt)).sum(-1)
+            self.reference_gt = copy.deepcopy(gt[0])
+        else:
+            raise ValueError
+        # OptionA: Zero too small errors to avoid the error too small deadloop
+        self.map[self.map < nodiff_thres] = 0
+        quantile_interval = np.linspace(0., 1., quantile_interval)
+        quantized_interval = np.quantile(self.map, quantile_interval)
+        # remove redundant
+        quantized_interval = np.unique(quantized_interval)
+        quantized_interval = sorted(quantized_interval[1:-1])
+        self.map = np.digitize(self.map, quantized_interval, right=False)
+        self.map = np.clip(self.map, 0, 255).astype(np.uint8)
+        self.idcnt = {}
+        for idi in sorted(np.unique(self.map)):
+            self.idcnt[idi] = (self.map==idi).sum()
+        self.idcnt.pop(min(self.idcnt.keys()))
+        # remove smallest one to remove the correct region
+    def __call__(self):
+        if len(self.idcnt) == 0:
+            h, w = self.map.shape
+            return [npr.uniform(0, 1)*w, npr.uniform(0, 1)*h]
+        target_id = max(self.idcnt, key=self.idcnt.get)
+        _, component, cstats, ccenter = cv2.connectedComponentsWithStats(
+            (self.map==target_id).astype(np.uint8), connectivity=4)
+        # remove cid = 0, it is the invalid area
+        csize = [ci[-1] for ci in cstats[1:]]
+        target_cid = csize.index(max(csize))+1
+        center = ccenter[target_cid][::-1]
+        coord = np.stack(np.where(component == target_cid)).T
+        dist = np.linalg.norm(coord-center, axis=1)
+        target_coord_id = np.argmin(dist)
+        coord_h, coord_w = coord[target_coord_id]
+        # replace_sampling
+        self.idcnt[target_id] -= max(csize)
+        if self.idcnt[target_id] == 0:
+            self.idcnt.pop(target_id)
+        self.map[component == target_cid] = 0
+        return [coord_w, coord_h]
+
+
+def init_shapes(num_paths,
+                num_segments,
+                canvas_size,
+                seginit_cfg,
+                shape_cnt,
+                pos_init_method=None,
+                trainable_stroke=False,
+                gt=None,
+                **kwargs):
+    shapes = []
+    shape_groups = []
+    h, w = canvas_size
+
+    # change path init location
+    if pos_init_method is None:
+        pos_init_method = random_coord_init(canvas_size=canvas_size)
+
+    for i in range(num_paths):
+        num_control_points = [2] * num_segments
+
+        if seginit_cfg.type=="random":
+            points = []
+            p0 = pos_init_method()
+            color_ref = copy.deepcopy(p0)
+            points.append(p0)
+            for j in range(num_segments):
+                radius = seginit_cfg.radius
+                p1 = (p0[0] + radius * npr.uniform(-0.5, 0.5),
+                      p0[1] + radius * npr.uniform(-0.5, 0.5))
+                p2 = (p1[0] + radius * npr.uniform(-0.5, 0.5),
+                      p1[1] + radius * npr.uniform(-0.5, 0.5))
+                p3 = (p2[0] + radius * npr.uniform(-0.5, 0.5),
+                      p2[1] + radius * npr.uniform(-0.5, 0.5))
+                points.append(p1)
+                points.append(p2)
+                if j < num_segments - 1:
+                    points.append(p3)
+                    p0 = p3
+            points = torch.FloatTensor(points)
+
+        # circle points initialization
+        elif seginit_cfg.type=="circle":
+            radius = seginit_cfg.radius
+            if radius is None:
+                radius = npr.uniform(0.5, 1)
+            center = pos_init_method()
+            color_ref = copy.deepcopy(center)
+            points = get_bezier_circle(
+                radius=radius, segments=num_segments,
+                bias=center)
+
+        path = pydiffvg.Path(num_control_points = torch.LongTensor(num_control_points),
+                             points = points,
+                             stroke_width = torch.tensor(0.0),
+                             is_closed = True)
+        shapes.append(path)
+        # !!!!!!problem is here. the shape group shape_ids is wrong
+
+        if gt is not None:
+            wref, href = color_ref
+            wref = max(0, min(int(wref), w-1))
+            href = max(0, min(int(href), h-1))
+            fill_color_init = list(gt[0, :, href, wref]) + [1.]
+            fill_color_init = torch.FloatTensor(fill_color_init)
+            stroke_color_init = torch.FloatTensor(npr.uniform(size=[4]))
+        else:
+            fill_color_init = torch.FloatTensor(npr.uniform(size=[4]))
+            stroke_color_init = torch.FloatTensor(npr.uniform(size=[4]))
+
+        path_group = pydiffvg.ShapeGroup(
+            shape_ids = torch.LongTensor([shape_cnt+i]),
+            fill_color = fill_color_init,
+            stroke_color = stroke_color_init,
+        )
+        shape_groups.append(path_group)
+
+    point_var = []
+    color_var = []
+
+    for path in shapes:
+        path.points.requires_grad = True
+        point_var.append(path.points)
+    for group in shape_groups:
+        group.fill_color.requires_grad = True
+        color_var.append(group.fill_color)
+
+    if trainable_stroke:
+        stroke_width_var = []
+        stroke_color_var = []
+        for path in shapes:
+            path.stroke_width.requires_grad = True
+            stroke_width_var.append(path.stroke_width)
+        for group in shape_groups:
+            group.stroke_color.requires_grad = True
+            stroke_color_var.append(group.stroke_color)
+        return shapes, shape_groups, point_var, color_var, stroke_width_var, stroke_color_var
+    else:
+        return shapes, shape_groups, point_var, color_var
+
+class linear_decay_lrlambda_f(object):
+    def __init__(self, decay_every, decay_ratio):
+        self.decay_every = decay_every
+        self.decay_ratio = decay_ratio
+
+    def __call__(self, n):
+        decay_time = n//self.decay_every
+        decay_step = n %self.decay_every
+        lr_s = self.decay_ratio**decay_time
+        lr_e = self.decay_ratio**(decay_time+1)
+        r = decay_step/self.decay_every
+        lr = lr_s * (1-r) + lr_e * r
+        return lr
+
+def main_func(target, experiment, num_iter, cfg_arg):
+    with open(cfg_arg.config, 'r') as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+    cfg_default = edict(cfg['default'])
+    cfg = edict(cfg[cfg_arg.experiment])
+    cfg.update(cfg_default)
+    cfg.update(cfg_arg)
+    cfg.exid = get_experiment_id(cfg.debug)
+
+    cfg.experiment_dir = \
+        osp.join(cfg.log_dir, '{}_{}'.format(cfg.exid, '_'.join(cfg.signature)))
+    cfg.target = target
+    cfg.experiment = experiment
+    cfg.num_iter = num_iter
+
+    configfile = osp.join(cfg.experiment_dir, 'config.yaml')
+    check_and_create_dir(configfile)
+    with open(osp.join(configfile), 'w') as f:
+        yaml.dump(edict_2_dict(cfg), f)
+
+    # Use GPU if available
+    pydiffvg.set_use_gpu(torch.cuda.is_available())
+    device = pydiffvg.get_device()
+
+    # gt = np.array(PIL.Image.open(cfg.target))
+    gt = np.array(cfg.target)
+    print(f"Input image shape is: {gt.shape}")
+    if len(gt.shape) == 2:
+        print("Converting the gray-scale image to RGB.")
+        gt = gt.unsqueeze(dim=-1).repeat(1,1,3)
+    if gt.shape[2] == 4:
+        print("Input image includes alpha channel, simply dropout alpha channel.")
+        gt = gt[:, :, :3]
+    gt = (gt/255).astype(np.float32)
+    gt = torch.FloatTensor(gt).permute(2, 0, 1)[None].to(device)
+    if cfg.use_ycrcb:
+        gt = ycrcb_conversion(gt)
+    h, w = gt.shape[2:]
+
+    path_schedule = get_path_schedule(**cfg.path_schedule)
+
+    if cfg.seed is not None:
+        random.seed(cfg.seed)
+        npr.seed(cfg.seed)
+        torch.manual_seed(cfg.seed)
+    render = pydiffvg.RenderFunction.apply
+
+    shapes_record, shape_groups_record = [], []
+
+    region_loss = None
+    loss_matrix = []
+
+    para_point, para_color = {}, {}
+    if cfg.trainable.stroke:
+        para_stroke_width, para_stroke_color = {}, {}
+
+    pathn_record = []
+    # Background
+    if cfg.trainable.bg:
+        # meancolor = gt.mean([2, 3])[0]
+        para_bg = torch.tensor([1., 1., 1.], requires_grad=True, device=device)
+    else:
+        if cfg.use_ycrcb:
+            para_bg = torch.tensor([219/255, 0, 0], requires_grad=False, device=device)
+        else:
+            para_bg = torch.tensor([1., 1., 1.], requires_grad=False, device=device)
+
+    ##################
+    # start_training #
+    ##################
+
+    loss_weight = None
+    loss_weight_keep = 0
+    if cfg.coord_init.type == 'naive':
+        pos_init_method = naive_coord_init(
+            para_bg.view(1, -1, 1, 1).repeat(1, 1, h, w), gt)
+    elif cfg.coord_init.type == 'sparse':
+        pos_init_method = sparse_coord_init(
+            para_bg.view(1, -1, 1, 1).repeat(1, 1, h, w), gt)
+    elif cfg.coord_init.type == 'random':
+        pos_init_method = random_coord_init([h, w])
+    else:
+        raise ValueError
+
+    lrlambda_f = linear_decay_lrlambda_f(cfg.num_iter, 0.4)
+    optim_schedular_dict = {}
+
+    for path_idx, pathn in enumerate(path_schedule):
+        loss_list = []
+        print("=> Adding [{}] paths, [{}] ...".format(pathn, cfg.seginit.type))
+        pathn_record.append(pathn)
+        pathn_record_str = '-'.join([str(i) for i in pathn_record])
+
+        # initialize new shapes related stuffs.
+        if cfg.trainable.stroke:
+            shapes, shape_groups, point_var, color_var, stroke_width_var, stroke_color_var = init_shapes(
+                pathn, cfg.num_segments, (h, w),
+                cfg.seginit, len(shapes_record),
+                pos_init_method,
+                trainable_stroke=True,
+                gt=gt, )
+            para_stroke_width[path_idx] = stroke_width_var
+            para_stroke_color[path_idx] = stroke_color_var
+        else:
+            shapes, shape_groups, point_var, color_var = init_shapes(
+                pathn, cfg.num_segments, (h, w),
+                cfg.seginit, len(shapes_record),
+                pos_init_method,
+                trainable_stroke=False,
+                gt=gt, )
+
+        shapes_record += shapes
+        shape_groups_record += shape_groups
+
+        if cfg.save.init:
+            filename = os.path.join(
+                cfg.experiment_dir, "svg-init",
+                "{}-init.svg".format(pathn_record_str))
+            check_and_create_dir(filename)
+            pydiffvg.save_svg(
+                filename, w, h,
+                shapes_record, shape_groups_record)
+
+        para = {}
+        if (cfg.trainable.bg) and (path_idx == 0):
+            para['bg'] = [para_bg]
+        para['point'] = point_var
+        para['color'] = color_var
+        if cfg.trainable.stroke:
+            para['stroke_width'] = stroke_width_var
+            para['stroke_color'] = stroke_color_var
+
+        pg = [{'params' : para[ki], 'lr' : cfg.lr_base[ki]} for ki in sorted(para.keys())]
+        optim = torch.optim.Adam(pg)
+
+        if cfg.trainable.record:
+            scheduler = LambdaLR(
+                optim, lr_lambda=lrlambda_f, last_epoch=-1)
+        else:
+            scheduler = LambdaLR(
+                optim, lr_lambda=lrlambda_f, last_epoch=cfg.num_iter)
+        optim_schedular_dict[path_idx] = (optim, scheduler)
+
+        # Inner loop training
+        t_range = tqdm(range(cfg.num_iter))
+        for t in t_range:
+
+            for _, (optim, _) in optim_schedular_dict.items():
+                optim.zero_grad()
+
+            # Forward pass: render the image.
+            scene_args = pydiffvg.RenderFunction.serialize_scene(
+                w, h, shapes_record, shape_groups_record)
+            img = render(w, h, 2, 2, t, None, *scene_args)
+
+            # Compose img with white background
+            img = img[:, :, 3:4] * img[:, :, :3] + \
+                para_bg * (1 - img[:, :, 3:4])
+
+
+
+
+
+            if cfg.save.video:
+                filename = os.path.join(
+                    cfg.experiment_dir, "video-png",
+                    "{}-iter{}.png".format(pathn_record_str, t))
+                check_and_create_dir(filename)
+                if cfg.use_ycrcb:
+                    imshow = ycrcb_conversion(
+                        img, format='[2D x 3]', reverse=True).detach().cpu()
+                else:
+                    imshow = img.detach().cpu()
+                pydiffvg.imwrite(imshow, filename, gamma=gamma)
+
+            # ### added for app
+            # if t%30==0 and t !=0 :
+            #     # print(f"debug: {t}, {filename} {img.size()}")
+            #     return img.detach().cpu().numpy(), t
+
+            x = img.unsqueeze(0).permute(0, 3, 1, 2) # HWC -> NCHW
+
+            if cfg.use_ycrcb:
+                color_reweight = torch.FloatTensor([255/219, 255/224, 255/255]).to(device)
+                loss = ((x-gt)*(color_reweight.view(1, -1, 1, 1)))**2
+            else:
+                loss = ((x-gt)**2)
+
+            if cfg.loss.use_l1_loss:
+                loss = abs(x-gt)
+
+            if cfg.loss.use_distance_weighted_loss:
+                if cfg.use_ycrcb:
+                    raise ValueError
+                shapes_forsdf = copy.deepcopy(shapes)
+                shape_groups_forsdf = copy.deepcopy(shape_groups)
+                for si in shapes_forsdf:
+                    si.stroke_width = torch.FloatTensor([0]).to(device)
+                for sg_idx, sgi in enumerate(shape_groups_forsdf):
+                    sgi.fill_color = torch.FloatTensor([1, 1, 1, 1]).to(device)
+                    sgi.shape_ids = torch.LongTensor([sg_idx]).to(device)
+
+                sargs_forsdf = pydiffvg.RenderFunction.serialize_scene(
+                    w, h, shapes_forsdf, shape_groups_forsdf)
+                with torch.no_grad():
+                    im_forsdf = render(w, h, 2, 2, 0, None, *sargs_forsdf)
+                # use alpha channel is a trick to get 0-1 image
+                im_forsdf = (im_forsdf[:, :, 3]).detach().cpu().numpy()
+                loss_weight = get_sdf(im_forsdf, normalize='to1')
+                loss_weight += loss_weight_keep
+                loss_weight = np.clip(loss_weight, 0, 1)
+                loss_weight = torch.FloatTensor(loss_weight).to(device)
+
+            if cfg.save.loss:
+                save_loss = loss.squeeze(dim=0).mean(dim=0,keepdim=False).cpu().detach().numpy()
+                save_weight = loss_weight.cpu().detach().numpy()
+                save_weighted_loss = save_loss*save_weight
+                # normalize to [0,1]
+                save_loss = (save_loss - np.min(save_loss))/np.ptp(save_loss)
+                save_weight = (save_weight - np.min(save_weight))/np.ptp(save_weight)
+                save_weighted_loss = (save_weighted_loss - np.min(save_weighted_loss))/np.ptp(save_weighted_loss)
+
+                # save
+                plt.imshow(save_loss, cmap='Reds')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-mseloss.png".format(pathn_record_str, t))
+                check_and_create_dir(filename)
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+                plt.imshow(save_weight, cmap='Greys')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-sdfweight.png".format(pathn_record_str, t))
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+                plt.imshow(save_weighted_loss, cmap='Reds')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-weightedloss.png".format(pathn_record_str, t))
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+
+
+
+
+            if loss_weight is None:
+                loss = loss.sum(1).mean()
+            else:
+                loss = (loss.sum(1)*loss_weight).mean()
+
+            # if (cfg.loss.bis_loss_weight is not None)  and (cfg.loss.bis_loss_weight > 0):
+            #     loss_bis = bezier_intersection_loss(point_var[0]) * cfg.loss.bis_loss_weight
+            #     loss = loss + loss_bis
+            if (cfg.loss.xing_loss_weight is not None) \
+                    and (cfg.loss.xing_loss_weight > 0):
+                loss_xing = xing_loss(point_var) * cfg.loss.xing_loss_weight
+                loss = loss + loss_xing
+
+
+            loss_list.append(loss.item())
+            t_range.set_postfix({'loss': loss.item()})
+            loss.backward()
+
+            # step
+            for _, (optim, scheduler) in optim_schedular_dict.items():
+                optim.step()
+                scheduler.step()
+
+            for group in shape_groups_record:
+                group.fill_color.data.clamp_(0.0, 1.0)
+
+        if cfg.loss.use_distance_weighted_loss:
+            loss_weight_keep = loss_weight.detach().cpu().numpy() * 1
+
+        if not cfg.trainable.record:
+            for _, pi in pg.items():
+                for ppi in pi:
+                    pi.require_grad = False
+            optim_schedular_dict = {}
+
+        if cfg.save.image:
+            filename = os.path.join(
+                cfg.experiment_dir, "demo-png", "{}.png".format(pathn_record_str))
+            check_and_create_dir(filename)
+            if cfg.use_ycrcb:
+                imshow = ycrcb_conversion(
+                    img, format='[2D x 3]', reverse=True).detach().cpu()
+            else:
+                imshow = img.detach().cpu()
+            pydiffvg.imwrite(imshow, filename, gamma=gamma)
+
+        svg_app_file_name = ""
+        if cfg.save.output:
+            filename = os.path.join(
+                cfg.experiment_dir, "output-svg", "{}.svg".format(pathn_record_str))
+            check_and_create_dir(filename)
+            pydiffvg.save_svg(filename, w, h, shapes_record, shape_groups_record)
+            svg_app_file_name = filename
+
+        loss_matrix.append(loss_list)
+
+        # calculate the pixel loss
+        # pixel_loss = ((x-gt)**2).sum(dim=1, keepdim=True).sqrt_() # [N,1,H, W]
+        # region_loss = adaptive_avg_pool2d(pixel_loss, cfg.region_loss_pool_size)
+        # loss_weight = torch.softmax(region_loss.reshape(1, 1, -1), dim=-1)\
+        #     .reshape_as(region_loss)
+
+        pos_init_method = naive_coord_init(x, gt)
+
+        if cfg.coord_init.type == 'naive':
+            pos_init_method = naive_coord_init(x, gt)
+        elif cfg.coord_init.type == 'sparse':
+            pos_init_method = sparse_coord_init(x, gt)
+        elif cfg.coord_init.type == 'random':
+            pos_init_method = random_coord_init([h, w])
+        else:
+            raise ValueError
+
+        if cfg.save.video:
+            print("saving iteration video...")
+            img_array = []
+            for ii in range(0, cfg.num_iter):
+                filename = os.path.join(
+                    cfg.experiment_dir, "video-png",
+                    "{}-iter{}.png".format(pathn_record_str, ii))
+                img = cv2.imread(filename)
+                # cv2.putText(
+                #     img, "Path:{} \nIteration:{}".format(pathn_record_str, ii),
+                #     (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
+                img_array.append(img)
+
+            videoname = os.path.join(
+                cfg.experiment_dir, "video-avi",
+                "{}.avi".format(pathn_record_str))
+            check_and_create_dir(videoname)
+            out = cv2.VideoWriter(
+                videoname,
+                # cv2.VideoWriter_fourcc(*'mp4v'),
+                cv2.VideoWriter_fourcc(*'FFV1'),
+                20.0, (w, h))
+            for iii in range(len(img_array)):
+                out.write(img_array[iii])
+            out.release()
+            # shutil.rmtree(os.path.join(cfg.experiment_dir, "video-png"))
+
+    print("The last loss is: {}".format(loss.item()))
+    return img.detach().cpu().numpy(), svg_app_file_name
+
+
+if __name__ == "__main__":
+
+    ###############
+    # make config #
+    ###############
+
+    cfg_arg = parse_args()
+    with open(cfg_arg.config, 'r') as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+    cfg_default = edict(cfg['default'])
+    cfg = edict(cfg[cfg_arg.experiment])
+    cfg.update(cfg_default)
+    cfg.update(cfg_arg)
+    cfg.exid = get_experiment_id(cfg.debug)
+
+    cfg.experiment_dir = \
+        osp.join(cfg.log_dir, '{}_{}'.format(cfg.exid, '_'.join(cfg.signature)))
+    configfile = osp.join(cfg.experiment_dir, 'config.yaml')
+    check_and_create_dir(configfile)
+    with open(osp.join(configfile), 'w') as f:
+        yaml.dump(edict_2_dict(cfg), f)
+
+    # Use GPU if available
+    pydiffvg.set_use_gpu(torch.cuda.is_available())
+    device = pydiffvg.get_device()
+
+    gt = np.array(PIL.Image.open(cfg.target))
+    print(f"Input image shape is: {gt.shape}")
+    if len(gt.shape) == 2:
+        print("Converting the gray-scale image to RGB.")
+        gt = gt.unsqueeze(dim=-1).repeat(1,1,3)
+    if gt.shape[2] == 4:
+        print("Input image includes alpha channel, simply dropout alpha channel.")
+        gt = gt[:, :, :3]
+    gt = (gt/255).astype(np.float32)
+    gt = torch.FloatTensor(gt).permute(2, 0, 1)[None].to(device)
+    if cfg.use_ycrcb:
+        gt = ycrcb_conversion(gt)
+    h, w = gt.shape[2:]
+
+    path_schedule = get_path_schedule(**cfg.path_schedule)
+
+    if cfg.seed is not None:
+        random.seed(cfg.seed)
+        npr.seed(cfg.seed)
+        torch.manual_seed(cfg.seed)
+    render = pydiffvg.RenderFunction.apply
+
+    shapes_record, shape_groups_record = [], []
+
+    region_loss = None
+    loss_matrix = []
+
+    para_point, para_color = {}, {}
+    if cfg.trainable.stroke:
+        para_stroke_width, para_stroke_color = {}, {}
+
+    pathn_record = []
+    # Background
+    if cfg.trainable.bg:
+        # meancolor = gt.mean([2, 3])[0]
+        para_bg = torch.tensor([1., 1., 1.], requires_grad=True, device=device)
+    else:
+        if cfg.use_ycrcb:
+            para_bg = torch.tensor([219/255, 0, 0], requires_grad=False, device=device)
+        else:
+            para_bg = torch.tensor([1., 1., 1.], requires_grad=False, device=device)
+
+    ##################
+    # start_training #
+    ##################
+
+    loss_weight = None
+    loss_weight_keep = 0
+    if cfg.coord_init.type == 'naive':
+        pos_init_method = naive_coord_init(
+            para_bg.view(1, -1, 1, 1).repeat(1, 1, h, w), gt)
+    elif cfg.coord_init.type == 'sparse':
+        pos_init_method = sparse_coord_init(
+            para_bg.view(1, -1, 1, 1).repeat(1, 1, h, w), gt)
+    elif cfg.coord_init.type == 'random':
+        pos_init_method = random_coord_init([h, w])
+    else:
+        raise ValueError
+
+    lrlambda_f = linear_decay_lrlambda_f(cfg.num_iter, 0.4)
+    optim_schedular_dict = {}
+
+    for path_idx, pathn in enumerate(path_schedule):
+        loss_list = []
+        print("=> Adding [{}] paths, [{}] ...".format(pathn, cfg.seginit.type))
+        pathn_record.append(pathn)
+        pathn_record_str = '-'.join([str(i) for i in pathn_record])
+
+        # initialize new shapes related stuffs.
+        if cfg.trainable.stroke:
+            shapes, shape_groups, point_var, color_var, stroke_width_var, stroke_color_var = init_shapes(
+                pathn, cfg.num_segments, (h, w),
+                cfg.seginit, len(shapes_record),
+                pos_init_method,
+                trainable_stroke=True,
+                gt=gt, )
+            para_stroke_width[path_idx] = stroke_width_var
+            para_stroke_color[path_idx] = stroke_color_var
+        else:
+            shapes, shape_groups, point_var, color_var = init_shapes(
+                pathn, cfg.num_segments, (h, w),
+                cfg.seginit, len(shapes_record),
+                pos_init_method,
+                trainable_stroke=False,
+                gt=gt, )
+
+        shapes_record += shapes
+        shape_groups_record += shape_groups
+
+        if cfg.save.init:
+            filename = os.path.join(
+                cfg.experiment_dir, "svg-init",
+                "{}-init.svg".format(pathn_record_str))
+            check_and_create_dir(filename)
+            pydiffvg.save_svg(
+                filename, w, h,
+                shapes_record, shape_groups_record)
+
+        para = {}
+        if (cfg.trainable.bg) and (path_idx == 0):
+            para['bg'] = [para_bg]
+        para['point'] = point_var
+        para['color'] = color_var
+        if cfg.trainable.stroke:
+            para['stroke_width'] = stroke_width_var
+            para['stroke_color'] = stroke_color_var
+
+        pg = [{'params' : para[ki], 'lr' : cfg.lr_base[ki]} for ki in sorted(para.keys())]
+        optim = torch.optim.Adam(pg)
+
+        if cfg.trainable.record:
+            scheduler = LambdaLR(
+                optim, lr_lambda=lrlambda_f, last_epoch=-1)
+        else:
+            scheduler = LambdaLR(
+                optim, lr_lambda=lrlambda_f, last_epoch=cfg.num_iter)
+        optim_schedular_dict[path_idx] = (optim, scheduler)
+
+        # Inner loop training
+        t_range = tqdm(range(cfg.num_iter))
+        for t in t_range:
+
+            for _, (optim, _) in optim_schedular_dict.items():
+                optim.zero_grad()
+
+            # Forward pass: render the image.
+            scene_args = pydiffvg.RenderFunction.serialize_scene(
+                w, h, shapes_record, shape_groups_record)
+            img = render(w, h, 2, 2, t, None, *scene_args)
+
+            # Compose img with white background
+            img = img[:, :, 3:4] * img[:, :, :3] + \
+                para_bg * (1 - img[:, :, 3:4])
+
+            if cfg.save.video:
+                filename = os.path.join(
+                    cfg.experiment_dir, "video-png",
+                    "{}-iter{}.png".format(pathn_record_str, t))
+                check_and_create_dir(filename)
+                if cfg.use_ycrcb:
+                    imshow = ycrcb_conversion(
+                        img, format='[2D x 3]', reverse=True).detach().cpu()
+                else:
+                    imshow = img.detach().cpu()
+                pydiffvg.imwrite(imshow, filename, gamma=gamma)
+
+            x = img.unsqueeze(0).permute(0, 3, 1, 2) # HWC -> NCHW
+
+            if cfg.use_ycrcb:
+                color_reweight = torch.FloatTensor([255/219, 255/224, 255/255]).to(device)
+                loss = ((x-gt)*(color_reweight.view(1, -1, 1, 1)))**2
+            else:
+                loss = ((x-gt)**2)
+
+            if cfg.loss.use_l1_loss:
+                loss = abs(x-gt)
+
+            if cfg.loss.use_distance_weighted_loss:
+                if cfg.use_ycrcb:
+                    raise ValueError
+                shapes_forsdf = copy.deepcopy(shapes)
+                shape_groups_forsdf = copy.deepcopy(shape_groups)
+                for si in shapes_forsdf:
+                    si.stroke_width = torch.FloatTensor([0]).to(device)
+                for sg_idx, sgi in enumerate(shape_groups_forsdf):
+                    sgi.fill_color = torch.FloatTensor([1, 1, 1, 1]).to(device)
+                    sgi.shape_ids = torch.LongTensor([sg_idx]).to(device)
+
+                sargs_forsdf = pydiffvg.RenderFunction.serialize_scene(
+                    w, h, shapes_forsdf, shape_groups_forsdf)
+                with torch.no_grad():
+                    im_forsdf = render(w, h, 2, 2, 0, None, *sargs_forsdf)
+                # use alpha channel is a trick to get 0-1 image
+                im_forsdf = (im_forsdf[:, :, 3]).detach().cpu().numpy()
+                loss_weight = get_sdf(im_forsdf, normalize='to1')
+                loss_weight += loss_weight_keep
+                loss_weight = np.clip(loss_weight, 0, 1)
+                loss_weight = torch.FloatTensor(loss_weight).to(device)
+
+            if cfg.save.loss:
+                save_loss = loss.squeeze(dim=0).mean(dim=0,keepdim=False).cpu().detach().numpy()
+                save_weight = loss_weight.cpu().detach().numpy()
+                save_weighted_loss = save_loss*save_weight
+                # normalize to [0,1]
+                save_loss = (save_loss - np.min(save_loss))/np.ptp(save_loss)
+                save_weight = (save_weight - np.min(save_weight))/np.ptp(save_weight)
+                save_weighted_loss = (save_weighted_loss - np.min(save_weighted_loss))/np.ptp(save_weighted_loss)
+
+                # save
+                plt.imshow(save_loss, cmap='Reds')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-mseloss.png".format(pathn_record_str, t))
+                check_and_create_dir(filename)
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+                plt.imshow(save_weight, cmap='Greys')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-sdfweight.png".format(pathn_record_str, t))
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+                plt.imshow(save_weighted_loss, cmap='Reds')
+                plt.axis('off')
+                # plt.colorbar()
+                filename = os.path.join(cfg.experiment_dir, "loss", "{}-iter{}-weightedloss.png".format(pathn_record_str, t))
+                plt.savefig(filename, dpi=800)
+                plt.close()
+
+
+
+
+
+            if loss_weight is None:
+                loss = loss.sum(1).mean()
+            else:
+                loss = (loss.sum(1)*loss_weight).mean()
+
+            # if (cfg.loss.bis_loss_weight is not None)  and (cfg.loss.bis_loss_weight > 0):
+            #     loss_bis = bezier_intersection_loss(point_var[0]) * cfg.loss.bis_loss_weight
+            #     loss = loss + loss_bis
+            if (cfg.loss.xing_loss_weight is not None) \
+                    and (cfg.loss.xing_loss_weight > 0):
+                loss_xing = xing_loss(point_var) * cfg.loss.xing_loss_weight
+                loss = loss + loss_xing
+
+
+            loss_list.append(loss.item())
+            t_range.set_postfix({'loss': loss.item()})
+            loss.backward()
+
+            # step
+            for _, (optim, scheduler) in optim_schedular_dict.items():
+                optim.step()
+                scheduler.step()
+
+            for group in shape_groups_record:
+                group.fill_color.data.clamp_(0.0, 1.0)
+
+        if cfg.loss.use_distance_weighted_loss:
+            loss_weight_keep = loss_weight.detach().cpu().numpy() * 1
+
+        if not cfg.trainable.record:
+            for _, pi in pg.items():
+                for ppi in pi:
+                    pi.require_grad = False
+            optim_schedular_dict = {}
+
+        if cfg.save.image:
+            filename = os.path.join(
+                cfg.experiment_dir, "demo-png", "{}.png".format(pathn_record_str))
+            check_and_create_dir(filename)
+            if cfg.use_ycrcb:
+                imshow = ycrcb_conversion(
+                    img, format='[2D x 3]', reverse=True).detach().cpu()
+            else:
+                imshow = img.detach().cpu()
+            pydiffvg.imwrite(imshow, filename, gamma=gamma)
+
+        if cfg.save.output:
+            filename = os.path.join(
+                cfg.experiment_dir, "output-svg", "{}.svg".format(pathn_record_str))
+            check_and_create_dir(filename)
+            pydiffvg.save_svg(filename, w, h, shapes_record, shape_groups_record)
+
+        loss_matrix.append(loss_list)
+
+        # calculate the pixel loss
+        # pixel_loss = ((x-gt)**2).sum(dim=1, keepdim=True).sqrt_() # [N,1,H, W]
+        # region_loss = adaptive_avg_pool2d(pixel_loss, cfg.region_loss_pool_size)
+        # loss_weight = torch.softmax(region_loss.reshape(1, 1, -1), dim=-1)\
+        #     .reshape_as(region_loss)
+
+        pos_init_method = naive_coord_init(x, gt)
+
+        if cfg.coord_init.type == 'naive':
+            pos_init_method = naive_coord_init(x, gt)
+        elif cfg.coord_init.type == 'sparse':
+            pos_init_method = sparse_coord_init(x, gt)
+        elif cfg.coord_init.type == 'random':
+            pos_init_method = random_coord_init([h, w])
+        else:
+            raise ValueError
+
+        if cfg.save.video:
+            print("saving iteration video...")
+            img_array = []
+            for ii in range(0, cfg.num_iter):
+                filename = os.path.join(
+                    cfg.experiment_dir, "video-png", 
+                    "{}-iter{}.png".format(pathn_record_str, ii))
+                img = cv2.imread(filename)
+                # cv2.putText(
+                #     img, "Path:{} \nIteration:{}".format(pathn_record_str, ii), 
+                #     (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
+                img_array.append(img)
+
+            videoname = os.path.join(
+                cfg.experiment_dir, "video-avi", 
+                "{}.avi".format(pathn_record_str))
+            check_and_create_dir(videoname)
+            out = cv2.VideoWriter(
+                videoname, 
+                # cv2.VideoWriter_fourcc(*'mp4v'),
+                cv2.VideoWriter_fourcc(*'FFV1'), 
+                20.0, (w, h))
+            for iii in range(len(img_array)):
+                out.write(img_array[iii])
+            out.release()
+            # shutil.rmtree(os.path.join(cfg.experiment_dir, "video-png"))
+
+    print("The last loss is: {}".format(loss.item()))
diff --git a/matrix.h b/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..b53f484e2abf613c6d0c1b36890a332d778f24b5
--- /dev/null
+++ b/matrix.h
@@ -0,0 +1,544 @@
+#pragma once
+
+#include "diffvg.h"
+#include "vector.h"
+#include <iostream>
+
+template <typename T>
+struct TMatrix3x3 {
+    DEVICE
+    TMatrix3x3() {
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+                data[i][j] = T(0);
+            }
+        }
+    }
+
+    template <typename T2>
+    DEVICE
+    TMatrix3x3(T2 *arr) {
+    	data[0][0] = arr[0];
+    	data[0][1] = arr[1];
+    	data[0][2] = arr[2];
+    	data[1][0] = arr[3];
+    	data[1][1] = arr[4];
+    	data[1][2] = arr[5];
+    	data[2][0] = arr[6];
+    	data[2][1] = arr[7];
+    	data[2][2] = arr[8];
+    }
+    DEVICE
+    TMatrix3x3(T v00, T v01, T v02,
+               T v10, T v11, T v12,
+               T v20, T v21, T v22) {
+        data[0][0] = v00;
+        data[0][1] = v01;
+        data[0][2] = v02;
+        data[1][0] = v10;
+        data[1][1] = v11;
+        data[1][2] = v12;
+        data[2][0] = v20;
+        data[2][1] = v21;
+        data[2][2] = v22;
+    }
+
+    DEVICE
+    const T& operator()(int i, int j) const {
+        return data[i][j];
+    }
+    DEVICE
+    T& operator()(int i, int j) {
+        return data[i][j];
+    }
+    DEVICE
+    static TMatrix3x3<T> identity() {
+        TMatrix3x3<T> m(1, 0, 0,
+                        0, 1, 0,
+                        0, 0, 1);
+        return m;
+    }
+
+    T data[3][3];
+};
+
+using Matrix3x3 = TMatrix3x3<Real>;
+using Matrix3x3f = TMatrix3x3<float>;
+
+template <typename T>
+struct TMatrix4x4 {
+    DEVICE TMatrix4x4() {
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                data[i][j] = T(0);
+            }
+        }
+    }
+
+    template <typename T2>
+    DEVICE TMatrix4x4(const T2 *arr) {
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                data[i][j] = (T)arr[i * 4 + j];
+            }
+        }
+    }
+
+    template <typename T2>
+    DEVICE TMatrix4x4(const TMatrix4x4<T2> &m) {
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                data[i][j] = T(m.data[i][j]);
+            }
+        }
+    }
+
+    template <typename T2>
+    DEVICE TMatrix4x4(T2 v00, T2 v01, T2 v02, T2 v03,
+                      T2 v10, T2 v11, T2 v12, T2 v13,
+                      T2 v20, T2 v21, T2 v22, T2 v23,
+                      T2 v30, T2 v31, T2 v32, T2 v33) {
+        data[0][0] = (T)v00;
+        data[0][1] = (T)v01;
+        data[0][2] = (T)v02;
+        data[0][3] = (T)v03;
+        data[1][0] = (T)v10;
+        data[1][1] = (T)v11;
+        data[1][2] = (T)v12;
+        data[1][3] = (T)v13;
+        data[2][0] = (T)v20;
+        data[2][1] = (T)v21;
+        data[2][2] = (T)v22;
+        data[2][3] = (T)v23;
+        data[3][0] = (T)v30;
+        data[3][1] = (T)v31;
+        data[3][2] = (T)v32;
+        data[3][3] = (T)v33;
+    }
+
+    DEVICE
+    const T& operator()(int i, int j) const {
+        return data[i][j];
+    }
+
+    DEVICE
+    T& operator()(int i, int j) {
+        return data[i][j];
+    }
+
+    DEVICE
+    static TMatrix4x4<T> identity() {
+        TMatrix4x4<T> m(1, 0, 0, 0,
+                        0, 1, 0, 0,
+                        0, 0, 1, 0,
+                        0, 0, 0, 1);
+        return m;
+    }
+
+    T data[4][4];
+};
+
+using Matrix4x4 = TMatrix4x4<Real>;
+using Matrix4x4f = TMatrix4x4<float>;
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TMatrix3x3<T0> &m0, const TMatrix3x3<T1> &m1) -> TMatrix3x3<decltype(m0(0, 0) + m1(0, 0))> {
+    TMatrix3x3<decltype(m0(0, 0) + m1(0, 0))> m;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m(i, j) = m0(i, j) + m1(i, j);
+        }
+    }
+    return m;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TMatrix3x3<T0> &m0, const TMatrix3x3<T1> &m1) -> TMatrix3x3<decltype(m0(0, 0) - m1(0, 0))> {
+    TMatrix3x3<decltype(m0(0, 0) - m1(0, 0))> m;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m(i, j) = m0(i, j) - m1(i, j);
+        }
+    }
+    return m;
+}
+
+template <typename T>
+DEVICE
+inline auto operator*(const TMatrix3x3<T> &m0, const TMatrix3x3<T> &m1) -> TMatrix3x3<T> {
+    TMatrix3x3<T> ret;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            ret(i, j) = T(0);
+            for (int k = 0; k < 3; k++) {
+                ret(i, j) += m0(i, k) * m1(k, j);
+            }
+        }
+    }
+    return ret;
+}
+
+template <typename T>
+DEVICE
+inline auto operator*(const TVector3<T> &v, const TMatrix3x3<T> &m) -> TVector3<T> {
+    TVector3<T> ret;
+    for (int i = 0; i < 3; i++) {
+        ret[i] = T(0);
+        for (int j = 0; j < 3; j++) {
+            ret[i] += v[j] * m(j, i);
+        }
+    }
+    return ret;
+}
+
+template <typename T>
+DEVICE
+inline auto operator*(const TMatrix3x3<T> &m, const TVector3<T> &v) -> TVector3<T> {
+    TVector3<T> ret;
+    for (int i = 0; i < 3; i++) {
+        ret[i] = 0.f;
+        for (int j = 0; j < 3; j++) {
+            ret[i] += m(i, j) * v[j];
+        }
+    }
+    return ret;
+}
+
+template <typename T>
+DEVICE
+inline auto inverse(const TMatrix3x3<T> &m) -> TMatrix3x3<T> {
+    // computes the inverse of a matrix m
+    auto det = m(0, 0) * (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) -
+               m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) +
+               m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+
+    auto invdet = 1 / det;
+
+    auto m_inv = TMatrix3x3<T>{};
+    m_inv(0, 0) = (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) * invdet;
+    m_inv(0, 1) = (m(0, 2) * m(2, 1) - m(0, 1) * m(2, 2)) * invdet;
+    m_inv(0, 2) = (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)) * invdet;
+    m_inv(1, 0) = (m(1, 2) * m(2, 0) - m(1, 0) * m(2, 2)) * invdet;
+    m_inv(1, 1) = (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0)) * invdet;
+    m_inv(1, 2) = (m(1, 0) * m(0, 2) - m(0, 0) * m(1, 2)) * invdet;
+    m_inv(2, 0) = (m(1, 0) * m(2, 1) - m(2, 0) * m(1, 1)) * invdet;
+    m_inv(2, 1) = (m(2, 0) * m(0, 1) - m(0, 0) * m(2, 1)) * invdet;
+    m_inv(2, 2) = (m(0, 0) * m(1, 1) - m(1, 0) * m(0, 1)) * invdet;
+    return m_inv;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TMatrix4x4<T0> &m0, const TMatrix4x4<T1> &m1) -> TMatrix4x4<decltype(m0(0, 0) + m1(0, 0))> {
+    TMatrix4x4<decltype(m0(0, 0) + m1(0, 0))> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m(i, j) = m0(i, j) + m1(i, j);
+        }
+    }
+    return m;
+}
+
+template <typename T>
+DEVICE
+TMatrix3x3<T> transpose(const TMatrix3x3<T> &m) {
+    return TMatrix3x3<T>(m(0, 0), m(1, 0), m(2, 0),
+                         m(0, 1), m(1, 1), m(2, 1),
+                         m(0, 2), m(1, 2), m(2, 2));
+}
+
+template <typename T>
+DEVICE
+TMatrix4x4<T> transpose(const TMatrix4x4<T> &m) {
+    return TMatrix4x4<T>(m(0, 0), m(1, 0), m(2, 0), m(3, 0),
+                         m(0, 1), m(1, 1), m(2, 1), m(3, 1),
+                         m(0, 2), m(1, 2), m(2, 2), m(3, 2),
+                         m(0, 3), m(1, 3), m(2, 3), m(3, 3));
+}
+
+template <typename T>
+DEVICE
+inline TMatrix3x3<T> operator-(const TMatrix3x3<T> &m0) {
+    TMatrix3x3<T> m;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m(i, j) = -m0(i, j);
+        }
+    }
+    return m;
+}
+
+template <typename T>
+DEVICE
+inline TMatrix4x4<T> operator-(const TMatrix4x4<T> &m0) {
+    TMatrix4x4<T> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m(i, j) = -m0(i, j);
+        }
+    }
+    return m;
+}
+
+template <typename T>
+DEVICE
+inline TMatrix4x4<T> operator-(const TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    TMatrix4x4<T> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m(i, j) = m0(i, j) - m1(i, j);
+        }
+    }
+    return m;
+}
+
+template <typename T>
+DEVICE
+inline TMatrix3x3<T>& operator+=(TMatrix3x3<T> &m0, const TMatrix3x3<T> &m1) {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m0(i, j) += m1(i, j);
+        }
+    }
+    return m0;
+}
+
+template <typename T>
+DEVICE
+inline TMatrix4x4<T>& operator+=(TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m0(i, j) += m1(i, j);
+        }
+    }
+    return m0;
+}
+
+template <typename T>
+DEVICE
+inline TMatrix4x4<T>& operator-=(TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m0(i, j) -= m1(i, j);
+        }
+    }
+    return m0;
+}
+
+template <typename T>
+DEVICE
+inline TMatrix4x4<T> operator*(const TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    TMatrix4x4<T> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            for (int k = 0; k < 4; k++) {
+                m(i, j) += m0(i, k) * m1(k, j);
+            }
+        }
+    }
+    return m;
+}
+
+template <typename T>
+DEVICE
+TMatrix4x4<T> inverse(const TMatrix4x4<T> &m) {
+    // https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
+    TMatrix4x4<T> inv;
+
+    inv(0, 0) = m(1, 1) * m(2, 2) * m(3, 3) -
+                m(1, 1) * m(2, 3) * m(3, 2) -
+                m(2, 1) * m(1, 2) * m(3, 3) +
+                m(2, 1) * m(1, 3) * m(3, 2) +
+                m(3, 1) * m(1, 2) * m(2, 3) -
+                m(3, 1) * m(1, 3) * m(2, 2);
+
+    inv(1, 0) = -m(1, 0) * m(2, 2) * m(3, 3) +
+                 m(1, 0) * m(2, 3) * m(3, 2) +
+                 m(2, 0) * m(1, 2) * m(3, 3) -
+                 m(2, 0) * m(1, 3) * m(3, 2) -
+                 m(3, 0) * m(1, 2) * m(2, 3) +
+                 m(3, 0) * m(1, 3) * m(2, 2);
+
+    inv(2, 0) = m(1, 0) * m(2, 1) * m(3, 3) -
+                m(1, 0) * m(2, 3) * m(3, 1) -
+                m(2, 0) * m(1, 1) * m(3, 3) +
+                m(2, 0) * m(1, 3) * m(3, 1) +
+                m(3, 0) * m(1, 1) * m(2, 3) -
+                m(3, 0) * m(1, 3) * m(2, 1);
+
+    inv(3, 0) = -m(1, 0) * m(2, 1) * m(3, 2) +
+                 m(1, 0) * m(2, 2) * m(3, 1) +
+                 m(2, 0) * m(1, 1) * m(3, 2) -
+                 m(2, 0) * m(1, 2) * m(3, 1) -
+                 m(3, 0) * m(1, 1) * m(2, 2) +
+                 m(3, 0) * m(1, 2) * m(2, 1);
+
+    inv(0, 1) = -m(0, 1) * m(2, 2) * m(3, 3) +
+                 m(0, 1) * m(2, 3) * m(3, 2) +
+                 m(2, 1) * m(0, 2) * m(3, 3) -
+                 m(2, 1) * m(0, 3) * m(3, 2) -
+                 m(3, 1) * m(0, 2) * m(2, 3) +
+                 m(3, 1) * m(0, 3) * m(2, 2);
+
+    inv(1, 1) = m(0, 0) * m(2, 2) * m(3, 3) -
+                m(0, 0) * m(2, 3) * m(3, 2) -
+                m(2, 0) * m(0, 2) * m(3, 3) +
+                m(2, 0) * m(0, 3) * m(3, 2) +
+                m(3, 0) * m(0, 2) * m(2, 3) -
+                m(3, 0) * m(0, 3) * m(2, 2);
+
+    inv(2, 1) = -m(0, 0) * m(2, 1) * m(3, 3) +
+                 m(0, 0) * m(2, 3) * m(3, 1) +
+                 m(2, 0) * m(0, 1) * m(3, 3) -
+                 m(2, 0) * m(0, 3) * m(3, 1) -
+                 m(3, 0) * m(0, 1) * m(2, 3) +
+                 m(3, 0) * m(0, 3) * m(2, 1);
+
+    inv(3, 1) = m(0, 0) * m(2, 1) * m(3, 2) -
+                m(0, 0) * m(2, 2) * m(3, 1) -
+                m(2, 0) * m(0, 1) * m(3, 2) +
+                m(2, 0) * m(0, 2) * m(3, 1) +
+                m(3, 0) * m(0, 1) * m(2, 2) -
+                m(3, 0) * m(0, 2) * m(2, 1);
+
+    inv(0, 2) = m(0, 1) * m(1, 2) * m(3, 3) -
+                m(0, 1) * m(1, 3) * m(3, 2) -
+                m(1, 1) * m(0, 2) * m(3, 3) +
+                m(1, 1) * m(0, 3) * m(3, 2) +
+                m(3, 1) * m(0, 2) * m(1, 3) -
+                m(3, 1) * m(0, 3) * m(1, 2);
+
+    inv(1, 2) = -m(0, 0) * m(1, 2) * m(3, 3) +
+                 m(0, 0) * m(1, 3) * m(3, 2) +
+                 m(1, 0) * m(0, 2) * m(3, 3) -
+                 m(1, 0) * m(0, 3) * m(3, 2) -
+                 m(3, 0) * m(0, 2) * m(1, 3) +
+                 m(3, 0) * m(0, 3) * m(1, 2);
+
+    inv(2, 2) = m(0, 0) * m(1, 1) * m(3, 3) -
+                m(0, 0) * m(1, 3) * m(3, 1) -
+                m(1, 0) * m(0, 1) * m(3, 3) +
+                m(1, 0) * m(0, 3) * m(3, 1) +
+                m(3, 0) * m(0, 1) * m(1, 3) -
+                m(3, 0) * m(0, 3) * m(1, 1);
+
+    inv(3, 2) = -m(0, 0) * m(1, 1) * m(3, 2) +
+                 m(0, 0) * m(1, 2) * m(3, 1) +
+                 m(1, 0) * m(0, 1) * m(3, 2) -
+                 m(1, 0) * m(0, 2) * m(3, 1) -
+                 m(3, 0) * m(0, 1) * m(1, 2) +
+                 m(3, 0) * m(0, 2) * m(1, 1);
+
+    inv(0, 3) = -m(0, 1) * m(1, 2) * m(2, 3) +
+                 m(0, 1) * m(1, 3) * m(2, 2) +
+                 m(1, 1) * m(0, 2) * m(2, 3) -
+                 m(1, 1) * m(0, 3) * m(2, 2) -
+                 m(2, 1) * m(0, 2) * m(1, 3) +
+                 m(2, 1) * m(0, 3) * m(1, 2);
+
+    inv(1, 3) = m(0, 0) * m(1, 2) * m(2, 3) -
+                m(0, 0) * m(1, 3) * m(2, 2) -
+                m(1, 0) * m(0, 2) * m(2, 3) +
+                m(1, 0) * m(0, 3) * m(2, 2) +
+                m(2, 0) * m(0, 2) * m(1, 3) -
+                m(2, 0) * m(0, 3) * m(1, 2);
+
+    inv(2, 3) = -m(0, 0) * m(1, 1) * m(2, 3) +
+                 m(0, 0) * m(1, 3) * m(2, 1) +
+                 m(1, 0) * m(0, 1) * m(2, 3) -
+                 m(1, 0) * m(0, 3) * m(2, 1) -
+                 m(2, 0) * m(0, 1) * m(1, 3) +
+                 m(2, 0) * m(0, 3) * m(1, 1);
+
+    inv(3, 3) = m(0, 0) * m(1, 1) * m(2, 2) -
+                m(0, 0) * m(1, 2) * m(2, 1) -
+                m(1, 0) * m(0, 1) * m(2, 2) +
+                m(1, 0) * m(0, 2) * m(2, 1) +
+                m(2, 0) * m(0, 1) * m(1, 2) -
+                m(2, 0) * m(0, 2) * m(1, 1);
+
+    auto det = m(0, 0) * inv(0, 0) +
+               m(0, 1) * inv(1, 0) +
+               m(0, 2) * inv(2, 0) +
+               m(0, 3) * inv(3, 0);
+
+    if (det == 0) {
+        return TMatrix4x4<T>{};
+    }
+
+    auto inv_det = 1.0 / det;
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            inv(i, j) *= inv_det;
+        }
+    }
+
+    return inv;
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TMatrix3x3<T> &m) {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            os << m(i, j) << " ";
+        }
+        os << std::endl;
+    }
+    return os;
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TMatrix4x4<T> &m) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            os << m(i, j) << " ";
+        }
+        os << std::endl;
+    }
+    return os;
+}
+
+template <typename T>
+DEVICE
+TVector2<T> xform_pt(const TMatrix3x3<T> &m, const TVector2<T> &pt) {
+    TVector3<T> t{m(0, 0) * pt[0] + m(0, 1) * pt[1] + m(0, 2),
+                  m(1, 0) * pt[0] + m(1, 1) * pt[1] + m(1, 2),
+                  m(2, 0) * pt[0] + m(2, 1) * pt[1] + m(2, 2)};
+    return TVector2<T>{t[0] / t[2], t[1] / t[2]};
+}
+
+template <typename T>
+DEVICE
+void d_xform_pt(const TMatrix3x3<T> &m, const TVector2<T> &pt,
+                const TVector2<T> &d_out,
+                TMatrix3x3<T> &d_m,
+                TVector2<T> &d_pt) {
+    TVector3<T> t{m(0, 0) * pt[0] + m(0, 1) * pt[1] + m(0, 2),
+                  m(1, 0) * pt[0] + m(1, 1) * pt[1] + m(1, 2),
+                  m(2, 0) * pt[0] + m(2, 1) * pt[1] + m(2, 2)};
+    auto out = TVector2<T>{t[0] / t[2], t[1] / t[2]};
+    TVector3<T> d_t{d_out[0] / t[2],
+                    d_out[1] / t[2],
+                    -(d_out[0] * out[0] + d_out[1] * out[1]) / t[2]};
+    d_m(0, 0) += d_t[0] * pt[0];
+    d_m(0, 1) += d_t[0] * pt[1];
+    d_m(0, 2) += d_t[0];
+    d_m(1, 0) += d_t[1] * pt[0];
+    d_m(1, 1) += d_t[1] * pt[1];
+    d_m(1, 2) += d_t[1];
+    d_m(2, 0) += d_t[2] * pt[0];
+    d_m(2, 1) += d_t[2] * pt[1];
+    d_m(2, 2) += d_t[2];
+    d_pt[0] += d_t[0] * m(0, 0) + d_t[1] * m(1, 0) + d_t[2] * m(2, 0);
+    d_pt[1] += d_t[0] * m(0, 1) + d_t[1] * m(1, 1) + d_t[2] * m(2, 1);
+}
+
+template <typename T>
+DEVICE
+TVector2<T> xform_normal(const TMatrix3x3<T> &m_inv, const TVector2<T> &n) {
+    return normalize(TVector2<T>{m_inv(0, 0) * n[0] + m_inv(1, 0) * n[1],
+                                 m_inv(0, 1) * n[0] + m_inv(1, 1) * n[1]});
+}
diff --git a/model_config/model_name_p5_all.csv b/model_config/model_name_p5_all.csv
new file mode 100644
index 0000000000000000000000000000000000000000..aafe25f261dd88008d5cc3b746778b7aa77156c9
--- /dev/null
+++ b/model_config/model_name_p5_all.csv
@@ -0,0 +1,5 @@
+yolov5n
+yolov5s
+yolov5m
+yolov5l
+yolov5x
\ No newline at end of file
diff --git a/model_config/model_name_p5_all.yaml b/model_config/model_name_p5_all.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b178207fb72c5c489d74eec1c45edd1c084b722f
--- /dev/null
+++ b/model_config/model_name_p5_all.yaml
@@ -0,0 +1 @@
+model_names: ["yolov5n", "yolov5s", "yolov5m", "yolov5l", "yolov5x"]
diff --git a/model_config/model_name_p5_n.csv b/model_config/model_name_p5_n.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f13d40609e8f329c832630dd0a81a4fc56a99f8d
--- /dev/null
+++ b/model_config/model_name_p5_n.csv
@@ -0,0 +1 @@
+yolov5n
\ No newline at end of file
diff --git a/model_config/model_name_p5_n.yaml b/model_config/model_name_p5_n.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa53bf5e8b0c78c9f9d6f797a663a4fb77bd6c5f
--- /dev/null
+++ b/model_config/model_name_p5_n.yaml
@@ -0,0 +1 @@
+model_names: ["yolov5n"]
\ No newline at end of file
diff --git a/model_config/model_name_p6_all.csv b/model_config/model_name_p6_all.csv
new file mode 100644
index 0000000000000000000000000000000000000000..1de274571ba45177344aab6bcfb97d46dda4b836
--- /dev/null
+++ b/model_config/model_name_p6_all.csv
@@ -0,0 +1,5 @@
+yolov5n6
+yolov5s6
+yolov5m6
+yolov5l6
+yolov5x6
\ No newline at end of file
diff --git a/model_config/model_name_p6_all.yaml b/model_config/model_name_p6_all.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3d4f1764da69d93442e37ee696c873e41a5ebe1
--- /dev/null
+++ b/model_config/model_name_p6_all.yaml
@@ -0,0 +1 @@
+model_names: ["yolov5n6", "yolov5s6", "yolov5m6", "yolov5l6", "yolov5x6"]
\ No newline at end of file
diff --git a/model_download/yolov5_model_p5_all.sh b/model_download/yolov5_model_p5_all.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a8e11f6c73445e2e7855d7b62c2b8ebbb7236e9d
--- /dev/null
+++ b/model_download/yolov5_model_p5_all.sh
@@ -0,0 +1,8 @@
+cd ./yolov5
+
+# 下载YOLOv5模型
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5n.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5m.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5l.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5x.pt
\ No newline at end of file
diff --git a/model_download/yolov5_model_p5_n.sh b/model_download/yolov5_model_p5_n.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2ff8cd2505a95c9f6469c47c3c890681f4df9ebe
--- /dev/null
+++ b/model_download/yolov5_model_p5_n.sh
@@ -0,0 +1,4 @@
+cd ./yolov5
+
+# 下载YOLOv5模型
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5n.pt
\ No newline at end of file
diff --git a/model_download/yolov5_model_p6_all.sh b/model_download/yolov5_model_p6_all.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dfe8d9014e46cf8f7df244095d0115df55e0a209
--- /dev/null
+++ b/model_download/yolov5_model_p6_all.sh
@@ -0,0 +1,8 @@
+cd ./yolov5
+
+# 下载YOLOv5模型
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5n6.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s6.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5m6.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5l6.pt
+wget -c -t 0 https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5x6.pt
\ No newline at end of file
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f359f073a1f1a020ae08d923becf10ca1e4afb57
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1 @@
+libgl1-mesa-glx
diff --git a/painterly_rendering.py b/painterly_rendering.py
new file mode 100644
index 0000000000000000000000000000000000000000..f08c9fe32927b05f6a99bf53fa30d3ba584b027d
--- /dev/null
+++ b/painterly_rendering.py
@@ -0,0 +1,223 @@
+"""
+Scream: python painterly_rendering.py imgs/scream.jpg --num_paths 2048 --max_width 4.0
+Fallingwater: python painterly_rendering.py imgs/fallingwater.jpg --num_paths 2048 --max_width 4.0
+Fallingwater: python painterly_rendering.py imgs/fallingwater.jpg --num_paths 2048 --max_width 4.0 --use_lpips_loss
+Baboon: python painterly_rendering.py imgs/baboon.png --num_paths 1024 --max_width 4.0 --num_iter 250
+Baboon Lpips: python painterly_rendering.py imgs/baboon.png --num_paths 1024 --max_width 4.0 --num_iter 500 --use_lpips_loss
+smile: python painterly_rendering.py ../LIVE/figures/smile.png --num_paths 5 --use_blob --num_iter 500
+"""
+import pydiffvg
+import torch
+import skimage
+import skimage.io
+import random
+import ttools.modules
+import argparse
+import math
+
+pydiffvg.set_print_timing(True)
+
+gamma = 1.0
+
+def main(args):
+    # Use GPU if available
+    pydiffvg.set_use_gpu(torch.cuda.is_available())
+    
+    perception_loss = ttools.modules.LPIPS().to(pydiffvg.get_device())
+    
+    #target = torch.from_numpy(skimage.io.imread('imgs/lena.png')).to(torch.float32) / 255.0
+    target = torch.from_numpy(skimage.io.imread(args.target)).to(torch.float32) / 255.0
+    target = target.pow(gamma)
+    target = target.to(pydiffvg.get_device())
+    target = target.unsqueeze(0)
+    target = target.permute(0, 3, 1, 2) # NHWC -> NCHW
+    #target = torch.nn.functional.interpolate(target, size = [256, 256], mode = 'area')
+    canvas_width, canvas_height = target.shape[3], target.shape[2]
+    num_paths = args.num_paths
+    max_width = args.max_width
+    
+    random.seed(1234)
+    torch.manual_seed(1234)
+    
+    shapes = []
+    shape_groups = []
+    if args.use_blob:
+        for i in range(num_paths):
+            num_segments = random.randint(3, 5)
+            num_control_points = torch.zeros(num_segments, dtype = torch.int32) + 2
+            points = []
+            p0 = (random.random(), random.random())
+            points.append(p0)
+            for j in range(num_segments):
+                radius = 0.05
+                p1 = (p0[0] + radius * (random.random() - 0.5), p0[1] + radius * (random.random() - 0.5))
+                p2 = (p1[0] + radius * (random.random() - 0.5), p1[1] + radius * (random.random() - 0.5))
+                p3 = (p2[0] + radius * (random.random() - 0.5), p2[1] + radius * (random.random() - 0.5))
+                points.append(p1)
+                points.append(p2)
+                if j < num_segments - 1:
+                    points.append(p3)
+                    p0 = p3
+            points = torch.tensor(points)
+            points[:, 0] *= canvas_width
+            points[:, 1] *= canvas_height
+            path = pydiffvg.Path(num_control_points = num_control_points,
+                                 points = points,
+                                 stroke_width = torch.tensor(1.0),
+                                 is_closed = True)
+            shapes.append(path)
+            path_group = pydiffvg.ShapeGroup(shape_ids = torch.tensor([len(shapes) - 1]),
+                                             fill_color = torch.tensor([random.random(),
+                                                                        random.random(),
+                                                                        random.random(),
+                                                                        random.random()]))
+            shape_groups.append(path_group)
+    else:
+        for i in range(num_paths):
+            num_segments = random.randint(1, 3)
+            num_control_points = torch.zeros(num_segments, dtype = torch.int32) + 2
+            points = []
+            p0 = (random.random(), random.random())
+            points.append(p0)
+            for j in range(num_segments):
+                radius = 0.05
+                p1 = (p0[0] + radius * (random.random() - 0.5), p0[1] + radius * (random.random() - 0.5))
+                p2 = (p1[0] + radius * (random.random() - 0.5), p1[1] + radius * (random.random() - 0.5))
+                p3 = (p2[0] + radius * (random.random() - 0.5), p2[1] + radius * (random.random() - 0.5))
+                points.append(p1)
+                points.append(p2)
+                points.append(p3)
+                p0 = p3
+            points = torch.tensor(points)
+            points[:, 0] *= canvas_width
+            points[:, 1] *= canvas_height
+            #points = torch.rand(3 * num_segments + 1, 2) * min(canvas_width, canvas_height)
+            path = pydiffvg.Path(num_control_points = num_control_points,
+                                 points = points,
+                                 stroke_width = torch.tensor(1.0),
+                                 is_closed = False)
+            shapes.append(path)
+            path_group = pydiffvg.ShapeGroup(shape_ids = torch.tensor([len(shapes) - 1]),
+                                             fill_color = None,
+                                             stroke_color = torch.tensor([random.random(),
+                                                                          random.random(),
+                                                                          random.random(),
+                                                                          random.random()]))
+            shape_groups.append(path_group)
+    
+    scene_args = pydiffvg.RenderFunction.serialize_scene(\
+        canvas_width, canvas_height, shapes, shape_groups)
+    
+    render = pydiffvg.RenderFunction.apply
+    img = render(canvas_width, # width
+                 canvas_height, # height
+                 2,   # num_samples_x
+                 2,   # num_samples_y
+                 0,   # seed
+                 None,
+                 *scene_args)
+    pydiffvg.imwrite(img.cpu(), 'results/painterly_rendering/init.png', gamma=gamma)
+
+    points_vars = []
+    stroke_width_vars = []
+    color_vars = []
+    for path in shapes:
+        path.points.requires_grad = True
+        points_vars.append(path.points)
+    if not args.use_blob:
+        for path in shapes:
+            path.stroke_width.requires_grad = True
+            stroke_width_vars.append(path.stroke_width)
+    if args.use_blob:
+        for group in shape_groups:
+            group.fill_color.requires_grad = True
+            color_vars.append(group.fill_color)
+    else:
+        for group in shape_groups:
+            group.stroke_color.requires_grad = True
+            color_vars.append(group.stroke_color)
+    
+    # Optimize
+    points_optim = torch.optim.Adam(points_vars, lr=1.0)
+    if len(stroke_width_vars) > 0:
+        width_optim = torch.optim.Adam(stroke_width_vars, lr=0.1)
+    color_optim = torch.optim.Adam(color_vars, lr=0.01)
+    # Adam iterations.
+    for t in range(args.num_iter):
+        print('iteration:', t)
+        points_optim.zero_grad()
+        if len(stroke_width_vars) > 0:
+            width_optim.zero_grad()
+        color_optim.zero_grad()
+        # Forward pass: render the image.
+        scene_args = pydiffvg.RenderFunction.serialize_scene(\
+            canvas_width, canvas_height, shapes, shape_groups)
+        img = render(canvas_width, # width
+                     canvas_height, # height
+                     2,   # num_samples_x
+                     2,   # num_samples_y
+                     t,   # seed
+                     None,
+                     *scene_args)
+        # Compose img with white background
+        img = img[:, :, 3:4] * img[:, :, :3] + torch.ones(img.shape[0], img.shape[1], 3, device = pydiffvg.get_device()) * (1 - img[:, :, 3:4])
+        # Save the intermediate render.
+        pydiffvg.imwrite(img.cpu(), 'results/painterly_rendering/iter_{}.png'.format(t), gamma=gamma)
+        img = img[:, :, :3]
+        # Convert img from HWC to NCHW
+        img = img.unsqueeze(0)
+        img = img.permute(0, 3, 1, 2) # NHWC -> NCHW
+        if args.use_lpips_loss:
+            loss = perception_loss(img, target) + (img.mean() - target.mean()).pow(2)
+        else:
+            loss = (img - target).pow(2).mean()
+        print('render loss:', loss.item())
+    
+        # Backpropagate the gradients.
+        loss.backward()
+
+        # Take a gradient descent step.
+        points_optim.step()
+        if len(stroke_width_vars) > 0:
+            width_optim.step()
+        color_optim.step()
+        if len(stroke_width_vars) > 0:
+            for path in shapes:
+                path.stroke_width.data.clamp_(1.0, max_width)
+        if args.use_blob:
+            for group in shape_groups:
+                group.fill_color.data.clamp_(0.0, 1.0)
+        else:
+            for group in shape_groups:
+                group.stroke_color.data.clamp_(0.0, 1.0)
+
+        if t % 10 == 0 or t == args.num_iter - 1:
+            pydiffvg.save_svg('results/painterly_rendering/iter_{}.svg'.format(t),
+                              canvas_width, canvas_height, shapes, shape_groups)
+    
+    # Render the final result.
+    img = render(target.shape[1], # width
+                 target.shape[0], # height
+                 2,   # num_samples_x
+                 2,   # num_samples_y
+                 0,   # seed
+                 None,
+                 *scene_args)
+    # Save the intermediate render.
+    pydiffvg.imwrite(img.cpu(), 'results/painterly_rendering/final.png'.format(t), gamma=gamma)
+    # Convert the intermediate renderings to a video.
+    from subprocess import call
+    call(["ffmpeg", "-framerate", "24", "-i",
+        "results/painterly_rendering/iter_%d.png", "-vb", "20M",
+        "results/painterly_rendering/out.mp4"])
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("target", help="target image path")
+    parser.add_argument("--num_paths", type=int, default=512)
+    parser.add_argument("--max_width", type=float, default=2.0)
+    parser.add_argument("--use_lpips_loss", dest='use_lpips_loss', action='store_true')
+    parser.add_argument("--num_iter", type=int, default=500)
+    parser.add_argument("--use_blob", dest='use_blob', action='store_true')
+    args = parser.parse_args()
+    main(args)
diff --git a/parallel.cpp b/parallel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..365fc5bb305f9cacc780fb5276905e37d3b37e34
--- /dev/null
+++ b/parallel.cpp
@@ -0,0 +1,273 @@
+#include "parallel.h"
+#include <list>
+#include <thread>
+#include <condition_variable>
+#include <vector>
+#include <cassert>
+
+// From https://github.com/mmp/pbrt-v3/blob/master/src/core/parallel.cpp
+
+static std::vector<std::thread> threads;
+static bool shutdownThreads = false;
+struct ParallelForLoop;
+static ParallelForLoop *workList = nullptr;
+static std::mutex workListMutex;
+
+struct ParallelForLoop {
+    ParallelForLoop(std::function<void(int64_t)> func1D, int64_t maxIndex, int chunkSize)
+        : func1D(std::move(func1D)), maxIndex(maxIndex), chunkSize(chunkSize) {
+    }
+    ParallelForLoop(const std::function<void(Vector2i)> &f, const Vector2i count)
+        : func2D(f), maxIndex(count[0] * count[1]), chunkSize(1) {
+        nX = count[0];
+    }
+
+    std::function<void(int64_t)> func1D;
+    std::function<void(Vector2i)> func2D;
+    const int64_t maxIndex;
+    const int chunkSize;
+    int64_t nextIndex = 0;
+    int activeWorkers = 0;
+    ParallelForLoop *next = nullptr;
+    int nX = -1;
+
+    bool Finished() const {
+        return nextIndex >= maxIndex && activeWorkers == 0;
+    }
+};
+
+void Barrier::Wait() {
+    std::unique_lock<std::mutex> lock(mutex);
+    assert(count > 0);
+    if (--count == 0) {
+        // This is the last thread to reach the barrier; wake up all of the
+        // other ones before exiting.
+        cv.notify_all();
+    } else {
+        // Otherwise there are still threads that haven't reached it. Give
+        // up the lock and wait to be notified.
+        cv.wait(lock, [this] { return count == 0; });
+    }
+}
+
+static std::condition_variable workListCondition;
+
+static void worker_thread_func(const int tIndex, std::shared_ptr<Barrier> barrier) {
+    ThreadIndex = tIndex;
+
+    // The main thread sets up a barrier so that it can be sure that all
+    // workers have called ProfilerWorkerThreadInit() before it continues
+    // (and actually starts the profiling system).
+    barrier->Wait();
+
+    // Release our reference to the Barrier so that it's freed once all of
+    // the threads have cleared it.
+    barrier.reset();
+
+    std::unique_lock<std::mutex> lock(workListMutex);
+    while (!shutdownThreads) {
+        if (!workList) {
+            // Sleep until there are more tasks to run
+            workListCondition.wait(lock);
+        } else {
+            // Get work from _workList_ and run loop iterations
+            ParallelForLoop &loop = *workList;
+
+            // Run a chunk of loop iterations for _loop_
+
+            // Find the set of loop iterations to run next
+            int64_t indexStart = loop.nextIndex;
+            int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+
+            // Update _loop_ to reflect iterations this thread will run
+            loop.nextIndex = indexEnd;
+            if (loop.nextIndex == loop.maxIndex)
+                workList = loop.next;
+            loop.activeWorkers++;
+
+            // Run loop indices in _[indexStart, indexEnd)_
+            lock.unlock();
+            for (int64_t index = indexStart; index < indexEnd; ++index) {
+                if (loop.func1D) {
+                    loop.func1D(index);
+                }
+                // Handle other types of loops
+                else {
+                    assert(loop.func2D != nullptr);
+                    loop.func2D(Vector2i{int(index % loop.nX),
+                                         int(index / loop.nX)});
+                }
+            }
+            lock.lock();
+
+            // Update _loop_ to reflect completion of iterations
+            loop.activeWorkers--;
+            if (loop.Finished()) {
+                workListCondition.notify_all();
+            }
+        }
+    }
+}
+
+void parallel_for_host(const std::function<void(int64_t)> &func,
+                       int64_t count,
+                       int chunkSize) {
+    // Run iterations immediately if not using threads or if _count_ is small
+    if (threads.empty() || count < chunkSize) {
+        for (int64_t i = 0; i < count; ++i) {
+            func(i);
+        }
+        return;
+    }
+
+    // Create and enqueue _ParallelForLoop_ for this loop
+    ParallelForLoop loop(func, count, chunkSize);
+    workListMutex.lock();
+    loop.next = workList;
+    workList = &loop;
+    workListMutex.unlock();
+
+    // Notify worker threads of work to be done
+    std::unique_lock<std::mutex> lock(workListMutex);
+    workListCondition.notify_all();
+
+    // Help out with parallel loop iterations in the current thread
+    while (!loop.Finished()) {
+        // Run a chunk of loop iterations for _loop_
+
+        // Find the set of loop iterations to run next
+        int64_t indexStart = loop.nextIndex;
+        int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+
+        // Update _loop_ to reflect iterations this thread will run
+        loop.nextIndex = indexEnd;
+        if (loop.nextIndex == loop.maxIndex) {
+            workList = loop.next;
+        }
+        loop.activeWorkers++;
+
+        // Run loop indices in _[indexStart, indexEnd)_
+        lock.unlock();
+        for (int64_t index = indexStart; index < indexEnd; ++index) {
+            if (loop.func1D) {
+                loop.func1D(index);
+            }
+            // Handle other types of loops
+            else {
+                assert(loop.func2D != nullptr);
+                loop.func2D(Vector2i{int(index % loop.nX),
+                                     int(index / loop.nX)});
+            }
+        }
+        lock.lock();
+
+        // Update _loop_ to reflect completion of iterations
+        loop.activeWorkers--;
+    }
+}
+
+thread_local int ThreadIndex;
+
+void parallel_for_host(
+        std::function<void(Vector2i)> func, const Vector2i count) {
+    // Launch worker threads if needed
+    if (threads.empty() || count.x * count.y <= 1) {
+        for (int y = 0; y < count.y; ++y) {
+            for (int x = 0; x < count.x; ++x) {
+                func(Vector2i{x, y});
+            }
+        }
+        return;
+    }
+
+    ParallelForLoop loop(std::move(func), count);
+    {
+        std::lock_guard<std::mutex> lock(workListMutex);
+        loop.next = workList;
+        workList = &loop;
+    }
+
+    std::unique_lock<std::mutex> lock(workListMutex);
+    workListCondition.notify_all();
+
+    // Help out with parallel loop iterations in the current thread
+    while (!loop.Finished()) {
+        // Run a chunk of loop iterations for _loop_
+
+        // Find the set of loop iterations to run next
+        int64_t indexStart = loop.nextIndex;
+        int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+
+        // Update _loop_ to reflect iterations this thread will run
+        loop.nextIndex = indexEnd;
+        if (loop.nextIndex == loop.maxIndex) {
+            workList = loop.next;
+        }
+        loop.activeWorkers++;
+
+        // Run loop indices in _[indexStart, indexEnd)_
+        lock.unlock();
+        for (int64_t index = indexStart; index < indexEnd; ++index) {
+            if (loop.func1D) {
+                loop.func1D(index);
+            }
+            // Handle other types of loops
+            else {
+                assert(loop.func2D != nullptr);
+                loop.func2D(Vector2i{int(index % loop.nX),
+                                     int(index / loop.nX)});
+            }
+        }
+        lock.lock();
+
+        // Update _loop_ to reflect completion of iterations
+        loop.activeWorkers--;
+    }
+}
+
+int num_system_cores() {
+    // return 1;
+    int ret = std::thread::hardware_concurrency();
+    if (ret == 0) {
+        return 16;
+    }
+    return ret;
+}
+
+void parallel_init() {
+    assert(threads.size() == 0);
+    int nThreads = num_system_cores();
+    ThreadIndex = 0;
+
+    // Create a barrier so that we can be sure all worker threads get past
+    // their call to ProfilerWorkerThreadInit() before we return from this
+    // function.  In turn, we can be sure that the profiling system isn't
+    // started until after all worker threads have done that.
+    std::shared_ptr<Barrier> barrier = std::make_shared<Barrier>(nThreads);
+
+    // Launch one fewer worker thread than the total number we want doing
+    // work, since the main thread helps out, too.
+    for (int i = 0; i < nThreads - 1; ++i) {
+        threads.push_back(std::thread(worker_thread_func, i + 1, barrier));
+    }
+
+    barrier->Wait();
+}
+
+void parallel_cleanup() {
+    if (threads.empty()) {
+        return;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(workListMutex);
+        shutdownThreads = true;
+        workListCondition.notify_all();
+    }
+
+    for (std::thread &thread : threads) {
+        thread.join();
+    }
+    threads.erase(threads.begin(), threads.end());
+    shutdownThreads = false;
+}
diff --git a/parallel.h b/parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7f9c712e471616d01921157c290a50adac768d9
--- /dev/null
+++ b/parallel.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "vector.h"
+
+#include <mutex>
+#include <condition_variable>
+#include <functional>
+#include <atomic>
+#include <cstdint>
+#include <cassert>
+#include <algorithm>
+// From https://github.com/mmp/pbrt-v3/blob/master/src/core/parallel.h
+
+class Barrier {
+  public:
+    Barrier(int count) : count(count) { assert(count > 0); }
+    ~Barrier() { assert(count == 0); }
+    void Wait();
+
+  private:
+    std::mutex mutex;
+    std::condition_variable cv;
+    int count;
+};
+
+void parallel_for_host(const std::function<void(int64_t)> &func,
+                       int64_t count,
+                       int chunkSize = 1);
+extern thread_local int ThreadIndex;
+void parallel_for_host(
+    std::function<void(Vector2i)> func, const Vector2i count);
+int num_system_cores();
+
+void parallel_init();
+void parallel_cleanup();
+
+#ifdef __CUDACC__
+template <typename T>
+__global__ void parallel_for_device_kernel(T functor, int count) {
+    auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= count) {
+        return;
+    }
+    functor(idx);
+}
+template <typename T>
+inline void parallel_for_device(T functor,
+                                int count,
+                                int work_per_thread = 256) {
+    if (count <= 0) {
+        return;
+    }
+    auto block_size = work_per_thread;
+    auto block_count = idiv_ceil(count, block_size);
+    parallel_for_device_kernel<T><<<block_count, block_size>>>(functor, count);
+}
+#endif
+
+template <typename T>
+inline void parallel_for(T functor,
+                         int count,
+                         bool use_gpu,
+                         int work_per_thread = -1) {
+    if (work_per_thread == -1) {
+        work_per_thread = use_gpu ? 64 : 256;
+    }
+    if (count <= 0) {
+        return;
+    }
+    if (use_gpu) {
+#ifdef __CUDACC__
+        auto block_size = work_per_thread;
+        auto block_count = idiv_ceil(count, block_size);
+        parallel_for_device_kernel<T><<<block_count, block_size>>>(functor, count);
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    } else {
+        auto num_threads = idiv_ceil(count, work_per_thread);
+        parallel_for_host([&](int thread_index) {
+            auto id_offset = work_per_thread * thread_index;
+            auto work_end = std::min(id_offset + work_per_thread, count);
+            for (int work_id = id_offset; work_id < work_end; work_id++) {
+                auto idx = work_id;
+                assert(idx < count);
+                functor(idx);
+            }
+        }, num_threads);
+    }
+}
diff --git a/pcg.h b/pcg.h
new file mode 100644
index 0000000000000000000000000000000000000000..55859a1e63d15d1d5d0b110c28561a064c5a446c
--- /dev/null
+++ b/pcg.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "diffvg.h"
+
+// http://www.pcg-random.org/download.html
+struct pcg32_state {
+    uint64_t state;
+    uint64_t inc;
+};
+
+DEVICE inline uint32_t next_pcg32(pcg32_state *rng) {
+    uint64_t oldstate = rng->state;
+    // Advance internal state
+    rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1);
+    // Calculate output function (XSH RR), uses old state for max ILP
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
+
+// https://github.com/wjakob/pcg32/blob/master/pcg32.h
+DEVICE inline float next_pcg32_float(pcg32_state *rng) {
+    union {
+        uint32_t u;
+        float f;
+    } x;
+    x.u = (next_pcg32(rng) >> 9) | 0x3f800000u;
+    return x.f - 1.0f;
+}
+
+// Initialize each pixel with a PCG rng with a different stream
+DEVICE inline pcg32_state init_pcg32(int idx, uint64_t seed) {
+    pcg32_state state;
+    state.state = 0U;
+    state.inc = (((uint64_t)idx + 1) << 1u) | 1u;
+    next_pcg32(&state);
+    state.state += (0x853c49e6748fea9bULL + seed);
+    next_pcg32(&state);
+    return state;
+}
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 0000000000000000000000000000000000000000..82cc3991ead2fbbeed1cb9002412a321f2532697
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,1399 @@
+[[package]]
+name = "certifi"
+version = "2020.12.5"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "cffi"
+version = "1.14.5"
+description = "Foreign Function Interface for Python calling C code."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pycparser = "*"
+
+[[package]]
+name = "chardet"
+version = "4.0.0"
+description = "Universal encoding detector for Python 2 and 3"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "cmake"
+version = "3.18.4.post1"
+description = "CMake is an open-source, cross-platform family of tools designed to build, test and package software"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "coloredlogs"
+version = "15.0"
+description = "Colored terminal output for Python's logging module"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.dependencies]
+humanfriendly = ">=9.1"
+
+[package.extras]
+cron = ["capturer (>=2.4)"]
+
+[[package]]
+name = "cssutils"
+version = "2.2.0"
+description = "A CSS Cascading Style Sheets library for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
+testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "mock", "pytest-black (>=0.3.7)", "pytest-mypy"]
+
+[[package]]
+name = "cycler"
+version = "0.10.0"
+description = "Composable style cycles"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+six = "*"
+
+[[package]]
+name = "decorator"
+version = "4.4.2"
+description = "Decorators for Humans"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+
+[[package]]
+name = "greenlet"
+version = "1.0.0"
+description = "Lightweight in-process concurrent programming"
+category = "dev"
+optional = false
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*"
+
+[package.extras]
+docs = ["sphinx"]
+
+[[package]]
+name = "humanfriendly"
+version = "9.1"
+description = "Human friendly output for text interfaces using Python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.dependencies]
+pyreadline = {version = "*", markers = "sys_platform == \"win32\""}
+
+[[package]]
+name = "idna"
+version = "2.10"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "imageio"
+version = "2.9.0"
+description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+numpy = "*"
+pillow = "*"
+
+[package.extras]
+ffmpeg = ["imageio-ffmpeg"]
+fits = ["astropy"]
+full = ["astropy", "gdal", "imageio-ffmpeg", "itk"]
+gdal = ["gdal"]
+itk = ["itk"]
+
+[[package]]
+name = "imageio-ffmpeg"
+version = "0.4.3"
+description = "FFMPEG wrapper for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.4"
+
+[[package]]
+name = "jinja2"
+version = "2.11.3"
+description = "A very fast and expressive template engine."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.dependencies]
+MarkupSafe = ">=0.23"
+
+[package.extras]
+i18n = ["Babel (>=0.8)"]
+
+[[package]]
+name = "jsonpatch"
+version = "1.32"
+description = "Apply JSON-Patches (RFC 6902)"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.dependencies]
+jsonpointer = ">=1.9"
+
+[[package]]
+name = "jsonpointer"
+version = "2.1"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "kiwisolver"
+version = "1.3.1"
+description = "A fast implementation of the Cassowary constraint solver"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "llvmlite"
+version = "0.36.0"
+description = "lightweight wrapper around basic LLVM functionality"
+category = "dev"
+optional = false
+python-versions = ">=3.6,<3.10"
+
+[[package]]
+name = "markupsafe"
+version = "1.1.1"
+description = "Safely add untrusted strings to HTML/XML markup."
+category = "dev"
+optional = false
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
+
+[[package]]
+name = "matplotlib"
+version = "3.4.1"
+description = "Python plotting package"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+cycler = ">=0.10"
+kiwisolver = ">=1.0.1"
+numpy = ">=1.16"
+pillow = ">=6.2.0"
+pyparsing = ">=2.2.1"
+python-dateutil = ">=2.7"
+
+[[package]]
+name = "networkx"
+version = "2.5.1"
+description = "Python package for creating and manipulating graphs and networks"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+decorator = ">=4.3,<5"
+
+[package.extras]
+all = ["numpy", "scipy", "pandas", "matplotlib", "pygraphviz", "pydot", "pyyaml", "lxml", "pytest"]
+gdal = ["gdal"]
+lxml = ["lxml"]
+matplotlib = ["matplotlib"]
+numpy = ["numpy"]
+pandas = ["pandas"]
+pydot = ["pydot"]
+pygraphviz = ["pygraphviz"]
+pytest = ["pytest"]
+pyyaml = ["pyyaml"]
+scipy = ["scipy"]
+
+[[package]]
+name = "numba"
+version = "0.53.1"
+description = "compiling Python code using LLVM"
+category = "dev"
+optional = false
+python-versions = ">=3.6,<3.10"
+
+[package.dependencies]
+llvmlite = ">=0.36.0rc1,<0.37"
+numpy = ">=1.15"
+
+[[package]]
+name = "numpy"
+version = "1.20.2"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "pandas"
+version = "1.2.3"
+description = "Powerful data structures for data analysis, time series, and statistics"
+category = "dev"
+optional = false
+python-versions = ">=3.7.1"
+
+[package.dependencies]
+numpy = ">=1.16.5"
+python-dateutil = ">=2.7.3"
+pytz = ">=2017.3"
+
+[package.extras]
+test = ["pytest (>=5.0.1)", "pytest-xdist", "hypothesis (>=3.58)"]
+
+[[package]]
+name = "pillow"
+version = "8.2.0"
+description = "Python Imaging Library (Fork)"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "py"
+version = "1.10.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyaml"
+version = "20.4.0"
+description = "PyYAML-based module to produce pretty and readable YAML-serialized data"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+PyYAML = "*"
+
+[[package]]
+name = "pycparser"
+version = "2.20"
+description = "C parser in Python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pygame"
+version = "2.0.1"
+description = "Python Game Development"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "pyparsing"
+version = "2.4.7"
+description = "Python parsing module"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "pyreadline"
+version = "2.1"
+description = "A python implmementation of GNU readline."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "python-dateutil"
+version = "2.8.1"
+description = "Extensions to the standard Python datetime module"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2021.1"
+description = "World timezone definitions, modern and historical"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "pywavelets"
+version = "1.1.1"
+description = "PyWavelets, wavelet transform module"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+numpy = ">=1.13.3"
+
+[[package]]
+name = "pyyaml"
+version = "5.4.1"
+description = "YAML parser and emitter for Python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+
+[[package]]
+name = "pyzmq"
+version = "22.0.3"
+description = "Python bindings for 0MQ"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+cffi = {version = "*", markers = "implementation_name == \"pypy\""}
+py = {version = "*", markers = "implementation_name == \"pypy\""}
+
+[[package]]
+name = "requests"
+version = "2.25.1"
+description = "Python HTTP for Humans."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+chardet = ">=3.0.2,<5"
+idna = ">=2.5,<3"
+urllib3 = ">=1.21.1,<1.27"
+
+[package.extras]
+security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
+socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
+
+[[package]]
+name = "scikit-image"
+version = "0.18.1"
+description = "Image processing in Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+imageio = ">=2.3.0"
+matplotlib = ">=2.0.0,<3.0.0 || >3.0.0"
+networkx = ">=2.0"
+numpy = ">=1.16.5"
+pillow = ">=4.3.0,<7.1.0 || >7.1.0,<7.1.1 || >7.1.1"
+PyWavelets = ">=1.1.1"
+scipy = ">=1.0.1"
+tifffile = ">=2019.7.26"
+
+[package.extras]
+data = ["pooch (>=1.3.0)"]
+docs = ["sphinx (>=1.8,<=2.4.4)", "sphinx-gallery (>=0.7.0,<0.8.0 || >0.8.0)", "numpydoc (>=1.0)", "sphinx-copybutton", "pytest-runner", "scikit-learn", "matplotlib (>=3.0.1)", "dask[array] (>=0.15.0,<2.17.0 || >2.17.0)", "cloudpickle (>=0.2.1)", "pandas (>=0.23.0)", "seaborn (>=0.7.1)", "pooch (>=1.3.0)", "tifffile (>=2020.5.30)", "myst-parser", "ipywidgets", "plotly (>=4.10.0)"]
+optional = ["simpleitk", "astropy (>=3.1.2)", "qtpy", "pyamg", "dask[array] (>=1.0.0,<2.17.0 || >2.17.0)", "cloudpickle (>=0.2.1)", "pooch (>=1.3.0)"]
+test = ["pytest (>=5.2.0)", "pytest-cov (>=2.7.0)", "pytest-localserver", "pytest-faulthandler", "flake8", "codecov", "pooch (>=1.3.0)"]
+
+[[package]]
+name = "scipy"
+version = "1.6.2"
+description = "SciPy: Scientific Library for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7,<3.10"
+
+[package.dependencies]
+numpy = ">=1.16.5,<1.23.0"
+
+[[package]]
+name = "seaborn"
+version = "0.11.1"
+description = "seaborn: statistical data visualization"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+matplotlib = ">=2.2"
+numpy = ">=1.15"
+pandas = ">=0.23"
+scipy = ">=1.0"
+
+[[package]]
+name = "six"
+version = "1.15.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "sqlalchemy"
+version = "1.4.6"
+description = "Database Abstraction Library"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+
+[package.dependencies]
+greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\""}
+
+[package.extras]
+aiomysql = ["greenlet (!=0.4.17)", "aiomysql"]
+aiosqlite = ["greenlet (!=0.4.17)", "aiosqlite"]
+asyncio = ["greenlet (!=0.4.17)"]
+mariadb_connector = ["mariadb (>=1.0.1)"]
+mssql = ["pyodbc"]
+mssql_pymssql = ["pymssql"]
+mssql_pyodbc = ["pyodbc"]
+mypy = ["sqlalchemy2-stubs", "mypy (>=0.800)"]
+mysql = ["mysqlclient (>=1.4.0,<2)", "mysqlclient (>=1.4.0)"]
+mysql_connector = ["mysqlconnector"]
+oracle = ["cx_oracle (>=7,<8)", "cx_oracle (>=7)"]
+postgresql = ["psycopg2 (>=2.7)"]
+postgresql_asyncpg = ["greenlet (!=0.4.17)", "asyncpg"]
+postgresql_pg8000 = ["pg8000 (>=1.16.6)"]
+postgresql_psycopg2binary = ["psycopg2-binary"]
+postgresql_psycopg2cffi = ["psycopg2cffi"]
+pymysql = ["pymysql (<1)", "pymysql"]
+sqlcipher = ["sqlcipher3-binary"]
+
+[[package]]
+name = "svgpathtools"
+version = "1.4.1"
+description = "A collection of tools for manipulating and analyzing SVG Path objects and Bezier curves."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+numpy = "*"
+svgwrite = "*"
+
+[[package]]
+name = "svgwrite"
+version = "1.4.1"
+description = "A Python library to create SVG drawings."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "tifffile"
+version = "2021.3.31"
+description = "Read and write TIFF files"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = ">=1.15.1"
+
+[package.extras]
+all = ["imagecodecs (>=2021.3.31)", "matplotlib (>=3.2)", "lxml"]
+
+[[package]]
+name = "torch"
+version = "1.8.1"
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+
+[package.dependencies]
+numpy = "*"
+typing-extensions = "*"
+
+[[package]]
+name = "torch-tools"
+version = "0.1.5"
+description = "A library of helpers to train, evaluate and visualize deep nets with PyTorch."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+coloredlogs = "*"
+imageio = "*"
+imageio-ffmpeg = "*"
+jinja2 = "*"
+numpy = "*"
+pyaml = "*"
+seaborn = "*"
+sqlalchemy = "*"
+torch = "*"
+torchvision = "*"
+tqdm = "*"
+visdom = "*"
+
+[package.extras]
+dev = ["sphinx", "pytest"]
+docs = ["sphinx"]
+tests = ["pytest"]
+
+[[package]]
+name = "torchfile"
+version = "0.1.0"
+description = "Torch7 binary serialized file parser"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "torchvision"
+version = "0.9.1"
+description = "image and video datasets and models for torch deep learning"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+numpy = "*"
+pillow = ">=4.1.1"
+torch = "1.8.1"
+
+[package.extras]
+scipy = ["scipy"]
+
+[[package]]
+name = "tornado"
+version = "6.1"
+description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
+category = "dev"
+optional = false
+python-versions = ">= 3.5"
+
+[[package]]
+name = "tqdm"
+version = "4.60.0"
+description = "Fast, Extensible Progress Meter"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+telegram = ["requests"]
+
+[[package]]
+name = "typing-extensions"
+version = "3.7.4.3"
+description = "Backported and Experimental Type Hints for Python 3.5+"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "urllib3"
+version = "1.26.4"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+
+[package.extras]
+secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"]
+brotli = ["brotlipy (>=0.6.0)"]
+
+[[package]]
+name = "visdom"
+version = "0.1.8.9"
+description = "A tool for visualizing live, rich data for Torch and Numpy"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+jsonpatch = "*"
+numpy = ">=1.8"
+pillow = "*"
+pyzmq = "*"
+requests = "*"
+scipy = "*"
+six = "*"
+torchfile = "*"
+tornado = "*"
+websocket-client = "*"
+
+[[package]]
+name = "websocket-client"
+version = "0.58.0"
+description = "WebSocket client for Python with low level API options"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+six = "*"
+
+[metadata]
+lock-version = "1.1"
+python-versions = "=3.8"
+content-hash = "fb1c03803c4a88e983ffd148d935ed43db7436d144c30678ecdc421d27534dae"
+
+[metadata.files]
+certifi = [
+    {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"},
+    {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"},
+]
+cffi = [
+    {file = "cffi-1.14.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:bb89f306e5da99f4d922728ddcd6f7fcebb3241fc40edebcb7284d7514741991"},
+    {file = "cffi-1.14.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:34eff4b97f3d982fb93e2831e6750127d1355a923ebaeeb565407b3d2f8d41a1"},
+    {file = "cffi-1.14.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:99cd03ae7988a93dd00bcd9d0b75e1f6c426063d6f03d2f90b89e29b25b82dfa"},
+    {file = "cffi-1.14.5-cp27-cp27m-win32.whl", hash = "sha256:65fa59693c62cf06e45ddbb822165394a288edce9e276647f0046e1ec26920f3"},
+    {file = "cffi-1.14.5-cp27-cp27m-win_amd64.whl", hash = "sha256:51182f8927c5af975fece87b1b369f722c570fe169f9880764b1ee3bca8347b5"},
+    {file = "cffi-1.14.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:43e0b9d9e2c9e5d152946b9c5fe062c151614b262fda2e7b201204de0b99e482"},
+    {file = "cffi-1.14.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:cbde590d4faaa07c72bf979734738f328d239913ba3e043b1e98fe9a39f8b2b6"},
+    {file = "cffi-1.14.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:5de7970188bb46b7bf9858eb6890aad302577a5f6f75091fd7cdd3ef13ef3045"},
+    {file = "cffi-1.14.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:a465da611f6fa124963b91bf432d960a555563efe4ed1cc403ba5077b15370aa"},
+    {file = "cffi-1.14.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:d42b11d692e11b6634f7613ad8df5d6d5f8875f5d48939520d351007b3c13406"},
+    {file = "cffi-1.14.5-cp35-cp35m-win32.whl", hash = "sha256:72d8d3ef52c208ee1c7b2e341f7d71c6fd3157138abf1a95166e6165dd5d4369"},
+    {file = "cffi-1.14.5-cp35-cp35m-win_amd64.whl", hash = "sha256:29314480e958fd8aab22e4a58b355b629c59bf5f2ac2492b61e3dc06d8c7a315"},
+    {file = "cffi-1.14.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:3d3dd4c9e559eb172ecf00a2a7517e97d1e96de2a5e610bd9b68cea3925b4892"},
+    {file = "cffi-1.14.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:48e1c69bbacfc3d932221851b39d49e81567a4d4aac3b21258d9c24578280058"},
+    {file = "cffi-1.14.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:69e395c24fc60aad6bb4fa7e583698ea6cc684648e1ffb7fe85e3c1ca131a7d5"},
+    {file = "cffi-1.14.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:9e93e79c2551ff263400e1e4be085a1210e12073a31c2011dbbda14bda0c6132"},
+    {file = "cffi-1.14.5-cp36-cp36m-win32.whl", hash = "sha256:58e3f59d583d413809d60779492342801d6e82fefb89c86a38e040c16883be53"},
+    {file = "cffi-1.14.5-cp36-cp36m-win_amd64.whl", hash = "sha256:005a36f41773e148deac64b08f233873a4d0c18b053d37da83f6af4d9087b813"},
+    {file = "cffi-1.14.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2894f2df484ff56d717bead0a5c2abb6b9d2bf26d6960c4604d5c48bbc30ee73"},
+    {file = "cffi-1.14.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:0857f0ae312d855239a55c81ef453ee8fd24136eaba8e87a2eceba644c0d4c06"},
+    {file = "cffi-1.14.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:cd2868886d547469123fadc46eac7ea5253ea7fcb139f12e1dfc2bbd406427d1"},
+    {file = "cffi-1.14.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:35f27e6eb43380fa080dccf676dece30bef72e4a67617ffda586641cd4508d49"},
+    {file = "cffi-1.14.5-cp37-cp37m-win32.whl", hash = "sha256:9ff227395193126d82e60319a673a037d5de84633f11279e336f9c0f189ecc62"},
+    {file = "cffi-1.14.5-cp37-cp37m-win_amd64.whl", hash = "sha256:9cf8022fb8d07a97c178b02327b284521c7708d7c71a9c9c355c178ac4bbd3d4"},
+    {file = "cffi-1.14.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8b198cec6c72df5289c05b05b8b0969819783f9418e0409865dac47288d2a053"},
+    {file = "cffi-1.14.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:ad17025d226ee5beec591b52800c11680fca3df50b8b29fe51d882576e039ee0"},
+    {file = "cffi-1.14.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:6c97d7350133666fbb5cf4abdc1178c812cb205dc6f41d174a7b0f18fb93337e"},
+    {file = "cffi-1.14.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8ae6299f6c68de06f136f1f9e69458eae58f1dacf10af5c17353eae03aa0d827"},
+    {file = "cffi-1.14.5-cp38-cp38-win32.whl", hash = "sha256:b85eb46a81787c50650f2392b9b4ef23e1f126313b9e0e9013b35c15e4288e2e"},
+    {file = "cffi-1.14.5-cp38-cp38-win_amd64.whl", hash = "sha256:1f436816fc868b098b0d63b8920de7d208c90a67212546d02f84fe78a9c26396"},
+    {file = "cffi-1.14.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1071534bbbf8cbb31b498d5d9db0f274f2f7a865adca4ae429e147ba40f73dea"},
+    {file = "cffi-1.14.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:9de2e279153a443c656f2defd67769e6d1e4163952b3c622dcea5b08a6405322"},
+    {file = "cffi-1.14.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:6e4714cc64f474e4d6e37cfff31a814b509a35cb17de4fb1999907575684479c"},
+    {file = "cffi-1.14.5-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:158d0d15119b4b7ff6b926536763dc0714313aa59e320ddf787502c70c4d4bee"},
+    {file = "cffi-1.14.5-cp39-cp39-win32.whl", hash = "sha256:afb29c1ba2e5a3736f1c301d9d0abe3ec8b86957d04ddfa9d7a6a42b9367e396"},
+    {file = "cffi-1.14.5-cp39-cp39-win_amd64.whl", hash = "sha256:f2d45f97ab6bb54753eab54fffe75aaf3de4ff2341c9daee1987ee1837636f1d"},
+    {file = "cffi-1.14.5.tar.gz", hash = "sha256:fd78e5fee591709f32ef6edb9a015b4aa1a5022598e36227500c8f4e02328d9c"},
+]
+chardet = [
+    {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"},
+    {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"},
+]
+cmake = [
+    {file = "cmake-3.18.4.post1-py2-none-macosx_10_6_x86_64.whl", hash = "sha256:10c46b0fd2c087b0cae611d1e734f065a1a8169d0b54ec834a9dff005c1857ca"},
+    {file = "cmake-3.18.4.post1-py2-none-manylinux1_i686.whl", hash = "sha256:65cd763dd232973a0deedf1f349e229fa3bf1357e0e2576da65ad118ff53b070"},
+    {file = "cmake-3.18.4.post1-py2-none-manylinux1_x86_64.whl", hash = "sha256:1c900642859c5970d81ae8821ae05a2af93d2630cd1c0f2bffc80e7abdbc087d"},
+    {file = "cmake-3.18.4.post1-py2-none-win32.whl", hash = "sha256:605c2a07c9ebf332319106bffb11941463d18e586902e3659c315cae9f0caaeb"},
+    {file = "cmake-3.18.4.post1-py2-none-win_amd64.whl", hash = "sha256:c1b14b302d3def2672968cd675031793e193382d0e4a00e2121af4b333d62ece"},
+    {file = "cmake-3.18.4.post1-py3-none-macosx_10_6_x86_64.whl", hash = "sha256:6dd3abb1afdd9a986a55977ef85a0d245ebf289cc704b687f061294c48c126ec"},
+    {file = "cmake-3.18.4.post1-py3-none-manylinux1_i686.whl", hash = "sha256:1c86369700f74363ee46de64e4167ac2d292a7c7f1606e372b8dcaf3108d0cc7"},
+    {file = "cmake-3.18.4.post1-py3-none-manylinux1_x86_64.whl", hash = "sha256:34f7ee67cef21b178a793fe760c979608d4ac66a1697cae6b382dbcc5d1ec485"},
+    {file = "cmake-3.18.4.post1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:e8ef8dab578e8ca85724b8506f230a5a5017ead67cb9da60fe1240fc9ab24135"},
+    {file = "cmake-3.18.4.post1-py3-none-win32.whl", hash = "sha256:5096f5d4541b5d0040bae9dbc364bb1c8cd9211e273c481baf9a1a3635be1d00"},
+    {file = "cmake-3.18.4.post1-py3-none-win_amd64.whl", hash = "sha256:ac062ac13591e4acbb6e919e5b1196a3b04f8d1022eb3ab4dbd20779ade9d5ab"},
+    {file = "cmake-3.18.4.post1.tar.gz", hash = "sha256:d7981ac85f1abb75c24eb14936d56dafbd327e7ba371d91007e38704af7b52b5"},
+]
+coloredlogs = [
+    {file = "coloredlogs-15.0-py2.py3-none-any.whl", hash = "sha256:b7f630a8297a66984b6bae0f6a1b0e0afb9f2f6838ea3bfa58f50d3d13e133d6"},
+    {file = "coloredlogs-15.0.tar.gz", hash = "sha256:5e78691e2673a8e294499e1832bb13efcfb44a86b92e18109fa18951093218ab"},
+]
+cssutils = [
+    {file = "cssutils-2.2.0-py3-none-any.whl", hash = "sha256:c76b08ac9ab92d258947dd74ca439c9bed8db8fbf3e1bfced7043abcfeeb0843"},
+    {file = "cssutils-2.2.0.tar.gz", hash = "sha256:5bef59f6b59bdccbea8e36cb292d2be1b6be1b485fc4a9f5886616f19eb31aaf"},
+]
+cycler = [
+    {file = "cycler-0.10.0-py2.py3-none-any.whl", hash = "sha256:1d8a5ae1ff6c5cf9b93e8811e581232ad8920aeec647c37316ceac982b08cb2d"},
+    {file = "cycler-0.10.0.tar.gz", hash = "sha256:cd7b2d1018258d7247a71425e9f26463dfb444d411c39569972f4ce586b0c9d8"},
+]
+decorator = [
+    {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
+    {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
+]
+greenlet = [
+    {file = "greenlet-1.0.0-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:1d1d4473ecb1c1d31ce8fd8d91e4da1b1f64d425c1dc965edc4ed2a63cfa67b2"},
+    {file = "greenlet-1.0.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:cfd06e0f0cc8db2a854137bd79154b61ecd940dce96fad0cba23fe31de0b793c"},
+    {file = "greenlet-1.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:eb333b90036358a0e2c57373f72e7648d7207b76ef0bd00a4f7daad1f79f5203"},
+    {file = "greenlet-1.0.0-cp27-cp27m-win32.whl", hash = "sha256:1a1ada42a1fd2607d232ae11a7b3195735edaa49ea787a6d9e6a53afaf6f3476"},
+    {file = "greenlet-1.0.0-cp27-cp27m-win_amd64.whl", hash = "sha256:f6f65bf54215e4ebf6b01e4bb94c49180a589573df643735107056f7a910275b"},
+    {file = "greenlet-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:f59eded163d9752fd49978e0bab7a1ff21b1b8d25c05f0995d140cc08ac83379"},
+    {file = "greenlet-1.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:875d4c60a6299f55df1c3bb870ebe6dcb7db28c165ab9ea6cdc5d5af36bb33ce"},
+    {file = "greenlet-1.0.0-cp35-cp35m-macosx_10_14_x86_64.whl", hash = "sha256:1bb80c71de788b36cefb0c3bb6bfab306ba75073dbde2829c858dc3ad70f867c"},
+    {file = "greenlet-1.0.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:b5f1b333015d53d4b381745f5de842f19fe59728b65f0fbb662dafbe2018c3a5"},
+    {file = "greenlet-1.0.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:5352c15c1d91d22902582e891f27728d8dac3bd5e0ee565b6a9f575355e6d92f"},
+    {file = "greenlet-1.0.0-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:2c65320774a8cd5fdb6e117c13afa91c4707548282464a18cf80243cf976b3e6"},
+    {file = "greenlet-1.0.0-cp35-cp35m-manylinux2014_ppc64le.whl", hash = "sha256:111cfd92d78f2af0bc7317452bd93a477128af6327332ebf3c2be7df99566683"},
+    {file = "greenlet-1.0.0-cp35-cp35m-win32.whl", hash = "sha256:cdb90267650c1edb54459cdb51dab865f6c6594c3a47ebd441bc493360c7af70"},
+    {file = "greenlet-1.0.0-cp35-cp35m-win_amd64.whl", hash = "sha256:eac8803c9ad1817ce3d8d15d1bb82c2da3feda6bee1153eec5c58fa6e5d3f770"},
+    {file = "greenlet-1.0.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:c93d1a71c3fe222308939b2e516c07f35a849c5047f0197442a4d6fbcb4128ee"},
+    {file = "greenlet-1.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:122c63ba795fdba4fc19c744df6277d9cfd913ed53d1a286f12189a0265316dd"},
+    {file = "greenlet-1.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:c5b22b31c947ad8b6964d4ed66776bcae986f73669ba50620162ba7c832a6b6a"},
+    {file = "greenlet-1.0.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:4365eccd68e72564c776418c53ce3c5af402bc526fe0653722bc89efd85bf12d"},
+    {file = "greenlet-1.0.0-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:da7d09ad0f24270b20f77d56934e196e982af0d0a2446120cb772be4e060e1a2"},
+    {file = "greenlet-1.0.0-cp36-cp36m-win32.whl", hash = "sha256:647ba1df86d025f5a34043451d7c4a9f05f240bee06277a524daad11f997d1e7"},
+    {file = "greenlet-1.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:e6e9fdaf6c90d02b95e6b0709aeb1aba5affbbb9ccaea5502f8638e4323206be"},
+    {file = "greenlet-1.0.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:62afad6e5fd70f34d773ffcbb7c22657e1d46d7fd7c95a43361de979f0a45aef"},
+    {file = "greenlet-1.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:d3789c1c394944084b5e57c192889985a9f23bd985f6d15728c745d380318128"},
+    {file = "greenlet-1.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:f5e2d36c86c7b03c94b8459c3bd2c9fe2c7dab4b258b8885617d44a22e453fb7"},
+    {file = "greenlet-1.0.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:292e801fcb3a0b3a12d8c603c7cf340659ea27fd73c98683e75800d9fd8f704c"},
+    {file = "greenlet-1.0.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:f3dc68272990849132d6698f7dc6df2ab62a88b0d36e54702a8fd16c0490e44f"},
+    {file = "greenlet-1.0.0-cp37-cp37m-win32.whl", hash = "sha256:7cd5a237f241f2764324396e06298b5dee0df580cf06ef4ada0ff9bff851286c"},
+    {file = "greenlet-1.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:0ddd77586553e3daf439aa88b6642c5f252f7ef79a39271c25b1d4bf1b7cbb85"},
+    {file = "greenlet-1.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:90b6a25841488cf2cb1c8623a53e6879573010a669455046df5f029d93db51b7"},
+    {file = "greenlet-1.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ed1d1351f05e795a527abc04a0d82e9aecd3bdf9f46662c36ff47b0b00ecaf06"},
+    {file = "greenlet-1.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:94620ed996a7632723a424bccb84b07e7b861ab7bb06a5aeb041c111dd723d36"},
+    {file = "greenlet-1.0.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f97d83049715fd9dec7911860ecf0e17b48d8725de01e45de07d8ac0bd5bc378"},
+    {file = "greenlet-1.0.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:0a77691f0080c9da8dfc81e23f4e3cffa5accf0f5b56478951016d7cfead9196"},
+    {file = "greenlet-1.0.0-cp38-cp38-win32.whl", hash = "sha256:e1128e022d8dce375362e063754e129750323b67454cac5600008aad9f54139e"},
+    {file = "greenlet-1.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:5d4030b04061fdf4cbc446008e238e44936d77a04b2b32f804688ad64197953c"},
+    {file = "greenlet-1.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:f8450d5ef759dbe59f84f2c9f77491bb3d3c44bc1a573746daf086e70b14c243"},
+    {file = "greenlet-1.0.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:df8053867c831b2643b2c489fe1d62049a98566b1646b194cc815f13e27b90df"},
+    {file = "greenlet-1.0.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:df3e83323268594fa9755480a442cabfe8d82b21aba815a71acf1bb6c1776218"},
+    {file = "greenlet-1.0.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:181300f826625b7fd1182205b830642926f52bd8cdb08b34574c9d5b2b1813f7"},
+    {file = "greenlet-1.0.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:58ca0f078d1c135ecf1879d50711f925ee238fe773dfe44e206d7d126f5bc664"},
+    {file = "greenlet-1.0.0-cp39-cp39-win32.whl", hash = "sha256:5f297cb343114b33a13755032ecf7109b07b9a0020e841d1c3cedff6602cc139"},
+    {file = "greenlet-1.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:5d69bbd9547d3bc49f8a545db7a0bd69f407badd2ff0f6e1a163680b5841d2b0"},
+    {file = "greenlet-1.0.0.tar.gz", hash = "sha256:719e169c79255816cdcf6dccd9ed2d089a72a9f6c42273aae12d55e8d35bdcf8"},
+]
+humanfriendly = [
+    {file = "humanfriendly-9.1-py2.py3-none-any.whl", hash = "sha256:d5c731705114b9ad673754f3317d9fa4c23212f36b29bdc4272a892eafc9bc72"},
+    {file = "humanfriendly-9.1.tar.gz", hash = "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d"},
+]
+idna = [
+    {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
+    {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
+]
+imageio = [
+    {file = "imageio-2.9.0-py3-none-any.whl", hash = "sha256:3604d751f03002e8e0e7650aa71d8d9148144a87daf17cb1f3228e80747f2e6b"},
+    {file = "imageio-2.9.0.tar.gz", hash = "sha256:52ddbaeca2dccf53ba2d6dec5676ca7bc3b2403ef8b37f7da78b7654bb3e10f0"},
+]
+imageio-ffmpeg = [
+    {file = "imageio-ffmpeg-0.4.3.tar.gz", hash = "sha256:f826260a3207b872f1a4ba87ec0c8e02c00afba4fd03348a59049bdd8215841e"},
+    {file = "imageio_ffmpeg-0.4.3-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:5b69f9390f791a0f1e125ae5244dcb8a6dbb7008195d047d529225a7ef351be4"},
+    {file = "imageio_ffmpeg-0.4.3-py3-none-manylinux2010_x86_64.whl", hash = "sha256:4b8067bf1ead54b03fa7fa37f80ce152b34426e128aa835131b04eb64ee52b32"},
+    {file = "imageio_ffmpeg-0.4.3-py3-none-win32.whl", hash = "sha256:3b00bb04e8649f60d5ede91aa47b754283d1fa9fd8d40803d9871c8afd72cd50"},
+    {file = "imageio_ffmpeg-0.4.3-py3-none-win_amd64.whl", hash = "sha256:b1b9cc52ead5ea0c6e9806bcce4d6d1b0d8d4e9baf017af090fb932cc37ad6b0"},
+]
+jinja2 = [
+    {file = "Jinja2-2.11.3-py2.py3-none-any.whl", hash = "sha256:03e47ad063331dd6a3f04a43eddca8a966a26ba0c5b7207a9a9e4e08f1b29419"},
+    {file = "Jinja2-2.11.3.tar.gz", hash = "sha256:a6d58433de0ae800347cab1fa3043cebbabe8baa9d29e668f1c768cb87a333c6"},
+]
+jsonpatch = [
+    {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"},
+    {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"},
+]
+jsonpointer = [
+    {file = "jsonpointer-2.1-py2.py3-none-any.whl", hash = "sha256:150f80c5badd02c757da6644852f612f88e8b4bc2f9852dcbf557c8738919686"},
+    {file = "jsonpointer-2.1.tar.gz", hash = "sha256:5a34b698db1eb79ceac454159d3f7c12a451a91f6334a4f638454327b7a89962"},
+]
+kiwisolver = [
+    {file = "kiwisolver-1.3.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:fd34fbbfbc40628200730bc1febe30631347103fc8d3d4fa012c21ab9c11eca9"},
+    {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:d3155d828dec1d43283bd24d3d3e0d9c7c350cdfcc0bd06c0ad1209c1bbc36d0"},
+    {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:5a7a7dbff17e66fac9142ae2ecafb719393aaee6a3768c9de2fd425c63b53e21"},
+    {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:f8d6f8db88049a699817fd9178782867bf22283e3813064302ac59f61d95be05"},
+    {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:5f6ccd3dd0b9739edcf407514016108e2280769c73a85b9e59aa390046dbf08b"},
+    {file = "kiwisolver-1.3.1-cp36-cp36m-win32.whl", hash = "sha256:225e2e18f271e0ed8157d7f4518ffbf99b9450fca398d561eb5c4a87d0986dd9"},
+    {file = "kiwisolver-1.3.1-cp36-cp36m-win_amd64.whl", hash = "sha256:cf8b574c7b9aa060c62116d4181f3a1a4e821b2ec5cbfe3775809474113748d4"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:232c9e11fd7ac3a470d65cd67e4359eee155ec57e822e5220322d7b2ac84fbf0"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:b38694dcdac990a743aa654037ff1188c7a9801ac3ccc548d3341014bc5ca278"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:ca3820eb7f7faf7f0aa88de0e54681bddcb46e485beb844fcecbcd1c8bd01689"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:c8fd0f1ae9d92b42854b2979024d7597685ce4ada367172ed7c09edf2cef9cb8"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:1e1bc12fb773a7b2ffdeb8380609f4f8064777877b2225dec3da711b421fda31"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-win32.whl", hash = "sha256:72c99e39d005b793fb7d3d4e660aed6b6281b502e8c1eaf8ee8346023c8e03bc"},
+    {file = "kiwisolver-1.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:8be8d84b7d4f2ba4ffff3665bcd0211318aa632395a1a41553250484a871d454"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:31dfd2ac56edc0ff9ac295193eeaea1c0c923c0355bf948fbd99ed6018010b72"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:563c649cfdef27d081c84e72a03b48ea9408c16657500c312575ae9d9f7bc1c3"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:78751b33595f7f9511952e7e60ce858c6d64db2e062afb325985ddbd34b5c131"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a357fd4f15ee49b4a98b44ec23a34a95f1e00292a139d6015c11f55774ef10de"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:5989db3b3b34b76c09253deeaf7fbc2707616f130e166996606c284395da3f18"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-win32.whl", hash = "sha256:c08e95114951dc2090c4a630c2385bef681cacf12636fb0241accdc6b303fd81"},
+    {file = "kiwisolver-1.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:44a62e24d9b01ba94ae7a4a6c3fb215dc4af1dde817e7498d901e229aaf50e4e"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50af681a36b2a1dee1d3c169ade9fdc59207d3c31e522519181e12f1b3ba7000"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:a53d27d0c2a0ebd07e395e56a1fbdf75ffedc4a05943daf472af163413ce9598"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:834ee27348c4aefc20b479335fd422a2c69db55f7d9ab61721ac8cd83eb78882"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5c3e6455341008a054cccee8c5d24481bcfe1acdbc9add30aa95798e95c65621"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:acef3d59d47dd85ecf909c359d0fd2c81ed33bdff70216d3956b463e12c38a54"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-win32.whl", hash = "sha256:c5518d51a0735b1e6cee1fdce66359f8d2b59c3ca85dc2b0813a8aa86818a030"},
+    {file = "kiwisolver-1.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:b9edd0110a77fc321ab090aaa1cfcaba1d8499850a12848b81be2222eab648f6"},
+    {file = "kiwisolver-1.3.1-pp36-pypy36_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0cd53f403202159b44528498de18f9285b04482bab2a6fc3f5dd8dbb9352e30d"},
+    {file = "kiwisolver-1.3.1-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:33449715e0101e4d34f64990352bce4095c8bf13bed1b390773fc0a7295967b3"},
+    {file = "kiwisolver-1.3.1-pp36-pypy36_pp73-win32.whl", hash = "sha256:401a2e9afa8588589775fe34fc22d918ae839aaaf0c0e96441c0fdbce6d8ebe6"},
+    {file = "kiwisolver-1.3.1.tar.gz", hash = "sha256:950a199911a8d94683a6b10321f9345d5a3a8433ec58b217ace979e18f16e248"},
+]
+llvmlite = [
+    {file = "llvmlite-0.36.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc0f9b9644b4ab0e4a5edb17f1531d791630c88858220d3cc688d6edf10da100"},
+    {file = "llvmlite-0.36.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:f7918dbac02b1ebbfd7302ad8e8307d7877ab57d782d5f04b70ff9696b53c21b"},
+    {file = "llvmlite-0.36.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:7768658646c418b9b3beccb7044277a608bc8c62b82a85e73c7e5c065e4157c2"},
+    {file = "llvmlite-0.36.0-cp36-cp36m-win32.whl", hash = "sha256:05f807209a360d39526d98141b6f281b9c7c771c77a4d1fc22002440642c8de2"},
+    {file = "llvmlite-0.36.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d1fdd63c371626c25ad834e1c6297eb76cf2f093a40dbb401a87b6476ab4e34e"},
+    {file = "llvmlite-0.36.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7c4e7066447305d5095d0b0a9cae7b835d2f0fde143456b3124110eab0856426"},
+    {file = "llvmlite-0.36.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:9dad7e4bb042492914292aea3f4172eca84db731f9478250240955aedba95e08"},
+    {file = "llvmlite-0.36.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:1ce5bc0a638d874a08d4222be0a7e48e5df305d094c2ff8dec525ef32b581551"},
+    {file = "llvmlite-0.36.0-cp37-cp37m-win32.whl", hash = "sha256:dbedff0f6d417b374253a6bab39aa4b5364f1caab30c06ba8726904776fcf1cb"},
+    {file = "llvmlite-0.36.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3b17fc4b0dd17bd29d7297d054e2915fad535889907c3f65232ee21f483447c5"},
+    {file = "llvmlite-0.36.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b3a77e46e6053e2a86e607e87b97651dda81e619febb914824a927bff4e88737"},
+    {file = "llvmlite-0.36.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:048a7c117641c9be87b90005684e64a6f33ea0897ebab1df8a01214a10d6e79a"},
+    {file = "llvmlite-0.36.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:7db4b0eef93125af1c4092c64a3c73c7dc904101117ef53f8d78a1a499b8d5f4"},
+    {file = "llvmlite-0.36.0-cp38-cp38-win32.whl", hash = "sha256:50b1828bde514b31431b2bba1aa20b387f5625b81ad6e12fede430a04645e47a"},
+    {file = "llvmlite-0.36.0-cp38-cp38-win_amd64.whl", hash = "sha256:f608bae781b2d343e15e080c546468c5a6f35f57f0446923ea198dd21f23757e"},
+    {file = "llvmlite-0.36.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a3abc8a8889aeb06bf9c4a7e5df5bc7bb1aa0aedd91a599813809abeec80b5a"},
+    {file = "llvmlite-0.36.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:705f0323d931684428bb3451549603299bb5e17dd60fb979d67c3807de0debc1"},
+    {file = "llvmlite-0.36.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:5a6548b4899facb182145147185e9166c69826fb424895f227e6b7cf924a8da1"},
+    {file = "llvmlite-0.36.0-cp39-cp39-win32.whl", hash = "sha256:ff52fb9c2be66b95b0e67d56fce11038397e5be1ea410ee53f5f1175fdbb107a"},
+    {file = "llvmlite-0.36.0-cp39-cp39-win_amd64.whl", hash = "sha256:1dee416ea49fd338c74ec15c0c013e5273b0961528169af06ff90772614f7f6c"},
+    {file = "llvmlite-0.36.0.tar.gz", hash = "sha256:765128fdf5f149ed0b889ffbe2b05eb1717f8e20a5c87fa2b4018fbcce0fcfc9"},
+]
+markupsafe = [
+    {file = "MarkupSafe-1.1.1-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161"},
+    {file = "MarkupSafe-1.1.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"},
+    {file = "MarkupSafe-1.1.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183"},
+    {file = "MarkupSafe-1.1.1-cp27-cp27m-win32.whl", hash = "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b"},
+    {file = "MarkupSafe-1.1.1-cp27-cp27m-win_amd64.whl", hash = "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e"},
+    {file = "MarkupSafe-1.1.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f"},
+    {file = "MarkupSafe-1.1.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1"},
+    {file = "MarkupSafe-1.1.1-cp34-cp34m-macosx_10_6_intel.whl", hash = "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5"},
+    {file = "MarkupSafe-1.1.1-cp34-cp34m-manylinux1_i686.whl", hash = "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1"},
+    {file = "MarkupSafe-1.1.1-cp34-cp34m-manylinux1_x86_64.whl", hash = "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735"},
+    {file = "MarkupSafe-1.1.1-cp34-cp34m-win32.whl", hash = "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21"},
+    {file = "MarkupSafe-1.1.1-cp34-cp34m-win_amd64.whl", hash = "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235"},
+    {file = "MarkupSafe-1.1.1-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b"},
+    {file = "MarkupSafe-1.1.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f"},
+    {file = "MarkupSafe-1.1.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905"},
+    {file = "MarkupSafe-1.1.1-cp35-cp35m-win32.whl", hash = "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1"},
+    {file = "MarkupSafe-1.1.1-cp35-cp35m-win_amd64.whl", hash = "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d"},
+    {file = "MarkupSafe-1.1.1-cp36-cp36m-macosx_10_6_intel.whl", hash = "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff"},
+    {file = "MarkupSafe-1.1.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473"},
+    {file = "MarkupSafe-1.1.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e"},
+    {file = "MarkupSafe-1.1.1-cp36-cp36m-win32.whl", hash = "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66"},
+    {file = "MarkupSafe-1.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5"},
+    {file = "MarkupSafe-1.1.1-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d"},
+    {file = "MarkupSafe-1.1.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e"},
+    {file = "MarkupSafe-1.1.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6"},
+    {file = "MarkupSafe-1.1.1-cp37-cp37m-win32.whl", hash = "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2"},
+    {file = "MarkupSafe-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c"},
+    {file = "MarkupSafe-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15"},
+    {file = "MarkupSafe-1.1.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2"},
+    {file = "MarkupSafe-1.1.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42"},
+    {file = "MarkupSafe-1.1.1-cp38-cp38-win32.whl", hash = "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b"},
+    {file = "MarkupSafe-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"},
+    {file = "MarkupSafe-1.1.1.tar.gz", hash = "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b"},
+]
+matplotlib = [
+    {file = "matplotlib-3.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7a54efd6fcad9cb3cd5ef2064b5a3eeb0b63c99f26c346bdcf66e7c98294d7cc"},
+    {file = "matplotlib-3.4.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:86dc94e44403fa0f2b1dd76c9794d66a34e821361962fe7c4e078746362e3b14"},
+    {file = "matplotlib-3.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:574306171b84cd6854c83dc87bc353cacc0f60184149fb00c9ea871eca8c1ecb"},
+    {file = "matplotlib-3.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:84a10e462120aa7d9eb6186b50917ed5a6286ee61157bfc17c5b47987d1a9068"},
+    {file = "matplotlib-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:81e6fe8b18ef5be67f40a1d4f07d5a4ed21d3878530193898449ddef7793952f"},
+    {file = "matplotlib-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c45e7bf89ea33a2adaef34774df4e692c7436a18a48bcb0e47a53e698a39fa39"},
+    {file = "matplotlib-3.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1f83a32e4b6045191f9d34e4dc68c0a17c870b57ef9cca518e516da591246e79"},
+    {file = "matplotlib-3.4.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:a18cc1ab4a35b845cf33b7880c979f5c609fd26c2d6e74ddfacb73dcc60dd956"},
+    {file = "matplotlib-3.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ac2a30a09984c2719f112a574b6543ccb82d020fd1b23b4d55bf4759ba8dd8f5"},
+    {file = "matplotlib-3.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a97781453ac79409ddf455fccf344860719d95142f9c334f2a8f3fff049ffec3"},
+    {file = "matplotlib-3.4.1-cp38-cp38-win32.whl", hash = "sha256:2eee37340ca1b353e0a43a33da79d0cd4bcb087064a0c3c3d1329cdea8fbc6f3"},
+    {file = "matplotlib-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:90dbc007f6389bcfd9ef4fe5d4c78c8d2efe4e0ebefd48b4f221cdfed5672be2"},
+    {file = "matplotlib-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f16660edf9a8bcc0f766f51c9e1b9d2dc6ceff6bf636d2dbd8eb925d5832dfd"},
+    {file = "matplotlib-3.4.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:a989022f89cda417f82dbf65e0a830832afd8af743d05d1414fb49549287ff04"},
+    {file = "matplotlib-3.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:be4430b33b25e127fc4ea239cc386389de420be4d63e71d5359c20b562951ce1"},
+    {file = "matplotlib-3.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7561fd541477d41f3aa09457c434dd1f7604f3bd26d7858d52018f5dfe1c06d1"},
+    {file = "matplotlib-3.4.1-cp39-cp39-win32.whl", hash = "sha256:9f374961a3996c2d1b41ba3145462c3708a89759e604112073ed6c8bdf9f622f"},
+    {file = "matplotlib-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:53ceb12ef44f8982b45adc7a0889a7e2df1d758e8b360f460e435abe8a8cd658"},
+    {file = "matplotlib-3.4.1.tar.gz", hash = "sha256:84d4c4f650f356678a5d658a43ca21a41fca13f9b8b00169c0b76e6a6a948908"},
+]
+networkx = [
+    {file = "networkx-2.5.1-py3-none-any.whl", hash = "sha256:0635858ed7e989f4c574c2328380b452df892ae85084144c73d8cd819f0c4e06"},
+    {file = "networkx-2.5.1.tar.gz", hash = "sha256:109cd585cac41297f71103c3c42ac6ef7379f29788eb54cb751be5a663bb235a"},
+]
+numba = [
+    {file = "numba-0.53.1-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:b23de6b6837c132087d06b8b92d343edb54b885873b824a037967fbd5272ebb7"},
+    {file = "numba-0.53.1-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:6545b9e9b0c112b81de7f88a3c787469a357eeff8211e90b8f45ee243d521cc2"},
+    {file = "numba-0.53.1-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:8fa5c963a43855050a868106a87cd614f3c3f459951c8fc468aec263ef80d063"},
+    {file = "numba-0.53.1-cp36-cp36m-win32.whl", hash = "sha256:aaa6ebf56afb0b6752607b9f3bf39e99b0efe3c1fa6849698373925ee6838fd7"},
+    {file = "numba-0.53.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b08b3df38aab769df79ed948d70f0a54a3cdda49d58af65369235c204ec5d0f3"},
+    {file = "numba-0.53.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:bf5c463b62d013e3f709cc8277adf2f4f4d8cc6757293e29c6db121b77e6b760"},
+    {file = "numba-0.53.1-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:74df02e73155f669e60dcff07c4eef4a03dbf5b388594db74142ab40914fe4f5"},
+    {file = "numba-0.53.1-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:5165709bf62f28667e10b9afe6df0ce1037722adab92d620f59cb8bbb8104641"},
+    {file = "numba-0.53.1-cp37-cp37m-win32.whl", hash = "sha256:2e96958ed2ca7e6d967b2ce29c8da0ca47117e1de28e7c30b2c8c57386506fa5"},
+    {file = "numba-0.53.1-cp37-cp37m-win_amd64.whl", hash = "sha256:276f9d1674fe08d95872d81b97267c6b39dd830f05eb992608cbede50fcf48a9"},
+    {file = "numba-0.53.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:4c4c8d102512ae472af52c76ad9522da718c392cb59f4cd6785d711fa5051a2a"},
+    {file = "numba-0.53.1-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:691adbeac17dbdf6ed7c759e9e33a522351f07d2065fe926b264b6b2c15fd89b"},
+    {file = "numba-0.53.1-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:94aab3e0e9e8754116325ce026e1b29ae72443c706a3104cf7f3368dc3012912"},
+    {file = "numba-0.53.1-cp38-cp38-win32.whl", hash = "sha256:aabeec89bb3e3162136eea492cea7ee8882ddcda2201f05caecdece192c40896"},
+    {file = "numba-0.53.1-cp38-cp38-win_amd64.whl", hash = "sha256:1895ebd256819ff22256cd6fe24aa8f7470b18acc73e7917e8e93c9ac7f565dc"},
+    {file = "numba-0.53.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:224d197a46a9e602a16780d87636e199e2cdef528caef084a4d8fd8909c2455c"},
+    {file = "numba-0.53.1-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:aba7acb247a09d7f12bd17a8e28bbb04e8adef9fc20ca29835d03b7894e1b49f"},
+    {file = "numba-0.53.1-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:bd126f1f49da6fc4b3169cf1d96f1c3b3f84a7badd11fe22da344b923a00e744"},
+    {file = "numba-0.53.1-cp39-cp39-win32.whl", hash = "sha256:0ef9d1f347b251282ae46e5a5033600aa2d0dfa1ee8c16cb8137b8cd6f79e221"},
+    {file = "numba-0.53.1-cp39-cp39-win_amd64.whl", hash = "sha256:17146885cbe4e89c9d4abd4fcb8886dee06d4591943dc4343500c36ce2fcfa69"},
+    {file = "numba-0.53.1.tar.gz", hash = "sha256:9cd4e5216acdc66c4e9dab2dfd22ddb5bef151185c070d4a3cd8e78638aff5b0"},
+]
+numpy = [
+    {file = "numpy-1.20.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e9459f40244bb02b2f14f6af0cd0732791d72232bbb0dc4bab57ef88e75f6935"},
+    {file = "numpy-1.20.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a8e6859913ec8eeef3dbe9aed3bf475347642d1cdd6217c30f28dee8903528e6"},
+    {file = "numpy-1.20.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:9cab23439eb1ebfed1aaec9cd42b7dc50fc96d5cd3147da348d9161f0501ada5"},
+    {file = "numpy-1.20.2-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:9c0fab855ae790ca74b27e55240fe4f2a36a364a3f1ebcfd1fb5ac4088f1cec3"},
+    {file = "numpy-1.20.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:61d5b4cf73622e4d0c6b83408a16631b670fc045afd6540679aa35591a17fe6d"},
+    {file = "numpy-1.20.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d15007f857d6995db15195217afdbddfcd203dfaa0ba6878a2f580eaf810ecd6"},
+    {file = "numpy-1.20.2-cp37-cp37m-win32.whl", hash = "sha256:d76061ae5cab49b83a8cf3feacefc2053fac672728802ac137dd8c4123397677"},
+    {file = "numpy-1.20.2-cp37-cp37m-win_amd64.whl", hash = "sha256:bad70051de2c50b1a6259a6df1daaafe8c480ca98132da98976d8591c412e737"},
+    {file = "numpy-1.20.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:719656636c48be22c23641859ff2419b27b6bdf844b36a2447cb39caceb00935"},
+    {file = "numpy-1.20.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:aa046527c04688af680217fffac61eec2350ef3f3d7320c07fd33f5c6e7b4d5f"},
+    {file = "numpy-1.20.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:2428b109306075d89d21135bdd6b785f132a1f5a3260c371cee1fae427e12727"},
+    {file = "numpy-1.20.2-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:e8e4fbbb7e7634f263c5b0150a629342cc19b47c5eba8d1cd4363ab3455ab576"},
+    {file = "numpy-1.20.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:edb1f041a9146dcf02cd7df7187db46ab524b9af2515f392f337c7cbbf5b52cd"},
+    {file = "numpy-1.20.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:c73a7975d77f15f7f68dacfb2bca3d3f479f158313642e8ea9058eea06637931"},
+    {file = "numpy-1.20.2-cp38-cp38-win32.whl", hash = "sha256:6c915ee7dba1071554e70a3664a839fbc033e1d6528199d4621eeaaa5487ccd2"},
+    {file = "numpy-1.20.2-cp38-cp38-win_amd64.whl", hash = "sha256:471c0571d0895c68da309dacee4e95a0811d0a9f9f532a48dc1bea5f3b7ad2b7"},
+    {file = "numpy-1.20.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4703b9e937df83f5b6b7447ca5912b5f5f297aba45f91dbbbc63ff9278c7aa98"},
+    {file = "numpy-1.20.2-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:abc81829c4039e7e4c30f7897938fa5d4916a09c2c7eb9b244b7a35ddc9656f4"},
+    {file = "numpy-1.20.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:377751954da04d4a6950191b20539066b4e19e3b559d4695399c5e8e3e683bf6"},
+    {file = "numpy-1.20.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:6e51e417d9ae2e7848314994e6fc3832c9d426abce9328cf7571eefceb43e6c9"},
+    {file = "numpy-1.20.2-cp39-cp39-win32.whl", hash = "sha256:780ae5284cb770ade51d4b4a7dce4faa554eb1d88a56d0e8b9f35fca9b0270ff"},
+    {file = "numpy-1.20.2-cp39-cp39-win_amd64.whl", hash = "sha256:924dc3f83de20437de95a73516f36e09918e9c9c18d5eac520062c49191025fb"},
+    {file = "numpy-1.20.2-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:97ce8b8ace7d3b9288d88177e66ee75480fb79b9cf745e91ecfe65d91a856042"},
+    {file = "numpy-1.20.2.zip", hash = "sha256:878922bf5ad7550aa044aa9301d417e2d3ae50f0f577de92051d739ac6096cee"},
+]
+pandas = [
+    {file = "pandas-1.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4d821b9b911fc1b7d428978d04ace33f0af32bb7549525c8a7b08444bce46b74"},
+    {file = "pandas-1.2.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:9f5829e64507ad10e2561b60baf285c470f3c4454b007c860e77849b88865ae7"},
+    {file = "pandas-1.2.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:97b1954533b2a74c7e20d1342c4f01311d3203b48f2ebf651891e6a6eaf01104"},
+    {file = "pandas-1.2.3-cp37-cp37m-win32.whl", hash = "sha256:5e3c8c60541396110586bcbe6eccdc335a38e7de8c217060edaf4722260b158f"},
+    {file = "pandas-1.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8a051e957c5206f722e83f295f95a2cf053e890f9a1fba0065780a8c2d045f5d"},
+    {file = "pandas-1.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a93e34f10f67d81de706ce00bf8bb3798403cabce4ccb2de10c61b5ae8786ab5"},
+    {file = "pandas-1.2.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:46fc671c542a8392a4f4c13edc8527e3a10f6cb62912d856f82248feb747f06e"},
+    {file = "pandas-1.2.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:43e00770552595c2250d8d712ec8b6e08ca73089ac823122344f023efa4abea3"},
+    {file = "pandas-1.2.3-cp38-cp38-win32.whl", hash = "sha256:475b7772b6e18a93a43ea83517932deff33954a10d4fbae18d0c1aba4182310f"},
+    {file = "pandas-1.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:72ffcea00ae8ffcdbdefff800284311e155fbb5ed6758f1a6110fc1f8f8f0c1c"},
+    {file = "pandas-1.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:621c044a1b5e535cf7dcb3ab39fca6f867095c3ef223a524f18f60c7fee028ea"},
+    {file = "pandas-1.2.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:0f27fd1adfa256388dc34895ca5437eaf254832223812afd817a6f73127f969c"},
+    {file = "pandas-1.2.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:dbb255975eb94143f2e6ec7dadda671d25147939047839cd6b8a4aff0379bb9b"},
+    {file = "pandas-1.2.3-cp39-cp39-win32.whl", hash = "sha256:d59842a5aa89ca03c2099312163ffdd06f56486050e641a45d926a072f04d994"},
+    {file = "pandas-1.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:09761bf5f8c741d47d4b8b9073288de1be39bbfccc281d70b889ade12b2aad29"},
+    {file = "pandas-1.2.3.tar.gz", hash = "sha256:df6f10b85aef7a5bb25259ad651ad1cc1d6bb09000595cab47e718cbac250b1d"},
+]
+pillow = [
+    {file = "Pillow-8.2.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:dc38f57d8f20f06dd7c3161c59ca2c86893632623f33a42d592f097b00f720a9"},
+    {file = "Pillow-8.2.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:a013cbe25d20c2e0c4e85a9daf438f85121a4d0344ddc76e33fd7e3965d9af4b"},
+    {file = "Pillow-8.2.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8bb1e155a74e1bfbacd84555ea62fa21c58e0b4e7e6b20e4447b8d07990ac78b"},
+    {file = "Pillow-8.2.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c5236606e8570542ed424849f7852a0ff0bce2c4c8d0ba05cc202a5a9c97dee9"},
+    {file = "Pillow-8.2.0-cp36-cp36m-win32.whl", hash = "sha256:12e5e7471f9b637762453da74e390e56cc43e486a88289995c1f4c1dc0bfe727"},
+    {file = "Pillow-8.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5afe6b237a0b81bd54b53f835a153770802f164c5570bab5e005aad693dab87f"},
+    {file = "Pillow-8.2.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:cb7a09e173903541fa888ba010c345893cd9fc1b5891aaf060f6ca77b6a3722d"},
+    {file = "Pillow-8.2.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:0d19d70ee7c2ba97631bae1e7d4725cdb2ecf238178096e8c82ee481e189168a"},
+    {file = "Pillow-8.2.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:083781abd261bdabf090ad07bb69f8f5599943ddb539d64497ed021b2a67e5a9"},
+    {file = "Pillow-8.2.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:c6b39294464b03457f9064e98c124e09008b35a62e3189d3513e5148611c9388"},
+    {file = "Pillow-8.2.0-cp37-cp37m-win32.whl", hash = "sha256:01425106e4e8cee195a411f729cff2a7d61813b0b11737c12bd5991f5f14bcd5"},
+    {file = "Pillow-8.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3b570f84a6161cf8865c4e08adf629441f56e32f180f7aa4ccbd2e0a5a02cba2"},
+    {file = "Pillow-8.2.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:031a6c88c77d08aab84fecc05c3cde8414cd6f8406f4d2b16fed1e97634cc8a4"},
+    {file = "Pillow-8.2.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:66cc56579fd91f517290ab02c51e3a80f581aba45fd924fcdee01fa06e635812"},
+    {file = "Pillow-8.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:6c32cc3145928c4305d142ebec682419a6c0a8ce9e33db900027ddca1ec39178"},
+    {file = "Pillow-8.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:624b977355cde8b065f6d51b98497d6cd5fbdd4f36405f7a8790e3376125e2bb"},
+    {file = "Pillow-8.2.0-cp38-cp38-win32.whl", hash = "sha256:5cbf3e3b1014dddc45496e8cf38b9f099c95a326275885199f427825c6522232"},
+    {file = "Pillow-8.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:463822e2f0d81459e113372a168f2ff59723e78528f91f0bd25680ac185cf797"},
+    {file = "Pillow-8.2.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:95d5ef984eff897850f3a83883363da64aae1000e79cb3c321915468e8c6add5"},
+    {file = "Pillow-8.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b91c36492a4bbb1ee855b7d16fe51379e5f96b85692dc8210831fbb24c43e484"},
+    {file = "Pillow-8.2.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d68cb92c408261f806b15923834203f024110a2e2872ecb0bd2a110f89d3c602"},
+    {file = "Pillow-8.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f217c3954ce5fd88303fc0c317af55d5e0204106d86dea17eb8205700d47dec2"},
+    {file = "Pillow-8.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5b70110acb39f3aff6b74cf09bb4169b167e2660dabc304c1e25b6555fa781ef"},
+    {file = "Pillow-8.2.0-cp39-cp39-win32.whl", hash = "sha256:a7d5e9fad90eff8f6f6106d3b98b553a88b6f976e51fce287192a5d2d5363713"},
+    {file = "Pillow-8.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:238c197fc275b475e87c1453b05b467d2d02c2915fdfdd4af126145ff2e4610c"},
+    {file = "Pillow-8.2.0-pp36-pypy36_pp73-macosx_10_10_x86_64.whl", hash = "sha256:0e04d61f0064b545b989126197930807c86bcbd4534d39168f4aa5fda39bb8f9"},
+    {file = "Pillow-8.2.0-pp36-pypy36_pp73-manylinux2010_i686.whl", hash = "sha256:63728564c1410d99e6d1ae8e3b810fe012bc440952168af0a2877e8ff5ab96b9"},
+    {file = "Pillow-8.2.0-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:c03c07ed32c5324939b19e36ae5f75c660c81461e312a41aea30acdd46f93a7c"},
+    {file = "Pillow-8.2.0-pp37-pypy37_pp73-macosx_10_10_x86_64.whl", hash = "sha256:4d98abdd6b1e3bf1a1cbb14c3895226816e666749ac040c4e2554231068c639b"},
+    {file = "Pillow-8.2.0-pp37-pypy37_pp73-manylinux2010_i686.whl", hash = "sha256:aac00e4bc94d1b7813fe882c28990c1bc2f9d0e1aa765a5f2b516e8a6a16a9e4"},
+    {file = "Pillow-8.2.0-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:22fd0f42ad15dfdde6c581347eaa4adb9a6fc4b865f90b23378aa7914895e120"},
+    {file = "Pillow-8.2.0-pp37-pypy37_pp73-win32.whl", hash = "sha256:e98eca29a05913e82177b3ba3d198b1728e164869c613d76d0de4bde6768a50e"},
+    {file = "Pillow-8.2.0.tar.gz", hash = "sha256:a787ab10d7bb5494e5f76536ac460741788f1fbce851068d73a87ca7c35fc3e1"},
+]
+py = [
+    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
+    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
+]
+pyaml = [
+    {file = "pyaml-20.4.0-py2.py3-none-any.whl", hash = "sha256:67081749a82b72c45e5f7f812ee3a14a03b3f5c25ff36ec3b290514f8c4c4b99"},
+    {file = "pyaml-20.4.0.tar.gz", hash = "sha256:29a5c2a68660a799103d6949167bd6c7953d031449d08802386372de1db6ad71"},
+]
+pycparser = [
+    {file = "pycparser-2.20-py2.py3-none-any.whl", hash = "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"},
+    {file = "pycparser-2.20.tar.gz", hash = "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0"},
+]
+pygame = [
+    {file = "pygame-2.0.1-cp27-cp27m-macosx_10_9_intel.whl", hash = "sha256:49c2f58559c1fbf4ba258e4b141578ccb0e83da3d4f823894f6171a8f0d594ed"},
+    {file = "pygame-2.0.1-cp27-cp27m-win32.whl", hash = "sha256:0571dde0277483f5060c8ee43cbfd8df5776b12505e3948eee241c8ce9b93371"},
+    {file = "pygame-2.0.1-cp27-cp27m-win_amd64.whl", hash = "sha256:fd5ee0f42d59a290c049f91894e0739f62c2908e7edc028ffb847a105e68bfc3"},
+    {file = "pygame-2.0.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:21475405bcdeb20b8a796a3da6704ebb816e06b29749dd64ff619e80816b7932"},
+    {file = "pygame-2.0.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ad230911d61f448c09886d3c92b2eae44ca7530babe9c48e74e02a0622ce2d34"},
+    {file = "pygame-2.0.1-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:9aead3f2eed90260136b201f398965900c5335c974bb7b47c381d98e39284018"},
+    {file = "pygame-2.0.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:5aeeb6659a7fe7760a78e449566553ae8c949ae29dd907a8eb4171fa0a274c16"},
+    {file = "pygame-2.0.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3270cbcff40ca2b5622a145346298a33285c91b6d50097e0b85123d9a2bc7c9b"},
+    {file = "pygame-2.0.1-cp35-cp35m-win32.whl", hash = "sha256:72625dc949c6d08ba7ce7c37a33163bb498d90ca0d7e626db3cfbf486df4db1d"},
+    {file = "pygame-2.0.1-cp35-cp35m-win_amd64.whl", hash = "sha256:bf833c853a0568738ee5d88e1345c17bf3e8db626c36fb895327a35bb1827b0b"},
+    {file = "pygame-2.0.1-cp36-cp36m-macosx_10_9_intel.whl", hash = "sha256:44f3ff8224d7cb998642400371c685005c8316b55e87794cbf1f6407b88ec424"},
+    {file = "pygame-2.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:eab18df58dcc8512f1b694f7218146828d7e3dd3f4e73bfd6942a11810293fd5"},
+    {file = "pygame-2.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:19357c826ab94f9ae5b4ec5cb752cc806cfc29ea32cf7bdaacb65fa2615607e8"},
+    {file = "pygame-2.0.1-cp36-cp36m-win32.whl", hash = "sha256:328d70d40bc9a6defb9f330f5e7f3d0726af1e7c2308ebca582e69480db2950d"},
+    {file = "pygame-2.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:81510ffb1c31a3827c6be047b1926d81caf36dc734564ca0e14903d6bce60c6f"},
+    {file = "pygame-2.0.1-cp37-cp37m-macosx_10_9_intel.whl", hash = "sha256:ece424c83a575c2e0ba25815871458d3bbade46d76b7997236fb51a0251229ab"},
+    {file = "pygame-2.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e72cdc97a49509ca2298350c2c3a0ac26bc8e943ce003a7d245df42e91439d5d"},
+    {file = "pygame-2.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:9fd0691c4fe58b932674bb6a91d2808790e8269c3183ef16052f13e1c602ac00"},
+    {file = "pygame-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:4f41252dfa1e8bb95f2ea51fba710827dde9820a535623d002a65621bafe7e3f"},
+    {file = "pygame-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:3b31d088129977885f72037c55cfa1140e9bcf3468e68b46141f6cc2b33d456b"},
+    {file = "pygame-2.0.1-cp38-cp38-macosx_10_9_intel.whl", hash = "sha256:30eb5c7adb0b3362024cec2c461be6978fbfc99c3bca974e438b1b540cd09438"},
+    {file = "pygame-2.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:845385caf99f8d941607791c60e560d24b4a35c70eef0b01c30cfde0b913ff92"},
+    {file = "pygame-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3c2676b4fd278d632037eacd3b0524ce1a592c048e8e5eb5830475f83585cb3a"},
+    {file = "pygame-2.0.1-cp38-cp38-win32.whl", hash = "sha256:4d3135a1f8c76c3fff1ef8b7a51e4c6523748e9bdbd7bca6daa69790ce0e798a"},
+    {file = "pygame-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:898874521a9be1f9dbc5b036a9755803287c2664e335afd3e10963f7f4ccb853"},
+    {file = "pygame-2.0.1-cp39-cp39-macosx_10_9_intel.whl", hash = "sha256:ed80b40da839d60f4c03915bb3638e3c96ea8c30e689d0cc309b7597d82cc217"},
+    {file = "pygame-2.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:fd6acd09d2a0fd3f616b18f977f399ed3dd95e2d6754f115837f026d19d62e10"},
+    {file = "pygame-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d8059084ce54b2c3d7b2c8bacd5f6490db849b2d2d6e7368c160b08504c87e73"},
+    {file = "pygame-2.0.1-cp39-cp39-win32.whl", hash = "sha256:107d5f82f471baee4b9522a691cb320dd52dbf329ed7a0e9ab25f75cd3caf890"},
+    {file = "pygame-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:07ed57062be4bb9741f57dab1751d95574c091c9958ed7e39cdb246d50903283"},
+    {file = "pygame-2.0.1-pp27-pypy_73-manylinux2010_i686.whl", hash = "sha256:5f057e5aa4c383fcf18560dcae2c5593e37e3fc941083a0a00a17f7cf25ee522"},
+    {file = "pygame-2.0.1-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:c188ce4bf1544f2758e8b651f4349a0f3dc441e09d8ab7c4863db1ae8f084a32"},
+    {file = "pygame-2.0.1-pp27-pypy_73-win32.whl", hash = "sha256:7ea518d8eeb072c77c16977cdee3c59d9fffa750ed9c7c9c533ba520b6b08af7"},
+    {file = "pygame-2.0.1-pp36-pypy36_pp73-macosx_10_9_intel.whl", hash = "sha256:10ca736eecedadf492ba1191f9fa3a5e6f30db2b9f8882b3ee7706d5a89c14e0"},
+    {file = "pygame-2.0.1-pp36-pypy36_pp73-manylinux2010_i686.whl", hash = "sha256:5afc34f0af0cec09a20b6bb09090054fac5169ab01909e01b06e7e0752ab0153"},
+    {file = "pygame-2.0.1-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:2d9b0a66034fed390ee367a549435853502c9d4fe82ac0fa3a520f0ad5648e6e"},
+    {file = "pygame-2.0.1-pp36-pypy36_pp73-win32.whl", hash = "sha256:9de559462aaa68c40bb7625dcd587584b4eb85c4208528dc97b9ee7254945294"},
+    {file = "pygame-2.0.1-pp37-pypy37_pp73-macosx_10_9_intel.whl", hash = "sha256:9f48277de1daa83fd58a722b2e3423201b5eb39842227f32702fb78e4bba5a71"},
+    {file = "pygame-2.0.1-pp37-pypy37_pp73-manylinux2010_i686.whl", hash = "sha256:a4e35d89b6754941e82df1ce980a1c370943d3c076938d94ed1e48165dd6a11b"},
+    {file = "pygame-2.0.1-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:b812285d23b5644c643a6ae30553a772f935f47f61826660b108b8727936384b"},
+    {file = "pygame-2.0.1.tar.gz", hash = "sha256:8b1e7b63f47aafcdd8849933b206778747ef1802bd3d526aca45ed77141e4001"},
+]
+pyparsing = [
+    {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
+    {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
+]
+pyreadline = [
+    {file = "pyreadline-2.1.win-amd64.exe", hash = "sha256:9ce5fa65b8992dfa373bddc5b6e0864ead8f291c94fbfec05fbd5c836162e67b"},
+    {file = "pyreadline-2.1.win32.exe", hash = "sha256:65540c21bfe14405a3a77e4c085ecfce88724743a4ead47c66b84defcf82c32e"},
+    {file = "pyreadline-2.1.zip", hash = "sha256:4530592fc2e85b25b1a9f79664433da09237c1a270e4d78ea5aa3a2c7229e2d1"},
+]
+python-dateutil = [
+    {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"},
+    {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"},
+]
+pytz = [
+    {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"},
+    {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"},
+]
+pywavelets = [
+    {file = "PyWavelets-1.1.1-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:35959c041ec014648575085a97b498eafbbaa824f86f6e4a59bfdef8a3fe6308"},
+    {file = "PyWavelets-1.1.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:55e39ec848ceec13c9fa1598253ae9dd5c31d09dfd48059462860d2b908fb224"},
+    {file = "PyWavelets-1.1.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c06d2e340c7bf8b9ec71da2284beab8519a3908eab031f4ea126e8ccfc3fd567"},
+    {file = "PyWavelets-1.1.1-cp35-cp35m-win32.whl", hash = "sha256:be105382961745f88d8196bba5a69ee2c4455d87ad2a2e5d1eed6bd7fda4d3fd"},
+    {file = "PyWavelets-1.1.1-cp35-cp35m-win_amd64.whl", hash = "sha256:076ca8907001fdfe4205484f719d12b4a0262dfe6652fa1cfc3c5c362d14dc84"},
+    {file = "PyWavelets-1.1.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:7947e51ca05489b85928af52a34fe67022ab5b81d4ae32a4109a99e883a0635e"},
+    {file = "PyWavelets-1.1.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:9e2528823ccf5a0a1d23262dfefe5034dce89cd84e4e124dc553dfcdf63ebb92"},
+    {file = "PyWavelets-1.1.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:80b924edbc012ded8aa8b91cb2fd6207fb1a9a3a377beb4049b8a07445cec6f0"},
+    {file = "PyWavelets-1.1.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c2a799e79cee81a862216c47e5623c97b95f1abee8dd1f9eed736df23fb653fb"},
+    {file = "PyWavelets-1.1.1-cp36-cp36m-win32.whl", hash = "sha256:d510aef84d9852653d079c84f2f81a82d5d09815e625f35c95714e7364570ad4"},
+    {file = "PyWavelets-1.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:889d4c5c5205a9c90118c1980df526857929841df33e4cd1ff1eff77c6817a65"},
+    {file = "PyWavelets-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:68b5c33741d26c827074b3d8f0251de1c3019bb9567b8d303eb093c822ce28f1"},
+    {file = "PyWavelets-1.1.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:18a51b3f9416a2ae6e9a35c4af32cf520dd7895f2b69714f4aa2f4342fca47f9"},
+    {file = "PyWavelets-1.1.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:cfe79844526dd92e3ecc9490b5031fca5f8ab607e1e858feba232b1b788ff0ea"},
+    {file = "PyWavelets-1.1.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:2f7429eeb5bf9c7068002d0d7f094ed654c77a70ce5e6198737fd68ab85f8311"},
+    {file = "PyWavelets-1.1.1-cp37-cp37m-win32.whl", hash = "sha256:720dbcdd3d91c6dfead79c80bf8b00a1d8aa4e5d551dc528c6d5151e4efc3403"},
+    {file = "PyWavelets-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:bc5e87b72371da87c9bebc68e54882aada9c3114e640de180f62d5da95749cd3"},
+    {file = "PyWavelets-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:98b2669c5af842a70cfab33a7043fcb5e7535a690a00cd251b44c9be0be418e5"},
+    {file = "PyWavelets-1.1.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e02a0558e0c2ac8b8bbe6a6ac18c136767ec56b96a321e0dfde2173adfa5a504"},
+    {file = "PyWavelets-1.1.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:6162dc0ae04669ea04b4b51420777b9ea2d30b0a9d02901b2a3b4d61d159c2e9"},
+    {file = "PyWavelets-1.1.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:39c74740718e420d38c78ca4498568fa57976d78d5096277358e0fa9629a7aea"},
+    {file = "PyWavelets-1.1.1-cp38-cp38-win32.whl", hash = "sha256:79f5b54f9dc353e5ee47f0c3f02bebd2c899d49780633aa771fed43fa20b3149"},
+    {file = "PyWavelets-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:935ff247b8b78bdf77647fee962b1cc208c51a7b229db30b9ba5f6da3e675178"},
+    {file = "PyWavelets-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6ebfefebb5c6494a3af41ad8c60248a95da267a24b79ed143723d4502b1fe4d7"},
+    {file = "PyWavelets-1.1.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:6bc78fb9c42a716309b4ace56f51965d8b5662c3ba19d4591749f31773db1125"},
+    {file = "PyWavelets-1.1.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:411e17ca6ed8cf5e18a7ca5ee06a91c25800cc6c58c77986202abf98d749273a"},
+    {file = "PyWavelets-1.1.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:83c5e3eb78ce111c2f0b45f46106cc697c3cb6c4e5f51308e1f81b512c70c8fb"},
+    {file = "PyWavelets-1.1.1-cp39-cp39-win32.whl", hash = "sha256:2b634a54241c190ee989a4af87669d377b37c91bcc9cf0efe33c10ff847f7841"},
+    {file = "PyWavelets-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:732bab78435c48be5d6bc75486ef629d7c8f112e07b313bf1f1a2220ab437277"},
+    {file = "PyWavelets-1.1.1.tar.gz", hash = "sha256:1a64b40f6acb4ffbaccce0545d7fc641744f95351f62e4c6aaa40549326008c9"},
+]
+pyyaml = [
+    {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
+    {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
+    {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"},
+    {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"},
+    {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"},
+    {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"},
+    {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"},
+    {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"},
+    {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"},
+    {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"},
+    {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"},
+]
+pyzmq = [
+    {file = "pyzmq-22.0.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:c0cde362075ee8f3d2b0353b283e203c2200243b5a15d5c5c03b78112a17e7d4"},
+    {file = "pyzmq-22.0.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:ff1ea14075bbddd6f29bf6beb8a46d0db779bcec6b9820909584081ec119f8fd"},
+    {file = "pyzmq-22.0.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:26380487eae4034d6c2a3fb8d0f2dff6dd0d9dd711894e8d25aa2d1938950a33"},
+    {file = "pyzmq-22.0.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:3e29f9cf85a40d521d048b55c63f59d6c772ac1c4bf51cdfc23b62a62e377c33"},
+    {file = "pyzmq-22.0.3-cp36-cp36m-win32.whl", hash = "sha256:4f34a173f813b38b83f058e267e30465ed64b22cd0cf6bad21148d3fa718f9bb"},
+    {file = "pyzmq-22.0.3-cp36-cp36m-win_amd64.whl", hash = "sha256:30df70f81fe210506aa354d7fd486a39b87d9f7f24c3d3f4f698ec5d96b8c084"},
+    {file = "pyzmq-22.0.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7026f0353977431fc884abd4ac28268894bd1a780ba84bb266d470b0ec26d2ed"},
+    {file = "pyzmq-22.0.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6d4163704201fff0f3ab0cd5d7a0ea1514ecfffd3926d62ec7e740a04d2012c7"},
+    {file = "pyzmq-22.0.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:763c175294d861869f18eb42901d500eda7d3fa4565f160b3b2fd2678ea0ebab"},
+    {file = "pyzmq-22.0.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:61e4bb6cd60caf1abcd796c3f48395e22c5b486eeca6f3a8797975c57d94b03e"},
+    {file = "pyzmq-22.0.3-cp37-cp37m-win32.whl", hash = "sha256:b25e5d339550a850f7e919fe8cb4c8eabe4c917613db48dab3df19bfb9a28969"},
+    {file = "pyzmq-22.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:3ef50d74469b03725d781a2a03c57537d86847ccde587130fe35caafea8f75c6"},
+    {file = "pyzmq-22.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60e63577b85055e4cc43892fecd877b86695ee3ef12d5d10a3c5d6e77a7cc1a3"},
+    {file = "pyzmq-22.0.3-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:f5831eff6b125992ec65d973f5151c48003b6754030094723ac4c6e80a97c8c4"},
+    {file = "pyzmq-22.0.3-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:9221783dacb419604d5345d0e097bddef4459a9a95322de6c306bf1d9896559f"},
+    {file = "pyzmq-22.0.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b62ea18c0458a65ccd5be90f276f7a5a3f26a6dea0066d948ce2fa896051420f"},
+    {file = "pyzmq-22.0.3-cp38-cp38-win32.whl", hash = "sha256:81e7df0da456206201e226491aa1fc449da85328bf33bbeec2c03bb3a9f18324"},
+    {file = "pyzmq-22.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:f52070871a0fd90a99130babf21f8af192304ec1e995bec2a9533efc21ea4452"},
+    {file = "pyzmq-22.0.3-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:c5e29fe4678f97ce429f076a2a049a3d0b2660ada8f2c621e5dc9939426056dd"},
+    {file = "pyzmq-22.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d18ddc6741b51f3985978f2fda57ddcdae359662d7a6b395bc8ff2292fca14bd"},
+    {file = "pyzmq-22.0.3-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4231943514812dfb74f44eadcf85e8dd8cf302b4d0bce450ce1357cac88dbfdc"},
+    {file = "pyzmq-22.0.3-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:23a74de4b43c05c3044aeba0d1f3970def8f916151a712a3ac1e5cd9c0bc2902"},
+    {file = "pyzmq-22.0.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:532af3e6dddea62d9c49062ece5add998c9823c2419da943cf95589f56737de0"},
+    {file = "pyzmq-22.0.3-cp39-cp39-win32.whl", hash = "sha256:33acd2b9790818b9d00526135acf12790649d8d34b2b04d64558b469c9d86820"},
+    {file = "pyzmq-22.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:a558c5bc89d56d7253187dccc4e81b5bb0eac5ae9511eb4951910a1245d04622"},
+    {file = "pyzmq-22.0.3-pp36-pypy36_pp73-macosx_10_9_x86_64.whl", hash = "sha256:581787c62eaa0e0db6c5413cedc393ebbadac6ddfd22e1cf9a60da23c4f1a4b2"},
+    {file = "pyzmq-22.0.3-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:38e3dca75d81bec4f2defa14b0a65b74545812bb519a8e89c8df96bbf4639356"},
+    {file = "pyzmq-22.0.3-pp36-pypy36_pp73-win32.whl", hash = "sha256:2f971431aaebe0a8b54ac018e041c2f0b949a43745444e4dadcc80d0f0ef8457"},
+    {file = "pyzmq-22.0.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:da7d4d4c778c86b60949d17531e60c54ed3726878de8a7f8a6d6e7f8cc8c3205"},
+    {file = "pyzmq-22.0.3-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:13465c1ff969cab328bc92f7015ce3843f6e35f8871ad79d236e4fbc85dbe4cb"},
+    {file = "pyzmq-22.0.3-pp37-pypy37_pp73-win32.whl", hash = "sha256:279cc9b51db48bec2db146f38e336049ac5a59e5f12fb3a8ad864e238c1c62e3"},
+    {file = "pyzmq-22.0.3.tar.gz", hash = "sha256:f7f63ce127980d40f3e6a5fdb87abf17ce1a7c2bd8bf2c7560e1bbce8ab1f92d"},
+]
+requests = [
+    {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
+    {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
+]
+scikit-image = [
+    {file = "scikit-image-0.18.1.tar.gz", hash = "sha256:fbb618ca911867bce45574c1639618cdfb5d94e207432b19bc19563d80d2f171"},
+    {file = "scikit_image-0.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1cd05c882ffb2a271a1f20b4afe937d63d55b8753c3d652f11495883a7800ebe"},
+    {file = "scikit_image-0.18.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e972c628ad9ba52c298b032368e29af9bd5eeb81ce33bc2d9b039a81661c99c5"},
+    {file = "scikit_image-0.18.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:1256017c513e8e1b8b9da73e5fd1e605d0077bbbc8e5c8d6c2cab36400131c6c"},
+    {file = "scikit_image-0.18.1-cp37-cp37m-win32.whl", hash = "sha256:ec25e4110951d3a280421bb10dd510a082ba83d86e20d706294faf7899cdb3d5"},
+    {file = "scikit_image-0.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:2eea42706a25ae6e0cebaf1914e2ab1c04061b1f3c9966d76025d58a2e9188fc"},
+    {file = "scikit_image-0.18.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:76446e2402e64d7dba78eeae8aa86e92a0cafe5b1c9e6235bd8d067471ed2788"},
+    {file = "scikit_image-0.18.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:d5ad4a9b4c9797d4c4c48f45fa224c5ebff22b9b0af636c3ecb8addbb66c21e6"},
+    {file = "scikit_image-0.18.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:23f9178b21c752bfb4e4ea3a3fa0ff79bc5a401bc75ddb4661f2cebd1c2b0e24"},
+    {file = "scikit_image-0.18.1-cp38-cp38-win32.whl", hash = "sha256:d746540cafe7776c6d05a0b40ec744bb8d33d1ddc51faba601d26c02593d8bcc"},
+    {file = "scikit_image-0.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:30447af3f5b7c9491f2d3db5bc275493d1b91bf1dd16b67e2fd79a6bb95d8ee9"},
+    {file = "scikit_image-0.18.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ae6659b3a8bd4bba7e9dcbfd0064e443b32c7054bf09174749db896730fcf42e"},
+    {file = "scikit_image-0.18.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c058770c6ad6e0fe6c30f59970c9c65fa740ff014d121d8c341664cd792cf49"},
+    {file = "scikit_image-0.18.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:c700336a7f96109c74154090c5e693693a8e3fa09ed6156a5996cdc9a3bb1534"},
+    {file = "scikit_image-0.18.1-cp39-cp39-win32.whl", hash = "sha256:3515b890e771f99bbe1051a0dcfe0fc477da961da933c34f89808a0f1eeb7dc2"},
+    {file = "scikit_image-0.18.1-cp39-cp39-win_amd64.whl", hash = "sha256:5f602779258807d03e72c0a439cfb221f647e628be166fb3594397435f13c76b"},
+]
+scipy = [
+    {file = "scipy-1.6.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:77f7a057724545b7e097bfdca5c6006bed8580768cd6621bb1330aedf49afba5"},
+    {file = "scipy-1.6.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e547f84cd52343ac2d56df0ab08d3e9cc202338e7d09fafe286d6c069ddacb31"},
+    {file = "scipy-1.6.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:bc52d4d70863141bb7e2f8fd4d98e41d77375606cde50af65f1243ce2d7853e8"},
+    {file = "scipy-1.6.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:adf7cee8e5c92b05f2252af498f77c7214a2296d009fc5478fc432c2f8fb953b"},
+    {file = "scipy-1.6.2-cp37-cp37m-win32.whl", hash = "sha256:e3e9742bad925c421d39e699daa8d396c57535582cba90017d17f926b61c1552"},
+    {file = "scipy-1.6.2-cp37-cp37m-win_amd64.whl", hash = "sha256:ffdfb09315896c6e9ac739bb6e13a19255b698c24e6b28314426fd40a1180822"},
+    {file = "scipy-1.6.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6ca1058cb5bd45388041a7c3c11c4b2bd58867ac9db71db912501df77be2c4a4"},
+    {file = "scipy-1.6.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:993c86513272bc84c451349b10ee4376652ab21f312b0554fdee831d593b6c02"},
+    {file = "scipy-1.6.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:37f4c2fb904c0ba54163e03993ce3544c9c5cde104bcf90614f17d85bdfbb431"},
+    {file = "scipy-1.6.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:96620240b393d155097618bcd6935d7578e85959e55e3105490bbbf2f594c7ad"},
+    {file = "scipy-1.6.2-cp38-cp38-win32.whl", hash = "sha256:03f1fd3574d544456325dae502facdf5c9f81cbfe12808a5e67a737613b7ba8c"},
+    {file = "scipy-1.6.2-cp38-cp38-win_amd64.whl", hash = "sha256:0c81ea1a95b4c9e0a8424cf9484b7b8fa7ef57169d7bcc0dfcfc23e3d7c81a12"},
+    {file = "scipy-1.6.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c1d3f771c19af00e1a36f749bd0a0690cc64632783383bc68f77587358feb5a4"},
+    {file = "scipy-1.6.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:50e5bcd9d45262725e652611bb104ac0919fd25ecb78c22f5282afabd0b2e189"},
+    {file = "scipy-1.6.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:816951e73d253a41fa2fd5f956f8e8d9ac94148a9a2039e7db56994520582bf2"},
+    {file = "scipy-1.6.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:1fba8a214c89b995e3721670e66f7053da82e7e5d0fe6b31d8e4b19922a9315e"},
+    {file = "scipy-1.6.2-cp39-cp39-win32.whl", hash = "sha256:e89091e6a8e211269e23f049473b2fde0c0e5ae0dd5bd276c3fc91b97da83480"},
+    {file = "scipy-1.6.2-cp39-cp39-win_amd64.whl", hash = "sha256:d744657c27c128e357de2f0fd532c09c84cd6e4933e8232895a872e67059ac37"},
+    {file = "scipy-1.6.2.tar.gz", hash = "sha256:e9da33e21c9bc1b92c20b5328adb13e5f193b924c9b969cd700c8908f315aa59"},
+]
+seaborn = [
+    {file = "seaborn-0.11.1-py3-none-any.whl", hash = "sha256:4e1cce9489449a1c6ff3c567f2113cdb41122f727e27a984950d004a88ef3c5c"},
+    {file = "seaborn-0.11.1.tar.gz", hash = "sha256:44e78eaed937c5a87fc7a892c329a7cc091060b67ebd1d0d306b446a74ba01ad"},
+]
+six = [
+    {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
+    {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
+]
+sqlalchemy = [
+    {file = "SQLAlchemy-1.4.6-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:432e98e6fe0d24e8181eb4177e59cba9f8831dcaf272a0d2de75bc8b933952a0"},
+    {file = "SQLAlchemy-1.4.6-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:534c71caa87c7fdb136ce5073fb42b732a4eb390946f503d8e1d7ce6a4a79100"},
+    {file = "SQLAlchemy-1.4.6-cp27-cp27m-win32.whl", hash = "sha256:4eeff8b12c7d22be4de98721bba5a042875f4365e9fd20dc3916eec474ccb81e"},
+    {file = "SQLAlchemy-1.4.6-cp27-cp27m-win_amd64.whl", hash = "sha256:e5267cd2e51ddefbe10bb182c36ba41cdaa51c83a0fdfa63ed8cbe89cbcf0f33"},
+    {file = "SQLAlchemy-1.4.6-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:66467123c220689d55c6d51fdf88f7b0b62b8078823c5f6c0297ab47c22003d7"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:933427a5474e014d01bac93224cd4e2bc7bbc7ce531d0bd7e55e4f940cc8ce0d"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:43fef20dd1024409375cc646a4b5afaffb62f6488e41588cde2a1ed2e9432b5b"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:8a296bbf367867aee2ea8d5b391cb04fbdb3ca7277cd1649d9e8114620f3b090"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:2e65c1146f5b4151cc6e553d9847299c97f53640d94ba88b1c534e15cdc6ac38"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:88d75ea6b4330a6f5596a49904f21762ff89ca763db065d63b815ad8c3d68952"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-win32.whl", hash = "sha256:a69787f7fc87b84df7e2f27158476cdf39a79ebb95af1d6f696e474724af9ebe"},
+    {file = "SQLAlchemy-1.4.6-cp36-cp36m-win_amd64.whl", hash = "sha256:21becd8b45ec70b703239cf915104e47889c2aad96d0f68f597b9b547cbfd787"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c719f0058951457a7761bb69c2e47781a9989ab4819b7a30b6b39141ad013a5f"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a76c10b467f7d385e4cffe2185d975336acf0dbf24ed702c46207df0fb64055e"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:a687e552ab4ffedcf3ec3bd5256ab3e753b4f605b467e9fa39690b2dadb5f607"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:013b659efe02f0f58e7f759602584899c921c178c6a972978f16460dcdd782d5"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:6c4af3aceeff6a0e2bd3657d8b25714a9f7c7c606e7ec52029284973094f84c1"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-win32.whl", hash = "sha256:2a042c27b1a32a87f4cead53bcdd28999324992650896094368a595165b31d97"},
+    {file = "SQLAlchemy-1.4.6-cp37-cp37m-win_amd64.whl", hash = "sha256:4387ebd5ae8bc2c716dbfc1ece769c867307eeecc192e72a4d2e7fa0fc092646"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:7d252dea33c1ee07b3d702fb4962963996ea40e5a2615dbe7646ccabd851ac76"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:d7684e0598acfbfb5110bea482d8c5e94f52001d6d66b5558177f41f49fb5930"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:cc3c0d87b11ae1dd1ccbd6fc7875a290b3f73b771254180c2e7b19c2aec7379b"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d42b8e2bffdf9e01d66cf46472b938493b854ea790a0fbe2e2e42624fc253b33"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:360a771b538463053383fb6ff7aceffb595248d7059bb9e003bf70562a66510d"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-win32.whl", hash = "sha256:86a7321636f851c6e8009901c5d67e97d82b86ee8c6f28a476691c41c3d71a95"},
+    {file = "SQLAlchemy-1.4.6-cp38-cp38-win_amd64.whl", hash = "sha256:bdeb300bb9adc02f98957cd0cf0c38d641bdd435b0927e39870a772e0a750bc0"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:4d71ee83441826fb48771e58cef51191500a87734b4acb6b698ca018479395bd"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:93f6fe67a76d7fa1cca3b9febb36e9f2dd76055230e2bfa317969532f34c03ab"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:345c201324066b789804411f07eea750e9f29872be052eba221ce76add647d50"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:2071ee6cd9390a9527a80ef03458fb58e0166bb299db2c62f9d688b6772d76a1"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cadb58aeadd9916e79e8f99a49d0c0a9e61ae2b24469c2b304a0699e41a25e59"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-win32.whl", hash = "sha256:2713b338d9c54d2c3c7ff4f7786a40a5ca85013c8ccea00327b034d42598e22e"},
+    {file = "SQLAlchemy-1.4.6-cp39-cp39-win_amd64.whl", hash = "sha256:b093bd6efb49332021714bed5752e784a34ae6d6896ec56ffdc32cc83275a215"},
+    {file = "SQLAlchemy-1.4.6.tar.gz", hash = "sha256:193c3ca465fbc68de071995a461ab535466f041089d372ee6a6f0aae7b9307e6"},
+]
+svgpathtools = [
+    {file = "svgpathtools-1.4.1-py2.py3-none-any.whl", hash = "sha256:6bc0082099ae1d64ae9649e837ccb10ae302520114da60eabf7360e5e46a12ed"},
+    {file = "svgpathtools-1.4.1.tar.gz", hash = "sha256:7aaba07923ad85b6628301e92e5e72fd0d0a5057620e1423509b3a0b609b7485"},
+]
+svgwrite = [
+    {file = "svgwrite-1.4.1-py3-none-any.whl", hash = "sha256:4b21652a1d9c543a6bf4f9f2a54146b214519b7540ca60cb99968ad09ef631d0"},
+    {file = "svgwrite-1.4.1.zip", hash = "sha256:e220a4bf189e7e214a55e8a11421d152b5b6fb1dd660c86a8b6b61fe8cc2ac48"},
+]
+tifffile = [
+    {file = "tifffile-2021.3.31-py3-none-any.whl", hash = "sha256:e0182c4f819688cad03788006512295875565127b7a7eeab0993304e2aa33c76"},
+    {file = "tifffile-2021.3.31.tar.gz", hash = "sha256:3a966053e09a89317e6c9bdf99db4bf5c4d3d611ca8ac455024d7824ea5772b3"},
+]
+torch = [
+    {file = "torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f23eeb1a48cc39209d986c418ad7e02227eee973da45c0c42d36b1aec72f4940"},
+    {file = "torch-1.8.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:4ace9c5bb94d5a7b9582cd089993201658466e9c59ff88bd4e9e08f6f072d1cf"},
+    {file = "torch-1.8.1-cp36-cp36m-win_amd64.whl", hash = "sha256:6ffa1e7ae079c7cb828712cb0cdaae5cc4fb87c16a607e6d14526b62c20bcc17"},
+    {file = "torch-1.8.1-cp36-none-macosx_10_9_x86_64.whl", hash = "sha256:16f2630d9604c4ee28ea7d6e388e2264cd7bc6031c6ecd796bae3f56b5efa9a3"},
+    {file = "torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:95b7bbbacc3f28fe438f418392ceeae146a01adc03b29d44917d55214ac234c9"},
+    {file = "torch-1.8.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:55137feb2f5a0dc7aced5bba690dcdb7652054ad3452b09a2bbb59f02a11e9ff"},
+    {file = "torch-1.8.1-cp37-cp37m-win_amd64.whl", hash = "sha256:8ad2252bf09833dcf46a536a78544e349b8256a370e03a98627ebfb118d9555b"},
+    {file = "torch-1.8.1-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:1388b30fbd262c1a053d6c9ace73bb0bd8f5871b4892b6f3e02d1d7bc9768563"},
+    {file = "torch-1.8.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:e7ad1649adb7dc2a450e70a3e51240b84fa4746c69c8f98989ce0c254f9fba3a"},
+    {file = "torch-1.8.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:3e4190c04dfd89c59bad06d5fe451446643a65e6d2607cc989eb1001ee76e12f"},
+    {file = "torch-1.8.1-cp38-cp38-win_amd64.whl", hash = "sha256:5c2e9a33d44cdb93ebd739b127ffd7da786bf5f740539539195195b186a05f6c"},
+    {file = "torch-1.8.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:c6ede2ae4dcd8214b63e047efabafa92493605205a947574cf358216ca4e440a"},
+    {file = "torch-1.8.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:ce7d435426f3dd14f95710d779aa46e9cd5e077d512488e813f7589fdc024f78"},
+    {file = "torch-1.8.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a50ea8ed900927fb30cadb63aa7a32fdd59c7d7abe5012348dfbe35a8355c083"},
+    {file = "torch-1.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:dac4d10494e74f7e553c92d7263e19ea501742c4825ddd26c4decfa27be95981"},
+    {file = "torch-1.8.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:225ee4238c019b28369c71977327deeeb2bd1c6b8557e6fcf631b8866bdc5447"},
+]
+torch-tools = [
+    {file = "torch-tools-0.1.5.tar.gz", hash = "sha256:9e6902158ed9bcbb57ee494c946e0c0a8197743408630a109bf673f4dc7fa3cc"},
+    {file = "torch_tools-0.1.5-py3-none-any.whl", hash = "sha256:f77b444df53c99629ba9fc8c3be370de3f2830877744d0cc376c93c2ebc1d15f"},
+]
+torchfile = [
+    {file = "torchfile-0.1.0.tar.gz", hash = "sha256:a53dfe134b737845a9f2cb24fe0585317874f965932cebdb0439d13c8da4136e"},
+]
+torchvision = [
+    {file = "torchvision-0.9.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:da4c4f7363b60b0637354974ea0a29dbc301f66c9f25d92ed5f10637909f3500"},
+    {file = "torchvision-0.9.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8a937cd3b53656e15de03671f8a638b5e8e4c100725b854d73bdb51e41455e9e"},
+    {file = "torchvision-0.9.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:f16ceec2862faaffc8fc19bca20e0e79ffdab18a53e6cb75e42e33d090e80d04"},
+    {file = "torchvision-0.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:99cd75163938b4b3728815696d75c0df8b66390c489abed2365a530a040059a1"},
+    {file = "torchvision-0.9.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8aa438869e3033cbd8749d041d1ca7beb6171ca9f7f47b42e742fabd6900f8fc"},
+    {file = "torchvision-0.9.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a1421a26b21b8c098935c3375182470c4c4d99d5e14d81ec3ac14a35e7a85285"},
+    {file = "torchvision-0.9.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b14b5b7fed0b7dc6245c2608b9fd2262d5b375ba998e097b980a1046683ca7f6"},
+    {file = "torchvision-0.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:86e4facb1cf4670ab3d67b7a947f0c43cd0805ec269a5e22ad0b82be727bcb3b"},
+    {file = "torchvision-0.9.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d38d0d23c6ce6ba15eba094a9319393e429796ab2bab228fa3b996abc9e33c3f"},
+    {file = "torchvision-0.9.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0bfcc3ab99128081bfc9a5c3ab31f5227c4df3b802e6d4217dac104bf5ba8636"},
+    {file = "torchvision-0.9.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:85f21862e504590eb4a77b1d9a1742156a296af55827fb8c82296601922b7ac1"},
+    {file = "torchvision-0.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:dda0dcb914bcab1a43f823348736b8b1c926bf1fbe9cbb3be892fdbe2ab6d097"},
+    {file = "torchvision-0.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:091812c9fa405bef12aca9b9c3e671fcae7c0a4945b68705534ba8a401396ad1"},
+    {file = "torchvision-0.9.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:46b82b6cdccd2cb982819165b6ddaa097629315377ba6bbf77bdcb02c2e83692"},
+    {file = "torchvision-0.9.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:92c936584e03dfca39ff31bbc4a4fb54edb08fe8362e75dc08a2fa4b43266068"},
+    {file = "torchvision-0.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:42bec9b8e8a1dcd478751457191f317f843fa463555c141994c809c4b11ad60d"},
+]
+tornado = [
+    {file = "tornado-6.1-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:d371e811d6b156d82aa5f9a4e08b58debf97c302a35714f6f45e35139c332e32"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9de9e5188a782be6b1ce866e8a51bc76a0fbaa0e16613823fc38e4fc2556ad05"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:61b32d06ae8a036a6607805e6720ef00a3c98207038444ba7fd3d169cd998910"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:3e63498f680547ed24d2c71e6497f24bca791aca2fe116dbc2bd0ac7f191691b"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:6c77c9937962577a6a76917845d06af6ab9197702a42e1346d8ae2e76b5e3675"},
+    {file = "tornado-6.1-cp35-cp35m-win32.whl", hash = "sha256:6286efab1ed6e74b7028327365cf7346b1d777d63ab30e21a0f4d5b275fc17d5"},
+    {file = "tornado-6.1-cp35-cp35m-win_amd64.whl", hash = "sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68"},
+    {file = "tornado-6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:748290bf9112b581c525e6e6d3820621ff020ed95af6f17fedef416b27ed564c"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:e385b637ac3acaae8022e7e47dfa7b83d3620e432e3ecb9a3f7f58f150e50921"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:25ad220258349a12ae87ede08a7b04aca51237721f63b1808d39bdb4b2164558"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:65d98939f1a2e74b58839f8c4dab3b6b3c1ce84972ae712be02845e65391ac7c"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:e519d64089b0876c7b467274468709dadf11e41d65f63bba207e04217f47c085"},
+    {file = "tornado-6.1-cp36-cp36m-win32.whl", hash = "sha256:b87936fd2c317b6ee08a5741ea06b9d11a6074ef4cc42e031bc6403f82a32575"},
+    {file = "tornado-6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:cc0ee35043162abbf717b7df924597ade8e5395e7b66d18270116f8745ceb795"},
+    {file = "tornado-6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7250a3fa399f08ec9cb3f7b1b987955d17e044f1ade821b32e5f435130250d7f"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:ed3ad863b1b40cd1d4bd21e7498329ccaece75db5a5bf58cd3c9f130843e7102"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:dcef026f608f678c118779cd6591c8af6e9b4155c44e0d1bc0c87c036fb8c8c4"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:70dec29e8ac485dbf57481baee40781c63e381bebea080991893cd297742b8fd"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d3f7594930c423fd9f5d1a76bee85a2c36fd8b4b16921cae7e965f22575e9c01"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3447475585bae2e77ecb832fc0300c3695516a47d46cefa0528181a34c5b9d3d"},
+    {file = "tornado-6.1-cp37-cp37m-win32.whl", hash = "sha256:e7229e60ac41a1202444497ddde70a48d33909e484f96eb0da9baf8dc68541df"},
+    {file = "tornado-6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:cb5ec8eead331e3bb4ce8066cf06d2dfef1bfb1b2a73082dfe8a161301b76e37"},
+    {file = "tornado-6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:20241b3cb4f425e971cb0a8e4ffc9b0a861530ae3c52f2b0434e6c1b57e9fd95"},
+    {file = "tornado-6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c77da1263aa361938476f04c4b6c8916001b90b2c2fdd92d8d535e1af48fba5a"},
+    {file = "tornado-6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:1e8225a1070cd8eec59a996c43229fe8f95689cb16e552d130b9793cb570a288"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:d14d30e7f46a0476efb0deb5b61343b1526f73ebb5ed84f23dc794bdb88f9d9f"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8f959b26f2634a091bb42241c3ed8d3cedb506e7c27b8dd5c7b9f745318ddbb6"},
+    {file = "tornado-6.1-cp38-cp38-win32.whl", hash = "sha256:34ca2dac9e4d7afb0bed4677512e36a52f09caa6fded70b4e3e1c89dbd92c326"},
+    {file = "tornado-6.1-cp38-cp38-win_amd64.whl", hash = "sha256:6196a5c39286cc37c024cd78834fb9345e464525d8991c21e908cc046d1cc02c"},
+    {file = "tornado-6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f0ba29bafd8e7e22920567ce0d232c26d4d47c8b5cf4ed7b562b5db39fa199c5"},
+    {file = "tornado-6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:33892118b165401f291070100d6d09359ca74addda679b60390b09f8ef325ffe"},
+    {file = "tornado-6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7da13da6f985aab7f6f28debab00c67ff9cbacd588e8477034c0652ac141feea"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:e0791ac58d91ac58f694d8d2957884df8e4e2f6687cdf367ef7eb7497f79eaa2"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:66324e4e1beede9ac79e60f88de548da58b1f8ab4b2f1354d8375774f997e6c0"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a48900ecea1cbb71b8c71c620dee15b62f85f7c14189bdeee54966fbd9a0c5bd"},
+    {file = "tornado-6.1-cp39-cp39-win32.whl", hash = "sha256:d3d20ea5782ba63ed13bc2b8c291a053c8d807a8fa927d941bd718468f7b950c"},
+    {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
+    {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
+]
+tqdm = [
+    {file = "tqdm-4.60.0-py2.py3-none-any.whl", hash = "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3"},
+    {file = "tqdm-4.60.0.tar.gz", hash = "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"},
+]
+typing-extensions = [
+    {file = "typing_extensions-3.7.4.3-py2-none-any.whl", hash = "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"},
+    {file = "typing_extensions-3.7.4.3-py3-none-any.whl", hash = "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918"},
+    {file = "typing_extensions-3.7.4.3.tar.gz", hash = "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c"},
+]
+urllib3 = [
+    {file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"},
+    {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"},
+]
+visdom = [
+    {file = "visdom-0.1.8.9.tar.gz", hash = "sha256:c73ad23723c24a48156899f78dd76bd4538eba3edf9120b6c65a9528fa677126"},
+]
+websocket-client = [
+    {file = "websocket_client-0.58.0-py2.py3-none-any.whl", hash = "sha256:44b5df8f08c74c3d82d28100fdc81f4536809ce98a17f0757557813275fbb663"},
+    {file = "websocket_client-0.58.0.tar.gz", hash = "sha256:63509b41d158ae5b7f67eb4ad20fecbb4eee99434e73e140354dc3ff8e09716f"},
+]
diff --git a/ptr.h b/ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8e43e148d6b0b2abec6a1d4b830a81982f50b
--- /dev/null
+++ b/ptr.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstddef>
+
+/**
+ * Python doesn't have a pointer type, therefore we create a pointer wrapper
+ * see https://stackoverflow.com/questions/48982143/returning-and-passing-around-raw-pod-pointers-arrays-with-python-c-and-pyb?rq=1
+ */
+template <typename T>
+class ptr {
+public:
+    ptr() : p(nullptr) {}
+    ptr(T* p) : p(p) {}
+    ptr(std::size_t p) : p((T*)p) {}
+    ptr(const ptr& other) : ptr(other.p) {}
+    T* operator->() const { return p; }
+    T* get() const { return p; }
+    void destroy() { delete p; }
+    bool is_null() const { return p == nullptr; }
+    size_t as_size_t() const {return (size_t)p;}
+private:
+    T* p;
+};
diff --git a/pybind11/.appveyor.yml b/pybind11/.appveyor.yml
new file mode 100644
index 0000000000000000000000000000000000000000..149a8a3dc9d0076811810036fdd781722fc83203
--- /dev/null
+++ b/pybind11/.appveyor.yml
@@ -0,0 +1,37 @@
+version: 1.0.{build}
+image:
+- Visual Studio 2015
+test: off
+skip_branch_with_pr: true
+build:
+  parallel: true
+platform:
+- x86
+environment:
+  matrix:
+  - PYTHON: 36
+    CONFIG: Debug
+  - PYTHON: 27
+    CONFIG: Debug
+install:
+- ps: |
+    $env:CMAKE_GENERATOR = "Visual Studio 14 2015"
+    if ($env:PLATFORM -eq "x64") { $env:PYTHON = "$env:PYTHON-x64" }
+    $env:PATH = "C:\Python$env:PYTHON\;C:\Python$env:PYTHON\Scripts\;$env:PATH"
+    python -W ignore -m pip install --upgrade pip wheel
+    python -W ignore -m pip install pytest numpy --no-warn-script-location
+- ps: |
+    Start-FileDownload 'https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip'
+    7z x eigen-3.3.7.zip -y > $null
+    $env:CMAKE_INCLUDE_PATH = "eigen-3.3.7;$env:CMAKE_INCLUDE_PATH"
+build_script:
+- cmake -G "%CMAKE_GENERATOR%" -A "%CMAKE_ARCH%"
+    -DCMAKE_CXX_STANDARD=14
+    -DPYBIND11_WERROR=ON
+    -DDOWNLOAD_CATCH=ON
+    -DCMAKE_SUPPRESS_REGENERATION=1
+    .
+- set MSBuildLogger="C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+- cmake --build . --config %CONFIG% --target pytest -- /m /v:m /logger:%MSBuildLogger%
+- cmake --build . --config %CONFIG% --target cpptest -- /m /v:m /logger:%MSBuildLogger%
+on_failure: if exist "tests\test_cmake_build" type tests\test_cmake_build\*.log*
diff --git a/pybind11/.cmake-format.yaml b/pybind11/.cmake-format.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2a69f3f897da990a4255b593dc3fcdf4b89a764
--- /dev/null
+++ b/pybind11/.cmake-format.yaml
@@ -0,0 +1,73 @@
+parse:
+  additional_commands:
+    pybind11_add_module:
+      flags:
+        - THIN_LTO
+        - MODULE
+        - SHARED
+        - NO_EXTRAS
+        - EXCLUDE_FROM_ALL
+        - SYSTEM
+
+format:
+  line_width: 99
+  tab_size: 2
+
+  # If an argument group contains more than this many sub-groups
+  # (parg or kwarg groups) then force it to a vertical layout.
+  max_subgroups_hwrap: 2
+
+  # If a positional argument group contains more than this many
+  # arguments, then force it to a vertical layout.
+  max_pargs_hwrap: 6
+
+  # If a cmdline positional group consumes more than this many
+  # lines without nesting, then invalidate the layout (and nest)
+  max_rows_cmdline: 2
+  separate_ctrl_name_with_space: false
+  separate_fn_name_with_space: false
+  dangle_parens: false
+
+  # If the trailing parenthesis must be 'dangled' on its on
+  # 'line, then align it to this reference: `prefix`: the start'
+  # 'of the statement,  `prefix-indent`: the start of the'
+  # 'statement, plus one indentation  level, `child`: align to'
+  # the column of the arguments
+  dangle_align: prefix
+  # If the statement spelling length (including space and
+  # parenthesis) is smaller than this amount, then force reject
+  # nested layouts.
+  min_prefix_chars: 4
+
+  # If the statement spelling length (including space and
+  # parenthesis) is larger than the tab width by more than this
+  # amount, then force reject un-nested layouts.
+  max_prefix_chars: 10
+
+  # If a candidate layout is wrapped horizontally but it exceeds
+  # this many lines, then reject the layout.
+  max_lines_hwrap: 2
+
+  line_ending: unix
+
+  # Format command names consistently as 'lower' or 'upper' case
+  command_case: canonical
+
+  # Format keywords consistently as 'lower' or 'upper' case
+  # unchanged is valid too
+  keyword_case: 'upper'
+
+  # A list of command names which should always be wrapped
+  always_wrap: []
+
+  # If true, the argument lists which are known to be sortable
+  # will be sorted lexicographically
+  enable_sort: true
+
+  # If true, the parsers may infer whether or not an argument
+  # list is sortable (without annotation).
+  autosort: false
+
+# Causes a few issues - can be solved later, possibly.
+markup:
+  enable_markup: false
diff --git a/pybind11/.github/CONTRIBUTING.md b/pybind11/.github/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..f61011d54059501e60f41d8da343aab10f259f6a
--- /dev/null
+++ b/pybind11/.github/CONTRIBUTING.md
@@ -0,0 +1,171 @@
+Thank you for your interest in this project! Please refer to the following
+sections on how to contribute code and bug reports.
+
+### Reporting bugs
+
+Before submitting a question or bug report, please take a moment of your time
+and ensure that your issue isn't already discussed in the project documentation
+provided at [pybind11.readthedocs.org][] or in the [issue tracker][]. You can
+also check [gitter][] to see if it came up before.
+
+Assuming that you have identified a previously unknown problem or an important
+question, it's essential that you submit a self-contained and minimal piece of
+code that reproduces the problem. In other words: no external dependencies,
+isolate the function(s) that cause breakage, submit matched and complete C++
+and Python snippets that can be easily compiled and run in isolation; or
+ideally make a small PR with a failing test case that can be used as a starting
+point.
+
+## Pull requests
+
+Contributions are submitted, reviewed, and accepted using GitHub pull requests.
+Please refer to [this article][using pull requests] for details and adhere to
+the following rules to make the process as smooth as possible:
+
+* Make a new branch for every feature you're working on.
+* Make small and clean pull requests that are easy to review but make sure they
+  do add value by themselves.
+* Add tests for any new functionality and run the test suite (`cmake --build
+  build --target pytest`) to ensure that no existing features break.
+* Please run [`pre-commit`][pre-commit] to check your code matches the
+  project style. (Note that `gawk` is required.) Use `pre-commit run
+  --all-files` before committing (or use installed-mode, check pre-commit docs)
+  to verify your code passes before pushing to save time.
+* This project has a strong focus on providing general solutions using a
+  minimal amount of code, thus small pull requests are greatly preferred.
+
+### Licensing of contributions
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project, you
+agree to the terms and conditions of this license.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to the author of this software, without
+imposing a separate written license agreement for such Enhancements, then you
+hereby grant the following license: a non-exclusive, royalty-free perpetual
+license to install, use, modify, prepare derivative works, incorporate into
+other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
+
+
+## Development of pybind11
+
+To setup an ideal development environment, run the following commands on a
+system with CMake 3.14+:
+
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r tests/requirements.txt
+cmake -S . -B build -DDOWNLOAD_CATCH=ON -DDOWNLOAD_EIGEN=ON
+cmake --build build -j4
+```
+
+Tips:
+
+* You can use `virtualenv` (from PyPI) instead of `venv` (which is Python 3
+  only).
+* You can select any name for your environment folder; if it contains "env" it
+  will be ignored by git.
+* If you don’t have CMake 3.14+, just add “cmake” to the pip install command.
+* You can use `-DPYBIND11_FINDPYTHON=ON` to use FindPython on CMake 3.12+
+* In classic mode, you may need to set `-DPYTHON_EXECUTABLE=/path/to/python`.
+  FindPython uses `-DPython_ROOT_DIR=/path/to` or
+  `-DPython_EXECUTABLE=/path/to/python`.
+
+### Configuration options
+
+In CMake, configuration options are given with “-D”. Options are stored in the
+build directory, in the `CMakeCache.txt` file, so they are remembered for each
+build directory. Two selections are special - the generator, given with `-G`,
+and the compiler, which is selected based on environment variables `CXX` and
+similar, or `-DCMAKE_CXX_COMPILER=`. Unlike the others, these cannot be changed
+after the initial run.
+
+The valid options are:
+
+* `-DCMAKE_BUILD_TYPE`: Release, Debug, MinSizeRel, RelWithDebInfo
+* `-DPYBIND11_FINDPYTHON=ON`: Use CMake 3.12+’s FindPython instead of the
+  classic, deprecated, custom FindPythonLibs
+* `-DPYBIND11_NOPYTHON=ON`: Disable all Python searching (disables tests)
+* `-DBUILD_TESTING=ON`: Enable the tests
+* `-DDOWNLOAD_CATCH=ON`: Download catch to build the C++ tests
+* `-DOWNLOAD_EIGEN=ON`: Download Eigen for the NumPy tests
+* `-DPYBIND11_INSTALL=ON/OFF`: Enable the install target (on by default for the
+  master project)
+* `-DUSE_PYTHON_INSTALL_DIR=ON`: Try to install into the python dir
+
+
+<details><summary>A few standard CMake tricks: (click to expand)</summary><p>
+
+* Use `cmake --build build -v` to see the commands used to build the files.
+* Use `cmake build -LH` to list the CMake options with help.
+* Use `ccmake` if available to see a curses (terminal) gui, or `cmake-gui` for
+  a completely graphical interface (not present in the PyPI package).
+* Use `cmake --build build -j12` to build with 12 cores (for example).
+* Use `-G` and the name of a generator to use something different. `cmake
+  --help` lists the generators available.
+      - On Unix, setting `CMAKE_GENERATER=Ninja` in your environment will give
+        you automatic mulithreading on all your CMake projects!
+* Open the `CMakeLists.txt` with QtCreator to generate for that IDE.
+* You can use `-DCMAKE_EXPORT_COMPILE_COMMANDS=ON` to generate the `.json` file
+  that some tools expect.
+
+</p></details>
+
+
+To run the tests, you can "build" the check target:
+
+```bash
+cmake --build build --target check
+```
+
+`--target` can be spelled `-t` in CMake 3.15+. You can also run individual
+tests with these targets:
+
+* `pytest`: Python tests only
+* `cpptest`: C++ tests only
+* `test_cmake_build`: Install / subdirectory tests
+
+If you want to build just a subset of tests, use
+`-DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_pickling.cpp"`. If this is
+empty, all tests will be built.
+
+### Formatting
+
+All formatting is handled by pre-commit.
+
+Install with brew (macOS) or pip (any OS):
+
+```bash
+# Any OS
+python3 -m pip install pre-commit
+
+# OR macOS with homebrew:
+brew install pre-commit
+```
+
+Then, you can run it on the items you've added to your staging area, or all
+files:
+
+```bash
+pre-commit run
+# OR
+pre-commit run --all-files
+```
+
+And, if you want to always use it, you can install it as a git hook (hence the
+name, pre-commit):
+
+```bash
+pre-commit install
+```
+
+[pre-commit]: https://pre-commit.com
+[pybind11.readthedocs.org]: http://pybind11.readthedocs.org/en/latest
+[issue tracker]: https://github.com/pybind/pybind11/issues
+[gitter]: https://gitter.im/pybind/Lobby
+[using pull requests]: https://help.github.com/articles/using-pull-requests
diff --git a/pybind11/.github/ISSUE_TEMPLATE/bug-report.md b/pybind11/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae36ea65083643dfc6f252249141b94c7ecb65e7
--- /dev/null
+++ b/pybind11/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,28 @@
+---
+name: Bug Report
+about: File an issue about a bug
+title: "[BUG] "
+---
+
+
+Make sure you've completed the following steps before submitting your issue -- thank you!
+
+1. Make sure you've read the [documentation][]. Your issue may be addressed there.
+2. Search the [issue tracker][] to verify that this hasn't already been reported. +1 or comment there if it has.
+3. Consider asking first in the [Gitter chat room][].
+4. Include a self-contained and minimal piece of code that reproduces the problem. If that's not possible, try to make the description as clear as possible.
+    a. If possible, make a PR with a new, failing test to give us a starting point to work on!
+
+[documentation]: https://pybind11.readthedocs.io
+[issue tracker]: https://github.com/pybind/pybind11/issues
+[Gitter chat room]: https://gitter.im/pybind/Lobby
+
+*After reading, remove this checklist and the template text in parentheses below.*
+
+## Issue description
+
+(Provide a short description, state the expected behavior and what actually happens.)
+
+## Reproducible example code
+
+(The code should be minimal, have no external dependencies, isolate the function(s) that cause breakage. Submit matched and complete C++ and Python snippets that can be easily compiled and run to diagnose the issue.)
diff --git a/pybind11/.github/ISSUE_TEMPLATE/config.yml b/pybind11/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..20e743136f3f2efea7deadf3e08c5b104c22a7f3
--- /dev/null
+++ b/pybind11/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Gitter room
+    url: https://gitter.im/pybind/Lobby
+    about: A room for discussing pybind11 with an active community
diff --git a/pybind11/.github/ISSUE_TEMPLATE/feature-request.md b/pybind11/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 0000000000000000000000000000000000000000..5f6ec81ec972b13e38520ef2e37d85022c2642c9
--- /dev/null
+++ b/pybind11/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,16 @@
+---
+name: Feature Request
+about: File an issue about adding a feature
+title: "[FEAT] "
+---
+
+
+Make sure you've completed the following steps before submitting your issue -- thank you!
+
+1. Check if your feature has already been mentioned / rejected / planned in other issues.
+2. If those resources didn't help, consider asking in the [Gitter chat room][] to see if this is interesting / useful to a larger audience and possible to implement reasonably,
+4. If you have a useful feature that passes the previous items (or not suitable for chat), please fill in the details below.
+
+[Gitter chat room]: https://gitter.im/pybind/Lobby
+
+*After reading, remove this checklist.*
diff --git a/pybind11/.github/ISSUE_TEMPLATE/question.md b/pybind11/.github/ISSUE_TEMPLATE/question.md
new file mode 100644
index 0000000000000000000000000000000000000000..b199b6ee8ad446994aed54f67b0d1c22049d53c1
--- /dev/null
+++ b/pybind11/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,21 @@
+---
+name: Question
+about: File an issue about unexplained behavior
+title: "[QUESTION] "
+---
+
+If you have a question, please check the following first:
+
+1. Check if your question has already been answered in the [FAQ][] section.
+2. Make sure you've read the [documentation][]. Your issue may be addressed there.
+3. If those resources didn't help and you only have a short question (not a bug report), consider asking in the [Gitter chat room][]
+4. Search the [issue tracker][], including the closed issues, to see if your question has already been asked/answered. +1 or comment if it has been asked but has no answer.
+5. If you have a more complex question which is not answered in the previous items (or not suitable for chat), please fill in the details below.
+6. Include a self-contained and minimal piece of code that illustrates your question. If that's not possible, try to make the description as clear as possible.
+
+[FAQ]: http://pybind11.readthedocs.io/en/latest/faq.html
+[documentation]: https://pybind11.readthedocs.io
+[issue tracker]: https://github.com/pybind/pybind11/issues
+[Gitter chat room]: https://gitter.im/pybind/Lobby
+
+*After reading, remove this checklist.*
diff --git a/pybind11/.github/workflows/ci.yml b/pybind11/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..825631beae41c182a250ffc10072e60e740736ef
--- /dev/null
+++ b/pybind11/.github/workflows/ci.yml
@@ -0,0 +1,359 @@
+name: CI
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - master
+      - stable
+      - v*
+
+jobs:
+  standard:
+    strategy:
+      matrix:
+        runs-on: [ubuntu-latest, windows-latest, macos-latest]
+        arch: [x64]
+        python:
+        - 2.7
+        - 3.5
+        - 3.8
+        - pypy2
+        - pypy3
+
+        include:
+          - runs-on: ubuntu-latest
+            python: 3.6
+            arch: x64
+            args: >
+              -DPYBIND11_FINDPYTHON=ON
+          - runs-on: windows-2016
+            python: 3.7
+            arch: x86
+            args2: >
+              -DCMAKE_CXX_FLAGS="/permissive- /EHsc /GR"
+          - runs-on: windows-latest
+            python: 3.6
+            arch: x64
+            args: >
+              -DPYBIND11_FINDPYTHON=ON
+          - runs-on: windows-latest
+            python: 3.7
+            arch: x64
+
+          - runs-on: ubuntu-latest
+            python: 3.9-dev
+            arch: x64
+          - runs-on: macos-latest
+            python: 3.9-dev
+            arch: x64
+            args: >
+              -DPYBIND11_FINDPYTHON=ON
+
+        exclude:
+            # Currently 32bit only, and we build 64bit
+          - runs-on: windows-latest
+            python: pypy2
+            arch: x64
+          - runs-on: windows-latest
+            python: pypy3
+            arch: x64
+
+            # Currently broken on embed_test
+          - runs-on: windows-latest
+            python: 3.8
+            arch: x64
+          - runs-on: windows-latest
+            python: 3.9-dev
+            arch: x64
+
+
+    name: "🐍 ${{ matrix.python }} • ${{ matrix.runs-on }} • ${{ matrix.arch }} ${{ matrix.args }}"
+    runs-on: ${{ matrix.runs-on }}
+    continue-on-error: ${{ endsWith(matrix.python, 'dev') }}
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Setup Python ${{ matrix.python }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python }}
+        architecture: ${{ matrix.arch }}
+
+    - name: Setup Boost (Windows / Linux latest)
+      run: echo "::set-env name=BOOST_ROOT::$BOOST_ROOT_1_72_0"
+
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.3
+
+    - name: Cache wheels
+      if: runner.os == 'macOS'
+      uses: actions/cache@v2
+      with:
+        # This path is specific to macOS - we really only need it for PyPy NumPy wheels
+        # See https://github.com/actions/cache/blob/master/examples.md#python---pip
+        # for ways to do this more generally
+        path: ~/Library/Caches/pip
+        # Look to see if there is a cache hit for the corresponding requirements file
+        key: ${{ runner.os }}-pip-${{ matrix.python }}-${{ matrix.arch }}-${{ hashFiles('tests/requirements.txt') }}
+
+    - name: Prepare env
+      run: python -m pip install -r tests/requirements.txt --prefer-binary
+
+    - name: Setup annotations
+      run: python -m pip install pytest-github-actions-annotate-failures
+
+    - name: Configure C++11 ${{ matrix.args }}
+      run: >
+        cmake -S . -B .
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DDOWNLOAD_EIGEN=ON
+        -DCMAKE_CXX_STANDARD=11
+        ${{ matrix.args }}
+
+    - name: Build C++11
+      run: cmake --build . -j 2
+
+    - name: Python tests C++11
+      run: cmake --build . --target pytest -j 2
+
+    - name: C++11 tests
+      run: cmake --build .  --target cpptest -j 2
+
+    - name: Interface test C++11
+      run: cmake --build . --target test_cmake_build
+
+    - name: Clean directory
+      run: git clean -fdx
+
+    - name: Configure ${{ matrix.args2 }}
+      run: >
+        cmake -S . -B build2
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DDOWNLOAD_EIGEN=ON
+        -DCMAKE_CXX_STANDARD=17
+        ${{ matrix.args }}
+        ${{ matrix.args2 }}
+
+    - name: Build
+      run: cmake --build build2 -j 2
+
+    - name: Python tests
+      run: cmake --build build2 --target pytest
+
+    - name: C++ tests
+      run: cmake --build build2 --target cpptest
+
+    - name: Interface test
+      run: cmake --build build2 --target test_cmake_build
+
+  clang:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        clang:
+          - 3.6
+          - 3.7
+          - 3.9
+          - 5
+          - 7
+          - 9
+          - dev
+
+    name: "🐍 3 • Clang ${{ matrix.clang }} • x64"
+    container: "silkeh/clang:${{ matrix.clang }}"
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Add wget and python3
+      run: apt-get update && apt-get install -y python3-dev python3-numpy python3-pytest libeigen3-dev
+
+    - name: Configure
+      shell: bash
+      run: >
+        cmake -S . -B build
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+
+    - name: Build
+      run: cmake --build build -j 2
+
+    - name: Python tests
+      run: cmake --build build --target pytest
+
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+
+    - name: Interface test
+      run: cmake --build build --target test_cmake_build
+
+  gcc:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        gcc:
+          - 7
+          - latest
+
+    name: "🐍 3 • GCC ${{ matrix.gcc }} • x64"
+    container: "gcc:${{ matrix.gcc }}"
+
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: Add Python 3
+      run: apt-get update; apt-get install -y python3-dev python3-numpy python3-pytest python3-pip libeigen3-dev
+
+    - name: Update pip
+      run: python3 -m pip install --upgrade pip
+
+    - name: Setup CMake 3.18
+      uses: jwlawson/actions-setup-cmake@v1.3
+      with:
+        cmake-version: 3.18
+
+    - name: Configure
+      shell: bash
+      run: >
+        cmake -S . -B build
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DCMAKE_CXX_STANDARD=11
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+
+    - name: Build
+      run: cmake --build build -j 2
+
+    - name: Python tests
+      run: cmake --build build --target pytest
+
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+
+    - name: Interface test
+      run: cmake --build build --target test_cmake_build
+
+  centos:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        centos:
+          - 7  # GCC 4.8
+          - 8
+
+    name: "🐍 3 • CentOS ${{ matrix.centos }} • x64"
+    container: "centos:${{ matrix.centos }}"
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Add Python 3
+      run: yum update -y && yum install -y python3-devel gcc-c++ make git
+
+    - name: Update pip
+      run: python3 -m pip install --upgrade pip
+
+    - name: Install dependencies
+      run: python3 -m pip install cmake -r tests/requirements.txt --prefer-binary
+
+    - name: Configure
+      shell: bash
+      run: >
+        cmake -S . -B build
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DDOWNLOAD_EIGEN=ON
+        -DCMAKE_CXX_STANDARD=11
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+
+    - name: Build
+      run: cmake --build build -j 2
+
+    - name: Python tests
+      run: cmake --build build --target pytest
+
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+
+    - name: Interface test
+      run: cmake --build build --target test_cmake_build
+
+  install-classic:
+    name: "🐍 3.5 • Debian • x86 •  Install"
+    runs-on: ubuntu-latest
+    container: i386/debian:stretch
+
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: Install requirements
+      run: |
+        apt-get update
+        apt-get install -y git make cmake g++ libeigen3-dev python3-dev python3-pip
+        pip3 install "pytest==3.1.*"
+
+    - name: Configure for install
+      run: >
+        cmake .
+        -DPYBIND11_INSTALL=1 -DPYBIND11_TEST=0
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+
+    - name: Make and install
+      run: make install
+
+    - name: Copy tests to new directory
+      run: cp -a tests /pybind11-tests
+
+    - name: Make a new test directory
+      run: mkdir /build-tests
+
+    - name: Configure tests
+      run: >
+        cmake ../pybind11-tests
+        -DDOWNLOAD_CATCH=ON
+        -DPYBIND11_WERROR=ON
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+      working-directory: /build-tests
+
+    - name: Run tests
+      run: make pytest -j 2
+      working-directory: /build-tests
+
+
+  doxygen:
+    name: "Documentation build test"
+    runs-on: ubuntu-latest
+    container: alpine:3.12
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Install system requirements
+      run: apk add doxygen python3-dev
+
+    - name: Ensure pip
+      run: python3 -m ensurepip
+
+    - name: Install docs & setup requirements
+      run: python3 -m pip install -r docs/requirements.txt pytest setuptools
+
+    - name: Build docs
+      run: python3 -m sphinx -W -b html docs docs/.build
+
+    - name: Make SDist
+      run: python3 setup.py sdist
+
+    - name: Compare Dists (headers only)
+      run: |
+        python3 -m pip install --user -U ./dist/*
+        installed=$(python3 -c "import pybind11; print(pybind11.get_include(True) + '/pybind11')")
+        diff -rq $installed ./include/pybind11
diff --git a/pybind11/.github/workflows/configure.yml b/pybind11/.github/workflows/configure.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d472f4b1917060053e464baaebebd9e90a5172f0
--- /dev/null
+++ b/pybind11/.github/workflows/configure.yml
@@ -0,0 +1,78 @@
+name: Config
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - master
+      - stable
+      - v*
+
+jobs:
+  cmake:
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [x64]
+        cmake: [3.18]
+
+        include:
+        - runs-on: ubuntu-latest
+          arch: x64
+          cmake: 3.4
+
+        - runs-on: macos-latest
+          arch: x64
+          cmake: 3.7
+
+        - runs-on: windows-2016
+          arch: x86
+          cmake: 3.8
+
+        - runs-on: windows-2016
+          arch: x86
+          cmake: 3.18
+
+    name: 🐍 3.7 • CMake ${{ matrix.cmake }} • ${{ matrix.runs-on }}
+    runs-on: ${{ matrix.runs-on }}
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Setup Python 3.7
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.7
+        architecture: ${{ matrix.arch }}
+
+    - name: Prepare env
+      run: python -m pip install -r tests/requirements.txt
+
+    - name: Setup CMake ${{ matrix.cmake }}
+      uses: jwlawson/actions-setup-cmake@v1.3
+      with:
+        cmake-version: ${{ matrix.cmake }}
+
+    - name: Make build directories
+      run: mkdir "build dir"
+
+    - name: Configure
+      working-directory: build dir
+      shell: bash
+      run: >
+        cmake ..
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DPYTHON_EXECUTABLE=$(python -c "import sys; print(sys.executable)")
+
+    - name: Build
+      working-directory: build dir
+      if: github.event_name == 'workflow_dispatch'
+      run: cmake --build . --config Release
+
+    - name: Test
+      working-directory: build dir
+      if: github.event_name == 'workflow_dispatch'
+      run: cmake --build . --config Release --target check
diff --git a/pybind11/.github/workflows/format.yml b/pybind11/.github/workflows/format.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e92f96e6ef06662528c8acc2000710db91d3fe0a
--- /dev/null
+++ b/pybind11/.github/workflows/format.yml
@@ -0,0 +1,19 @@
+name: Format
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+    - master
+    - stable
+    - "v*"
+
+jobs:
+  pre-commit:
+    name: Format
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+    - uses: pre-commit/action@v2.0.0
diff --git a/pybind11/.gitignore b/pybind11/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5613b367d257a8c155ff2c9bdc3a9cae99fe7376
--- /dev/null
+++ b/pybind11/.gitignore
@@ -0,0 +1,41 @@
+CMakeCache.txt
+CMakeFiles
+Makefile
+cmake_install.cmake
+cmake_uninstall.cmake
+.DS_Store
+*.so
+*.pyd
+*.dll
+*.sln
+*.sdf
+*.opensdf
+*.vcxproj
+*.vcxproj.user
+*.filters
+example.dir
+Win32
+x64
+Release
+Debug
+.vs
+CTestTestfile.cmake
+Testing
+autogen
+MANIFEST
+/.ninja_*
+/*.ninja
+/docs/.build
+*.py[co]
+*.egg-info
+*~
+.*.swp
+.DS_Store
+/dist
+/*build*
+.cache/
+sosize-*.txt
+pybind11Config*.cmake
+pybind11Targets.cmake
+/*env*
+/.vscode
diff --git a/pybind11/.gitmodules b/pybind11/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..4d698f93f828a5c7538ba9af2059a881ec99ac55
--- /dev/null
+++ b/pybind11/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tools/clang"]
+	path = tools/clang
+	url = ../../wjakob/clang-cindex-python3.git
diff --git a/pybind11/.pre-commit-config.yaml b/pybind11/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a046c6fcfe6b60666c7b4353372c06ab4a8d13c1
--- /dev/null
+++ b/pybind11/.pre-commit-config.yaml
@@ -0,0 +1,44 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v3.1.0
+  hooks:
+  - id: check-added-large-files
+  - id: check-case-conflict
+  - id: check-merge-conflict
+  - id: check-symlinks
+  - id: check-yaml
+  - id: debug-statements
+  - id: end-of-file-fixer
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
+  - id: fix-encoding-pragma
+
+- repo: https://github.com/Lucas-C/pre-commit-hooks
+  rev: v1.1.9
+  hooks:
+  - id: remove-tabs
+
+- repo: https://gitlab.com/pycqa/flake8
+  rev: 3.8.3
+  hooks:
+  - id: flake8
+    additional_dependencies: [flake8-bugbear, pep8-naming]
+    exclude: ^(docs/.*|tools/.*)$
+
+- repo: https://github.com/cheshirekow/cmake-format-precommit
+  rev: v0.6.11
+  hooks:
+  - id: cmake-format
+    additional_dependencies: [pyyaml]
+    types: [file]
+    files: (\.cmake|CMakeLists.txt)(.in)?$
+
+- repo: local
+  hooks:
+  - id: check-style
+    name: Classic check-style
+    language: system
+    types:
+    - c++
+    entry: ./tools/check-style.sh
diff --git a/pybind11/.readthedocs.yml b/pybind11/.readthedocs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c9c61617ca9b13a3e31d33226c52ba9529872a0d
--- /dev/null
+++ b/pybind11/.readthedocs.yml
@@ -0,0 +1,3 @@
+python:
+  version: 3
+requirements_file: docs/requirements.txt
diff --git a/pybind11/CMakeLists.txt b/pybind11/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b460494a86a7b7d7487bfd10d4b2560441d5a76
--- /dev/null
+++ b/pybind11/CMakeLists.txt
@@ -0,0 +1,271 @@
+# CMakeLists.txt -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+# Extract project version from source
+file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/pybind11/detail/common.h"
+     pybind11_version_defines REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
+
+foreach(ver ${pybind11_version_defines})
+  if(ver MATCHES [[#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$]])
+    set(PYBIND11_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}")
+  endif()
+endforeach()
+
+if(PYBIND11_VERSION_PATCH MATCHES [[([a-zA-Z]+)]])
+  set(pybind11_VERSION_TYPE "${CMAKE_MATCH_1}")
+endif()
+string(REGEX MATCH "[0-9]+" PYBIND11_VERSION_PATCH "${PYBIND11_VERSION_PATCH}")
+
+project(
+  pybind11
+  LANGUAGES CXX
+  VERSION "${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH}")
+
+# Standard includes
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+include(CMakeDependentOption)
+
+if(NOT pybind11_FIND_QUIETLY)
+  message(STATUS "pybind11 v${pybind11_VERSION} ${pybind11_VERSION_TYPE}")
+endif()
+
+# Check if pybind11 is being used directly or via add_subdirectory
+if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
+  ### Warn if not an out-of-source builds
+  if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+    set(lines
+        "You are building in-place. If that is not what you intended to "
+        "do, you can clean the source directory with:\n"
+        "rm -r CMakeCache.txt CMakeFiles/ cmake_uninstall.cmake pybind11Config.cmake "
+        "pybind11ConfigVersion.cmake tests/CMakeFiles/\n")
+    message(AUTHOR_WARNING ${lines})
+  endif()
+
+  set(PYBIND11_MASTER_PROJECT ON)
+
+  if(OSX AND CMAKE_VERSION VERSION_LESS 3.7)
+    # Bug in macOS CMake < 3.7 is unable to download catch
+    message(WARNING "CMAKE 3.7+ needed on macOS to download catch, and newer HIGHLY recommended")
+  elseif(WINDOWS AND CMAKE_VERSION VERSION_LESS 3.8)
+    # Only tested with 3.8+ in CI.
+    message(WARNING "CMAKE 3.8+ tested on Windows, previous versions untested")
+  endif()
+
+  message(STATUS "CMake ${CMAKE_VERSION}")
+
+  if(CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  endif()
+else()
+  set(PYBIND11_MASTER_PROJECT OFF)
+  set(pybind11_system SYSTEM)
+endif()
+
+# Options
+option(PYBIND11_INSTALL "Install pybind11 header files?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_TEST "Build pybind11 test suite?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_NOPYTHON "Disable search for Python" OFF)
+
+cmake_dependent_option(
+  USE_PYTHON_INCLUDE_DIR
+  "Install pybind11 headers in Python include directory instead of default installation prefix"
+  OFF "PYBIND11_INSTALL" OFF)
+
+cmake_dependent_option(PYBIND11_FINDPYTHON "Force new FindPython" OFF
+                       "NOT CMAKE_VERSION VERSION_LESS 3.12" OFF)
+
+# NB: when adding a header don't forget to also add it to setup.py
+set(PYBIND11_HEADERS
+    include/pybind11/detail/class.h
+    include/pybind11/detail/common.h
+    include/pybind11/detail/descr.h
+    include/pybind11/detail/init.h
+    include/pybind11/detail/internals.h
+    include/pybind11/detail/typeid.h
+    include/pybind11/attr.h
+    include/pybind11/buffer_info.h
+    include/pybind11/cast.h
+    include/pybind11/chrono.h
+    include/pybind11/common.h
+    include/pybind11/complex.h
+    include/pybind11/options.h
+    include/pybind11/eigen.h
+    include/pybind11/embed.h
+    include/pybind11/eval.h
+    include/pybind11/iostream.h
+    include/pybind11/functional.h
+    include/pybind11/numpy.h
+    include/pybind11/operators.h
+    include/pybind11/pybind11.h
+    include/pybind11/pytypes.h
+    include/pybind11/stl.h
+    include/pybind11/stl_bind.h)
+
+# Compare with grep and warn if mismatched
+if(PYBIND11_MASTER_PROJECT AND NOT CMAKE_VERSION VERSION_LESS 3.12)
+  file(
+    GLOB_RECURSE _pybind11_header_check
+    LIST_DIRECTORIES false
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    CONFIGURE_DEPENDS "include/pybind11/*.h")
+  set(_pybind11_here_only ${PYBIND11_HEADERS})
+  set(_pybind11_disk_only ${_pybind11_header_check})
+  list(REMOVE_ITEM _pybind11_here_only ${_pybind11_header_check})
+  list(REMOVE_ITEM _pybind11_disk_only ${PYBIND11_HEADERS})
+  if(_pybind11_here_only)
+    message(AUTHOR_WARNING "PYBIND11_HEADERS has extra files:" ${_pybind11_here_only})
+  endif()
+  if(_pybind11_disk_only)
+    message(AUTHOR_WARNING "PYBIND11_HEADERS is missing files:" ${_pybind11_disk_only})
+  endif()
+endif()
+
+# CMake 3.12 added list(TRANSFORM <list> PREPEND
+# But we can't use it yet
+string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/" PYBIND11_HEADERS
+               "${PYBIND11_HEADERS}")
+
+# Cache variables so pybind11_add_module can be used in parent projects
+set(PYBIND11_INCLUDE_DIR
+    "${CMAKE_CURRENT_LIST_DIR}/include"
+    CACHE INTERNAL "")
+
+# Note: when creating targets, you cannot use if statements at configure time -
+# you need generator expressions, because those will be placed in the target file.
+# You can also place ifs *in* the Config.in, but not here.
+
+# This section builds targets, but does *not* touch Python
+
+# Build the headers-only target (no Python included):
+# (long name used here to keep this from clashing in subdirectory mode)
+add_library(pybind11_headers INTERFACE)
+add_library(pybind11::pybind11_headers ALIAS pybind11_headers) # to match exported target
+add_library(pybind11::headers ALIAS pybind11_headers) # easier to use/remember
+
+include("${CMAKE_CURRENT_SOURCE_DIR}/tools/pybind11Common.cmake")
+
+if(NOT PYBIND11_MASTER_PROJECT AND NOT pybind11_FIND_QUIETLY)
+  message(STATUS "Using pybind11: (version \"${pybind11_VERSION}\" ${pybind11_VERSION_TYPE})")
+endif()
+
+# Relative directory setting
+if(USE_PYTHON_INCLUDE_DIR AND DEFINED Python_INCLUDE_DIRS)
+  file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${Python_INCLUDE_DIRS})
+elseif(USE_PYTHON_INCLUDE_DIR AND DEFINED PYTHON_INCLUDE_DIR)
+  file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+
+# Fill in headers target
+target_include_directories(
+  pybind11_headers ${pybind11_system} INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
+                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+target_compile_features(pybind11_headers INTERFACE cxx_inheriting_constructors cxx_user_literals
+                                                   cxx_right_angle_brackets)
+
+if(PYBIND11_INSTALL)
+  install(DIRECTORY ${PYBIND11_INCLUDE_DIR}/pybind11 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
+  set(PYBIND11_CMAKECONFIG_INSTALL_DIR
+      "share/cmake/${PROJECT_NAME}"
+      CACHE STRING "install path for pybind11Config.cmake")
+
+  configure_package_config_file(
+    tools/${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+    INSTALL_DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  if(CMAKE_VERSION VERSION_LESS 3.14)
+    # Remove CMAKE_SIZEOF_VOID_P from ConfigVersion.cmake since the library does
+    # not depend on architecture specific settings or libraries.
+    set(_PYBIND11_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+    unset(CMAKE_SIZEOF_VOID_P)
+
+    write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion)
+
+    set(CMAKE_SIZEOF_VOID_P ${_PYBIND11_CMAKE_SIZEOF_VOID_P})
+  else()
+    # CMake 3.14+ natively supports header-only libraries
+    write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion ARCH_INDEPENDENT)
+  endif()
+
+  install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+          ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+          tools/FindPythonLibsNew.cmake
+          tools/pybind11Common.cmake
+          tools/pybind11Tools.cmake
+          tools/pybind11NewTools.cmake
+    DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  if(NOT PYBIND11_EXPORT_NAME)
+    set(PYBIND11_EXPORT_NAME "${PROJECT_NAME}Targets")
+  endif()
+
+  install(TARGETS pybind11_headers EXPORT "${PYBIND11_EXPORT_NAME}")
+
+  install(
+    EXPORT "${PYBIND11_EXPORT_NAME}"
+    NAMESPACE "pybind11::"
+    DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  # Uninstall target
+  if(PYBIND11_MASTER_PROJECT)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tools/cmake_uninstall.cmake.in"
+                   "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
+
+    add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P
+                                        ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+  endif()
+endif()
+
+# BUILD_TESTING takes priority, but only if this is the master project
+if(PYBIND11_MASTER_PROJECT AND DEFINED BUILD_TESTING)
+  if(BUILD_TESTING)
+    if(_pybind11_nopython)
+      message(FATAL_ERROR "Cannot activate tests in NOPYTHON mode")
+    else()
+      add_subdirectory(tests)
+    endif()
+  endif()
+else()
+  if(PYBIND11_TEST)
+    if(_pybind11_nopython)
+      message(FATAL_ERROR "Cannot activate tests in NOPYTHON mode")
+    else()
+      add_subdirectory(tests)
+    endif()
+  endif()
+endif()
+
+# Better symmetry with find_package(pybind11 CONFIG) mode.
+if(NOT PYBIND11_MASTER_PROJECT)
+  set(pybind11_FOUND
+      TRUE
+      CACHE INTERNAL "true if pybind11 and all required components found on the system")
+  set(pybind11_INCLUDE_DIR
+      "${PYBIND11_INCLUDE_DIR}"
+      CACHE INTERNAL "Directory where pybind11 headers are located")
+endif()
diff --git a/pybind11/LICENSE b/pybind11/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e466b0dfda14f3a7c8ece512937eb99c8b7b6d68
--- /dev/null
+++ b/pybind11/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file .github/CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
diff --git a/pybind11/MANIFEST.in b/pybind11/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..6fe84ced8d4334e90d2539c6cde208e838a9ba7e
--- /dev/null
+++ b/pybind11/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include include/pybind11 *.h
+include LICENSE README.md .github/CONTRIBUTING.md
diff --git a/pybind11/README.md b/pybind11/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bae6cf2b5c46f7181e28bef6a1c2d4101086433d
--- /dev/null
+++ b/pybind11/README.md
@@ -0,0 +1,143 @@
+![pybind11 logo](https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png)
+
+# pybind11 — Seamless operability between C++11 and Python
+
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=master)](http://pybind11.readthedocs.org/en/master/?badge=master)
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=stable)](http://pybind11.readthedocs.org/en/stable/?badge=stable)
+[![Gitter chat](https://img.shields.io/gitter/room/gitterHQ/gitter.svg)](https://gitter.im/pybind/Lobby)
+[![CI](https://github.com/pybind/pybind11/workflows/CI/badge.svg)](https://github.com/pybind/pybind11/actions)
+[![Build status](https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true)](https://ci.appveyor.com/project/wjakob/pybind11)
+
+**pybind11** is a lightweight header-only library that exposes C++ types in
+Python and vice versa, mainly to create Python bindings of existing C++ code.
+Its goals and syntax are similar to the excellent [Boost.Python][] library by
+David Abrahams: to minimize boilerplate code in traditional extension modules
+by inferring type information using compile-time introspection.
+
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.5+, or PyPy) and the C++ standard library. This compact
+implementation was possible thanks to some of the new C++11 language features
+(specifically: tuples, lambda functions and variadic templates). Since its
+creation, this library has grown beyond Boost.Python in many ways, leading to
+dramatically simpler binding code in many common situations.
+
+Tutorial and reference documentation is provided at
+[pybind11.readthedocs.org][].  A PDF version of the manual is available
+[here][docs-pdf].
+
+## Core features
+pybind11 can map the following core C++ features to Python:
+
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like `std::shared_ptr`
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+
+## Goodies
+In addition to the core functionality, pybind11 provides some extra goodies:
+
+- Python 2.7, 3.5+, and PyPy (tested on 7.3) are supported with an implementation-agnostic
+  interface.
+
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of PyRosetta, an enormous Boost.Python binding project,
+  [reported][pyrosetta-report] a binary size reduction of **5.4x** and compile
+  time reduction by **5.8x**.
+
+- Function signatures are precomputed at compile time (using `constexpr`),
+  leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+
+## Supported compilers
+
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
+4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11
+   v2.0 and a [workaround][intel-15-workaround])
+5. Cygwin/GCC (tested on 2.5.1)
+
+## About
+
+This project was created by [Wenzel Jakob](http://rgl.epfl.ch/people/wjakob).
+Significant features and/or improvements to the code were contributed by
+Jonas Adler,
+Lori A. Burns,
+Sylvain Corlay,
+Trent Houliston,
+Axel Huebl,
+@hulucc,
+Sergey Lyskov
+Johan Mabille,
+Tomasz Miąsko,
+Dean Moldovan,
+Ben Pritchard,
+Jason Rhinelander,
+Boris Schäling,
+Pim Schellart,
+Henry Schreiner,
+Ivan Smirnov, and
+Patrick Stewart.
+
+### Contributing
+
+See the [contributing guide][] for information on building and contributing to
+pybind11.
+
+
+### License
+
+pybind11 is provided under a BSD-style license that can be found in the
+[`LICENSE`][] file. By using, distributing, or contributing to this project,
+you agree to the terms and conditions of this license.
+
+
+[pybind11.readthedocs.org]: http://pybind11.readthedocs.org/en/master
+[docs-pdf]: https://media.readthedocs.org/pdf/pybind11/master/pybind11.pdf
+[Boost.Python]: http://www.boost.org/doc/libs/1_58_0/libs/python/doc/
+[pyrosetta-report]: http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
+[contributing guide]:  https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md
+[`LICENSE`]: https://github.com/pybind/pybind11/blob/master/LICENSE
+[intel-15-workaround]: https://github.com/pybind/pybind11/issues/276
diff --git a/pybind11/docs/Doxyfile b/pybind11/docs/Doxyfile
new file mode 100644
index 0000000000000000000000000000000000000000..24ece0d8dbac25c6c2ab076610f867beb2fe0c9a
--- /dev/null
+++ b/pybind11/docs/Doxyfile
@@ -0,0 +1,22 @@
+PROJECT_NAME           = pybind11
+INPUT                  = ../include/pybind11/
+RECURSIVE              = YES
+
+GENERATE_HTML          = NO
+GENERATE_LATEX         = NO
+GENERATE_XML           = YES
+XML_OUTPUT             = .build/doxygenxml
+XML_PROGRAMLISTING     = YES
+
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+EXPAND_AS_DEFINED      = PYBIND11_RUNTIME_EXCEPTION
+
+ALIASES                = "rst=\verbatim embed:rst"
+ALIASES               += "endrst=\endverbatim"
+
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = NO
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS \
+                         PY_MAJOR_VERSION=3
diff --git a/pybind11/docs/_static/theme_overrides.css b/pybind11/docs/_static/theme_overrides.css
new file mode 100644
index 0000000000000000000000000000000000000000..1071809fa0fecf7c28d3356f37363266e9128b81
--- /dev/null
+++ b/pybind11/docs/_static/theme_overrides.css
@@ -0,0 +1,11 @@
+.wy-table-responsive table td,
+.wy-table-responsive table th {
+    white-space: initial !important;
+}
+.rst-content table.docutils td {
+    vertical-align: top !important;
+}
+div[class^='highlight'] pre {
+    white-space: pre;
+    white-space: pre-wrap;
+}
diff --git a/pybind11/docs/advanced/cast/chrono.rst b/pybind11/docs/advanced/cast/chrono.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fbd46057aa392c86ae3747c2b21768367205ea49
--- /dev/null
+++ b/pybind11/docs/advanced/cast/chrono.rst
@@ -0,0 +1,81 @@
+Chrono
+======
+
+When including the additional header file :file:`pybind11/chrono.h` conversions
+from C++11 chrono datatypes to python datetime objects are automatically enabled.
+This header also enables conversions of python floats (often from sources such
+as ``time.monotonic()``, ``time.perf_counter()`` and ``time.process_time()``)
+into durations.
+
+An overview of clocks in C++11
+------------------------------
+
+A point of confusion when using these conversions is the differences between
+clocks provided in C++11. There are three clock types defined by the C++11
+standard and users can define their own if needed. Each of these clocks have
+different properties and when converting to and from python will give different
+results.
+
+The first clock defined by the standard is ``std::chrono::system_clock``. This
+clock measures the current date and time. However, this clock changes with to
+updates to the operating system time. For example, if your time is synchronised
+with a time server this clock will change. This makes this clock a poor choice
+for timing purposes but good for measuring the wall time.
+
+The second clock defined in the standard is ``std::chrono::steady_clock``.
+This clock ticks at a steady rate and is never adjusted. This makes it excellent
+for timing purposes, however the value in this clock does not correspond to the
+current date and time. Often this clock will be the amount of time your system
+has been on, although it does not have to be. This clock will never be the same
+clock as the system clock as the system clock can change but steady clocks
+cannot.
+
+The third clock defined in the standard is ``std::chrono::high_resolution_clock``.
+This clock is the clock that has the highest resolution out of the clocks in the
+system. It is normally a typedef to either the system clock or the steady clock
+but can be its own independent clock. This is important as when using these
+conversions as the types you get in python for this clock might be different
+depending on the system.
+If it is a typedef of the system clock, python will get datetime objects, but if
+it is a different clock they will be timedelta objects.
+
+Provided conversions
+--------------------
+
+.. rubric:: C++ to Python
+
+- ``std::chrono::system_clock::time_point`` → ``datetime.datetime``
+    System clock times are converted to python datetime instances. They are
+    in the local timezone, but do not have any timezone information attached
+    to them (they are naive datetime objects).
+
+- ``std::chrono::duration`` → ``datetime.timedelta``
+    Durations are converted to timedeltas, any precision in the duration
+    greater than microseconds is lost by rounding towards zero.
+
+- ``std::chrono::[other_clocks]::time_point`` → ``datetime.timedelta``
+    Any clock time that is not the system clock is converted to a time delta.
+    This timedelta measures the time from the clocks epoch to now.
+
+.. rubric:: Python to C++
+
+- ``datetime.datetime`` or ``datetime.date`` or ``datetime.time`` → ``std::chrono::system_clock::time_point``
+    Date/time objects are converted into system clock timepoints. Any
+    timezone information is ignored and the type is treated as a naive
+    object.
+
+- ``datetime.timedelta`` → ``std::chrono::duration``
+    Time delta are converted into durations with microsecond precision.
+
+- ``datetime.timedelta`` → ``std::chrono::[other_clocks]::time_point``
+    Time deltas that are converted into clock timepoints are treated as
+    the amount of time from the start of the clocks epoch.
+
+- ``float`` → ``std::chrono::duration``
+    Floats that are passed to C++ as durations be interpreted as a number of
+    seconds. These will be converted to the duration using ``duration_cast``
+    from the float.
+
+- ``float`` → ``std::chrono::[other_clocks]::time_point``
+    Floats that are passed to C++ as time points will be interpreted as the
+    number of seconds from the start of the clocks epoch.
diff --git a/pybind11/docs/advanced/cast/custom.rst b/pybind11/docs/advanced/cast/custom.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e4f99ac5b086355ff8e691a22ad5e16ea84dfed7
--- /dev/null
+++ b/pybind11/docs/advanced/cast/custom.rst
@@ -0,0 +1,91 @@
+Custom type casters
+===================
+
+In very rare cases, applications may require custom type casters that cannot be
+expressed using the abstractions provided by pybind11, thus requiring raw
+Python C API calls. This is fairly advanced usage and should only be pursued by
+experts who are familiar with the intricacies of Python reference counting.
+
+The following snippets demonstrate how this works for a very simple ``inty``
+type that that should be convertible from Python types that provide a
+``__int__(self)`` method.
+
+.. code-block:: cpp
+
+    struct inty { long long_value; };
+
+    void print(inty s) {
+        std::cout << s.long_value << std::endl;
+    }
+
+The following Python snippet demonstrates the intended usage from the Python side:
+
+.. code-block:: python
+
+    class A:
+        def __int__(self):
+            return 123
+
+    from example import print
+    print(A())
+
+To register the necessary conversion routines, it is necessary to add
+a partial overload to the ``pybind11::detail::type_caster<T>`` template.
+Although this is an implementation detail, adding partial overloads to this
+type is explicitly allowed.
+
+.. code-block:: cpp
+
+    namespace pybind11 { namespace detail {
+        template <> struct type_caster<inty> {
+        public:
+            /**
+             * This macro establishes the name 'inty' in
+             * function signatures and declares a local variable
+             * 'value' of type inty
+             */
+            PYBIND11_TYPE_CASTER(inty, _("inty"));
+
+            /**
+             * Conversion part 1 (Python->C++): convert a PyObject into a inty
+             * instance or return false upon failure. The second argument
+             * indicates whether implicit conversions should be applied.
+             */
+            bool load(handle src, bool) {
+                /* Extract PyObject from handle */
+                PyObject *source = src.ptr();
+                /* Try converting into a Python integer value */
+                PyObject *tmp = PyNumber_Long(source);
+                if (!tmp)
+                    return false;
+                /* Now try to convert into a C++ int */
+                value.long_value = PyLong_AsLong(tmp);
+                Py_DECREF(tmp);
+                /* Ensure return code was OK (to avoid out-of-range errors etc) */
+                return !(value.long_value == -1 && !PyErr_Occurred());
+            }
+
+            /**
+             * Conversion part 2 (C++ -> Python): convert an inty instance into
+             * a Python object. The second and third arguments are used to
+             * indicate the return value policy and parent object (for
+             * ``return_value_policy::reference_internal``) and are generally
+             * ignored by implicit casters.
+             */
+            static handle cast(inty src, return_value_policy /* policy */, handle /* parent */) {
+                return PyLong_FromLong(src.long_value);
+            }
+        };
+    }} // namespace pybind11::detail
+
+.. note::
+
+    A ``type_caster<T>`` defined with ``PYBIND11_TYPE_CASTER(T, ...)`` requires
+    that ``T`` is default-constructible (``value`` is first default constructed
+    and then ``load()`` assigns to it).
+
+.. warning::
+
+    When using custom type casters, it's important to declare them consistently
+    in every compilation unit of the Python extension module. Otherwise,
+    undefined behavior can ensue.
diff --git a/pybind11/docs/advanced/cast/eigen.rst b/pybind11/docs/advanced/cast/eigen.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59ba08c3c4297556f6aaa2e0c8db328eb1490e40
--- /dev/null
+++ b/pybind11/docs/advanced/cast/eigen.rst
@@ -0,0 +1,310 @@
+Eigen
+#####
+
+`Eigen <http://eigen.tuxfamily.org>`_ is C++ header-based library for dense and
+sparse linear algebra. Due to its popularity and widespread adoption, pybind11
+provides transparent conversion and limited mapping support between Eigen and
+Scientific Python linear algebra data types.
+
+To enable the built-in Eigen support you must include the optional header file
+:file:`pybind11/eigen.h`.
+
+Pass-by-value
+=============
+
+When binding a function with ordinary Eigen dense object arguments (for
+example, ``Eigen::MatrixXd``), pybind11 will accept any input value that is
+already (or convertible to) a ``numpy.ndarray`` with dimensions compatible with
+the Eigen type, copy its values into a temporary Eigen variable of the
+appropriate type, then call the function with this temporary variable.
+
+Sparse matrices are similarly copied to or from
+``scipy.sparse.csr_matrix``/``scipy.sparse.csc_matrix`` objects.
+
+Pass-by-reference
+=================
+
+One major limitation of the above is that every data conversion implicitly
+involves a copy, which can be both expensive (for large matrices) and disallows
+binding functions that change their (Matrix) arguments.  Pybind11 allows you to
+work around this by using Eigen's ``Eigen::Ref<MatrixType>`` class much as you
+would when writing a function taking a generic type in Eigen itself (subject to
+some limitations discussed below).
+
+When calling a bound function accepting a ``Eigen::Ref<const MatrixType>``
+type, pybind11 will attempt to avoid copying by using an ``Eigen::Map`` object
+that maps into the source ``numpy.ndarray`` data: this requires both that the
+data types are the same (e.g. ``dtype='float64'`` and ``MatrixType::Scalar`` is
+``double``); and that the storage is layout compatible.  The latter limitation
+is discussed in detail in the section below, and requires careful
+consideration: by default, numpy matrices and Eigen matrices are *not* storage
+compatible.
+
+If the numpy matrix cannot be used as is (either because its types differ, e.g.
+passing an array of integers to an Eigen parameter requiring doubles, or
+because the storage is incompatible), pybind11 makes a temporary copy and
+passes the copy instead.
+
+When a bound function parameter is instead ``Eigen::Ref<MatrixType>`` (note the
+lack of ``const``), pybind11 will only allow the function to be called if it
+can be mapped *and* if the numpy array is writeable (that is
+``a.flags.writeable`` is true).  Any access (including modification) made to
+the passed variable will be transparently carried out directly on the
+``numpy.ndarray``.
+
+This means you can can write code such as the following and have it work as
+expected:
+
+.. code-block:: cpp
+
+    void scale_by_2(Eigen::Ref<Eigen::VectorXd> v) {
+        v *= 2;
+    }
+
+Note, however, that you will likely run into limitations due to numpy and
+Eigen's difference default storage order for data; see the below section on
+:ref:`storage_orders` for details on how to bind code that won't run into such
+limitations.
+
+.. note::
+
+    Passing by reference is not supported for sparse types.
+
+Returning values to Python
+==========================
+
+When returning an ordinary dense Eigen matrix type to numpy (e.g.
+``Eigen::MatrixXd`` or ``Eigen::RowVectorXf``) pybind11 keeps the matrix and
+returns a numpy array that directly references the Eigen matrix: no copy of the
+data is performed.  The numpy array will have ``array.flags.owndata`` set to
+``False`` to indicate that it does not own the data, and the lifetime of the
+stored Eigen matrix will be tied to the returned ``array``.
+
+If you bind a function with a non-reference, ``const`` return type (e.g.
+``const Eigen::MatrixXd``), the same thing happens except that pybind11 also
+sets the numpy array's ``writeable`` flag to false.
+
+If you return an lvalue reference or pointer, the usual pybind11 rules apply,
+as dictated by the binding function's return value policy (see the
+documentation on :ref:`return_value_policies` for full details).  That means,
+without an explicit return value policy, lvalue references will be copied and
+pointers will be managed by pybind11.  In order to avoid copying, you should
+explicitly specify an appropriate return value policy, as in the following
+example:
+
+.. code-block:: cpp
+
+    class MyClass {
+        Eigen::MatrixXd big_mat = Eigen::MatrixXd::Zero(10000, 10000);
+    public:
+        Eigen::MatrixXd &getMatrix() { return big_mat; }
+        const Eigen::MatrixXd &viewMatrix() { return big_mat; }
+    };
+
+    // Later, in binding code:
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def("copy_matrix", &MyClass::getMatrix) // Makes a copy!
+        .def("get_matrix", &MyClass::getMatrix, py::return_value_policy::reference_internal)
+        .def("view_matrix", &MyClass::viewMatrix, py::return_value_policy::reference_internal)
+        ;
+
+.. code-block:: python
+
+    a = MyClass()
+    m = a.get_matrix()   # flags.writeable = True,  flags.owndata = False
+    v = a.view_matrix()  # flags.writeable = False, flags.owndata = False
+    c = a.copy_matrix()  # flags.writeable = True,  flags.owndata = True
+    # m[5,6] and v[5,6] refer to the same element, c[5,6] does not.
+
+Note in this example that ``py::return_value_policy::reference_internal`` is
+used to tie the life of the MyClass object to the life of the returned arrays.
+
+You may also return an ``Eigen::Ref``, ``Eigen::Map`` or other map-like Eigen
+object (for example, the return value of ``matrix.block()`` and related
+methods) that map into a dense Eigen type.  When doing so, the default
+behaviour of pybind11 is to simply reference the returned data: you must take
+care to ensure that this data remains valid!  You may ask pybind11 to
+explicitly *copy* such a return value by using the
+``py::return_value_policy::copy`` policy when binding the function.  You may
+also use ``py::return_value_policy::reference_internal`` or a
+``py::keep_alive`` to ensure the data stays valid as long as the returned numpy
+array does.
+
+When returning such a reference of map, pybind11 additionally respects the
+readonly-status of the returned value, marking the numpy array as non-writeable
+if the reference or map was itself read-only.
+
+.. note::
+
+    Sparse types are always copied when returned.
+
+.. _storage_orders:
+
+Storage orders
+==============
+
+Passing arguments via ``Eigen::Ref`` has some limitations that you must be
+aware of in order to effectively pass matrices by reference.  First and
+foremost is that the default ``Eigen::Ref<MatrixType>`` class requires
+contiguous storage along columns (for column-major types, the default in Eigen)
+or rows if ``MatrixType`` is specifically an ``Eigen::RowMajor`` storage type.
+The former, Eigen's default, is incompatible with ``numpy``'s default row-major
+storage, and so you will not be able to pass numpy arrays to Eigen by reference
+without making one of two changes.
+
+(Note that this does not apply to vectors (or column or row matrices): for such
+types the "row-major" and "column-major" distinction is meaningless).
+
+The first approach is to change the use of ``Eigen::Ref<MatrixType>`` to the
+more general ``Eigen::Ref<MatrixType, 0, Eigen::Stride<Eigen::Dynamic,
+Eigen::Dynamic>>`` (or similar type with a fully dynamic stride type in the
+third template argument).  Since this is a rather cumbersome type, pybind11
+provides a ``py::EigenDRef<MatrixType>`` type alias for your convenience (along
+with EigenDMap for the equivalent Map, and EigenDStride for just the stride
+type).
+
+This type allows Eigen to map into any arbitrary storage order.  This is not
+the default in Eigen for performance reasons: contiguous storage allows
+vectorization that cannot be done when storage is not known to be contiguous at
+compile time.  The default ``Eigen::Ref`` stride type allows non-contiguous
+storage along the outer dimension (that is, the rows of a column-major matrix
+or columns of a row-major matrix), but not along the inner dimension.
+
+This type, however, has the added benefit of also being able to map numpy array
+slices.  For example, the following (contrived) example uses Eigen with a numpy
+slice to multiply by 2 all coefficients that are both on even rows (0, 2, 4,
+...) and in columns 2, 5, or 8:
+
+.. code-block:: cpp
+
+    m.def("scale", [](py::EigenDRef<Eigen::MatrixXd> m, double c) { m *= c; });
+
+.. code-block:: python
+
+    # a = np.array(...)
+    scale_by_2(myarray[0::2, 2:9:3])
+
+The second approach to avoid copying is more intrusive: rearranging the
+underlying data types to not run into the non-contiguous storage problem in the
+first place.  In particular, that means using matrices with ``Eigen::RowMajor``
+storage, where appropriate, such as:
+
+.. code-block:: cpp
+
+    using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    // Use RowMatrixXd instead of MatrixXd
+
+Now bound functions accepting ``Eigen::Ref<RowMatrixXd>`` arguments will be
+callable with numpy's (default) arrays without involving a copying.
+
+You can, alternatively, change the storage order that numpy arrays use by
+adding the ``order='F'`` option when creating an array:
+
+.. code-block:: python
+
+    myarray = np.array(source, order='F')
+
+Such an object will be passable to a bound function accepting an
+``Eigen::Ref<MatrixXd>`` (or similar column-major Eigen type).
+
+One major caveat with this approach, however, is that it is not entirely as
+easy as simply flipping all Eigen or numpy usage from one to the other: some
+operations may alter the storage order of a numpy array.  For example, ``a2 =
+array.transpose()`` results in ``a2`` being a view of ``array`` that references
+the same data, but in the opposite storage order!
+
+While this approach allows fully optimized vectorized calculations in Eigen, it
+cannot be used with array slices, unlike the first approach.
+
+When *returning* a matrix to Python (either a regular matrix, a reference via
+``Eigen::Ref<>``, or a map/block into a matrix), no special storage
+consideration is required: the created numpy array will have the required
+stride that allows numpy to properly interpret the array, whatever its storage
+order.
+
+Failing rather than copying
+===========================
+
+The default behaviour when binding ``Eigen::Ref<const MatrixType>`` Eigen
+references is to copy matrix values when passed a numpy array that does not
+conform to the element type of ``MatrixType`` or does not have a compatible
+stride layout.  If you want to explicitly avoid copying in such a case, you
+should bind arguments using the ``py::arg().noconvert()`` annotation (as
+described in the :ref:`nonconverting_arguments` documentation).
+
+The following example shows an example of arguments that don't allow data
+copying to take place:
+
+.. code-block:: cpp
+
+    // The method and function to be bound:
+    class MyClass {
+        // ...
+        double some_method(const Eigen::Ref<const MatrixXd> &matrix) { /* ... */ }
+    };
+    float some_function(const Eigen::Ref<const MatrixXf> &big,
+                        const Eigen::Ref<const MatrixXf> &small) {
+        // ...
+    }
+
+    // The associated binding code:
+    using namespace pybind11::literals; // for "arg"_a
+    py::class_<MyClass>(m, "MyClass")
+        // ... other class definitions
+        .def("some_method", &MyClass::some_method, py::arg().noconvert());
+
+    m.def("some_function", &some_function,
+        "big"_a.noconvert(), // <- Don't allow copying for this arg
+        "small"_a            // <- This one can be copied if needed
+    );
+
+With the above binding code, attempting to call the the ``some_method(m)``
+method on a ``MyClass`` object, or attempting to call ``some_function(m, m2)``
+will raise a ``RuntimeError`` rather than making a temporary copy of the array.
+It will, however, allow the ``m2`` argument to be copied into a temporary if
+necessary.
+
+Note that explicitly specifying ``.noconvert()`` is not required for *mutable*
+Eigen references (e.g. ``Eigen::Ref<MatrixXd>`` without ``const`` on the
+``MatrixXd``): mutable references will never be called with a temporary copy.
+
+Vectors versus column/row matrices
+==================================
+
+Eigen and numpy have fundamentally different notions of a vector.  In Eigen, a
+vector is simply a matrix with the number of columns or rows set to 1 at
+compile time (for a column vector or row vector, respectively).  Numpy, in
+contrast, has comparable 2-dimensional 1xN and Nx1 arrays, but *also* has
+1-dimensional arrays of size N.
+
+When passing a 2-dimensional 1xN or Nx1 array to Eigen, the Eigen type must
+have matching dimensions: That is, you cannot pass a 2-dimensional Nx1 numpy
+array to an Eigen value expecting a row vector, or a 1xN numpy array as a
+column vector argument.
+
+On the other hand, pybind11 allows you to pass 1-dimensional arrays of length N
+as Eigen parameters.  If the Eigen type can hold a column vector of length N it
+will be passed as such a column vector.  If not, but the Eigen type constraints
+will accept a row vector, it will be passed as a row vector.  (The column
+vector takes precedence when both are supported, for example, when passing a
+1D numpy array to a MatrixXd argument).  Note that the type need not be
+explicitly a vector: it is permitted to pass a 1D numpy array of size 5 to an
+Eigen ``Matrix<double, Dynamic, 5>``: you would end up with a 1x5 Eigen matrix.
+Passing the same to an ``Eigen::MatrixXd`` would result in a 5x1 Eigen matrix.
+
+When returning an Eigen vector to numpy, the conversion is ambiguous: a row
+vector of length 4 could be returned as either a 1D array of length 4, or as a
+2D array of size 1x4.  When encountering such a situation, pybind11 compromises
+by considering the returned Eigen type: if it is a compile-time vector--that
+is, the type has either the number of rows or columns set to 1 at compile
+time--pybind11 converts to a 1D numpy array when returning the value.  For
+instances that are a vector only at run-time (e.g. ``MatrixXd``,
+``Matrix<float, Dynamic, 4>``), pybind11 returns the vector as a 2D array to
+numpy.  If this isn't want you want, you can use ``array.reshape(...)`` to get
+a view of the same data in the desired dimensions.
+
+.. seealso::
+
+    The file :file:`tests/test_eigen.cpp` contains a complete example that
+    shows how to pass Eigen sparse and dense data types in more detail.
diff --git a/pybind11/docs/advanced/cast/functional.rst b/pybind11/docs/advanced/cast/functional.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d9b46057598f0d182422accd088fbff9785b0b53
--- /dev/null
+++ b/pybind11/docs/advanced/cast/functional.rst
@@ -0,0 +1,109 @@
+Functional
+##########
+
+The following features must be enabled by including :file:`pybind11/functional.h`.
+
+
+Callbacks and passing anonymous functions
+=========================================
+
+The C++11 standard brought lambda functions and the generic polymorphic
+function wrapper ``std::function<>`` to the C++ programming language, which
+enable powerful new ways of working with functions. Lambda functions come in
+two flavors: stateless lambda function resemble classic function pointers that
+link to an anonymous piece of code, while stateful lambda functions
+additionally depend on captured variables that are stored in an anonymous
+*lambda closure object*.
+
+Here is a simple example of a C++ function that takes an arbitrary function
+(stateful or stateless) with signature ``int -> int`` as an argument and runs
+it with the value 10.
+
+.. code-block:: cpp
+
+    int func_arg(const std::function<int(int)> &f) {
+        return f(10);
+    }
+
+The example below is more involved: it takes a function of signature ``int -> int``
+and returns another function of the same kind. The return value is a stateful
+lambda function, which stores the value ``f`` in the capture object and adds 1 to
+its return value upon execution.
+
+.. code-block:: cpp
+
+    std::function<int(int)> func_ret(const std::function<int(int)> &f) {
+        return [f](int i) {
+            return f(i) + 1;
+        };
+    }
+
+This example demonstrates using python named parameters in C++ callbacks which
+requires using ``py::cpp_function`` as a wrapper. Usage is similar to defining
+methods of classes:
+
+.. code-block:: cpp
+
+    py::cpp_function func_cpp() {
+        return py::cpp_function([](int i) { return i+1; },
+           py::arg("number"));
+    }
+
+After including the extra header file :file:`pybind11/functional.h`, it is almost
+trivial to generate binding code for all of these functions.
+
+.. code-block:: cpp
+
+    #include <pybind11/functional.h>
+
+    PYBIND11_MODULE(example, m) {
+        m.def("func_arg", &func_arg);
+        m.def("func_ret", &func_ret);
+        m.def("func_cpp", &func_cpp);
+    }
+
+The following interactive session shows how to call them from Python.
+
+.. code-block:: pycon
+
+    $ python
+    >>> import example
+    >>> def square(i):
+    ...     return i * i
+    ...
+    >>> example.func_arg(square)
+    100L
+    >>> square_plus_1 = example.func_ret(square)
+    >>> square_plus_1(4)
+    17L
+    >>> plus_1 = func_cpp()
+    >>> plus_1(number=43)
+    44L
+
+.. warning::
+
+    Keep in mind that passing a function from C++ to Python (or vice versa)
+    will instantiate a piece of wrapper code that translates function
+    invocations between the two languages. Naturally, this translation
+    increases the computational cost of each function call somewhat. A
+    problematic situation can arise when a function is copied back and forth
+    between Python and C++ many times in a row, in which case the underlying
+    wrappers will accumulate correspondingly. The resulting long sequence of
+    C++ -> Python -> C++ -> ... roundtrips can significantly decrease
+    performance.
+
+    There is one exception: pybind11 detects case where a stateless function
+    (i.e. a function pointer or a lambda function without captured variables)
+    is passed as an argument to another C++ function exposed in Python. In this
+    case, there is no overhead. Pybind11 will extract the underlying C++
+    function pointer from the wrapped function to sidestep a potential C++ ->
+    Python -> C++ roundtrip. This is demonstrated in :file:`tests/test_callbacks.cpp`.
+
+.. note::
+
+    This functionality is very useful when generating bindings for callbacks in
+    C++ libraries (e.g. GUI libraries, asynchronous networking libraries, etc.).
+
+    The file :file:`tests/test_callbacks.cpp` contains a complete example
+    that demonstrates how to work with callbacks and anonymous functions in
+    more detail.
diff --git a/pybind11/docs/advanced/cast/index.rst b/pybind11/docs/advanced/cast/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..724585c9202f6fe7cd03daa5cf4df4e1daf0ffe7
--- /dev/null
+++ b/pybind11/docs/advanced/cast/index.rst
@@ -0,0 +1,41 @@
+Type conversions
+################
+
+Apart from enabling cross-language function calls, a fundamental problem
+that a binding tool like pybind11 must address is to provide access to
+native Python types in C++ and vice versa. There are three fundamentally
+different ways to do this—which approach is preferable for a particular type
+depends on the situation at hand.
+
+1. Use a native C++ type everywhere. In this case, the type must be wrapped
+   using pybind11-generated bindings so that Python can interact with it.
+
+2. Use a native Python type everywhere. It will need to be wrapped so that
+   C++ functions can interact with it.
+
+3. Use a native C++ type on the C++ side and a native Python type on the
+   Python side. pybind11 refers to this as a *type conversion*.
+
+   Type conversions are the most "natural" option in the sense that native
+   (non-wrapped) types are used everywhere. The main downside is that a copy
+   of the data must be made on every Python ↔ C++ transition: this is
+   needed since the C++ and Python versions of the same type generally won't
+   have the same memory layout.
+
+   pybind11 can perform many kinds of conversions automatically. An overview
+   is provided in the table ":ref:`conversion_table`".
+
+The following subsections discuss the differences between these options in more
+detail. The main focus in this section is on type conversions, which represent
+the last case of the above list.
+
+.. toctree::
+   :maxdepth: 1
+
+   overview
+   strings
+   stl
+   functional
+   chrono
+   eigen
+   custom
diff --git a/pybind11/docs/advanced/cast/overview.rst b/pybind11/docs/advanced/cast/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0e32a52f9c8bb2945e230416216c38e3c3a4a9b
--- /dev/null
+++ b/pybind11/docs/advanced/cast/overview.rst
@@ -0,0 +1,165 @@
+Overview
+########
+
+.. rubric:: 1. Native type in C++, wrapper in Python
+
+Exposing a custom C++ type using :class:`py::class_` was covered in detail
+in the :doc:`/classes` section. There, the underlying data structure is
+always the original C++ class while the :class:`py::class_` wrapper provides
+a Python interface. Internally, when an object like this is sent from C++ to
+Python, pybind11 will just add the outer wrapper layer over the native C++
+object. Getting it back from Python is just a matter of peeling off the
+wrapper.
+
+.. rubric:: 2. Wrapper in C++, native type in Python
+
+This is the exact opposite situation. Now, we have a type which is native to
+Python, like a ``tuple`` or a ``list``. One way to get this data into C++ is
+with the :class:`py::object` family of wrappers. These are explained in more
+detail in the :doc:`/advanced/pycpp/object` section. We'll just give a quick
+example here:
+
+.. code-block:: cpp
+
+    void print_list(py::list my_list) {
+        for (auto item : my_list)
+            std::cout << item << " ";
+    }
+
+.. code-block:: pycon
+
+    >>> print_list([1, 2, 3])
+    1 2 3
+
+The Python ``list`` is not converted in any way -- it's just wrapped in a C++
+:class:`py::list` class. At its core it's still a Python object. Copying a
+:class:`py::list` will do the usual reference-counting like in Python.
+Returning the object to Python will just remove the thin wrapper.
+
+.. rubric:: 3. Converting between native C++ and Python types
+
+In the previous two cases we had a native type in one language and a wrapper in
+the other. Now, we have native types on both sides and we convert between them.
+
+.. code-block:: cpp
+
+    void print_vector(const std::vector<int> &v) {
+        for (auto item : v)
+            std::cout << item << "\n";
+    }
+
+.. code-block:: pycon
+
+    >>> print_vector([1, 2, 3])
+    1 2 3
+
+In this case, pybind11 will construct a new ``std::vector<int>`` and copy each
+element from the Python ``list``. The newly constructed object will be passed
+to ``print_vector``. The same thing happens in the other direction: a new
+``list`` is made to match the value returned from C++.
+
+Lots of these conversions are supported out of the box, as shown in the table
+below. They are very convenient, but keep in mind that these conversions are
+fundamentally based on copying data. This is perfectly fine for small immutable
+types but it may become quite expensive for large data structures. This can be
+avoided by overriding the automatic conversion with a custom wrapper (i.e. the
+above-mentioned approach 1). This requires some manual effort and more details
+are available in the :ref:`opaque` section.
+
+.. _conversion_table:
+
+List of all builtin conversions
+-------------------------------
+
+The following basic data types are supported out of the box (some may require
+an additional extension header to be included). To pass other data structures
+as arguments and return values, refer to the section on binding :ref:`classes`.
+
++------------------------------------+---------------------------+-------------------------------+
+|  Data type                         |  Description              | Header file                   |
++====================================+===========================+===============================+
+| ``int8_t``, ``uint8_t``            | 8-bit integers            | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int16_t``, ``uint16_t``          | 16-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int32_t``, ``uint32_t``          | 32-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int64_t``, ``uint64_t``          | 64-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``ssize_t``, ``size_t``            | Platform-dependent size   | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``float``, ``double``              | Floating point types      | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``bool``                           | Two-state Boolean type    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char``                           | Character literal         | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char16_t``                       | UTF-16 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char32_t``                       | UTF-32 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``wchar_t``                        | Wide character literal    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char *``                   | UTF-8 string literal      | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char16_t *``               | UTF-16 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char32_t *``               | UTF-32 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const wchar_t *``                | Wide string literal       | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::string``                    | STL dynamic UTF-8 string  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u16string``                 | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u32string``                 | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::wstring``                   | STL dynamic wide string   | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::string_view``,              | STL C++17 string views    | :file:`pybind11/pybind11.h`   |
+| ``std::u16string_view``, etc.      |                           |                               |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::pair<T1, T2>``              | Pair of two custom types  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::tuple<...>``                | Arbitrary tuple of types  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::reference_wrapper<...>``    | Reference type wrapper    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::complex<T>``                | Complex numbers           | :file:`pybind11/complex.h`    |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::array<T, Size>``            | STL static array          | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::vector<T>``                 | STL dynamic array         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::deque<T>``                  | STL double-ended queue    | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::valarray<T>``               | STL value array           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::list<T>``                   | STL linked list           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::map<T1, T2>``               | STL ordered map           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::unordered_map<T1, T2>``     | STL unordered map         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::set<T>``                    | STL ordered set           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::unordered_set<T>``          | STL unordered set         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::optional<T>``               | STL optional type (C++17) | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::experimental::optional<T>`` | STL optional type (exp.)  | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::variant<...>``              | Type-safe union (C++17)   | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::function<...>``             | STL polymorphic function  | :file:`pybind11/functional.h` |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::chrono::duration<...>``     | STL time duration         | :file:`pybind11/chrono.h`     |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::chrono::time_point<...>``   | STL date/time             | :file:`pybind11/chrono.h`     |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::Matrix<...>``             | Eigen: dense matrix       | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::Map<...>``                | Eigen: mapped memory      | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::SparseMatrix<...>``       | Eigen: sparse matrix      | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
diff --git a/pybind11/docs/advanced/cast/stl.rst b/pybind11/docs/advanced/cast/stl.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e48409f025d021b35e4e26f4fee754b2d858daa4
--- /dev/null
+++ b/pybind11/docs/advanced/cast/stl.rst
@@ -0,0 +1,240 @@
+STL containers
+##############
+
+Automatic conversion
+====================
+
+When including the additional header file :file:`pybind11/stl.h`, conversions
+between ``std::vector<>``/``std::deque<>``/``std::list<>``/``std::array<>``,
+``std::set<>``/``std::unordered_set<>``, and
+``std::map<>``/``std::unordered_map<>`` and the Python ``list``, ``set`` and
+``dict`` data structures are automatically enabled. The types ``std::pair<>``
+and ``std::tuple<>`` are already supported out of the box with just the core
+:file:`pybind11/pybind11.h` header.
+
+The major downside of these implicit conversions is that containers must be
+converted (i.e. copied) on every Python->C++ and C++->Python transition, which
+can have implications on the program semantics and performance. Please read the
+next sections for more details and alternative approaches that avoid this.
+
+.. note::
+
+    Arbitrary nesting of any of these types is possible.
+
+.. seealso::
+
+    The file :file:`tests/test_stl.cpp` contains a complete
+    example that demonstrates how to pass STL data types in more detail.
+
+.. _cpp17_container_casters:
+
+C++17 library containers
+========================
+
+The :file:`pybind11/stl.h` header also includes support for ``std::optional<>``
+and ``std::variant<>``. These require a C++17 compiler and standard library.
+In C++14 mode, ``std::experimental::optional<>`` is supported if available.
+
+Various versions of these containers also exist for C++11 (e.g. in Boost).
+pybind11 provides an easy way to specialize the ``type_caster`` for such
+types:
+
+.. code-block:: cpp
+
+    // `boost::optional` as an example -- can be any `std::optional`-like container
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct type_caster<boost::optional<T>> : optional_caster<boost::optional<T>> {};
+    }}
+
+The above should be placed in a header file and included in all translation units
+where automatic conversion is needed. Similarly, a specialization can be provided
+for custom variant types:
+
+.. code-block:: cpp
+
+    // `boost::variant` as an example -- can be any `std::variant`-like container
+    namespace pybind11 { namespace detail {
+        template <typename... Ts>
+        struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+        // Specifies the function used to visit the variant -- `apply_visitor` instead of `visit`
+        template <>
+        struct visit_helper<boost::variant> {
+            template <typename... Args>
+            static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+                return boost::apply_visitor(args...);
+            }
+        };
+    }} // namespace pybind11::detail
+
+The ``visit_helper`` specialization is not required if your ``name::variant`` provides
+a ``name::visit()`` function. For any other function name, the specialization must be
+included to tell pybind11 how to visit the variant.
+
+.. note::
+
+    pybind11 only supports the modern implementation of ``boost::variant``
+    which makes use of variadic templates. This requires Boost 1.56 or newer.
+    Additionally, on Windows, MSVC 2017 is required because ``boost::variant``
+    falls back to the old non-variadic implementation on MSVC 2015.
+
+.. _opaque:
+
+Making opaque types
+===================
+
+pybind11 heavily relies on a template matching mechanism to convert parameters
+and return values that are constructed from STL data types such as vectors,
+linked lists, hash tables, etc. This even works in a recursive manner, for
+instance to deal with lists of hash maps of pairs of elementary and custom
+types, etc.
+
+However, a fundamental limitation of this approach is that internal conversions
+between Python and C++ types involve a copy operation that prevents
+pass-by-reference semantics. What does this mean?
+
+Suppose we bind the following function
+
+.. code-block:: cpp
+
+    void append_1(std::vector<int> &v) {
+       v.push_back(1);
+    }
+
+and call it from Python, the following happens:
+
+.. code-block:: pycon
+
+   >>> v = [5, 6]
+   >>> append_1(v)
+   >>> print(v)
+   [5, 6]
+
+As you can see, when passing STL data structures by reference, modifications
+are not propagated back the Python side. A similar situation arises when
+exposing STL data structures using the ``def_readwrite`` or ``def_readonly``
+functions:
+
+.. code-block:: cpp
+
+    /* ... definition ... */
+
+    class MyClass {
+        std::vector<int> contents;
+    };
+
+    /* ... binding code ... */
+
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def_readwrite("contents", &MyClass::contents);
+
+In this case, properties can be read and written in their entirety. However, an
+``append`` operation involving such a list type has no effect:
+
+.. code-block:: pycon
+
+   >>> m = MyClass()
+   >>> m.contents = [5, 6]
+   >>> print(m.contents)
+   [5, 6]
+   >>> m.contents.append(7)
+   >>> print(m.contents)
+   [5, 6]
+
+Finally, the involved copy operations can be costly when dealing with very
+large lists. To deal with all of the above situations, pybind11 provides a
+macro named ``PYBIND11_MAKE_OPAQUE(T)`` that disables the template-based
+conversion machinery of types, thus rendering them *opaque*. The contents of
+opaque objects are never inspected or extracted, hence they *can* be passed by
+reference. For instance, to turn ``std::vector<int>`` into an opaque type, add
+the declaration
+
+.. code-block:: cpp
+
+    PYBIND11_MAKE_OPAQUE(std::vector<int>);
+
+before any binding code (e.g. invocations to ``class_::def()``, etc.). This
+macro must be specified at the top level (and outside of any namespaces), since
+it instantiates a partial template overload. If your binding code consists of
+multiple compilation units, it must be present in every file (typically via a
+common header) preceding any usage of ``std::vector<int>``. Opaque types must
+also have a corresponding ``class_`` declaration to associate them with a name
+in Python, and to define a set of available operations, e.g.:
+
+.. code-block:: cpp
+
+    py::class_<std::vector<int>>(m, "IntVector")
+        .def(py::init<>())
+        .def("clear", &std::vector<int>::clear)
+        .def("pop_back", &std::vector<int>::pop_back)
+        .def("__len__", [](const std::vector<int> &v) { return v.size(); })
+        .def("__iter__", [](std::vector<int> &v) {
+           return py::make_iterator(v.begin(), v.end());
+        }, py::keep_alive<0, 1>()) /* Keep vector alive while iterator is used */
+        // ....
+
+.. seealso::
+
+    The file :file:`tests/test_opaque_types.cpp` contains a complete
+    example that demonstrates how to create and expose opaque types using
+    pybind11 in more detail.
+
+.. _stl_bind:
+
+Binding STL containers
+======================
+
+The ability to expose STL containers as native Python objects is a fairly
+common request, hence pybind11 also provides an optional header file named
+:file:`pybind11/stl_bind.h` that does exactly this. The mapped containers try
+to match the behavior of their native Python counterparts as much as possible.
+
+The following example showcases usage of :file:`pybind11/stl_bind.h`:
+
+.. code-block:: cpp
+
+    // Don't forget this
+    #include <pybind11/stl_bind.h>
+
+    PYBIND11_MAKE_OPAQUE(std::vector<int>);
+    PYBIND11_MAKE_OPAQUE(std::map<std::string, double>);
+
+    // ...
+
+    // later in binding code:
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+    py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
+
+When binding STL containers pybind11 considers the types of the container's
+elements to decide whether the container should be confined to the local module
+(via the :ref:`module_local` feature).  If the container element types are
+anything other than already-bound custom types bound without
+``py::module_local()`` the container binding will have ``py::module_local()``
+applied.  This includes converting types such as numeric types, strings, Eigen
+types; and types that have not yet been bound at the time of the stl container
+binding.  This module-local binding is designed to avoid potential conflicts
+between module bindings (for example, from two separate modules each attempting
+to bind ``std::vector<int>`` as a python type).
+
+It is possible to override this behavior to force a definition to be either
+module-local or global.  To do so, you can pass the attributes
+``py::module_local()`` (to make the binding module-local) or
+``py::module_local(false)`` (to make the binding global) into the
+``py::bind_vector`` or ``py::bind_map`` arguments:
+
+.. code-block:: cpp
+
+    py::bind_vector<std::vector<int>>(m, "VectorInt", py::module_local(false));
+
+Note, however, that such a global binding would make it impossible to load this
+module at the same time as any other pybind module that also attempts to bind
+the same container type (``std::vector<int>`` in the above example).
+
+See :ref:`module_local` for more details on module-local bindings.
+
+.. seealso::
+
+    The file :file:`tests/test_stl_binders.cpp` shows how to use the
+    convenience STL container wrappers.
diff --git a/pybind11/docs/advanced/cast/strings.rst b/pybind11/docs/advanced/cast/strings.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e25701ecabd80142f4fd705f5419ef7c10cc6c56
--- /dev/null
+++ b/pybind11/docs/advanced/cast/strings.rst
@@ -0,0 +1,305 @@
+Strings, bytes and Unicode conversions
+######################################
+
+.. note::
+
+    This section discusses string handling in terms of Python 3 strings. For
+    Python 2.7, replace all occurrences of ``str`` with ``unicode`` and
+    ``bytes`` with ``str``.  Python 2.7 users may find it best to use ``from
+    __future__ import unicode_literals`` to avoid unintentionally using ``str``
+    instead of ``unicode``.
+
+Passing Python strings to C++
+=============================
+
+When a Python ``str`` is passed from Python to a C++ function that accepts
+``std::string`` or ``char *`` as arguments, pybind11 will encode the Python
+string to UTF-8. All Python ``str`` can be encoded in UTF-8, so this operation
+does not fail.
+
+The C++ language is encoding agnostic. It is the responsibility of the
+programmer to track encodings. It's often easiest to simply `use UTF-8
+everywhere <http://utf8everywhere.org/>`_.
+
+.. code-block:: c++
+
+    m.def("utf8_test",
+        [](const std::string &s) {
+            cout << "utf-8 is icing on the cake.\n";
+            cout << s;
+        }
+    );
+    m.def("utf8_charptr",
+        [](const char *s) {
+            cout << "My favorite food is\n";
+            cout << s;
+        }
+    );
+
+.. code-block:: python
+
+    >>> utf8_test('🎂')
+    utf-8 is icing on the cake.
+    🎂
+
+    >>> utf8_charptr('🍕')
+    My favorite food is
+    🍕
+
+.. note::
+
+    Some terminal emulators do not support UTF-8 or emoji fonts and may not
+    display the example above correctly.
+
+The results are the same whether the C++ function accepts arguments by value or
+reference, and whether or not ``const`` is used.
+
+Passing bytes to C++
+--------------------
+
+A Python ``bytes`` object will be passed to C++ functions that accept
+``std::string`` or ``char*`` *without* conversion.  On Python 3, in order to
+make a function *only* accept ``bytes`` (and not ``str``), declare it as taking
+a ``py::bytes`` argument.
+
+
+Returning C++ strings to Python
+===============================
+
+When a C++ function returns a ``std::string`` or ``char*`` to a Python caller,
+**pybind11 will assume that the string is valid UTF-8** and will decode it to a
+native Python ``str``, using the same API as Python uses to perform
+``bytes.decode('utf-8')``. If this implicit conversion fails, pybind11 will
+raise a ``UnicodeDecodeError``.
+
+.. code-block:: c++
+
+    m.def("std_string_return",
+        []() {
+            return std::string("This string needs to be UTF-8 encoded");
+        }
+    );
+
+.. code-block:: python
+
+    >>> isinstance(example.std_string_return(), str)
+    True
+
+
+Because UTF-8 is inclusive of pure ASCII, there is never any issue with
+returning a pure ASCII string to Python. If there is any possibility that the
+string is not pure ASCII, it is necessary to ensure the encoding is valid
+UTF-8.
+
+.. warning::
+
+    Implicit conversion assumes that a returned ``char *`` is null-terminated.
+    If there is no null terminator a buffer overrun will occur.
+
+Explicit conversions
+--------------------
+
+If some C++ code constructs a ``std::string`` that is not a UTF-8 string, one
+can perform a explicit conversion and return a ``py::str`` object. Explicit
+conversion has the same overhead as implicit conversion.
+
+.. code-block:: c++
+
+    // This uses the Python C API to convert Latin-1 to Unicode
+    m.def("str_output",
+        []() {
+            std::string s = "Send your r\xe9sum\xe9 to Alice in HR"; // Latin-1
+            py::str py_s = PyUnicode_DecodeLatin1(s.data(), s.length());
+            return py_s;
+        }
+    );
+
+.. code-block:: python
+
+    >>> str_output()
+    'Send your résumé to Alice in HR'
+
+The `Python C API
+<https://docs.python.org/3/c-api/unicode.html#built-in-codecs>`_ provides
+several built-in codecs.
+
+
+One could also use a third party encoding library such as libiconv to transcode
+to UTF-8.
+
+Return C++ strings without conversion
+-------------------------------------
+
+If the data in a C++ ``std::string`` does not represent text and should be
+returned to Python as ``bytes``, then one can return the data as a
+``py::bytes`` object.
+
+.. code-block:: c++
+
+    m.def("return_bytes",
+        []() {
+            std::string s("\xba\xd0\xba\xd0");  // Not valid UTF-8
+            return py::bytes(s);  // Return the data without transcoding
+        }
+    );
+
+.. code-block:: python
+
+    >>> example.return_bytes()
+    b'\xba\xd0\xba\xd0'
+
+
+Note the asymmetry: pybind11 will convert ``bytes`` to ``std::string`` without
+encoding, but cannot convert ``std::string`` back to ``bytes`` implicitly.
+
+.. code-block:: c++
+
+    m.def("asymmetry",
+        [](std::string s) {  // Accepts str or bytes from Python
+            return s;  // Looks harmless, but implicitly converts to str
+        }
+    );
+
+.. code-block:: python
+
+    >>> isinstance(example.asymmetry(b"have some bytes"), str)
+    True
+
+    >>> example.asymmetry(b"\xba\xd0\xba\xd0")  # invalid utf-8 as bytes
+    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xba in position 0: invalid start byte
+
+
+Wide character strings
+======================
+
+When a Python ``str`` is passed to a C++ function expecting ``std::wstring``,
+``wchar_t*``, ``std::u16string`` or ``std::u32string``, the ``str`` will be
+encoded to UTF-16 or UTF-32 depending on how the C++ compiler implements each
+type, in the platform's native endianness. When strings of these types are
+returned, they are assumed to contain valid UTF-16 or UTF-32, and will be
+decoded to Python ``str``.
+
+.. code-block:: c++
+
+    #define UNICODE
+    #include <windows.h>
+
+    m.def("set_window_text",
+        [](HWND hwnd, std::wstring s) {
+            // Call SetWindowText with null-terminated UTF-16 string
+            ::SetWindowText(hwnd, s.c_str());
+        }
+    );
+    m.def("get_window_text",
+        [](HWND hwnd) {
+            const int buffer_size = ::GetWindowTextLength(hwnd) + 1;
+            auto buffer = std::make_unique< wchar_t[] >(buffer_size);
+
+            ::GetWindowText(hwnd, buffer.data(), buffer_size);
+
+            std::wstring text(buffer.get());
+
+            // wstring will be converted to Python str
+            return text;
+        }
+    );
+
+.. warning::
+
+    Wide character strings may not work as described on Python 2.7 or Python
+    3.3 compiled with ``--enable-unicode=ucs2``.
+
+Strings in multibyte encodings such as Shift-JIS must transcoded to a
+UTF-8/16/32 before being returned to Python.
+
+
+Character literals
+==================
+
+C++ functions that accept character literals as input will receive the first
+character of a Python ``str`` as their input. If the string is longer than one
+Unicode character, trailing characters will be ignored.
+
+When a character literal is returned from C++ (such as a ``char`` or a
+``wchar_t``), it will be converted to a ``str`` that represents the single
+character.
+
+.. code-block:: c++
+
+    m.def("pass_char", [](char c) { return c; });
+    m.def("pass_wchar", [](wchar_t w) { return w; });
+
+.. code-block:: python
+
+    >>> example.pass_char('A')
+    'A'
+
+While C++ will cast integers to character types (``char c = 0x65;``), pybind11
+does not convert Python integers to characters implicitly. The Python function
+``chr()`` can be used to convert integers to characters.
+
+.. code-block:: python
+
+    >>> example.pass_char(0x65)
+    TypeError
+
+    >>> example.pass_char(chr(0x65))
+    'A'
+
+If the desire is to work with an 8-bit integer, use ``int8_t`` or ``uint8_t``
+as the argument type.
+
+Grapheme clusters
+-----------------
+
+A single grapheme may be represented by two or more Unicode characters. For
+example 'é' is usually represented as U+00E9 but can also be expressed as the
+combining character sequence U+0065 U+0301 (that is, the letter 'e' followed by
+a combining acute accent). The combining character will be lost if the
+two-character sequence is passed as an argument, even though it renders as a
+single grapheme.
+
+.. code-block:: python
+
+    >>> example.pass_wchar('é')
+    'é'
+
+    >>> combining_e_acute = 'e' + '\u0301'
+
+    >>> combining_e_acute
+    'é'
+
+    >>> combining_e_acute == 'é'
+    False
+
+    >>> example.pass_wchar(combining_e_acute)
+    'e'
+
+Normalizing combining characters before passing the character literal to C++
+may resolve *some* of these issues:
+
+.. code-block:: python
+
+    >>> example.pass_wchar(unicodedata.normalize('NFC', combining_e_acute))
+    'é'
+
+In some languages (Thai for example), there are `graphemes that cannot be
+expressed as a single Unicode code point
+<http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>`_, so there is
+no way to capture them in a C++ character type.
+
+
+C++17 string views
+==================
+
+C++17 string views are automatically supported when compiling in C++17 mode.
+They follow the same rules for encoding and decoding as the corresponding STL
+string type (for example, a ``std::u16string_view`` argument will be passed
+UTF-16-encoded data, and a returned ``std::string_view`` will be decoded as
+UTF-8).
+
+References
+==========
+
+* `The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!) <https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/>`_
+* `C++ - Using STL Strings at Win32 API Boundaries <https://msdn.microsoft.com/en-ca/magazine/mt238407.aspx>`_
diff --git a/pybind11/docs/advanced/classes.rst b/pybind11/docs/advanced/classes.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f4efc68f8b4b5d8ed70527de064560395fe4ed00
--- /dev/null
+++ b/pybind11/docs/advanced/classes.rst
@@ -0,0 +1,1234 @@
+Classes
+#######
+
+This section presents advanced binding code for classes and it is assumed
+that you are already familiar with the basics from :doc:`/classes`.
+
+.. _overriding_virtuals:
+
+Overriding virtual functions in Python
+======================================
+
+Suppose that a C++ class or interface has a virtual function that we'd like to
+to override from within Python (we'll focus on the class ``Animal``; ``Dog`` is
+given as a specific example of how one would do this with traditional C++
+code).
+
+.. code-block:: cpp
+
+    class Animal {
+    public:
+        virtual ~Animal() { }
+        virtual std::string go(int n_times) = 0;
+    };
+
+    class Dog : public Animal {
+    public:
+        std::string go(int n_times) override {
+            std::string result;
+            for (int i=0; i<n_times; ++i)
+                result += "woof! ";
+            return result;
+        }
+    };
+
+Let's also suppose that we are given a plain function which calls the
+function ``go()`` on an arbitrary ``Animal`` instance.
+
+.. code-block:: cpp
+
+    std::string call_go(Animal *animal) {
+        return animal->go(3);
+    }
+
+Normally, the binding code for these classes would look as follows:
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Animal>(m, "Animal")
+            .def("go", &Animal::go);
+
+        py::class_<Dog, Animal>(m, "Dog")
+            .def(py::init<>());
+
+        m.def("call_go", &call_go);
+    }
+
+However, these bindings are impossible to extend: ``Animal`` is not
+constructible, and we clearly require some kind of "trampoline" that
+redirects virtual calls back to Python.
+
+Defining a new type of ``Animal`` from within Python is possible but requires a
+helper class that is defined as follows:
+
+.. code-block:: cpp
+
+    class PyAnimal : public Animal {
+    public:
+        /* Inherit the constructors */
+        using Animal::Animal;
+
+        /* Trampoline (need one for each virtual function) */
+        std::string go(int n_times) override {
+            PYBIND11_OVERLOAD_PURE(
+                std::string, /* Return type */
+                Animal,      /* Parent class */
+                go,          /* Name of function in C++ (must match Python name) */
+                n_times      /* Argument(s) */
+            );
+        }
+    };
+
+The macro :c:macro:`PYBIND11_OVERLOAD_PURE` should be used for pure virtual
+functions, and :c:macro:`PYBIND11_OVERLOAD` should be used for functions which have
+a default implementation.  There are also two alternate macros
+:c:macro:`PYBIND11_OVERLOAD_PURE_NAME` and :c:macro:`PYBIND11_OVERLOAD_NAME` which
+take a string-valued name argument between the *Parent class* and *Name of the
+function* slots, which defines the name of function in Python. This is required
+when the C++ and Python versions of the
+function have different names, e.g.  ``operator()`` vs ``__call__``.
+
+The binding code also needs a few minor adaptations (highlighted):
+
+.. code-block:: cpp
+    :emphasize-lines: 2,3
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Animal, PyAnimal /* <--- trampoline*/>(m, "Animal")
+            .def(py::init<>())
+            .def("go", &Animal::go);
+
+        py::class_<Dog, Animal>(m, "Dog")
+            .def(py::init<>());
+
+        m.def("call_go", &call_go);
+    }
+
+Importantly, pybind11 is made aware of the trampoline helper class by
+specifying it as an extra template argument to :class:`class_`. (This can also
+be combined with other template arguments such as a custom holder type; the
+order of template types does not matter).  Following this, we are able to
+define a constructor as usual.
+
+Bindings should be made against the actual class, not the trampoline helper class.
+
+.. code-block:: cpp
+    :emphasize-lines: 3
+
+    py::class_<Animal, PyAnimal /* <--- trampoline*/>(m, "Animal");
+        .def(py::init<>())
+        .def("go", &PyAnimal::go); /* <--- THIS IS WRONG, use &Animal::go */
+
+Note, however, that the above is sufficient for allowing python classes to
+extend ``Animal``, but not ``Dog``: see :ref:`virtual_and_inheritance` for the
+necessary steps required to providing proper overload support for inherited
+classes.
+
+The Python session below shows how to override ``Animal::go`` and invoke it via
+a virtual method call.
+
+.. code-block:: pycon
+
+    >>> from example import *
+    >>> d = Dog()
+    >>> call_go(d)
+    u'woof! woof! woof! '
+    >>> class Cat(Animal):
+    ...     def go(self, n_times):
+    ...             return "meow! " * n_times
+    ...
+    >>> c = Cat()
+    >>> call_go(c)
+    u'meow! meow! meow! '
+
+If you are defining a custom constructor in a derived Python class, you *must*
+ensure that you explicitly call the bound C++ constructor using ``__init__``,
+*regardless* of whether it is a default constructor or not. Otherwise, the
+memory for the C++ portion of the instance will be left uninitialized, which
+will generally leave the C++ instance in an invalid state and cause undefined
+behavior if the C++ instance is subsequently used.
+
+.. versionchanged:: 2.6
+   The default pybind11 metaclass will throw a ``TypeError`` when it detects
+   that ``__init__`` was not called by a derived class.
+
+Here is an example:
+
+.. code-block:: python
+
+    class Dachshund(Dog):
+        def __init__(self, name):
+            Dog.__init__(self) # Without this, a TypeError is raised.
+            self.name = name
+        def bark(self):
+            return "yap!"
+
+Note that a direct ``__init__`` constructor *should be called*, and ``super()``
+should not be used. For simple cases of linear inheritance, ``super()``
+may work, but once you begin mixing Python and C++ multiple inheritance,
+things will fall apart due to differences between Python's MRO and C++'s
+mechanisms.
+
+Please take a look at the :ref:`macro_notes` before using this feature.
+
+.. note::
+
+    When the overridden type returns a reference or pointer to a type that
+    pybind11 converts from Python (for example, numeric values, std::string,
+    and other built-in value-converting types), there are some limitations to
+    be aware of:
+
+    - because in these cases there is no C++ variable to reference (the value
+      is stored in the referenced Python variable), pybind11 provides one in
+      the PYBIND11_OVERLOAD macros (when needed) with static storage duration.
+      Note that this means that invoking the overloaded method on *any*
+      instance will change the referenced value stored in *all* instances of
+      that type.
+
+    - Attempts to modify a non-const reference will not have the desired
+      effect: it will change only the static cache variable, but this change
+      will not propagate to underlying Python instance, and the change will be
+      replaced the next time the overload is invoked.
+
+.. seealso::
+
+    The file :file:`tests/test_virtual_functions.cpp` contains a complete
+    example that demonstrates how to override virtual functions using pybind11
+    in more detail.
+
+.. _virtual_and_inheritance:
+
+Combining virtual functions and inheritance
+===========================================
+
+When combining virtual methods with inheritance, you need to be sure to provide
+an override for each method for which you want to allow overrides from derived
+python classes.  For example, suppose we extend the above ``Animal``/``Dog``
+example as follows:
+
+.. code-block:: cpp
+
+    class Animal {
+    public:
+        virtual std::string go(int n_times) = 0;
+        virtual std::string name() { return "unknown"; }
+    };
+    class Dog : public Animal {
+    public:
+        std::string go(int n_times) override {
+            std::string result;
+            for (int i=0; i<n_times; ++i)
+                result += bark() + " ";
+            return result;
+        }
+        virtual std::string bark() { return "woof!"; }
+    };
+
+then the trampoline class for ``Animal`` must, as described in the previous
+section, override ``go()`` and ``name()``, but in order to allow python code to
+inherit properly from ``Dog``, we also need a trampoline class for ``Dog`` that
+overrides both the added ``bark()`` method *and* the ``go()`` and ``name()``
+methods inherited from ``Animal`` (even though ``Dog`` doesn't directly
+override the ``name()`` method):
+
+.. code-block:: cpp
+
+    class PyAnimal : public Animal {
+    public:
+        using Animal::Animal; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, Animal, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, Animal, name, ); }
+    };
+    class PyDog : public Dog {
+    public:
+        using Dog::Dog; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD(std::string, Dog, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, Dog, name, ); }
+        std::string bark() override { PYBIND11_OVERLOAD(std::string, Dog, bark, ); }
+    };
+
+.. note::
+
+    Note the trailing commas in the ``PYBIND11_OVERLOAD`` calls to ``name()``
+    and ``bark()``. These are needed to portably implement a trampoline for a
+    function that does not take any arguments. For functions that take
+    a nonzero number of arguments, the trailing comma must be omitted.
+
+A registered class derived from a pybind11-registered class with virtual
+methods requires a similar trampoline class, *even if* it doesn't explicitly
+declare or override any virtual methods itself:
+
+.. code-block:: cpp
+
+    class Husky : public Dog {};
+    class PyHusky : public Husky {
+    public:
+        using Husky::Husky; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, Husky, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, Husky, name, ); }
+        std::string bark() override { PYBIND11_OVERLOAD(std::string, Husky, bark, ); }
+    };
+
+There is, however, a technique that can be used to avoid this duplication
+(which can be especially helpful for a base class with several virtual
+methods).  The technique involves using template trampoline classes, as
+follows:
+
+.. code-block:: cpp
+
+    template <class AnimalBase = Animal> class PyAnimal : public AnimalBase {
+    public:
+        using AnimalBase::AnimalBase; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, AnimalBase, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, AnimalBase, name, ); }
+    };
+    template <class DogBase = Dog> class PyDog : public PyAnimal<DogBase> {
+    public:
+        using PyAnimal<DogBase>::PyAnimal; // Inherit constructors
+        // Override PyAnimal's pure virtual go() with a non-pure one:
+        std::string go(int n_times) override { PYBIND11_OVERLOAD(std::string, DogBase, go, n_times); }
+        std::string bark() override { PYBIND11_OVERLOAD(std::string, DogBase, bark, ); }
+    };
+
+This technique has the advantage of requiring just one trampoline method to be
+declared per virtual method and pure virtual method override.  It does,
+however, require the compiler to generate at least as many methods (and
+possibly more, if both pure virtual and overridden pure virtual methods are
+exposed, as above).
+
+The classes are then registered with pybind11 using:
+
+.. code-block:: cpp
+
+    py::class_<Animal, PyAnimal<>> animal(m, "Animal");
+    py::class_<Dog, Animal, PyDog<>> dog(m, "Dog");
+    py::class_<Husky, Dog, PyDog<Husky>> husky(m, "Husky");
+    // ... add animal, dog, husky definitions
+
+Note that ``Husky`` did not require a dedicated trampoline template class at
+all, since it neither declares any new virtual methods nor provides any pure
+virtual method implementations.
+
+With either the repeated-virtuals or templated trampoline methods in place, you
+can now create a python class that inherits from ``Dog``:
+
+.. code-block:: python
+
+    class ShihTzu(Dog):
+        def bark(self):
+            return "yip!"
+
+.. seealso::
+
+    See the file :file:`tests/test_virtual_functions.cpp` for complete examples
+    using both the duplication and templated trampoline approaches.
+
+.. _extended_aliases:
+
+Extended trampoline class functionality
+=======================================
+
+.. _extended_class_functionality_forced_trampoline:
+
+Forced trampoline class initialisation
+--------------------------------------
+The trampoline classes described in the previous sections are, by default, only
+initialized when needed.  More specifically, they are initialized when a python
+class actually inherits from a registered type (instead of merely creating an
+instance of the registered type), or when a registered constructor is only
+valid for the trampoline class but not the registered class.  This is primarily
+for performance reasons: when the trampoline class is not needed for anything
+except virtual method dispatching, not initializing the trampoline class
+improves performance by avoiding needing to do a run-time check to see if the
+inheriting python instance has an overloaded method.
+
+Sometimes, however, it is useful to always initialize a trampoline class as an
+intermediate class that does more than just handle virtual method dispatching.
+For example, such a class might perform extra class initialization, extra
+destruction operations, and might define new members and methods to enable a
+more python-like interface to a class.
+
+In order to tell pybind11 that it should *always* initialize the trampoline
+class when creating new instances of a type, the class constructors should be
+declared using ``py::init_alias<Args, ...>()`` instead of the usual
+``py::init<Args, ...>()``.  This forces construction via the trampoline class,
+ensuring member initialization and (eventual) destruction.
+
+.. seealso::
+
+    See the file :file:`tests/test_virtual_functions.cpp` for complete examples
+    showing both normal and forced trampoline instantiation.
+
+Different method signatures
+---------------------------
+The macro's introduced in :ref:`overriding_virtuals` cover most of the standard
+use cases when exposing C++ classes to Python. Sometimes it is hard or unwieldy
+to create a direct one-on-one mapping between the arguments and method return
+type.
+
+An example would be when the C++ signature contains output arguments using
+references (See also :ref:`faq_reference_arguments`). Another way of solving
+this is to use the method body of the trampoline class to do conversions to the
+input and return of the Python method.
+
+The main building block to do so is the :func:`get_overload`, this function
+allows retrieving a method implemented in Python from within the trampoline's
+methods. Consider for example a C++ method which has the signature
+``bool myMethod(int32_t& value)``, where the return indicates whether
+something should be done with the ``value``. This can be made convenient on the
+Python side by allowing the Python function to return ``None`` or an ``int``:
+
+.. code-block:: cpp
+
+    bool MyClass::myMethod(int32_t& value)
+    {
+        pybind11::gil_scoped_acquire gil;  // Acquire the GIL while in this scope.
+        // Try to look up the overloaded method on the Python side.
+        pybind11::function overload = pybind11::get_overload(this, "myMethod");
+        if (overload) {  // method is found
+            auto obj = overload(value);  // Call the Python function.
+            if (py::isinstance<py::int_>(obj)) {  // check if it returned a Python integer type
+                value = obj.cast<int32_t>();  // Cast it and assign it to the value.
+                return true;  // Return true; value should be used.
+            } else {
+                return false;  // Python returned none, return false.
+            }
+        }
+        return false;  // Alternatively return MyClass::myMethod(value);
+    }
+
+
+.. _custom_constructors:
+
+Custom constructors
+===================
+
+The syntax for binding constructors was previously introduced, but it only
+works when a constructor of the appropriate arguments actually exists on the
+C++ side.  To extend this to more general cases, pybind11 makes it possible
+to bind factory functions as constructors. For example, suppose you have a
+class like this:
+
+.. code-block:: cpp
+
+    class Example {
+    private:
+        Example(int); // private constructor
+    public:
+        // Factory function:
+        static Example create(int a) { return Example(a); }
+    };
+
+    py::class_<Example>(m, "Example")
+        .def(py::init(&Example::create));
+
+While it is possible to create a straightforward binding of the static
+``create`` method, it may sometimes be preferable to expose it as a constructor
+on the Python side. This can be accomplished by calling ``.def(py::init(...))``
+with the function reference returning the new instance passed as an argument.
+It is also possible to use this approach to bind a function returning a new
+instance by raw pointer or by the holder (e.g. ``std::unique_ptr``).
+
+The following example shows the different approaches:
+
+.. code-block:: cpp
+
+    class Example {
+    private:
+        Example(int); // private constructor
+    public:
+        // Factory function - returned by value:
+        static Example create(int a) { return Example(a); }
+
+        // These constructors are publicly callable:
+        Example(double);
+        Example(int, int);
+        Example(std::string);
+    };
+
+    py::class_<Example>(m, "Example")
+        // Bind the factory function as a constructor:
+        .def(py::init(&Example::create))
+        // Bind a lambda function returning a pointer wrapped in a holder:
+        .def(py::init([](std::string arg) {
+            return std::unique_ptr<Example>(new Example(arg));
+        }))
+        // Return a raw pointer:
+        .def(py::init([](int a, int b) { return new Example(a, b); }))
+        // You can mix the above with regular C++ constructor bindings as well:
+        .def(py::init<double>())
+        ;
+
+When the constructor is invoked from Python, pybind11 will call the factory
+function and store the resulting C++ instance in the Python instance.
+
+When combining factory functions constructors with :ref:`virtual function
+trampolines <overriding_virtuals>` there are two approaches.  The first is to
+add a constructor to the alias class that takes a base value by
+rvalue-reference.  If such a constructor is available, it will be used to
+construct an alias instance from the value returned by the factory function.
+The second option is to provide two factory functions to ``py::init()``: the
+first will be invoked when no alias class is required (i.e. when the class is
+being used but not inherited from in Python), and the second will be invoked
+when an alias is required.
+
+You can also specify a single factory function that always returns an alias
+instance: this will result in behaviour similar to ``py::init_alias<...>()``,
+as described in the :ref:`extended trampoline class documentation
+<extended_aliases>`.
+
+The following example shows the different factory approaches for a class with
+an alias:
+
+.. code-block:: cpp
+
+    #include <pybind11/factory.h>
+    class Example {
+    public:
+        // ...
+        virtual ~Example() = default;
+    };
+    class PyExample : public Example {
+    public:
+        using Example::Example;
+        PyExample(Example &&base) : Example(std::move(base)) {}
+    };
+    py::class_<Example, PyExample>(m, "Example")
+        // Returns an Example pointer.  If a PyExample is needed, the Example
+        // instance will be moved via the extra constructor in PyExample, above.
+        .def(py::init([]() { return new Example(); }))
+        // Two callbacks:
+        .def(py::init([]() { return new Example(); } /* no alias needed */,
+                      []() { return new PyExample(); } /* alias needed */))
+        // *Always* returns an alias instance (like py::init_alias<>())
+        .def(py::init([]() { return new PyExample(); }))
+        ;
+
+Brace initialization
+--------------------
+
+``pybind11::init<>`` internally uses C++11 brace initialization to call the
+constructor of the target class. This means that it can be used to bind
+*implicit* constructors as well:
+
+.. code-block:: cpp
+
+    struct Aggregate {
+        int a;
+        std::string b;
+    };
+
+    py::class_<Aggregate>(m, "Aggregate")
+        .def(py::init<int, const std::string &>());
+
+.. note::
+
+    Note that brace initialization preferentially invokes constructor overloads
+    taking a ``std::initializer_list``. In the rare event that this causes an
+    issue, you can work around it by using ``py::init(...)`` with a lambda
+    function that constructs the new object as desired.
+
+.. _classes_with_non_public_destructors:
+
+Non-public destructors
+======================
+
+If a class has a private or protected destructor (as might e.g. be the case in
+a singleton pattern), a compile error will occur when creating bindings via
+pybind11. The underlying issue is that the ``std::unique_ptr`` holder type that
+is responsible for managing the lifetime of instances will reference the
+destructor even if no deallocations ever take place. In order to expose classes
+with private or protected destructors, it is possible to override the holder
+type via a holder type argument to ``class_``. Pybind11 provides a helper class
+``py::nodelete`` that disables any destructor invocations. In this case, it is
+crucial that instances are deallocated on the C++ side to avoid memory leaks.
+
+.. code-block:: cpp
+
+    /* ... definition ... */
+
+    class MyClass {
+    private:
+        ~MyClass() { }
+    };
+
+    /* ... binding code ... */
+
+    py::class_<MyClass, std::unique_ptr<MyClass, py::nodelete>>(m, "MyClass")
+        .def(py::init<>())
+
+.. _destructors_that_call_python:
+
+Destructors that call Python
+============================
+
+If a Python function is invoked from a C++ destructor, an exception may be thrown
+of type :class:`error_already_set`. If this error is thrown out of a class destructor,
+``std::terminate()`` will be called, terminating the process. Class destructors
+must catch all exceptions of type :class:`error_already_set` to discard the Python
+exception using :func:`error_already_set::discard_as_unraisable`.
+
+Every Python function should be treated as *possibly throwing*. When a Python generator
+stops yielding items, Python will throw a ``StopIteration`` exception, which can pass
+though C++ destructors if the generator's stack frame holds the last reference to C++
+objects.
+
+For more information, see :ref:`the documentation on exceptions <unraisable_exceptions>`.
+
+.. code-block:: cpp
+
+    class MyClass {
+    public:
+        ~MyClass() {
+            try {
+                py::print("Even printing is dangerous in a destructor");
+                py::exec("raise ValueError('This is an unraisable exception')");
+            } catch (py::error_already_set &e) {
+                // error_context should be information about where/why the occurred,
+                // e.g. use __func__ to get the name of the current function
+                e.discard_as_unraisable(__func__);
+            }
+        }
+    };
+
+.. note::
+
+    pybind11 does not support C++ destructors marked ``noexcept(false)``.
+
+.. versionadded:: 2.6
+
+.. _implicit_conversions:
+
+Implicit conversions
+====================
+
+Suppose that instances of two types ``A`` and ``B`` are used in a project, and
+that an ``A`` can easily be converted into an instance of type ``B`` (examples of this
+could be a fixed and an arbitrary precision number type).
+
+.. code-block:: cpp
+
+    py::class_<A>(m, "A")
+        /// ... members ...
+
+    py::class_<B>(m, "B")
+        .def(py::init<A>())
+        /// ... members ...
+
+    m.def("func",
+        [](const B &) { /* .... */ }
+    );
+
+To invoke the function ``func`` using a variable ``a`` containing an ``A``
+instance, we'd have to write ``func(B(a))`` in Python. On the other hand, C++
+will automatically apply an implicit type conversion, which makes it possible
+to directly write ``func(a)``.
+
+In this situation (i.e. where ``B`` has a constructor that converts from
+``A``), the following statement enables similar implicit conversions on the
+Python side:
+
+.. code-block:: cpp
+
+    py::implicitly_convertible<A, B>();
+
+.. note::
+
+    Implicit conversions from ``A`` to ``B`` only work when ``B`` is a custom
+    data type that is exposed to Python via pybind11.
+
+    To prevent runaway recursion, implicit conversions are non-reentrant: an
+    implicit conversion invoked as part of another implicit conversion of the
+    same type (i.e. from ``A`` to ``B``) will fail.
+
+.. _static_properties:
+
+Static properties
+=================
+
+The section on :ref:`properties` discussed the creation of instance properties
+that are implemented in terms of C++ getters and setters.
+
+Static properties can also be created in a similar way to expose getters and
+setters of static class attributes. Note that the implicit ``self`` argument
+also exists in this case and is used to pass the Python ``type`` subclass
+instance. This parameter will often not be needed by the C++ side, and the
+following example illustrates how to instantiate a lambda getter function
+that ignores it:
+
+.. code-block:: cpp
+
+    py::class_<Foo>(m, "Foo")
+        .def_property_readonly_static("foo", [](py::object /* self */) { return Foo(); });
+
+Operator overloading
+====================
+
+Suppose that we're given the following ``Vector2`` class with a vector addition
+and scalar multiplication operation, all implemented using overloaded operators
+in C++.
+
+.. code-block:: cpp
+
+    class Vector2 {
+    public:
+        Vector2(float x, float y) : x(x), y(y) { }
+
+        Vector2 operator+(const Vector2 &v) const { return Vector2(x + v.x, y + v.y); }
+        Vector2 operator*(float value) const { return Vector2(x * value, y * value); }
+        Vector2& operator+=(const Vector2 &v) { x += v.x; y += v.y; return *this; }
+        Vector2& operator*=(float v) { x *= v; y *= v; return *this; }
+
+        friend Vector2 operator*(float f, const Vector2 &v) {
+            return Vector2(f * v.x, f * v.y);
+        }
+
+        std::string toString() const {
+            return "[" + std::to_string(x) + ", " + std::to_string(y) + "]";
+        }
+    private:
+        float x, y;
+    };
+
+The following snippet shows how the above operators can be conveniently exposed
+to Python.
+
+.. code-block:: cpp
+
+    #include <pybind11/operators.h>
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Vector2>(m, "Vector2")
+            .def(py::init<float, float>())
+            .def(py::self + py::self)
+            .def(py::self += py::self)
+            .def(py::self *= float())
+            .def(float() * py::self)
+            .def(py::self * float())
+            .def(-py::self)
+            .def("__repr__", &Vector2::toString);
+    }
+
+Note that a line like
+
+.. code-block:: cpp
+
+            .def(py::self * float())
+
+is really just short hand notation for
+
+.. code-block:: cpp
+
+    .def("__mul__", [](const Vector2 &a, float b) {
+        return a * b;
+    }, py::is_operator())
+
+This can be useful for exposing additional operators that don't exist on the
+C++ side, or to perform other types of customization. The ``py::is_operator``
+flag marker is needed to inform pybind11 that this is an operator, which
+returns ``NotImplemented`` when invoked with incompatible arguments rather than
+throwing a type error.
+
+.. note::
+
+    To use the more convenient ``py::self`` notation, the additional
+    header file :file:`pybind11/operators.h` must be included.
+
+.. seealso::
+
+    The file :file:`tests/test_operator_overloading.cpp` contains a
+    complete example that demonstrates how to work with overloaded operators in
+    more detail.
+
+.. _pickling:
+
+Pickling support
+================
+
+Python's ``pickle`` module provides a powerful facility to serialize and
+de-serialize a Python object graph into a binary data stream. To pickle and
+unpickle C++ classes using pybind11, a ``py::pickle()`` definition must be
+provided. Suppose the class in question has the following signature:
+
+.. code-block:: cpp
+
+    class Pickleable {
+    public:
+        Pickleable(const std::string &value) : m_value(value) { }
+        const std::string &value() const { return m_value; }
+
+        void setExtra(int extra) { m_extra = extra; }
+        int extra() const { return m_extra; }
+    private:
+        std::string m_value;
+        int m_extra = 0;
+    };
+
+Pickling support in Python is enabled by defining the ``__setstate__`` and
+``__getstate__`` methods [#f3]_. For pybind11 classes, use ``py::pickle()``
+to bind these two functions:
+
+.. code-block:: cpp
+
+    py::class_<Pickleable>(m, "Pickleable")
+        .def(py::init<std::string>())
+        .def("value", &Pickleable::value)
+        .def("extra", &Pickleable::extra)
+        .def("setExtra", &Pickleable::setExtra)
+        .def(py::pickle(
+            [](const Pickleable &p) { // __getstate__
+                /* Return a tuple that fully encodes the state of the object */
+                return py::make_tuple(p.value(), p.extra());
+            },
+            [](py::tuple t) { // __setstate__
+                if (t.size() != 2)
+                    throw std::runtime_error("Invalid state!");
+
+                /* Create a new C++ instance */
+                Pickleable p(t[0].cast<std::string>());
+
+                /* Assign any additional state */
+                p.setExtra(t[1].cast<int>());
+
+                return p;
+            }
+        ));
+
+The ``__setstate__`` part of the ``py::picke()`` definition follows the same
+rules as the single-argument version of ``py::init()``. The return type can be
+a value, pointer or holder type. See :ref:`custom_constructors` for details.
+
+An instance can now be pickled as follows:
+
+.. code-block:: python
+
+    try:
+        import cPickle as pickle  # Use cPickle on Python 2.7
+    except ImportError:
+        import pickle
+
+    p = Pickleable("test_value")
+    p.setExtra(15)
+    data = pickle.dumps(p, 2)
+
+
+.. note::
+    Note that only the cPickle module is supported on Python 2.7.
+
+    The second argument to ``dumps`` is also crucial: it selects the pickle
+    protocol version 2, since the older version 1 is not supported. Newer
+    versions are also fine—for instance, specify ``-1`` to always use the
+    latest available version. Beware: failure to follow these instructions
+    will cause important pybind11 memory allocation routines to be skipped
+    during unpickling, which will likely lead to memory corruption and/or
+    segmentation faults.
+
+.. seealso::
+
+    The file :file:`tests/test_pickling.cpp` contains a complete example
+    that demonstrates how to pickle and unpickle types using pybind11 in more
+    detail.
+
+.. [#f3] http://docs.python.org/3/library/pickle.html#pickling-class-instances
+
+Deepcopy support
+================
+
+Python normally uses references in assignments. Sometimes a real copy is needed
+to prevent changing all copies. The ``copy`` module [#f5]_ provides these
+capabilities.
+
+On Python 3, a class with pickle support is automatically also (deep)copy
+compatible. However, performance can be improved by adding custom
+``__copy__`` and ``__deepcopy__`` methods. With Python 2.7, these custom methods
+are mandatory for (deep)copy compatibility, because pybind11 only supports
+cPickle.
+
+For simple classes (deep)copy can be enabled by using the copy constructor,
+which should look as follows:
+
+.. code-block:: cpp
+
+    py::class_<Copyable>(m, "Copyable")
+        .def("__copy__",  [](const Copyable &self) {
+            return Copyable(self);
+        })
+        .def("__deepcopy__", [](const Copyable &self, py::dict) {
+            return Copyable(self);
+        }, "memo"_a);
+
+.. note::
+
+    Dynamic attributes will not be copied in this example.
+
+.. [#f5] https://docs.python.org/3/library/copy.html
+
+Multiple Inheritance
+====================
+
+pybind11 can create bindings for types that derive from multiple base types
+(aka. *multiple inheritance*). To do so, specify all bases in the template
+arguments of the ``class_`` declaration:
+
+.. code-block:: cpp
+
+    py::class_<MyType, BaseType1, BaseType2, BaseType3>(m, "MyType")
+       ...
+
+The base types can be specified in arbitrary order, and they can even be
+interspersed with alias types and holder types (discussed earlier in this
+document)---pybind11 will automatically find out which is which. The only
+requirement is that the first template argument is the type to be declared.
+
+It is also permitted to inherit multiply from exported C++ classes in Python,
+as well as inheriting from multiple Python and/or pybind11-exported classes.
+
+There is one caveat regarding the implementation of this feature:
+
+When only one base type is specified for a C++ type that actually has multiple
+bases, pybind11 will assume that it does not participate in multiple
+inheritance, which can lead to undefined behavior. In such cases, add the tag
+``multiple_inheritance`` to the class constructor:
+
+.. code-block:: cpp
+
+    py::class_<MyType, BaseType2>(m, "MyType", py::multiple_inheritance());
+
+The tag is redundant and does not need to be specified when multiple base types
+are listed.
+
+.. _module_local:
+
+Module-local class bindings
+===========================
+
+When creating a binding for a class, pybind11 by default makes that binding
+"global" across modules.  What this means is that a type defined in one module
+can be returned from any module resulting in the same Python type.  For
+example, this allows the following:
+
+.. code-block:: cpp
+
+    // In the module1.cpp binding code for module1:
+    py::class_<Pet>(m, "Pet")
+        .def(py::init<std::string>())
+        .def_readonly("name", &Pet::name);
+
+.. code-block:: cpp
+
+    // In the module2.cpp binding code for module2:
+    m.def("create_pet", [](std::string name) { return new Pet(name); });
+
+.. code-block:: pycon
+
+    >>> from module1 import Pet
+    >>> from module2 import create_pet
+    >>> pet1 = Pet("Kitty")
+    >>> pet2 = create_pet("Doggy")
+    >>> pet2.name()
+    'Doggy'
+
+When writing binding code for a library, this is usually desirable: this
+allows, for example, splitting up a complex library into multiple Python
+modules.
+
+In some cases, however, this can cause conflicts.  For example, suppose two
+unrelated modules make use of an external C++ library and each provide custom
+bindings for one of that library's classes.  This will result in an error when
+a Python program attempts to import both modules (directly or indirectly)
+because of conflicting definitions on the external type:
+
+.. code-block:: cpp
+
+    // dogs.cpp
+
+    // Binding for external library class:
+    py::class<pets::Pet>(m, "Pet")
+        .def("name", &pets::Pet::name);
+
+    // Binding for local extension class:
+    py::class<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+
+.. code-block:: cpp
+
+    // cats.cpp, in a completely separate project from the above dogs.cpp.
+
+    // Binding for external library class:
+    py::class<pets::Pet>(m, "Pet")
+        .def("get_name", &pets::Pet::name);
+
+    // Binding for local extending class:
+    py::class<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+
+.. code-block:: pycon
+
+    >>> import cats
+    >>> import dogs
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ImportError: generic_type: type "Pet" is already registered!
+
+To get around this, you can tell pybind11 to keep the external class binding
+localized to the module by passing the ``py::module_local()`` attribute into
+the ``py::class_`` constructor:
+
+.. code-block:: cpp
+
+    // Pet binding in dogs.cpp:
+    py::class<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+
+.. code-block:: cpp
+
+    // Pet binding in cats.cpp:
+    py::class<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+
+This makes the Python-side ``dogs.Pet`` and ``cats.Pet`` into distinct classes,
+avoiding the conflict and allowing both modules to be loaded.  C++ code in the
+``dogs`` module that casts or returns a ``Pet`` instance will result in a
+``dogs.Pet`` Python instance, while C++ code in the ``cats`` module will result
+in a ``cats.Pet`` Python instance.
+
+This does come with two caveats, however: First, external modules cannot return
+or cast a ``Pet`` instance to Python (unless they also provide their own local
+bindings).  Second, from the Python point of view they are two distinct classes.
+
+Note that the locality only applies in the C++ -> Python direction.  When
+passing such a ``py::module_local`` type into a C++ function, the module-local
+classes are still considered.  This means that if the following function is
+added to any module (including but not limited to the ``cats`` and ``dogs``
+modules above) it will be callable with either a ``dogs.Pet`` or ``cats.Pet``
+argument:
+
+.. code-block:: cpp
+
+    m.def("pet_name", [](const pets::Pet &pet) { return pet.name(); });
+
+For example, suppose the above function is added to each of ``cats.cpp``,
+``dogs.cpp`` and ``frogs.cpp`` (where ``frogs.cpp`` is some other module that
+does *not* bind ``Pets`` at all).
+
+.. code-block:: pycon
+
+    >>> import cats, dogs, frogs  # No error because of the added py::module_local()
+    >>> mycat, mydog = cats.Cat("Fluffy"), dogs.Dog("Rover")
+    >>> (cats.pet_name(mycat), dogs.pet_name(mydog))
+    ('Fluffy', 'Rover')
+    >>> (cats.pet_name(mydog), dogs.pet_name(mycat), frogs.pet_name(mycat))
+    ('Rover', 'Fluffy', 'Fluffy')
+
+It is possible to use ``py::module_local()`` registrations in one module even
+if another module registers the same type globally: within the module with the
+module-local definition, all C++ instances will be cast to the associated bound
+Python type.  In other modules any such values are converted to the global
+Python type created elsewhere.
+
+.. note::
+
+    STL bindings (as provided via the optional :file:`pybind11/stl_bind.h`
+    header) apply ``py::module_local`` by default when the bound type might
+    conflict with other modules; see :ref:`stl_bind` for details.
+
+.. note::
+
+    The localization of the bound types is actually tied to the shared object
+    or binary generated by the compiler/linker.  For typical modules created
+    with ``PYBIND11_MODULE()``, this distinction is not significant.  It is
+    possible, however, when :ref:`embedding` to embed multiple modules in the
+    same binary (see :ref:`embedding_modules`).  In such a case, the
+    localization will apply across all embedded modules within the same binary.
+
+.. seealso::
+
+    The file :file:`tests/test_local_bindings.cpp` contains additional examples
+    that demonstrate how ``py::module_local()`` works.
+
+Binding protected member functions
+==================================
+
+It's normally not possible to expose ``protected`` member functions to Python:
+
+.. code-block:: cpp
+
+    class A {
+    protected:
+        int foo() const { return 42; }
+    };
+
+    py::class_<A>(m, "A")
+        .def("foo", &A::foo); // error: 'foo' is a protected member of 'A'
+
+On one hand, this is good because non-``public`` members aren't meant to be
+accessed from the outside. But we may want to make use of ``protected``
+functions in derived Python classes.
+
+The following pattern makes this possible:
+
+.. code-block:: cpp
+
+    class A {
+    protected:
+        int foo() const { return 42; }
+    };
+
+    class Publicist : public A { // helper type for exposing protected functions
+    public:
+        using A::foo; // inherited with different access modifier
+    };
+
+    py::class_<A>(m, "A") // bind the primary class
+        .def("foo", &Publicist::foo); // expose protected methods via the publicist
+
+This works because ``&Publicist::foo`` is exactly the same function as
+``&A::foo`` (same signature and address), just with a different access
+modifier. The only purpose of the ``Publicist`` helper class is to make
+the function name ``public``.
+
+If the intent is to expose ``protected`` ``virtual`` functions which can be
+overridden in Python, the publicist pattern can be combined with the previously
+described trampoline:
+
+.. code-block:: cpp
+
+    class A {
+    public:
+        virtual ~A() = default;
+
+    protected:
+        virtual int foo() const { return 42; }
+    };
+
+    class Trampoline : public A {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, A, foo, ); }
+    };
+
+    class Publicist : public A {
+    public:
+        using A::foo;
+    };
+
+    py::class_<A, Trampoline>(m, "A") // <-- `Trampoline` here
+        .def("foo", &Publicist::foo); // <-- `Publicist` here, not `Trampoline`!
+
+.. note::
+
+    MSVC 2015 has a compiler bug (fixed in version 2017) which
+    requires a more explicit function binding in the form of
+    ``.def("foo", static_cast<int (A::*)() const>(&Publicist::foo));``
+    where ``int (A::*)() const`` is the type of ``A::foo``.
+
+Binding final classes
+=====================
+
+Some classes may not be appropriate to inherit from. In C++11, classes can
+use the ``final`` specifier to ensure that a class cannot be inherited from.
+The ``py::is_final`` attribute can be used to ensure that Python classes
+cannot inherit from a specified type. The underlying C++ type does not need
+to be declared final.
+
+.. code-block:: cpp
+
+    class IsFinal final {};
+
+    py::class_<IsFinal>(m, "IsFinal", py::is_final());
+
+When you try to inherit from such a class in Python, you will now get this
+error:
+
+.. code-block:: pycon
+
+    >>> class PyFinalChild(IsFinal):
+    ...     pass
+    TypeError: type 'IsFinal' is not an acceptable base type
+
+.. note:: This attribute is currently ignored on PyPy
+
+.. versionadded:: 2.6
+
+Custom automatic downcasters
+============================
+
+As explained in :ref:`inheritance`, pybind11 comes with built-in
+understanding of the dynamic type of polymorphic objects in C++; that
+is, returning a Pet to Python produces a Python object that knows it's
+wrapping a Dog, if Pet has virtual methods and pybind11 knows about
+Dog and this Pet is in fact a Dog. Sometimes, you might want to
+provide this automatic downcasting behavior when creating bindings for
+a class hierarchy that does not use standard C++ polymorphism, such as
+LLVM [#f4]_. As long as there's some way to determine at runtime
+whether a downcast is safe, you can proceed by specializing the
+``pybind11::polymorphic_type_hook`` template:
+
+.. code-block:: cpp
+
+    enum class PetKind { Cat, Dog, Zebra };
+    struct Pet {   // Not polymorphic: has no virtual methods
+        const PetKind kind;
+        int age = 0;
+      protected:
+        Pet(PetKind _kind) : kind(_kind) {}
+    };
+    struct Dog : Pet {
+        Dog() : Pet(PetKind::Dog) {}
+        std::string sound = "woof!";
+        std::string bark() const { return sound; }
+    };
+
+    namespace pybind11 {
+        template<> struct polymorphic_type_hook<Pet> {
+            static const void *get(const Pet *src, const std::type_info*& type) {
+                // note that src may be nullptr
+                if (src && src->kind == PetKind::Dog) {
+                    type = &typeid(Dog);
+                    return static_cast<const Dog*>(src);
+                }
+                return src;
+            }
+        };
+    } // namespace pybind11
+
+When pybind11 wants to convert a C++ pointer of type ``Base*`` to a
+Python object, it calls ``polymorphic_type_hook<Base>::get()`` to
+determine if a downcast is possible. The ``get()`` function should use
+whatever runtime information is available to determine if its ``src``
+parameter is in fact an instance of some class ``Derived`` that
+inherits from ``Base``. If it finds such a ``Derived``, it sets ``type
+= &typeid(Derived)`` and returns a pointer to the ``Derived`` object
+that contains ``src``. Otherwise, it just returns ``src``, leaving
+``type`` at its default value of nullptr. If you set ``type`` to a
+type that pybind11 doesn't know about, no downcasting will occur, and
+the original ``src`` pointer will be used with its static type
+``Base*``.
+
+It is critical that the returned pointer and ``type`` argument of
+``get()`` agree with each other: if ``type`` is set to something
+non-null, the returned pointer must point to the start of an object
+whose type is ``type``. If the hierarchy being exposed uses only
+single inheritance, a simple ``return src;`` will achieve this just
+fine, but in the general case, you must cast ``src`` to the
+appropriate derived-class pointer (e.g. using
+``static_cast<Derived>(src)``) before allowing it to be returned as a
+``void*``.
+
+.. [#f4] https://llvm.org/docs/HowToSetUpLLVMStyleRTTI.html
+
+.. note::
+
+    pybind11's standard support for downcasting objects whose types
+    have virtual methods is implemented using
+    ``polymorphic_type_hook`` too, using the standard C++ ability to
+    determine the most-derived type of a polymorphic object using
+    ``typeid()`` and to cast a base pointer to that most-derived type
+    (even if you don't know what it is) using ``dynamic_cast<void*>``.
+
+.. seealso::
+
+    The file :file:`tests/test_tagbased_polymorphic.cpp` contains a
+    more complete example, including a demonstration of how to provide
+    automatic downcasting for an entire class hierarchy without
+    writing one get() function for each class.
diff --git a/pybind11/docs/advanced/embedding.rst b/pybind11/docs/advanced/embedding.rst
new file mode 100644
index 0000000000000000000000000000000000000000..98a5c5219076872fbbd158bc2d99de294ee00789
--- /dev/null
+++ b/pybind11/docs/advanced/embedding.rst
@@ -0,0 +1,261 @@
+.. _embedding:
+
+Embedding the interpreter
+#########################
+
+While pybind11 is mainly focused on extending Python using C++, it's also
+possible to do the reverse: embed the Python interpreter into a C++ program.
+All of the other documentation pages still apply here, so refer to them for
+general pybind11 usage. This section will cover a few extra things required
+for embedding.
+
+Getting started
+===============
+
+A basic executable with an embedded interpreter can be created with just a few
+lines of CMake and the ``pybind11::embed`` target, as shown below. For more
+information, see :doc:`/compiling`.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.4)
+    project(example)
+
+    find_package(pybind11 REQUIRED)  # or `add_subdirectory(pybind11)`
+
+    add_executable(example main.cpp)
+    target_link_libraries(example PRIVATE pybind11::embed)
+
+The essential structure of the ``main.cpp`` file looks like this:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h> // everything needed for embedding
+    namespace py = pybind11;
+
+    int main() {
+        py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+
+        py::print("Hello, World!"); // use the Python API
+    }
+
+The interpreter must be initialized before using any Python API, which includes
+all the functions and classes in pybind11. The RAII guard class `scoped_interpreter`
+takes care of the interpreter lifetime. After the guard is destroyed, the interpreter
+shuts down and clears its memory. No Python functions can be called after this.
+
+Executing Python code
+=====================
+
+There are a few different ways to run Python code. One option is to use `eval`,
+`exec` or `eval_file`, as explained in :ref:`eval`. Here is a quick example in
+the context of an executable with an embedded interpreter:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        py::exec(R"(
+            kwargs = dict(name="World", number=42)
+            message = "Hello, {name}! The answer is {number}".format(**kwargs)
+            print(message)
+        )");
+    }
+
+Alternatively, similar results can be achieved using pybind11's API (see
+:doc:`/advanced/pycpp/index` for more details).
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+    using namespace py::literals;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto kwargs = py::dict("name"_a="World", "number"_a=42);
+        auto message = "Hello, {name}! The answer is {number}"_s.format(**kwargs);
+        py::print(message);
+    }
+
+The two approaches can also be combined:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    #include <iostream>
+
+    namespace py = pybind11;
+    using namespace py::literals;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto locals = py::dict("name"_a="World", "number"_a=42);
+        py::exec(R"(
+            message = "Hello, {name}! The answer is {number}".format(**locals())
+        )", py::globals(), locals);
+
+        auto message = locals["message"].cast<std::string>();
+        std::cout << message;
+    }
+
+Importing modules
+=================
+
+Python modules can be imported using `module::import()`:
+
+.. code-block:: cpp
+
+    py::module sys = py::module::import("sys");
+    py::print(sys.attr("path"));
+
+For convenience, the current working directory is included in ``sys.path`` when
+embedding the interpreter. This makes it easy to import local Python files:
+
+.. code-block:: python
+
+    """calc.py located in the working directory"""
+
+    def add(i, j):
+        return i + j
+
+
+.. code-block:: cpp
+
+    py::module calc = py::module::import("calc");
+    py::object result = calc.attr("add")(1, 2);
+    int n = result.cast<int>();
+    assert(n == 3);
+
+Modules can be reloaded using `module::reload()` if the source is modified e.g.
+by an external process. This can be useful in scenarios where the application
+imports a user defined data processing script which needs to be updated after
+changes by the user. Note that this function does not reload modules recursively.
+
+.. _embedding_modules:
+
+Adding embedded modules
+=======================
+
+Embedded binary modules can be added using the `PYBIND11_EMBEDDED_MODULE` macro.
+Note that the definition must be placed at global scope. They can be imported
+like any other module.
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    PYBIND11_EMBEDDED_MODULE(fast_calc, m) {
+        // `m` is a `py::module` which is used to bind functions and classes
+        m.def("add", [](int i, int j) {
+            return i + j;
+        });
+    }
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto fast_calc = py::module::import("fast_calc");
+        auto result = fast_calc.attr("add")(1, 2).cast<int>();
+        assert(result == 3);
+    }
+
+Unlike extension modules where only a single binary module can be created, on
+the embedded side an unlimited number of modules can be added using multiple
+`PYBIND11_EMBEDDED_MODULE` definitions (as long as they have unique names).
+
+These modules are added to Python's list of builtins, so they can also be
+imported in pure Python files loaded by the interpreter. Everything interacts
+naturally:
+
+.. code-block:: python
+
+    """py_module.py located in the working directory"""
+    import cpp_module
+
+    a = cpp_module.a
+    b = a + 1
+
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    PYBIND11_EMBEDDED_MODULE(cpp_module, m) {
+        m.attr("a") = 1;
+    }
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto py_module = py::module::import("py_module");
+
+        auto locals = py::dict("fmt"_a="{} + {} = {}", **py_module.attr("__dict__"));
+        assert(locals["a"].cast<int>() == 1);
+        assert(locals["b"].cast<int>() == 2);
+
+        py::exec(R"(
+            c = a + b
+            message = fmt.format(a, b, c)
+        )", py::globals(), locals);
+
+        assert(locals["c"].cast<int>() == 3);
+        assert(locals["message"].cast<std::string>() == "1 + 2 = 3");
+    }
+
+
+Interpreter lifetime
+====================
+
+The Python interpreter shuts down when `scoped_interpreter` is destroyed. After
+this, creating a new instance will restart the interpreter. Alternatively, the
+`initialize_interpreter` / `finalize_interpreter` pair of functions can be used
+to directly set the state at any time.
+
+Modules created with pybind11 can be safely re-initialized after the interpreter
+has been restarted. However, this may not apply to third-party extension modules.
+The issue is that Python itself cannot completely unload extension modules and
+there are several caveats with regard to interpreter restarting. In short, not
+all memory may be freed, either due to Python reference cycles or user-created
+global data. All the details can be found in the CPython documentation.
+
+.. warning::
+
+    Creating two concurrent `scoped_interpreter` guards is a fatal error. So is
+    calling `initialize_interpreter` for a second time after the interpreter
+    has already been initialized.
+
+    Do not use the raw CPython API functions ``Py_Initialize`` and
+    ``Py_Finalize`` as these do not properly handle the lifetime of
+    pybind11's internal data.
+
+
+Sub-interpreter support
+=======================
+
+Creating multiple copies of `scoped_interpreter` is not possible because it
+represents the main Python interpreter. Sub-interpreters are something different
+and they do permit the existence of multiple interpreters. This is an advanced
+feature of the CPython API and should be handled with care. pybind11 does not
+currently offer a C++ interface for sub-interpreters, so refer to the CPython
+documentation for all the details regarding this feature.
+
+We'll just mention a couple of caveats the sub-interpreters support in pybind11:
+
+ 1. Sub-interpreters will not receive independent copies of embedded modules.
+    Instead, these are shared and modifications in one interpreter may be
+    reflected in another.
+
+ 2. Managing multiple threads, multiple interpreters and the GIL can be
+    challenging and there are several caveats here, even within the pure
+    CPython API (please refer to the Python docs for details). As for
+    pybind11, keep in mind that `gil_scoped_release` and `gil_scoped_acquire`
+    do not take sub-interpreters into account.
diff --git a/pybind11/docs/advanced/exceptions.rst b/pybind11/docs/advanced/exceptions.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b7d36014a68239c7518d00364522fbdb03f183bc
--- /dev/null
+++ b/pybind11/docs/advanced/exceptions.rst
@@ -0,0 +1,285 @@
+Exceptions
+##########
+
+Built-in C++ to Python exception translation
+============================================
+
+When Python calls C++ code through pybind11, pybind11 provides a C++ exception handler
+that will trap C++ exceptions, translate them to the corresponding Python exception,
+and raise them so that Python code can handle them.
+
+pybind11 defines translations for ``std::exception`` and its standard
+subclasses, and several special exception classes that translate to specific
+Python exceptions. Note that these are not actually Python exceptions, so they
+cannot be examined using the Python C API. Instead, they are pure C++ objects
+that pybind11 will translate the corresponding Python exception when they arrive
+at its exception handler.
+
+.. tabularcolumns:: |p{0.5\textwidth}|p{0.45\textwidth}|
+
++--------------------------------------+--------------------------------------+
+|  Exception thrown by C++             |  Translated to Python exception type |
++======================================+======================================+
+| :class:`std::exception`              | ``RuntimeError``                     |
++--------------------------------------+--------------------------------------+
+| :class:`std::bad_alloc`              | ``MemoryError``                      |
++--------------------------------------+--------------------------------------+
+| :class:`std::domain_error`           | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::invalid_argument`       | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::length_error`           | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::out_of_range`           | ``IndexError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::range_error`            | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::overflow_error`         | ``OverflowError``                    |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::stop_iteration`    | ``StopIteration`` (used to implement |
+|                                      | custom iterators)                    |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::index_error`       | ``IndexError`` (used to indicate out |
+|                                      | of bounds access in ``__getitem__``, |
+|                                      | ``__setitem__``, etc.)               |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::value_error`       | ``ValueError`` (used to indicate     |
+|                                      | wrong value passed in                |
+|                                      | ``container.remove(...)``)           |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::key_error`         | ``KeyError`` (used to indicate out   |
+|                                      | of bounds access in ``__getitem__``, |
+|                                      | ``__setitem__`` in dict-like         |
+|                                      | objects, etc.)                       |
++--------------------------------------+--------------------------------------+
+
+Exception translation is not bidirectional. That is, *catching* the C++
+exceptions defined above above will not trap exceptions that originate from
+Python. For that, catch :class:`pybind11::error_already_set`. See :ref:`below
+<handling_python_exceptions_cpp>` for further details.
+
+There is also a special exception :class:`cast_error` that is thrown by
+:func:`handle::call` when the input arguments cannot be converted to Python
+objects.
+
+Registering custom translators
+==============================
+
+If the default exception conversion policy described above is insufficient,
+pybind11 also provides support for registering custom exception translators.
+To register a simple exception conversion that translates a C++ exception into
+a new Python exception using the C++ exception's ``what()`` method, a helper
+function is available:
+
+.. code-block:: cpp
+
+    py::register_exception<CppExp>(module, "PyExp");
+
+This call creates a Python exception class with the name ``PyExp`` in the given
+module and automatically converts any encountered exceptions of type ``CppExp``
+into Python exceptions of type ``PyExp``.
+
+When more advanced exception translation is needed, the function
+``py::register_exception_translator(translator)`` can be used to register
+functions that can translate arbitrary exception types (and which may include
+additional logic to do so).  The function takes a stateless callable (e.g.  a
+function pointer or a lambda function without captured variables) with the call
+signature ``void(std::exception_ptr)``.
+
+When a C++ exception is thrown, the registered exception translators are tried
+in reverse order of registration (i.e. the last registered translator gets the
+first shot at handling the exception).
+
+Inside the translator, ``std::rethrow_exception`` should be used within
+a try block to re-throw the exception.  One or more catch clauses to catch
+the appropriate exceptions should then be used with each clause using
+``PyErr_SetString`` to set a Python exception or ``ex(string)`` to set
+the python exception to a custom exception type (see below).
+
+To declare a custom Python exception type, declare a ``py::exception`` variable
+and use this in the associated exception translator (note: it is often useful
+to make this a static declaration when using it inside a lambda expression
+without requiring capturing).
+
+The following example demonstrates this for a hypothetical exception classes
+``MyCustomException`` and ``OtherException``: the first is translated to a
+custom python exception ``MyCustomError``, while the second is translated to a
+standard python RuntimeError:
+
+.. code-block:: cpp
+
+    static py::exception<MyCustomException> exc(m, "MyCustomError");
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyCustomException &e) {
+            exc(e.what());
+        } catch (const OtherException &e) {
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+        }
+    });
+
+Multiple exceptions can be handled by a single translator, as shown in the
+example above. If the exception is not caught by the current translator, the
+previously registered one gets a chance.
+
+If none of the registered exception translators is able to handle the
+exception, it is handled by the default converter as described in the previous
+section.
+
+.. seealso::
+
+    The file :file:`tests/test_exceptions.cpp` contains examples
+    of various custom exception translators and custom exception types.
+
+.. note::
+
+    Call either ``PyErr_SetString`` or a custom exception's call
+    operator (``exc(string)``) for every exception caught in a custom exception
+    translator.  Failure to do so will cause Python to crash with ``SystemError:
+    error return without exception set``.
+
+    Exceptions that you do not plan to handle should simply not be caught, or
+    may be explicitly (re-)thrown to delegate it to the other,
+    previously-declared existing exception translators.
+
+.. _handling_python_exceptions_cpp:
+
+Handling exceptions from Python in C++
+======================================
+
+When C++ calls Python functions, such as in a callback function or when
+manipulating Python objects, and Python raises an ``Exception``, pybind11
+converts the Python exception into a C++ exception of type
+:class:`pybind11::error_already_set` whose payload contains a C++ string textual
+summary and the actual Python exception. ``error_already_set`` is used to
+propagate Python exception back to Python (or possibly, handle them in C++).
+
+.. tabularcolumns:: |p{0.5\textwidth}|p{0.45\textwidth}|
+
++--------------------------------------+--------------------------------------+
+|  Exception raised in Python          |  Thrown as C++ exception type        |
++======================================+======================================+
+| Any Python ``Exception``             | :class:`pybind11::error_already_set` |
++--------------------------------------+--------------------------------------+
+
+For example:
+
+.. code-block:: cpp
+
+    try {
+        // open("missing.txt", "r")
+        auto file = py::module::import("io").attr("open")("missing.txt", "r");
+        auto text = file.attr("read")();
+        file.attr("close")();
+    } catch (py::error_already_set &e) {
+        if (e.matches(PyExc_FileNotFoundError)) {
+            py::print("missing.txt not found");
+        } else if (e.match(PyExc_PermissionError)) {
+            py::print("missing.txt found but not accessible");
+        } else {
+            throw;
+        }
+    }
+
+Note that C++ to Python exception translation does not apply here, since that is
+a method for translating C++ exceptions to Python, not vice versa. The error raised
+from Python is always ``error_already_set``.
+
+This example illustrates this behavior:
+
+.. code-block:: cpp
+
+    try {
+        py::eval("raise ValueError('The Ring')");
+    } catch (py::value_error &boromir) {
+        // Boromir never gets the ring
+        assert(false);
+    } catch (py::error_already_set &frodo) {
+        // Frodo gets the ring
+        py::print("I will take the ring");
+    }
+
+    try {
+        // py::value_error is a request for pybind11 to raise a Python exception
+        throw py::value_error("The ball");
+    } catch (py::error_already_set &cat) {
+        // cat won't catch the ball since
+        // py::value_error is not a Python exception
+        assert(false);
+    } catch (py::value_error &dog) {
+        // dog will catch the ball
+        py::print("Run Spot run");
+        throw;  // Throw it again (pybind11 will raise ValueError)
+    }
+
+Handling errors from the Python C API
+=====================================
+
+Where possible, use :ref:`pybind11 wrappers <wrappers>` instead of calling
+the Python C API directly. When calling the Python C API directly, in
+addition to manually managing reference counts, one must follow the pybind11
+error protocol, which is outlined here.
+
+After calling the Python C API, if Python returns an error,
+``throw py::error_already_set();``, which allows pybind11 to deal with the
+exception and pass it back to the Python interpreter. This includes calls to
+the error setting functions such as ``PyErr_SetString``.
+
+.. code-block:: cpp
+
+    PyErr_SetString(PyExc_TypeError, "C API type error demo");
+    throw py::error_already_set();
+
+    // But it would be easier to simply...
+    throw py::type_error("pybind11 wrapper type error");
+
+Alternately, to ignore the error, call `PyErr_Clear
+<https://docs.python.org/3/c-api/exceptions.html#c.PyErr_Clear>`_.
+
+Any Python error must be thrown or cleared, or Python/pybind11 will be left in
+an invalid state.
+
+.. _unraisable_exceptions:
+
+Handling unraisable exceptions
+==============================
+
+If a Python function invoked from a C++ destructor or any function marked
+``noexcept(true)`` (collectively, "noexcept functions") throws an exception, there
+is no way to propagate the exception, as such functions may not throw.
+Should they throw or fail to catch any exceptions in their call graph,
+the C++ runtime calls ``std::terminate()`` to abort immediately.
+
+Similarly, Python exceptions raised in a class's ``__del__`` method do not
+propagate, but are logged by Python as an unraisable error. In Python 3.8+, a
+`system hook is triggered
+<https://docs.python.org/3/library/sys.html#sys.unraisablehook>`_
+and an auditing event is logged.
+
+Any noexcept function should have a try-catch block that traps
+class:`error_already_set` (or any other exception that can occur). Note that
+pybind11 wrappers around Python exceptions such as
+:class:`pybind11::value_error` are *not* Python exceptions; they are C++
+exceptions that pybind11 catches and converts to Python exceptions. Noexcept
+functions cannot propagate these exceptions either. A useful approach is to
+convert them to Python exceptions and then ``discard_as_unraisable`` as shown
+below.
+
+.. code-block:: cpp
+
+    void nonthrowing_func() noexcept(true) {
+        try {
+            // ...
+        } catch (py::error_already_set &eas) {
+            // Discard the Python error using Python APIs, using the C++ magic
+            // variable __func__. Python already knows the type and value and of the
+            // exception object.
+            eas.discard_as_unraisable(__func__);
+        } catch (const std::exception &e) {
+            // Log and discard C++ exceptions.
+            third_party::log(e);
+        }
+    }
+
+.. versionadded:: 2.6
diff --git a/pybind11/docs/advanced/functions.rst b/pybind11/docs/advanced/functions.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3e33c9cf7da88d9d9669f33fadcfc11ccf0d2698
--- /dev/null
+++ b/pybind11/docs/advanced/functions.rst
@@ -0,0 +1,537 @@
+Functions
+#########
+
+Before proceeding with this section, make sure that you are already familiar
+with the basics of binding functions and classes, as explained in :doc:`/basics`
+and :doc:`/classes`. The following guide is applicable to both free and member
+functions, i.e. *methods* in Python.
+
+.. _return_value_policies:
+
+Return value policies
+=====================
+
+Python and C++ use fundamentally different ways of managing the memory and
+lifetime of objects managed by them. This can lead to issues when creating
+bindings for functions that return a non-trivial type. Just by looking at the
+type information, it is not clear whether Python should take charge of the
+returned value and eventually free its resources, or if this is handled on the
+C++ side. For this reason, pybind11 provides a several *return value policy*
+annotations that can be passed to the :func:`module::def` and
+:func:`class_::def` functions. The default policy is
+:enum:`return_value_policy::automatic`.
+
+Return value policies are tricky, and it's very important to get them right.
+Just to illustrate what can go wrong, consider the following simple example:
+
+.. code-block:: cpp
+
+    /* Function declaration */
+    Data *get_data() { return _data; /* (pointer to a static data structure) */ }
+    ...
+
+    /* Binding code */
+    m.def("get_data", &get_data); // <-- KABOOM, will cause crash when called from Python
+
+What's going on here? When ``get_data()`` is called from Python, the return
+value (a native C++ type) must be wrapped to turn it into a usable Python type.
+In this case, the default return value policy (:enum:`return_value_policy::automatic`)
+causes pybind11 to assume ownership of the static ``_data`` instance.
+
+When Python's garbage collector eventually deletes the Python
+wrapper, pybind11 will also attempt to delete the C++ instance (via ``operator
+delete()``) due to the implied ownership. At this point, the entire application
+will come crashing down, though errors could also be more subtle and involve
+silent data corruption.
+
+In the above example, the policy :enum:`return_value_policy::reference` should have
+been specified so that the global data instance is only *referenced* without any
+implied transfer of ownership, i.e.:
+
+.. code-block:: cpp
+
+    m.def("get_data", &get_data, return_value_policy::reference);
+
+On the other hand, this is not the right policy for many other situations,
+where ignoring ownership could lead to resource leaks.
+As a developer using pybind11, it's important to be familiar with the different
+return value policies, including which situation calls for which one of them.
+The following table provides an overview of available policies:
+
+.. tabularcolumns:: |p{0.5\textwidth}|p{0.45\textwidth}|
+
++--------------------------------------------------+----------------------------------------------------------------------------+
+| Return value policy                              | Description                                                                |
++==================================================+============================================================================+
+| :enum:`return_value_policy::take_ownership`      | Reference an existing object (i.e. do not create a new copy) and take      |
+|                                                  | ownership. Python will call the destructor and delete operator when the    |
+|                                                  | object's reference count reaches zero. Undefined behavior ensues when the  |
+|                                                  | C++ side does the same, or when the data was not dynamically allocated.    |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::copy`                | Create a new copy of the returned object, which will be owned by Python.   |
+|                                                  | This policy is comparably safe because the lifetimes of the two instances  |
+|                                                  | are decoupled.                                                             |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::move`                | Use ``std::move`` to move the return value contents into a new instance    |
+|                                                  | that will be owned by Python. This policy is comparably safe because the   |
+|                                                  | lifetimes of the two instances (move source and destination) are decoupled.|
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::reference`           | Reference an existing object, but do not take ownership. The C++ side is   |
+|                                                  | responsible for managing the object's lifetime and deallocating it when    |
+|                                                  | it is no longer used. Warning: undefined behavior will ensue when the C++  |
+|                                                  | side deletes an object that is still referenced and used by Python.        |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::reference_internal`  | Indicates that the lifetime of the return value is tied to the lifetime    |
+|                                                  | of a parent object, namely the implicit ``this``, or ``self`` argument of  |
+|                                                  | the called method or property. Internally, this policy works just like     |
+|                                                  | :enum:`return_value_policy::reference` but additionally applies a          |
+|                                                  | ``keep_alive<0, 1>`` *call policy* (described in the next section) that    |
+|                                                  | prevents the parent object from being garbage collected as long as the     |
+|                                                  | return value is referenced by Python. This is the default policy for       |
+|                                                  | property getters created via ``def_property``, ``def_readwrite``, etc.     |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::automatic`           | **Default policy.** This policy falls back to the policy                   |
+|                                                  | :enum:`return_value_policy::take_ownership` when the return value is a     |
+|                                                  | pointer. Otherwise, it uses :enum:`return_value_policy::move` or           |
+|                                                  | :enum:`return_value_policy::copy` for rvalue and lvalue references,        |
+|                                                  | respectively. See above for a description of what all of these different   |
+|                                                  | policies do.                                                               |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::automatic_reference` | As above, but use policy :enum:`return_value_policy::reference` when the   |
+|                                                  | return value is a pointer. This is the default conversion policy for       |
+|                                                  | function arguments when calling Python functions manually from C++ code    |
+|                                                  | (i.e. via handle::operator()). You probably won't need to use this.        |
++--------------------------------------------------+----------------------------------------------------------------------------+
+
+Return value policies can also be applied to properties:
+
+.. code-block:: cpp
+
+    class_<MyClass>(m, "MyClass")
+        .def_property("data", &MyClass::getData, &MyClass::setData,
+                      py::return_value_policy::copy);
+
+Technically, the code above applies the policy to both the getter and the
+setter function, however, the setter doesn't really care about *return*
+value policies which makes this a convenient terse syntax. Alternatively,
+targeted arguments can be passed through the :class:`cpp_function` constructor:
+
+.. code-block:: cpp
+
+    class_<MyClass>(m, "MyClass")
+        .def_property("data"
+            py::cpp_function(&MyClass::getData, py::return_value_policy::copy),
+            py::cpp_function(&MyClass::setData)
+        );
+
+.. warning::
+
+    Code with invalid return value policies might access uninitialized memory or
+    free data structures multiple times, which can lead to hard-to-debug
+    non-determinism and segmentation faults, hence it is worth spending the
+    time to understand all the different options in the table above.
+
+.. note::
+
+    One important aspect of the above policies is that they only apply to
+    instances which pybind11 has *not* seen before, in which case the policy
+    clarifies essential questions about the return value's lifetime and
+    ownership.  When pybind11 knows the instance already (as identified by its
+    type and address in memory), it will return the existing Python object
+    wrapper rather than creating a new copy.
+
+.. note::
+
+    The next section on :ref:`call_policies` discusses *call policies* that can be
+    specified *in addition* to a return value policy from the list above. Call
+    policies indicate reference relationships that can involve both return values
+    and parameters of functions.
+
+.. note::
+
+   As an alternative to elaborate call policies and lifetime management logic,
+   consider using smart pointers (see the section on :ref:`smart_pointers` for
+   details). Smart pointers can tell whether an object is still referenced from
+   C++ or Python, which generally eliminates the kinds of inconsistencies that
+   can lead to crashes or undefined behavior. For functions returning smart
+   pointers, it is not necessary to specify a return value policy.
+
+.. _call_policies:
+
+Additional call policies
+========================
+
+In addition to the above return value policies, further *call policies* can be
+specified to indicate dependencies between parameters or ensure a certain state
+for the function call.
+
+Keep alive
+----------
+
+In general, this policy is required when the C++ object is any kind of container
+and another object is being added to the container. ``keep_alive<Nurse, Patient>``
+indicates that the argument with index ``Patient`` should be kept alive at least
+until the argument with index ``Nurse`` is freed by the garbage collector. Argument
+indices start at one, while zero refers to the return value. For methods, index
+``1`` refers to the implicit ``this`` pointer, while regular arguments begin at
+index ``2``. Arbitrarily many call policies can be specified. When a ``Nurse``
+with value ``None`` is detected at runtime, the call policy does nothing.
+
+When the nurse is not a pybind11-registered type, the implementation internally
+relies on the ability to create a *weak reference* to the nurse object. When
+the nurse object is not a pybind11-registered type and does not support weak
+references, an exception will be thrown.
+
+Consider the following example: here, the binding code for a list append
+operation ties the lifetime of the newly added element to the underlying
+container:
+
+.. code-block:: cpp
+
+    py::class_<List>(m, "List")
+        .def("append", &List::append, py::keep_alive<1, 2>());
+
+For consistency, the argument indexing is identical for constructors. Index
+``1`` still refers to the implicit ``this`` pointer, i.e. the object which is
+being constructed. Index ``0`` refers to the return type which is presumed to
+be ``void`` when a constructor is viewed like a function. The following example
+ties the lifetime of the constructor element to the constructed object:
+
+.. code-block:: cpp
+
+    py::class_<Nurse>(m, "Nurse")
+        .def(py::init<Patient &>(), py::keep_alive<1, 2>());
+
+.. note::
+
+    ``keep_alive`` is analogous to the ``with_custodian_and_ward`` (if Nurse,
+    Patient != 0) and ``with_custodian_and_ward_postcall`` (if Nurse/Patient ==
+    0) policies from Boost.Python.
+
+Call guard
+----------
+
+The ``call_guard<T>`` policy allows any scope guard type ``T`` to be placed
+around the function call. For example, this definition:
+
+.. code-block:: cpp
+
+    m.def("foo", foo, py::call_guard<T>());
+
+is equivalent to the following pseudocode:
+
+.. code-block:: cpp
+
+    m.def("foo", [](args...) {
+        T scope_guard;
+        return foo(args...); // forwarded arguments
+    });
+
+The only requirement is that ``T`` is default-constructible, but otherwise any
+scope guard will work. This is very useful in combination with `gil_scoped_release`.
+See :ref:`gil`.
+
+Multiple guards can also be specified as ``py::call_guard<T1, T2, T3...>``. The
+constructor order is left to right and destruction happens in reverse.
+
+.. seealso::
+
+    The file :file:`tests/test_call_policies.cpp` contains a complete example
+    that demonstrates using `keep_alive` and `call_guard` in more detail.
+
+.. _python_objects_as_args:
+
+Python objects as arguments
+===========================
+
+pybind11 exposes all major Python types using thin C++ wrapper classes. These
+wrapper classes can also be used as parameters of functions in bindings, which
+makes it possible to directly work with native Python types on the C++ side.
+For instance, the following statement iterates over a Python ``dict``:
+
+.. code-block:: cpp
+
+    void print_dict(py::dict dict) {
+        /* Easily interact with Python types */
+        for (auto item : dict)
+            std::cout << "key=" << std::string(py::str(item.first)) << ", "
+                      << "value=" << std::string(py::str(item.second)) << std::endl;
+    }
+
+It can be exported:
+
+.. code-block:: cpp
+
+    m.def("print_dict", &print_dict);
+
+And used in Python as usual:
+
+.. code-block:: pycon
+
+    >>> print_dict({'foo': 123, 'bar': 'hello'})
+    key=foo, value=123
+    key=bar, value=hello
+
+For more information on using Python objects in C++, see :doc:`/advanced/pycpp/index`.
+
+Accepting \*args and \*\*kwargs
+===============================
+
+Python provides a useful mechanism to define functions that accept arbitrary
+numbers of arguments and keyword arguments:
+
+.. code-block:: python
+
+   def generic(*args, **kwargs):
+       ...  # do something with args and kwargs
+
+Such functions can also be created using pybind11:
+
+.. code-block:: cpp
+
+   void generic(py::args args, py::kwargs kwargs) {
+       /// .. do something with args
+       if (kwargs)
+           /// .. do something with kwargs
+   }
+
+   /// Binding code
+   m.def("generic", &generic);
+
+The class ``py::args`` derives from ``py::tuple`` and ``py::kwargs`` derives
+from ``py::dict``.
+
+You may also use just one or the other, and may combine these with other
+arguments as long as the ``py::args`` and ``py::kwargs`` arguments are the last
+arguments accepted by the function.
+
+Please refer to the other examples for details on how to iterate over these,
+and on how to cast their entries into C++ objects. A demonstration is also
+available in ``tests/test_kwargs_and_defaults.cpp``.
+
+.. note::
+
+    When combining \*args or \*\*kwargs with :ref:`keyword_args` you should
+    *not* include ``py::arg`` tags for the ``py::args`` and ``py::kwargs``
+    arguments.
+
+Default arguments revisited
+===========================
+
+The section on :ref:`default_args` previously discussed basic usage of default
+arguments using pybind11. One noteworthy aspect of their implementation is that
+default arguments are converted to Python objects right at declaration time.
+Consider the following example:
+
+.. code-block:: cpp
+
+    py::class_<MyClass>("MyClass")
+        .def("myFunction", py::arg("arg") = SomeType(123));
+
+In this case, pybind11 must already be set up to deal with values of the type
+``SomeType`` (via a prior instantiation of ``py::class_<SomeType>``), or an
+exception will be thrown.
+
+Another aspect worth highlighting is that the "preview" of the default argument
+in the function signature is generated using the object's ``__repr__`` method.
+If not available, the signature may not be very helpful, e.g.:
+
+.. code-block:: pycon
+
+    FUNCTIONS
+    ...
+    |  myFunction(...)
+    |      Signature : (MyClass, arg : SomeType = <SomeType object at 0x101b7b080>) -> NoneType
+    ...
+
+The first way of addressing this is by defining ``SomeType.__repr__``.
+Alternatively, it is possible to specify the human-readable preview of the
+default argument manually using the ``arg_v`` notation:
+
+.. code-block:: cpp
+
+    py::class_<MyClass>("MyClass")
+        .def("myFunction", py::arg_v("arg", SomeType(123), "SomeType(123)"));
+
+Sometimes it may be necessary to pass a null pointer value as a default
+argument. In this case, remember to cast it to the underlying type in question,
+like so:
+
+.. code-block:: cpp
+
+    py::class_<MyClass>("MyClass")
+        .def("myFunction", py::arg("arg") = (SomeType *) nullptr);
+
+Keyword-only arguments
+======================
+
+Python 3 introduced keyword-only arguments by specifying an unnamed ``*``
+argument in a function definition:
+
+.. code-block:: python
+
+    def f(a, *, b):  # a can be positional or via keyword; b must be via keyword
+        pass
+
+    f(a=1, b=2)  # good
+    f(b=2, a=1)  # good
+    f(1, b=2)    # good
+    f(1, 2)      # TypeError: f() takes 1 positional argument but 2 were given
+
+Pybind11 provides a ``py::kwonly`` object that allows you to implement
+the same behaviour by specifying the object between positional and keyword-only
+argument annotations when registering the function:
+
+.. code-block:: cpp
+
+    m.def("f", [](int a, int b) { /* ... */ },
+          py::arg("a"), py::kwonly(), py::arg("b"));
+
+Note that, as in Python, you cannot combine this with a ``py::args`` argument.
+This feature does *not* require Python 3 to work.
+
+.. versionadded:: 2.6
+
+.. _nonconverting_arguments:
+
+Non-converting arguments
+========================
+
+Certain argument types may support conversion from one type to another.  Some
+examples of conversions are:
+
+* :ref:`implicit_conversions` declared using ``py::implicitly_convertible<A,B>()``
+* Calling a method accepting a double with an integer argument
+* Calling a ``std::complex<float>`` argument with a non-complex python type
+  (for example, with a float).  (Requires the optional ``pybind11/complex.h``
+  header).
+* Calling a function taking an Eigen matrix reference with a numpy array of the
+  wrong type or of an incompatible data layout.  (Requires the optional
+  ``pybind11/eigen.h`` header).
+
+This behaviour is sometimes undesirable: the binding code may prefer to raise
+an error rather than convert the argument.  This behaviour can be obtained
+through ``py::arg`` by calling the ``.noconvert()`` method of the ``py::arg``
+object, such as:
+
+.. code-block:: cpp
+
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+
+Attempting the call the second function (the one without ``.noconvert()``) with
+an integer will succeed, but attempting to call the ``.noconvert()`` version
+will fail with a ``TypeError``:
+
+.. code-block:: pycon
+
+    >>> floats_preferred(4)
+    2.0
+    >>> floats_only(4)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: floats_only(): incompatible function arguments. The following argument types are supported:
+        1. (f: float) -> float
+
+    Invoked with: 4
+
+You may, of course, combine this with the :var:`_a` shorthand notation (see
+:ref:`keyword_args`) and/or :ref:`default_args`.  It is also permitted to omit
+the argument name by using the ``py::arg()`` constructor without an argument
+name, i.e. by specifying ``py::arg().noconvert()``.
+
+.. note::
+
+    When specifying ``py::arg`` options it is necessary to provide the same
+    number of options as the bound function has arguments.  Thus if you want to
+    enable no-convert behaviour for just one of several arguments, you will
+    need to specify a ``py::arg()`` annotation for each argument with the
+    no-convert argument modified to ``py::arg().noconvert()``.
+
+.. _none_arguments:
+
+Allow/Prohibiting None arguments
+================================
+
+When a C++ type registered with :class:`py::class_` is passed as an argument to
+a function taking the instance as pointer or shared holder (e.g. ``shared_ptr``
+or a custom, copyable holder as described in :ref:`smart_pointers`), pybind
+allows ``None`` to be passed from Python which results in calling the C++
+function with ``nullptr`` (or an empty holder) for the argument.
+
+To explicitly enable or disable this behaviour, using the
+``.none`` method of the :class:`py::arg` object:
+
+.. code-block:: cpp
+
+    py::class_<Dog>(m, "Dog").def(py::init<>());
+    py::class_<Cat>(m, "Cat").def(py::init<>());
+    m.def("bark", [](Dog *dog) -> std::string {
+        if (dog) return "woof!"; /* Called with a Dog instance */
+        else return "(no dog)"; /* Called with None, dog == nullptr */
+    }, py::arg("dog").none(true));
+    m.def("meow", [](Cat *cat) -> std::string {
+        // Can't be called with None argument
+        return "meow";
+    }, py::arg("cat").none(false));
+
+With the above, the Python call ``bark(None)`` will return the string ``"(no
+dog)"``, while attempting to call ``meow(None)`` will raise a ``TypeError``:
+
+.. code-block:: pycon
+
+    >>> from animals import Dog, Cat, bark, meow
+    >>> bark(Dog())
+    'woof!'
+    >>> meow(Cat())
+    'meow'
+    >>> bark(None)
+    '(no dog)'
+    >>> meow(None)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: meow(): incompatible function arguments. The following argument types are supported:
+        1. (cat: animals.Cat) -> str
+
+    Invoked with: None
+
+The default behaviour when the tag is unspecified is to allow ``None``.
+
+.. note::
+
+    Even when ``.none(true)`` is specified for an argument, ``None`` will be converted to a
+    ``nullptr`` *only* for custom and :ref:`opaque <opaque>` types. Pointers to built-in types
+    (``double *``, ``int *``, ...) and STL types (``std::vector<T> *``, ...; if ``pybind11/stl.h``
+    is included) are copied when converted to C++ (see :doc:`/advanced/cast/overview`) and will
+    not allow ``None`` as argument.  To pass optional argument of these copied types consider
+    using ``std::optional<T>``
+
+Overload resolution order
+=========================
+
+When a function or method with multiple overloads is called from Python,
+pybind11 determines which overload to call in two passes.  The first pass
+attempts to call each overload without allowing argument conversion (as if
+every argument had been specified as ``py::arg().noconvert()`` as described
+above).
+
+If no overload succeeds in the no-conversion first pass, a second pass is
+attempted in which argument conversion is allowed (except where prohibited via
+an explicit ``py::arg().noconvert()`` attribute in the function definition).
+
+If the second pass also fails a ``TypeError`` is raised.
+
+Within each pass, overloads are tried in the order they were registered with
+pybind11.
+
+What this means in practice is that pybind11 will prefer any overload that does
+not require conversion of arguments to an overload that does, but otherwise prefers
+earlier-defined overloads to later-defined ones.
+
+.. note::
+
+    pybind11 does *not* further prioritize based on the number/pattern of
+    overloaded arguments.  That is, pybind11 does not prioritize a function
+    requiring one conversion over one requiring three, but only prioritizes
+    overloads requiring no conversion at all to overloads that require
+    conversion of at least one argument.
diff --git a/pybind11/docs/advanced/misc.rst b/pybind11/docs/advanced/misc.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0a73dae7e7f675736b0b0aea5aa740154988bf47
--- /dev/null
+++ b/pybind11/docs/advanced/misc.rst
@@ -0,0 +1,337 @@
+Miscellaneous
+#############
+
+.. _macro_notes:
+
+General notes regarding convenience macros
+==========================================
+
+pybind11 provides a few convenience macros such as
+:func:`PYBIND11_DECLARE_HOLDER_TYPE` and ``PYBIND11_OVERLOAD_*``. Since these
+are "just" macros that are evaluated in the preprocessor (which has no concept
+of types), they *will* get confused by commas in a template argument; for
+example, consider:
+
+.. code-block:: cpp
+
+    PYBIND11_OVERLOAD(MyReturnType<T1, T2>, Class<T3, T4>, func)
+
+The limitation of the C preprocessor interprets this as five arguments (with new
+arguments beginning after each comma) rather than three.  To get around this,
+there are two alternatives: you can use a type alias, or you can wrap the type
+using the ``PYBIND11_TYPE`` macro:
+
+.. code-block:: cpp
+
+    // Version 1: using a type alias
+    using ReturnType = MyReturnType<T1, T2>;
+    using ClassType = Class<T3, T4>;
+    PYBIND11_OVERLOAD(ReturnType, ClassType, func);
+
+    // Version 2: using the PYBIND11_TYPE macro:
+    PYBIND11_OVERLOAD(PYBIND11_TYPE(MyReturnType<T1, T2>),
+                      PYBIND11_TYPE(Class<T3, T4>), func)
+
+The ``PYBIND11_MAKE_OPAQUE`` macro does *not* require the above workarounds.
+
+.. _gil:
+
+Global Interpreter Lock (GIL)
+=============================
+
+When calling a C++ function from Python, the GIL is always held.
+The classes :class:`gil_scoped_release` and :class:`gil_scoped_acquire` can be
+used to acquire and release the global interpreter lock in the body of a C++
+function call. In this way, long-running C++ code can be parallelized using
+multiple Python threads. Taking :ref:`overriding_virtuals` as an example, this
+could be realized as follows (important changes highlighted):
+
+.. code-block:: cpp
+    :emphasize-lines: 8,9,31,32
+
+    class PyAnimal : public Animal {
+    public:
+        /* Inherit the constructors */
+        using Animal::Animal;
+
+        /* Trampoline (need one for each virtual function) */
+        std::string go(int n_times) {
+            /* Acquire GIL before calling Python code */
+            py::gil_scoped_acquire acquire;
+
+            PYBIND11_OVERLOAD_PURE(
+                std::string, /* Return type */
+                Animal,      /* Parent class */
+                go,          /* Name of function */
+                n_times      /* Argument(s) */
+            );
+        }
+    };
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Animal, PyAnimal> animal(m, "Animal");
+        animal
+            .def(py::init<>())
+            .def("go", &Animal::go);
+
+        py::class_<Dog>(m, "Dog", animal)
+            .def(py::init<>());
+
+        m.def("call_go", [](Animal *animal) -> std::string {
+            /* Release GIL before calling into (potentially long-running) C++ code */
+            py::gil_scoped_release release;
+            return call_go(animal);
+        });
+    }
+
+The ``call_go`` wrapper can also be simplified using the `call_guard` policy
+(see :ref:`call_policies`) which yields the same result:
+
+.. code-block:: cpp
+
+    m.def("call_go", &call_go, py::call_guard<py::gil_scoped_release>());
+
+
+Binding sequence data types, iterators, the slicing protocol, etc.
+==================================================================
+
+Please refer to the supplemental example for details.
+
+.. seealso::
+
+    The file :file:`tests/test_sequences_and_iterators.cpp` contains a
+    complete example that shows how to bind a sequence data type, including
+    length queries (``__len__``), iterators (``__iter__``), the slicing
+    protocol and other kinds of useful operations.
+
+
+Partitioning code over multiple extension modules
+=================================================
+
+It's straightforward to split binding code over multiple extension modules,
+while referencing types that are declared elsewhere. Everything "just" works
+without any special precautions. One exception to this rule occurs when
+extending a type declared in another extension module. Recall the basic example
+from Section :ref:`inheritance`.
+
+.. code-block:: cpp
+
+    py::class_<Pet> pet(m, "Pet");
+    pet.def(py::init<const std::string &>())
+       .def_readwrite("name", &Pet::name);
+
+    py::class_<Dog>(m, "Dog", pet /* <- specify parent */)
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Suppose now that ``Pet`` bindings are defined in a module named ``basic``,
+whereas the ``Dog`` bindings are defined somewhere else. The challenge is of
+course that the variable ``pet`` is not available anymore though it is needed
+to indicate the inheritance relationship to the constructor of ``class_<Dog>``.
+However, it can be acquired as follows:
+
+.. code-block:: cpp
+
+    py::object pet = (py::object) py::module::import("basic").attr("Pet");
+
+    py::class_<Dog>(m, "Dog", pet)
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Alternatively, you can specify the base class as a template parameter option to
+``class_``, which performs an automated lookup of the corresponding Python
+type. Like the above code, however, this also requires invoking the ``import``
+function once to ensure that the pybind11 binding code of the module ``basic``
+has been executed:
+
+.. code-block:: cpp
+
+    py::module::import("basic");
+
+    py::class_<Dog, Pet>(m, "Dog")
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Naturally, both methods will fail when there are cyclic dependencies.
+
+Note that pybind11 code compiled with hidden-by-default symbol visibility (e.g.
+via the command line flag ``-fvisibility=hidden`` on GCC/Clang), which is
+required for proper pybind11 functionality, can interfere with the ability to
+access types defined in another extension module.  Working around this requires
+manually exporting types that are accessed by multiple extension modules;
+pybind11 provides a macro to do just this:
+
+.. code-block:: cpp
+
+    class PYBIND11_EXPORT Dog : public Animal {
+        ...
+    };
+
+Note also that it is possible (although would rarely be required) to share arbitrary
+C++ objects between extension modules at runtime. Internal library data is shared
+between modules using capsule machinery [#f6]_ which can be also utilized for
+storing, modifying and accessing user-defined data. Note that an extension module
+will "see" other extensions' data if and only if they were built with the same
+pybind11 version. Consider the following example:
+
+.. code-block:: cpp
+
+    auto data = (MyData *) py::get_shared_data("mydata");
+    if (!data)
+        data = (MyData *) py::set_shared_data("mydata", new MyData(42));
+
+If the above snippet was used in several separately compiled extension modules,
+the first one to be imported would create a ``MyData`` instance and associate
+a ``"mydata"`` key with a pointer to it. Extensions that are imported later
+would be then able to access the data behind the same pointer.
+
+.. [#f6] https://docs.python.org/3/extending/extending.html#using-capsules
+
+Module Destructors
+==================
+
+pybind11 does not provide an explicit mechanism to invoke cleanup code at
+module destruction time. In rare cases where such functionality is required, it
+is possible to emulate it using Python capsules or weak references with a
+destruction callback.
+
+.. code-block:: cpp
+
+    auto cleanup_callback = []() {
+        // perform cleanup here -- this function is called with the GIL held
+    };
+
+    m.add_object("_cleanup", py::capsule(cleanup_callback));
+
+This approach has the potential downside that instances of classes exposed
+within the module may still be alive when the cleanup callback is invoked
+(whether this is acceptable will generally depend on the application).
+
+Alternatively, the capsule may also be stashed within a type object, which
+ensures that it not called before all instances of that type have been
+collected:
+
+.. code-block:: cpp
+
+    auto cleanup_callback = []() { /* ... */ };
+    m.attr("BaseClass").attr("_cleanup") = py::capsule(cleanup_callback);
+
+Both approaches also expose a potentially dangerous ``_cleanup`` attribute in
+Python, which may be undesirable from an API standpoint (a premature explicit
+call from Python might lead to undefined behavior). Yet another approach that
+avoids this issue involves weak reference with a cleanup callback:
+
+.. code-block:: cpp
+
+    // Register a callback function that is invoked when the BaseClass object is colelcted
+    py::cpp_function cleanup_callback(
+        [](py::handle weakref) {
+            // perform cleanup here -- this function is called with the GIL held
+
+            weakref.dec_ref(); // release weak reference
+        }
+    );
+
+    // Create a weak reference with a cleanup callback and initially leak it
+    (void) py::weakref(m.attr("BaseClass"), cleanup_callback).release();
+
+.. note::
+
+    PyPy (at least version 5.9) does not garbage collect objects when the
+    interpreter exits. An alternative approach (which also works on CPython) is to use
+    the :py:mod:`atexit` module [#f7]_, for example:
+
+    .. code-block:: cpp
+
+        auto atexit = py::module::import("atexit");
+        atexit.attr("register")(py::cpp_function([]() {
+            // perform cleanup here -- this function is called with the GIL held
+        }));
+
+    .. [#f7] https://docs.python.org/3/library/atexit.html
+
+
+Generating documentation using Sphinx
+=====================================
+
+Sphinx [#f4]_ has the ability to inspect the signatures and documentation
+strings in pybind11-based extension modules to automatically generate beautiful
+documentation in a variety formats. The python_example repository [#f5]_ contains a
+simple example repository which uses this approach.
+
+There are two potential gotchas when using this approach: first, make sure that
+the resulting strings do not contain any :kbd:`TAB` characters, which break the
+docstring parsing routines. You may want to use C++11 raw string literals,
+which are convenient for multi-line comments. Conveniently, any excess
+indentation will be automatically be removed by Sphinx. However, for this to
+work, it is important that all lines are indented consistently, i.e.:
+
+.. code-block:: cpp
+
+    // ok
+    m.def("foo", &foo, R"mydelimiter(
+        The foo function
+
+        Parameters
+        ----------
+    )mydelimiter");
+
+    // *not ok*
+    m.def("foo", &foo, R"mydelimiter(The foo function
+
+        Parameters
+        ----------
+    )mydelimiter");
+
+By default, pybind11 automatically generates and prepends a signature to the docstring of a function
+registered with ``module::def()`` and ``class_::def()``. Sometimes this
+behavior is not desirable, because you want to provide your own signature or remove
+the docstring completely to exclude the function from the Sphinx documentation.
+The class ``options`` allows you to selectively suppress auto-generated signatures:
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+        py::options options;
+        options.disable_function_signatures();
+
+        m.def("add", [](int a, int b) { return a + b; }, "A function which adds two numbers");
+    }
+
+Note that changes to the settings affect only function bindings created during the
+lifetime of the ``options`` instance. When it goes out of scope at the end of the module's init function,
+the default settings are restored to prevent unwanted side effects.
+
+.. [#f4] http://www.sphinx-doc.org
+.. [#f5] http://github.com/pybind/python_example
+
+.. _avoiding-cpp-types-in-docstrings:
+
+Avoiding C++ types in docstrings
+================================
+
+Docstrings are generated at the time of the declaration, e.g. when ``.def(...)`` is called.
+At this point parameter and return types should be known to pybind11.
+If a custom type is not exposed yet through a ``py::class_`` constructor or a custom type caster,
+its C++ type name will be used instead to generate the signature in the docstring:
+
+.. code-block:: text
+
+     |  __init__(...)
+     |      __init__(self: example.Foo, arg0: ns::Bar) -> None
+                                              ^^^^^^^
+
+
+This limitation can be circumvented by ensuring that C++ classes are registered with pybind11
+before they are used as a parameter or return type of a function:
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+
+        auto pyFoo = py::class_<ns::Foo>(m, "Foo");
+        auto pyBar = py::class_<ns::Bar>(m, "Bar");
+
+        pyFoo.def(py::init<const ns::Bar&>());
+        pyBar.def(py::init<const ns::Foo&>());
+    }
diff --git a/pybind11/docs/advanced/pycpp/index.rst b/pybind11/docs/advanced/pycpp/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6885bdcff1b56bbab5605873ccb1e0676864bb03
--- /dev/null
+++ b/pybind11/docs/advanced/pycpp/index.rst
@@ -0,0 +1,13 @@
+Python C++ interface
+####################
+
+pybind11 exposes Python types and functions using thin C++ wrappers, which
+makes it possible to conveniently call Python code from C++ without resorting
+to Python's C API.
+
+.. toctree::
+   :maxdepth: 2
+
+   object
+   numpy
+   utilities
diff --git a/pybind11/docs/advanced/pycpp/numpy.rst b/pybind11/docs/advanced/pycpp/numpy.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8e5c6092c471f1006672e4060def4fd31786622b
--- /dev/null
+++ b/pybind11/docs/advanced/pycpp/numpy.rst
@@ -0,0 +1,436 @@
+.. _numpy:
+
+NumPy
+#####
+
+Buffer protocol
+===============
+
+Python supports an extremely general and convenient approach for exchanging
+data between plugin libraries. Types can expose a buffer view [#f2]_, which
+provides fast direct access to the raw internal data representation. Suppose we
+want to bind the following simplistic Matrix class:
+
+.. code-block:: cpp
+
+    class Matrix {
+    public:
+        Matrix(size_t rows, size_t cols) : m_rows(rows), m_cols(cols) {
+            m_data = new float[rows*cols];
+        }
+        float *data() { return m_data; }
+        size_t rows() const { return m_rows; }
+        size_t cols() const { return m_cols; }
+    private:
+        size_t m_rows, m_cols;
+        float *m_data;
+    };
+
+The following binding code exposes the ``Matrix`` contents as a buffer object,
+making it possible to cast Matrices into NumPy arrays. It is even possible to
+completely avoid copy operations with Python expressions like
+``np.array(matrix_instance, copy = False)``.
+
+.. code-block:: cpp
+
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+       .def_buffer([](Matrix &m) -> py::buffer_info {
+            return py::buffer_info(
+                m.data(),                               /* Pointer to buffer */
+                sizeof(float),                          /* Size of one scalar */
+                py::format_descriptor<float>::format(), /* Python struct-style format descriptor */
+                2,                                      /* Number of dimensions */
+                { m.rows(), m.cols() },                 /* Buffer dimensions */
+                { sizeof(float) * m.cols(),             /* Strides (in bytes) for each index */
+                  sizeof(float) }
+            );
+        });
+
+Supporting the buffer protocol in a new type involves specifying the special
+``py::buffer_protocol()`` tag in the ``py::class_`` constructor and calling the
+``def_buffer()`` method with a lambda function that creates a
+``py::buffer_info`` description record on demand describing a given matrix
+instance. The contents of ``py::buffer_info`` mirror the Python buffer protocol
+specification.
+
+.. code-block:: cpp
+
+    struct buffer_info {
+        void *ptr;
+        ssize_t itemsize;
+        std::string format;
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::vector<ssize_t> strides;
+    };
+
+To create a C++ function that can take a Python buffer object as an argument,
+simply use the type ``py::buffer`` as one of its arguments. Buffers can exist
+in a great variety of configurations, hence some safety checks are usually
+necessary in the function body. Below, you can see a basic example on how to
+define a custom constructor for the Eigen double precision matrix
+(``Eigen::MatrixXd``) type, which supports initialization from compatible
+buffer objects (e.g. a NumPy matrix).
+
+.. code-block:: cpp
+
+    /* Bind MatrixXd (or some other Eigen type) to Python */
+    typedef Eigen::MatrixXd Matrix;
+
+    typedef Matrix::Scalar Scalar;
+    constexpr bool rowMajor = Matrix::Flags & Eigen::RowMajorBit;
+
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+        .def(py::init([](py::buffer b) {
+            typedef Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic> Strides;
+
+            /* Request a buffer descriptor from Python */
+            py::buffer_info info = b.request();
+
+            /* Some sanity checks ... */
+            if (info.format != py::format_descriptor<Scalar>::format())
+                throw std::runtime_error("Incompatible format: expected a double array!");
+
+            if (info.ndim != 2)
+                throw std::runtime_error("Incompatible buffer dimension!");
+
+            auto strides = Strides(
+                info.strides[rowMajor ? 0 : 1] / (py::ssize_t)sizeof(Scalar),
+                info.strides[rowMajor ? 1 : 0] / (py::ssize_t)sizeof(Scalar));
+
+            auto map = Eigen::Map<Matrix, 0, Strides>(
+                static_cast<Scalar *>(info.ptr), info.shape[0], info.shape[1], strides);
+
+            return Matrix(map);
+        }));
+
+For reference, the ``def_buffer()`` call for this Eigen data type should look
+as follows:
+
+.. code-block:: cpp
+
+    .def_buffer([](Matrix &m) -> py::buffer_info {
+        return py::buffer_info(
+            m.data(),                                /* Pointer to buffer */
+            sizeof(Scalar),                          /* Size of one scalar */
+            py::format_descriptor<Scalar>::format(), /* Python struct-style format descriptor */
+            2,                                       /* Number of dimensions */
+            { m.rows(), m.cols() },                  /* Buffer dimensions */
+            { sizeof(Scalar) * (rowMajor ? m.cols() : 1),
+              sizeof(Scalar) * (rowMajor ? 1 : m.rows()) }
+                                                     /* Strides (in bytes) for each index */
+        );
+     })
+
+For a much easier approach of binding Eigen types (although with some
+limitations), refer to the section on :doc:`/advanced/cast/eigen`.
+
+.. seealso::
+
+    The file :file:`tests/test_buffers.cpp` contains a complete example
+    that demonstrates using the buffer protocol with pybind11 in more detail.
+
+.. [#f2] http://docs.python.org/3/c-api/buffer.html
+
+Arrays
+======
+
+By exchanging ``py::buffer`` with ``py::array`` in the above snippet, we can
+restrict the function so that it only accepts NumPy arrays (rather than any
+type of Python object satisfying the buffer protocol).
+
+In many situations, we want to define a function which only accepts a NumPy
+array of a certain data type. This is possible via the ``py::array_t<T>``
+template. For instance, the following function requires the argument to be a
+NumPy array containing double precision values.
+
+.. code-block:: cpp
+
+    void f(py::array_t<double> array);
+
+When it is invoked with a different type (e.g. an integer or a list of
+integers), the binding code will attempt to cast the input into a NumPy array
+of the requested type. Note that this feature requires the
+:file:`pybind11/numpy.h` header to be included.
+
+Data in NumPy arrays is not guaranteed to packed in a dense manner;
+furthermore, entries can be separated by arbitrary column and row strides.
+Sometimes, it can be useful to require a function to only accept dense arrays
+using either the C (row-major) or Fortran (column-major) ordering. This can be
+accomplished via a second template argument with values ``py::array::c_style``
+or ``py::array::f_style``.
+
+.. code-block:: cpp
+
+    void f(py::array_t<double, py::array::c_style | py::array::forcecast> array);
+
+The ``py::array::forcecast`` argument is the default value of the second
+template parameter, and it ensures that non-conforming arguments are converted
+into an array satisfying the specified requirements instead of trying the next
+function overload.
+
+Structured types
+================
+
+In order for ``py::array_t`` to work with structured (record) types, we first
+need to register the memory layout of the type. This can be done via
+``PYBIND11_NUMPY_DTYPE`` macro, called in the plugin definition code, which
+expects the type followed by field names:
+
+.. code-block:: cpp
+
+    struct A {
+        int x;
+        double y;
+    };
+
+    struct B {
+        int z;
+        A a;
+    };
+
+    // ...
+    PYBIND11_MODULE(test, m) {
+        // ...
+
+        PYBIND11_NUMPY_DTYPE(A, x, y);
+        PYBIND11_NUMPY_DTYPE(B, z, a);
+        /* now both A and B can be used as template arguments to py::array_t */
+    }
+
+The structure should consist of fundamental arithmetic types, ``std::complex``,
+previously registered substructures, and arrays of any of the above. Both C++
+arrays and ``std::array`` are supported. While there is a static assertion to
+prevent many types of unsupported structures, it is still the user's
+responsibility to use only "plain" structures that can be safely manipulated as
+raw memory without violating invariants.
+
+Vectorizing functions
+=====================
+
+Suppose we want to bind a function with the following signature to Python so
+that it can process arbitrary NumPy array arguments (vectors, matrices, general
+N-D arrays) in addition to its normal arguments:
+
+.. code-block:: cpp
+
+    double my_func(int x, float y, double z);
+
+After including the ``pybind11/numpy.h`` header, this is extremely simple:
+
+.. code-block:: cpp
+
+    m.def("vectorized_func", py::vectorize(my_func));
+
+Invoking the function like below causes 4 calls to be made to ``my_func`` with
+each of the array elements. The significant advantage of this compared to
+solutions like ``numpy.vectorize()`` is that the loop over the elements runs
+entirely on the C++ side and can be crunched down into a tight, optimized loop
+by the compiler. The result is returned as a NumPy array of type
+``numpy.dtype.float64``.
+
+.. code-block:: pycon
+
+    >>> x = np.array([[1, 3],[5, 7]])
+    >>> y = np.array([[2, 4],[6, 8]])
+    >>> z = 3
+    >>> result = vectorized_func(x, y, z)
+
+The scalar argument ``z`` is transparently replicated 4 times.  The input
+arrays ``x`` and ``y`` are automatically converted into the right types (they
+are of type  ``numpy.dtype.int64`` but need to be ``numpy.dtype.int32`` and
+``numpy.dtype.float32``, respectively).
+
+.. note::
+
+    Only arithmetic, complex, and POD types passed by value or by ``const &``
+    reference are vectorized; all other arguments are passed through as-is.
+    Functions taking rvalue reference arguments cannot be vectorized.
+
+In cases where the computation is too complicated to be reduced to
+``vectorize``, it will be necessary to create and access the buffer contents
+manually. The following snippet contains a complete example that shows how this
+works (the code is somewhat contrived, since it could have been done more
+simply using ``vectorize``).
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+    #include <pybind11/numpy.h>
+
+    namespace py = pybind11;
+
+    py::array_t<double> add_arrays(py::array_t<double> input1, py::array_t<double> input2) {
+        py::buffer_info buf1 = input1.request(), buf2 = input2.request();
+
+        if (buf1.ndim != 1 || buf2.ndim != 1)
+            throw std::runtime_error("Number of dimensions must be one");
+
+        if (buf1.size != buf2.size)
+            throw std::runtime_error("Input shapes must match");
+
+        /* No pointer is passed, so NumPy will allocate the buffer */
+        auto result = py::array_t<double>(buf1.size);
+
+        py::buffer_info buf3 = result.request();
+
+        double *ptr1 = (double *) buf1.ptr,
+               *ptr2 = (double *) buf2.ptr,
+               *ptr3 = (double *) buf3.ptr;
+
+        for (size_t idx = 0; idx < buf1.shape[0]; idx++)
+            ptr3[idx] = ptr1[idx] + ptr2[idx];
+
+        return result;
+    }
+
+    PYBIND11_MODULE(test, m) {
+        m.def("add_arrays", &add_arrays, "Add two NumPy arrays");
+    }
+
+.. seealso::
+
+    The file :file:`tests/test_numpy_vectorize.cpp` contains a complete
+    example that demonstrates using :func:`vectorize` in more detail.
+
+Direct access
+=============
+
+For performance reasons, particularly when dealing with very large arrays, it
+is often desirable to directly access array elements without internal checking
+of dimensions and bounds on every access when indices are known to be already
+valid.  To avoid such checks, the ``array`` class and ``array_t<T>`` template
+class offer an unchecked proxy object that can be used for this unchecked
+access through the ``unchecked<N>`` and ``mutable_unchecked<N>`` methods,
+where ``N`` gives the required dimensionality of the array:
+
+.. code-block:: cpp
+
+    m.def("sum_3d", [](py::array_t<double> x) {
+        auto r = x.unchecked<3>(); // x must have ndim = 3; can be non-writeable
+        double sum = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                for (ssize_t k = 0; k < r.shape(2); k++)
+                    sum += r(i, j, k);
+        return sum;
+    });
+    m.def("increment_3d", [](py::array_t<double> x) {
+        auto r = x.mutable_unchecked<3>(); // Will throw if ndim != 3 or flags.writeable is false
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                for (ssize_t k = 0; k < r.shape(2); k++)
+                    r(i, j, k) += 1.0;
+    }, py::arg().noconvert());
+
+To obtain the proxy from an ``array`` object, you must specify both the data
+type and number of dimensions as template arguments, such as ``auto r =
+myarray.mutable_unchecked<float, 2>()``.
+
+If the number of dimensions is not known at compile time, you can omit the
+dimensions template parameter (i.e. calling ``arr_t.unchecked()`` or
+``arr.unchecked<T>()``.  This will give you a proxy object that works in the
+same way, but results in less optimizable code and thus a small efficiency
+loss in tight loops.
+
+Note that the returned proxy object directly references the array's data, and
+only reads its shape, strides, and writeable flag when constructed.  You must
+take care to ensure that the referenced array is not destroyed or reshaped for
+the duration of the returned object, typically by limiting the scope of the
+returned instance.
+
+The returned proxy object supports some of the same methods as ``py::array`` so
+that it can be used as a drop-in replacement for some existing, index-checked
+uses of ``py::array``:
+
+- ``r.ndim()`` returns the number of dimensions
+
+- ``r.data(1, 2, ...)`` and ``r.mutable_data(1, 2, ...)``` returns a pointer to
+  the ``const T`` or ``T`` data, respectively, at the given indices.  The
+  latter is only available to proxies obtained via ``a.mutable_unchecked()``.
+
+- ``itemsize()`` returns the size of an item in bytes, i.e. ``sizeof(T)``.
+
+- ``ndim()`` returns the number of dimensions.
+
+- ``shape(n)`` returns the size of dimension ``n``
+
+- ``size()`` returns the total number of elements (i.e. the product of the shapes).
+
+- ``nbytes()`` returns the number of bytes used by the referenced elements
+  (i.e. ``itemsize()`` times ``size()``).
+
+.. seealso::
+
+    The file :file:`tests/test_numpy_array.cpp` contains additional examples
+    demonstrating the use of this feature.
+
+Ellipsis
+========
+
+Python 3 provides a convenient ``...`` ellipsis notation that is often used to
+slice multidimensional arrays. For instance, the following snippet extracts the
+middle dimensions of a tensor with the first and last index set to zero.
+In Python 2, the syntactic sugar ``...`` is not available, but the singleton
+``Ellipsis`` (of type ``ellipsis``) can still be used directly.
+
+.. code-block:: python
+
+   a = # a NumPy array
+   b = a[0, ..., 0]
+
+The function ``py::ellipsis()`` function can be used to perform the same
+operation on the C++ side:
+
+.. code-block:: cpp
+
+   py::array a = /* A NumPy array */;
+   py::array b = a[py::make_tuple(0, py::ellipsis(), 0)];
+
+.. versionchanged:: 2.6
+   ``py::ellipsis()`` is now also avaliable in Python 2.
+
+Memory view
+===========
+
+For a case when we simply want to provide a direct accessor to C/C++ buffer
+without a concrete class object, we can return a ``memoryview`` object. Suppose
+we wish to expose a ``memoryview`` for 2x4 uint8_t array, we can do the
+following:
+
+.. code-block:: cpp
+
+    const uint8_t buffer[] = {
+        0, 1, 2, 3,
+        4, 5, 6, 7
+    };
+    m.def("get_memoryview2d", []() {
+        return py::memoryview::from_buffer(
+            buffer,                                    // buffer pointer
+            { 2, 4 },                                  // shape (rows, cols)
+            { sizeof(uint8_t) * 4, sizeof(uint8_t) }   // strides in bytes
+        );
+    })
+
+This approach is meant for providing a ``memoryview`` for a C/C++ buffer not
+managed by Python. The user is responsible for managing the lifetime of the
+buffer. Using a ``memoryview`` created in this way after deleting the buffer in
+C++ side results in undefined behavior.
+
+We can also use ``memoryview::from_memory`` for a simple 1D contiguous buffer:
+
+.. code-block:: cpp
+
+    m.def("get_memoryview1d", []() {
+        return py::memoryview::from_memory(
+            buffer,               // buffer pointer
+            sizeof(uint8_t) * 8   // buffer size
+        );
+    })
+
+.. note::
+
+    ``memoryview::from_memory`` is not available in Python 2.
+
+.. versionchanged:: 2.6
+    ``memoryview::from_memory`` added.
diff --git a/pybind11/docs/advanced/pycpp/object.rst b/pybind11/docs/advanced/pycpp/object.rst
new file mode 100644
index 0000000000000000000000000000000000000000..07525d0dc78f5e2ff2f83cd57bdb616413104341
--- /dev/null
+++ b/pybind11/docs/advanced/pycpp/object.rst
@@ -0,0 +1,180 @@
+Python types
+############
+
+.. _wrappers:
+
+Available wrappers
+==================
+
+All major Python types are available as thin C++ wrapper classes. These
+can also be used as function parameters -- see :ref:`python_objects_as_args`.
+
+Available types include :class:`handle`, :class:`object`, :class:`bool_`,
+:class:`int_`, :class:`float_`, :class:`str`, :class:`bytes`, :class:`tuple`,
+:class:`list`, :class:`dict`, :class:`slice`, :class:`none`, :class:`capsule`,
+:class:`iterable`, :class:`iterator`, :class:`function`, :class:`buffer`,
+:class:`array`, and :class:`array_t`.
+
+Casting back and forth
+======================
+
+In this kind of mixed code, it is often necessary to convert arbitrary C++
+types to Python, which can be done using :func:`py::cast`:
+
+.. code-block:: cpp
+
+    MyClass *cls = ..;
+    py::object obj = py::cast(cls);
+
+The reverse direction uses the following syntax:
+
+.. code-block:: cpp
+
+    py::object obj = ...;
+    MyClass *cls = obj.cast<MyClass *>();
+
+When conversion fails, both directions throw the exception :class:`cast_error`.
+
+.. _python_libs:
+
+Accessing Python libraries from C++
+===================================
+
+It is also possible to import objects defined in the Python standard
+library or available in the current Python environment (``sys.path``) and work
+with these in C++.
+
+This example obtains a reference to the Python ``Decimal`` class.
+
+.. code-block:: cpp
+
+    // Equivalent to "from decimal import Decimal"
+    py::object Decimal = py::module::import("decimal").attr("Decimal");
+
+.. code-block:: cpp
+
+    // Try to import scipy
+    py::object scipy = py::module::import("scipy");
+    return scipy.attr("__version__");
+
+.. _calling_python_functions:
+
+Calling Python functions
+========================
+
+It is also possible to call Python classes, functions and methods
+via ``operator()``.
+
+.. code-block:: cpp
+
+    // Construct a Python object of class Decimal
+    py::object pi = Decimal("3.14159");
+
+.. code-block:: cpp
+
+    // Use Python to make our directories
+    py::object os = py::module::import("os");
+    py::object makedirs = os.attr("makedirs");
+    makedirs("/tmp/path/to/somewhere");
+
+One can convert the result obtained from Python to a pure C++ version
+if a ``py::class_`` or type conversion is defined.
+
+.. code-block:: cpp
+
+    py::function f = <...>;
+    py::object result_py = f(1234, "hello", some_instance);
+    MyClass &result = result_py.cast<MyClass>();
+
+.. _calling_python_methods:
+
+Calling Python methods
+========================
+
+To call an object's method, one can again use ``.attr`` to obtain access to the
+Python method.
+
+.. code-block:: cpp
+
+    // Calculate e^π in decimal
+    py::object exp_pi = pi.attr("exp")();
+    py::print(py::str(exp_pi));
+
+In the example above ``pi.attr("exp")`` is a *bound method*: it will always call
+the method for that same instance of the class. Alternately one can create an
+*unbound method* via the Python class (instead of instance) and pass the ``self``
+object explicitly, followed by other arguments.
+
+.. code-block:: cpp
+
+    py::object decimal_exp = Decimal.attr("exp");
+
+    // Compute the e^n for n=0..4
+    for (int n = 0; n < 5; n++) {
+        py::print(decimal_exp(Decimal(n));
+    }
+
+Keyword arguments
+=================
+
+Keyword arguments are also supported. In Python, there is the usual call syntax:
+
+.. code-block:: python
+
+    def f(number, say, to):
+        ...  # function code
+
+    f(1234, say="hello", to=some_instance)  # keyword call in Python
+
+In C++, the same call can be made using:
+
+.. code-block:: cpp
+
+    using namespace pybind11::literals; // to bring in the `_a` literal
+    f(1234, "say"_a="hello", "to"_a=some_instance); // keyword call in C++
+
+Unpacking arguments
+===================
+
+Unpacking of ``*args`` and ``**kwargs`` is also possible and can be mixed with
+other arguments:
+
+.. code-block:: cpp
+
+    // * unpacking
+    py::tuple args = py::make_tuple(1234, "hello", some_instance);
+    f(*args);
+
+    // ** unpacking
+    py::dict kwargs = py::dict("number"_a=1234, "say"_a="hello", "to"_a=some_instance);
+    f(**kwargs);
+
+    // mixed keywords, * and ** unpacking
+    py::tuple args = py::make_tuple(1234);
+    py::dict kwargs = py::dict("to"_a=some_instance);
+    f(*args, "say"_a="hello", **kwargs);
+
+Generalized unpacking according to PEP448_ is also supported:
+
+.. code-block:: cpp
+
+    py::dict kwargs1 = py::dict("number"_a=1234);
+    py::dict kwargs2 = py::dict("to"_a=some_instance);
+    f(**kwargs1, "say"_a="hello", **kwargs2);
+
+.. seealso::
+
+    The file :file:`tests/test_pytypes.cpp` contains a complete
+    example that demonstrates passing native Python types in more detail. The
+    file :file:`tests/test_callbacks.cpp` presents a few examples of calling
+    Python functions from C++, including keywords arguments and unpacking.
+
+.. _PEP448: https://www.python.org/dev/peps/pep-0448/
+
+Handling exceptions
+===================
+
+Python exceptions from wrapper classes will be thrown as a ``py::error_already_set``.
+See :ref:`Handling exceptions from Python in C++
+<handling_python_exceptions_cpp>` for more information on handling exceptions
+raised when calling C++ wrapper classes.
diff --git a/pybind11/docs/advanced/pycpp/utilities.rst b/pybind11/docs/advanced/pycpp/utilities.rst
new file mode 100644
index 0000000000000000000000000000000000000000..369e7c94dbd69f3ce7bb2d837a53ea2853a04efc
--- /dev/null
+++ b/pybind11/docs/advanced/pycpp/utilities.rst
@@ -0,0 +1,144 @@
+Utilities
+#########
+
+Using Python's print function in C++
+====================================
+
+The usual way to write output in C++ is using ``std::cout`` while in Python one
+would use ``print``. Since these methods use different buffers, mixing them can
+lead to output order issues. To resolve this, pybind11 modules can use the
+:func:`py::print` function which writes to Python's ``sys.stdout`` for consistency.
+
+Python's ``print`` function is replicated in the C++ API including optional
+keyword arguments ``sep``, ``end``, ``file``, ``flush``. Everything works as
+expected in Python:
+
+.. code-block:: cpp
+
+    py::print(1, 2.0, "three"); // 1 2.0 three
+    py::print(1, 2.0, "three", "sep"_a="-"); // 1-2.0-three
+
+    auto args = py::make_tuple("unpacked", true);
+    py::print("->", *args, "end"_a="<-"); // -> unpacked True <-
+
+.. _ostream_redirect:
+
+Capturing standard output from ostream
+======================================
+
+Often, a library will use the streams ``std::cout`` and ``std::cerr`` to print,
+but this does not play well with Python's standard ``sys.stdout`` and ``sys.stderr``
+redirection. Replacing a library's printing with `py::print <print>` may not
+be feasible. This can be fixed using a guard around the library function that
+redirects output to the corresponding Python streams:
+
+.. code-block:: cpp
+
+    #include <pybind11/iostream.h>
+
+    ...
+
+    // Add a scoped redirect for your noisy code
+    m.def("noisy_func", []() {
+        py::scoped_ostream_redirect stream(
+            std::cout,                               // std::ostream&
+            py::module::import("sys").attr("stdout") // Python output
+        );
+        call_noisy_func();
+    });
+
+This method respects flushes on the output streams and will flush if needed
+when the scoped guard is destroyed. This allows the output to be redirected in
+real time, such as to a Jupyter notebook. The two arguments, the C++ stream and
+the Python output, are optional, and default to standard output if not given. An
+extra type, `py::scoped_estream_redirect <scoped_estream_redirect>`, is identical
+except for defaulting to ``std::cerr`` and ``sys.stderr``; this can be useful with
+`py::call_guard`, which allows multiple items, but uses the default constructor:
+
+.. code-block:: py
+
+    // Alternative: Call single function using call guard
+    m.def("noisy_func", &call_noisy_function,
+          py::call_guard<py::scoped_ostream_redirect,
+                         py::scoped_estream_redirect>());
+
+The redirection can also be done in Python with the addition of a context
+manager, using the `py::add_ostream_redirect() <add_ostream_redirect>` function:
+
+.. code-block:: cpp
+
+    py::add_ostream_redirect(m, "ostream_redirect");
+
+The name in Python defaults to ``ostream_redirect`` if no name is passed.  This
+creates the following context manager in Python:
+
+.. code-block:: python
+
+    with ostream_redirect(stdout=True, stderr=True):
+        noisy_function()
+
+It defaults to redirecting both streams, though you can use the keyword
+arguments to disable one of the streams if needed.
+
+.. note::
+
+    The above methods will not redirect C-level output to file descriptors, such
+    as ``fprintf``. For those cases, you'll need to redirect the file
+    descriptors either directly in C or with Python's ``os.dup2`` function
+    in an operating-system dependent way.
+
+.. _eval:
+
+Evaluating Python expressions from strings and files
+====================================================
+
+pybind11 provides the `eval`, `exec` and `eval_file` functions to evaluate
+Python expressions and statements. The following example illustrates how they
+can be used.
+
+.. code-block:: cpp
+
+    // At beginning of file
+    #include <pybind11/eval.h>
+
+    ...
+
+    // Evaluate in scope of main module
+    py::object scope = py::module::import("__main__").attr("__dict__");
+
+    // Evaluate an isolated expression
+    int result = py::eval("my_variable + 10", scope).cast<int>();
+
+    // Evaluate a sequence of statements
+    py::exec(
+        "print('Hello')\n"
+        "print('world!');",
+        scope);
+
+    // Evaluate the statements in an separate Python file on disk
+    py::eval_file("script.py", scope);
+
+C++11 raw string literals are also supported and quite handy for this purpose.
+The only requirement is that the first statement must be on a new line following
+the raw string delimiter ``R"(``, ensuring all lines have common leading indent:
+
+.. code-block:: cpp
+
+    py::exec(R"(
+        x = get_answer()
+        if x == 42:
+            print('Hello World!')
+        else:
+            print('Bye!')
+        )", scope
+    );
+
+.. note::
+
+    `eval` and `eval_file` accept a template parameter that describes how the
+    string/file should be interpreted. Possible choices include ``eval_expr``
+    (isolated expression), ``eval_single_statement`` (a single statement, return
+    value is always ``none``), and ``eval_statements`` (sequence of statements,
+    return value is always ``none``). `eval` defaults to  ``eval_expr``,
+    `eval_file` defaults to ``eval_statements`` and `exec` is just a shortcut
+    for ``eval<eval_statements>``.
diff --git a/pybind11/docs/advanced/smart_ptrs.rst b/pybind11/docs/advanced/smart_ptrs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..da57748ca585a92000198a6c607a087704b1f07c
--- /dev/null
+++ b/pybind11/docs/advanced/smart_ptrs.rst
@@ -0,0 +1,173 @@
+Smart pointers
+##############
+
+std::unique_ptr
+===============
+
+Given a class ``Example`` with Python bindings, it's possible to return
+instances wrapped in C++11 unique pointers, like so
+
+.. code-block:: cpp
+
+    std::unique_ptr<Example> create_example() { return std::unique_ptr<Example>(new Example()); }
+
+.. code-block:: cpp
+
+    m.def("create_example", &create_example);
+
+In other words, there is nothing special that needs to be done. While returning
+unique pointers in this way is allowed, it is *illegal* to use them as function
+arguments. For instance, the following function signature cannot be processed
+by pybind11.
+
+.. code-block:: cpp
+
+    void do_something_with_example(std::unique_ptr<Example> ex) { ... }
+
+The above signature would imply that Python needs to give up ownership of an
+object that is passed to this function, which is generally not possible (for
+instance, the object might be referenced elsewhere).
+
+std::shared_ptr
+===============
+
+The binding generator for classes, :class:`class_`, can be passed a template
+type that denotes a special *holder* type that is used to manage references to
+the object.  If no such holder type template argument is given, the default for
+a type named ``Type`` is ``std::unique_ptr<Type>``, which means that the object
+is deallocated when Python's reference count goes to zero.
+
+It is possible to switch to other types of reference counting wrappers or smart
+pointers, which is useful in codebases that rely on them. For instance, the
+following snippet causes ``std::shared_ptr`` to be used instead.
+
+.. code-block:: cpp
+
+    py::class_<Example, std::shared_ptr<Example> /* <- holder type */> obj(m, "Example");
+
+Note that any particular class can only be associated with a single holder type.
+
+One potential stumbling block when using holder types is that they need to be
+applied consistently. Can you guess what's broken about the following binding
+code?
+
+.. code-block:: cpp
+
+    class Child { };
+
+    class Parent {
+    public:
+       Parent() : child(std::make_shared<Child>()) { }
+       Child *get_child() { return child.get(); }  /* Hint: ** DON'T DO THIS ** */
+    private:
+        std::shared_ptr<Child> child;
+    };
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Child, std::shared_ptr<Child>>(m, "Child");
+
+        py::class_<Parent, std::shared_ptr<Parent>>(m, "Parent")
+           .def(py::init<>())
+           .def("get_child", &Parent::get_child);
+    }
+
+The following Python code will cause undefined behavior (and likely a
+segmentation fault).
+
+.. code-block:: python
+
+   from example import Parent
+   print(Parent().get_child())
+
+The problem is that ``Parent::get_child()`` returns a pointer to an instance of
+``Child``, but the fact that this instance is already managed by
+``std::shared_ptr<...>`` is lost when passing raw pointers. In this case,
+pybind11 will create a second independent ``std::shared_ptr<...>`` that also
+claims ownership of the pointer. In the end, the object will be freed **twice**
+since these shared pointers have no way of knowing about each other.
+
+There are two ways to resolve this issue:
+
+1. For types that are managed by a smart pointer class, never use raw pointers
+   in function arguments or return values. In other words: always consistently
+   wrap pointers into their designated holder types (such as
+   ``std::shared_ptr<...>``). In this case, the signature of ``get_child()``
+   should be modified as follows:
+
+.. code-block:: cpp
+
+    std::shared_ptr<Child> get_child() { return child; }
+
+2. Adjust the definition of ``Child`` by specifying
+   ``std::enable_shared_from_this<T>`` (see cppreference_ for details) as a
+   base class. This adds a small bit of information to ``Child`` that allows
+   pybind11 to realize that there is already an existing
+   ``std::shared_ptr<...>`` and communicate with it. In this case, the
+   declaration of ``Child`` should look as follows:
+
+.. _cppreference: http://en.cppreference.com/w/cpp/memory/enable_shared_from_this
+
+.. code-block:: cpp
+
+    class Child : public std::enable_shared_from_this<Child> { };
+
+.. _smart_pointers:
+
+Custom smart pointers
+=====================
+
+pybind11 supports ``std::unique_ptr`` and ``std::shared_ptr`` right out of the
+box. For any other custom smart pointer, transparent conversions can be enabled
+using a macro invocation similar to the following. It must be declared at the
+top namespace level before any binding code:
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>);
+
+The first argument of :func:`PYBIND11_DECLARE_HOLDER_TYPE` should be a
+placeholder name that is used as a template parameter of the second argument.
+Thus, feel free to use any identifier, but use it consistently on both sides;
+also, don't use the name of a type that already exists in your codebase.
+
+The macro also accepts a third optional boolean parameter that is set to false
+by default. Specify
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>, true);
+
+if ``SmartPtr<T>`` can always be initialized from a ``T*`` pointer without the
+risk of inconsistencies (such as multiple independent ``SmartPtr`` instances
+believing that they are the sole owner of the ``T*`` pointer). A common
+situation where ``true`` should be passed is when the ``T`` instances use
+*intrusive* reference counting.
+
+Please take a look at the :ref:`macro_notes` before using this feature.
+
+By default, pybind11 assumes that your custom smart pointer has a standard
+interface, i.e. provides a ``.get()`` member function to access the underlying
+raw pointer. If this is not the case, pybind11's ``holder_helper`` must be
+specialized:
+
+.. code-block:: cpp
+
+    // Always needed for custom holder types
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>);
+
+    // Only needed if the type's `.get()` goes by another name
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct holder_helper<SmartPtr<T>> { // <-- specialization
+            static const T *get(const SmartPtr<T> &p) { return p.getPointer(); }
+        };
+    }}
+
+The above specialization informs pybind11 that the custom ``SmartPtr`` class
+provides ``.get()`` functionality via ``.getPointer()``.
+
+.. seealso::
+
+    The file :file:`tests/test_smart_ptr.cpp` contains a complete example
+    that demonstrates how to work with custom reference-counting holder types
+    in more detail.
diff --git a/pybind11/docs/basics.rst b/pybind11/docs/basics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6bb5f98222f8ee7986b386e2af3ee4b1cb98940d
--- /dev/null
+++ b/pybind11/docs/basics.rst
@@ -0,0 +1,301 @@
+.. _basics:
+
+First steps
+###########
+
+This sections demonstrates the basic features of pybind11. Before getting
+started, make sure that development environment is set up to compile the
+included set of test cases.
+
+
+Compiling the test cases
+========================
+
+Linux/MacOS
+-----------
+
+On Linux  you'll need to install the **python-dev** or **python3-dev** packages as
+well as **cmake**. On Mac OS, the included python version works out of the box,
+but **cmake** must still be installed.
+
+After installing the prerequisites, run
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake ..
+   make check -j 4
+
+The last line will both compile and run the tests.
+
+Windows
+-------
+
+On Windows, only **Visual Studio 2015** and newer are supported since pybind11 relies
+on various C++11 language features that break older versions of Visual Studio.
+
+.. Note::
+
+    To use the C++17 in Visual Studio 2017 (MSVC 14.1), pybind11 requires the flag
+    ``/permissive-`` to be passed to the compiler `to enforce standard conformance`_. When
+    building with Visual Studio 2019, this is not strictly necessary, but still adviced.
+
+..  _`to enforce standard conformance`: https://docs.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance?view=vs-2017
+
+To compile and run the tests:
+
+.. code-block:: batch
+
+   mkdir build
+   cd build
+   cmake ..
+   cmake --build . --config Release --target check
+
+This will create a Visual Studio project, compile and run the target, all from the
+command line.
+
+.. Note::
+
+    If all tests fail, make sure that the Python binary and the testcases are compiled
+    for the same processor type and bitness (i.e. either **i386** or **x86_64**). You
+    can specify **x86_64** as the target architecture for the generated Visual Studio
+    project using ``cmake -A x64 ..``.
+
+.. seealso::
+
+    Advanced users who are already familiar with Boost.Python may want to skip
+    the tutorial and look at the test cases in the :file:`tests` directory,
+    which exercise all features of pybind11.
+
+Header and namespace conventions
+================================
+
+For brevity, all code examples assume that the following two lines are present:
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+
+    namespace py = pybind11;
+
+Some features may require additional headers, but those will be specified as needed.
+
+.. _simple_example:
+
+Creating bindings for a simple function
+=======================================
+
+Let's start by creating Python bindings for an extremely simple function, which
+adds two numbers and returns their result:
+
+.. code-block:: cpp
+
+    int add(int i, int j) {
+        return i + j;
+    }
+
+For simplicity [#f1]_, we'll put both this function and the binding code into
+a file named :file:`example.cpp` with the following contents:
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+
+    int add(int i, int j) {
+        return i + j;
+    }
+
+    PYBIND11_MODULE(example, m) {
+        m.doc() = "pybind11 example plugin"; // optional module docstring
+
+        m.def("add", &add, "A function which adds two numbers");
+    }
+
+.. [#f1] In practice, implementation and binding code will generally be located
+         in separate files.
+
+The :func:`PYBIND11_MODULE` macro creates a function that will be called when an
+``import`` statement is issued from within Python. The module name (``example``)
+is given as the first macro argument (it should not be in quotes). The second
+argument (``m``) defines a variable of type :class:`py::module <module>` which
+is the main interface for creating bindings. The method :func:`module::def`
+generates binding code that exposes the ``add()`` function to Python.
+
+.. note::
+
+    Notice how little code was needed to expose our function to Python: all
+    details regarding the function's parameters and return value were
+    automatically inferred using template metaprogramming. This overall
+    approach and the used syntax are borrowed from Boost.Python, though the
+    underlying implementation is very different.
+
+pybind11 is a header-only library, hence it is not necessary to link against
+any special libraries and there are no intermediate (magic) translation steps.
+On Linux, the above example can be compiled using the following command:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+For more details on the required compiler flags on Linux and MacOS, see
+:ref:`building_manually`. For complete cross-platform compilation instructions,
+refer to the :ref:`compiling` page.
+
+The `python_example`_ and `cmake_example`_ repositories are also a good place
+to start. They are both complete project examples with cross-platform build
+systems. The only difference between the two is that `python_example`_ uses
+Python's ``setuptools`` to build the module, while `cmake_example`_ uses CMake
+(which may be preferable for existing C++ projects).
+
+.. _python_example: https://github.com/pybind/python_example
+.. _cmake_example: https://github.com/pybind/cmake_example
+
+Building the above C++ code will produce a binary module file that can be
+imported to Python. Assuming that the compiled module is located in the
+current directory, the following interactive Python session shows how to
+load and execute the example:
+
+.. code-block:: pycon
+
+    $ python
+    Python 2.7.10 (default, Aug 22 2015, 20:33:39)
+    [GCC 4.2.1 Compatible Apple LLVM 7.0.0 (clang-700.0.59.1)] on darwin
+    Type "help", "copyright", "credits" or "license" for more information.
+    >>> import example
+    >>> example.add(1, 2)
+    3L
+    >>>
+
+.. _keyword_args:
+
+Keyword arguments
+=================
+
+With a simple code modification, it is possible to inform Python about the
+names of the arguments ("i" and "j" in this case).
+
+.. code-block:: cpp
+
+    m.def("add", &add, "A function which adds two numbers",
+          py::arg("i"), py::arg("j"));
+
+:class:`arg` is one of several special tag classes which can be used to pass
+metadata into :func:`module::def`. With this modified binding code, we can now
+call the function using keyword arguments, which is a more readable alternative
+particularly for functions taking many parameters:
+
+.. code-block:: pycon
+
+    >>> import example
+    >>> example.add(i=1, j=2)
+    3L
+
+The keyword names also appear in the function signatures within the documentation.
+
+.. code-block:: pycon
+
+    >>> help(example)
+
+    ....
+
+    FUNCTIONS
+        add(...)
+            Signature : (i: int, j: int) -> int
+
+            A function which adds two numbers
+
+A shorter notation for named arguments is also available:
+
+.. code-block:: cpp
+
+    // regular notation
+    m.def("add1", &add, py::arg("i"), py::arg("j"));
+    // shorthand
+    using namespace pybind11::literals;
+    m.def("add2", &add, "i"_a, "j"_a);
+
+The :var:`_a` suffix forms a C++11 literal which is equivalent to :class:`arg`.
+Note that the literal operator must first be made visible with the directive
+``using namespace pybind11::literals``. This does not bring in anything else
+from the ``pybind11`` namespace except for literals.
+
+.. _default_args:
+
+Default arguments
+=================
+
+Suppose now that the function to be bound has default arguments, e.g.:
+
+.. code-block:: cpp
+
+    int add(int i = 1, int j = 2) {
+        return i + j;
+    }
+
+Unfortunately, pybind11 cannot automatically extract these parameters, since they
+are not part of the function's type information. However, they are simple to specify
+using an extension of :class:`arg`:
+
+.. code-block:: cpp
+
+    m.def("add", &add, "A function which adds two numbers",
+          py::arg("i") = 1, py::arg("j") = 2);
+
+The default values also appear within the documentation.
+
+.. code-block:: pycon
+
+    >>> help(example)
+
+    ....
+
+    FUNCTIONS
+        add(...)
+            Signature : (i: int = 1, j: int = 2) -> int
+
+            A function which adds two numbers
+
+The shorthand notation is also available for default arguments:
+
+.. code-block:: cpp
+
+    // regular notation
+    m.def("add1", &add, py::arg("i") = 1, py::arg("j") = 2);
+    // shorthand
+    m.def("add2", &add, "i"_a=1, "j"_a=2);
+
+Exporting variables
+===================
+
+To expose a value from C++, use the ``attr`` function to register it in a
+module as shown below. Built-in types and general objects (more on that later)
+are automatically converted when assigned as attributes, and can be explicitly
+converted using the function ``py::cast``.
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+        m.attr("the_answer") = 42;
+        py::object world = py::cast("World");
+        m.attr("what") = world;
+    }
+
+These are then accessible from Python:
+
+.. code-block:: pycon
+
+    >>> import example
+    >>> example.the_answer
+    42
+    >>> example.what
+    'World'
+
+.. _supported_types:
+
+Supported data types
+====================
+
+A large number of data types are supported out of the box and can be used
+seamlessly as functions arguments, return values or with ``py::cast`` in general.
+For a full overview, see the :doc:`advanced/cast/index` section.
diff --git a/pybind11/docs/benchmark.py b/pybind11/docs/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..023477212ee3ca34353067b196e9959144444f33
--- /dev/null
+++ b/pybind11/docs/benchmark.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+import random
+import os
+import time
+import datetime as dt
+
+nfns = 4  # Functions per class
+nargs = 4  # Arguments per function
+
+
+def generate_dummy_code_pybind11(nclasses=10):
+    decl = ""
+    bindings = ""
+
+    for cl in range(nclasses):
+        decl += "class cl%03i;\n" % cl
+    decl += '\n'
+
+    for cl in range(nclasses):
+        decl += "class cl%03i {\n" % cl
+        decl += "public:\n"
+        bindings += '    py::class_<cl%03i>(m, "cl%03i")\n' % (cl, cl)
+        for fn in range(nfns):
+            ret = random.randint(0, nclasses - 1)
+            params  = [random.randint(0, nclasses - 1) for i in range(nargs)]
+            decl += "    cl%03i *fn_%03i(" % (ret, fn)
+            decl += ", ".join("cl%03i *" % p for p in params)
+            decl += ");\n"
+            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i)\n' % \
+                (fn, cl, fn)
+        decl += "};\n\n"
+        bindings += '        ;\n'
+
+    result = "#include <pybind11/pybind11.h>\n\n"
+    result += "namespace py = pybind11;\n\n"
+    result += decl + '\n'
+    result += "PYBIND11_MODULE(example, m) {\n"
+    result += bindings
+    result += "}"
+    return result
+
+
+def generate_dummy_code_boost(nclasses=10):
+    decl = ""
+    bindings = ""
+
+    for cl in range(nclasses):
+        decl += "class cl%03i;\n" % cl
+    decl += '\n'
+
+    for cl in range(nclasses):
+        decl += "class cl%03i {\n" % cl
+        decl += "public:\n"
+        bindings += '    py::class_<cl%03i>("cl%03i")\n' % (cl, cl)
+        for fn in range(nfns):
+            ret = random.randint(0, nclasses - 1)
+            params  = [random.randint(0, nclasses - 1) for i in range(nargs)]
+            decl += "    cl%03i *fn_%03i(" % (ret, fn)
+            decl += ", ".join("cl%03i *" % p for p in params)
+            decl += ");\n"
+            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i, py::return_value_policy<py::manage_new_object>())\n' % \
+                (fn, cl, fn)
+        decl += "};\n\n"
+        bindings += '        ;\n'
+
+    result = "#include <boost/python.hpp>\n\n"
+    result += "namespace py = boost::python;\n\n"
+    result += decl + '\n'
+    result += "BOOST_PYTHON_MODULE(example) {\n"
+    result += bindings
+    result += "}"
+    return result
+
+
+for codegen in [generate_dummy_code_pybind11, generate_dummy_code_boost]:
+    print ("{")
+    for i in range(0, 10):
+        nclasses = 2 ** i
+        with open("test.cpp", "w") as f:
+            f.write(codegen(nclasses))
+        n1 = dt.datetime.now()
+        os.system("g++ -Os -shared -rdynamic -undefined dynamic_lookup "
+            "-fvisibility=hidden -std=c++14 test.cpp -I include "
+            "-I /System/Library/Frameworks/Python.framework/Headers -o test.so")
+        n2 = dt.datetime.now()
+        elapsed = (n2 - n1).total_seconds()
+        size = os.stat('test.so').st_size
+        print("   {%i, %f, %i}," % (nclasses * nfns, elapsed, size))
+    print ("}")
diff --git a/pybind11/docs/benchmark.rst b/pybind11/docs/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..02c2ccde7dc00db1e57b73a7523521f9c39a5639
--- /dev/null
+++ b/pybind11/docs/benchmark.rst
@@ -0,0 +1,95 @@
+Benchmark
+=========
+
+The following is the result of a synthetic benchmark comparing both compilation
+time and module size of pybind11 against Boost.Python. A detailed report about a
+Boost.Python to pybind11 conversion of a real project is available here: [#f1]_.
+
+.. [#f1] http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
+
+Setup
+-----
+
+A python script (see the ``docs/benchmark.py`` file) was used to generate a set
+of files with dummy classes whose count increases for each successive benchmark
+(between 1 and 2048 classes in powers of two). Each class has four methods with
+a randomly generated signature with a return value and four arguments. (There
+was no particular reason for this setup other than the desire to generate many
+unique function signatures whose count could be controlled in a simple way.)
+
+Here is an example of the binding code for one class:
+
+.. code-block:: cpp
+
+    ...
+    class cl034 {
+    public:
+        cl279 *fn_000(cl084 *, cl057 *, cl065 *, cl042 *);
+        cl025 *fn_001(cl098 *, cl262 *, cl414 *, cl121 *);
+        cl085 *fn_002(cl445 *, cl297 *, cl145 *, cl421 *);
+        cl470 *fn_003(cl200 *, cl323 *, cl332 *, cl492 *);
+    };
+    ...
+
+    PYBIND11_MODULE(example, m) {
+        ...
+        py::class_<cl034>(m, "cl034")
+            .def("fn_000", &cl034::fn_000)
+            .def("fn_001", &cl034::fn_001)
+            .def("fn_002", &cl034::fn_002)
+            .def("fn_003", &cl034::fn_003)
+        ...
+    }
+
+The Boost.Python version looks almost identical except that a return value
+policy had to be specified as an argument to ``def()``. For both libraries,
+compilation was done with
+
+.. code-block:: bash
+
+    Apple LLVM version 7.0.2 (clang-700.1.81)
+
+and the following compilation flags
+
+.. code-block:: bash
+
+    g++ -Os -shared -rdynamic -undefined dynamic_lookup -fvisibility=hidden -std=c++14
+
+Compilation time
+----------------
+
+The following log-log plot shows how the compilation time grows for an
+increasing number of class and function declarations. pybind11 includes many
+fewer headers, which initially leads to shorter compilation times, but the
+performance is ultimately fairly similar (pybind11 is 19.8 seconds faster for
+the largest largest file with 2048 classes and a total of 8192 methods -- a
+modest **1.2x** speedup relative to Boost.Python, which required 116.35
+seconds).
+
+.. only:: not latex
+
+    .. image:: pybind11_vs_boost_python1.svg
+
+.. only:: latex
+
+    .. image:: pybind11_vs_boost_python1.png
+
+Module size
+-----------
+
+Differences between the two libraries become much more pronounced when
+considering the file size of the generated Python plugin: for the largest file,
+the binary generated by Boost.Python required 16.8 MiB, which was **2.17
+times** / **9.1 megabytes** larger than the output generated by pybind11. For
+very small inputs, Boost.Python has an edge in the plot below -- however, note
+that it stores many definitions in an external library, whose size was not
+included here, hence the comparison is slightly shifted in Boost.Python's
+favor.
+
+.. only:: not latex
+
+    .. image:: pybind11_vs_boost_python2.svg
+
+.. only:: latex
+
+    .. image:: pybind11_vs_boost_python2.png
diff --git a/pybind11/docs/changelog.rst b/pybind11/docs/changelog.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0e15621d2811054dfcc0e9ffcb5bd5ad38e40dae
--- /dev/null
+++ b/pybind11/docs/changelog.rst
@@ -0,0 +1,1361 @@
+.. _changelog:
+
+Changelog
+#########
+
+Starting with version 1.8.0, pybind11 releases use a `semantic versioning
+<http://semver.org>`_ policy.
+
+v2.6.0 (IN PROGRESS)
+--------------------
+
+See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
+
+* Keyword only argument supported in Python 2 or 3 with ``py::kwonly()``.
+  `#2100 <https://github.com/pybind/pybind11/pull/2100>`_
+
+* Perfect forwarding support for methods.
+  `#2048 <https://github.com/pybind/pybind11/pull/2048>`_
+
+* Added ``py::error_already_set::discard_as_unraisable()``.
+  `#2372 <https://github.com/pybind/pybind11/pull/2372>`_
+
+* ``py::hash`` is now public.
+  `#2217 <https://github.com/pybind/pybind11/pull/2217>`_
+
+* ``py::is_final()`` class modifier to block subclassing (CPython only).
+  `#2151 <https://github.com/pybind/pybind11/pull/2151>`_
+
+* ``py::memoryview``  update and documentation.
+  `#2223 <https://github.com/pybind/pybind11/pull/2223>`_
+
+* Minimum CMake required increased to 3.4.
+  `#2338 <https://github.com/pybind/pybind11/pull/2338>`_ and
+  `#2370 <https://github.com/pybind/pybind11/pull/2370>`_
+
+  * Full integration with CMake’s C++ standard system replaces
+    ``PYBIND11_CPP_STANDARD``.
+
+  * Generated config file is now portable to different Python/compiler/CMake
+    versions.
+
+  * Virtual environments prioritized if ``PYTHON_EXECUTABLE`` is not set
+    (``venv``, ``virtualenv``, and ``conda``) (similar to the new FindPython
+    mode).
+
+  * Other CMake features now natively supported, like
+    ``CMAKE_INTERPROCEDURAL_OPTIMIZATION``, ``set(CMAKE_CXX_VISIBILITY_PRESET
+    hidden)``.
+
+* Optional :ref:`find-python-mode` and :ref:`nopython-mode` with CMake.
+  `#2370 <https://github.com/pybind/pybind11/pull/2370>`_
+
+* Uninstall target added.
+  `#2265 <https://github.com/pybind/pybind11/pull/2265>`_ and
+  `#2346 <https://github.com/pybind/pybind11/pull/2346>`_
+
+Smaller or developer focused features:
+
+* Error now thrown when ``__init__`` is forgotten on subclasses.
+  `#2152 <https://github.com/pybind/pybind11/pull/2152>`_
+
+* If ``__eq__`` defined but not ``__hash__``, ``__hash__`` is now set to
+  ``None``.
+  `#2291 <https://github.com/pybind/pybind11/pull/2291>`_
+
+* ``py::ellipsis`` now also works on Python 2
+  `#2360 <https://github.com/pybind/pybind11/pull/2360>`_
+
+* Added missing signature for ``py::array``
+  `#2363 <https://github.com/pybind/pybind11/pull/2363>`_
+
+* Bugfixes related to more extensive testing
+  `#2321 <https://github.com/pybind/pybind11/pull/2321>`_
+
+* Pointer to ``std::tuple`` & ``std::pair`` supported in cast.
+  `#2334 <https://github.com/pybind/pybind11/pull/2334>`_
+
+* Small fixes in NumPy support. ``py::array`` now uses ``py::ssize_t`` as first
+  argument type.
+  `#2293 <https://github.com/pybind/pybind11/pull/2293>`_
+
+* PyPy fixes, including support for PyPy3 and PyPy 7.
+  `#2146 <https://github.com/pybind/pybind11/pull/2146>`_
+
+* CPython 3.9 fixes.
+  `#2253 <https://github.com/pybind/pybind11/pull/2253>`_
+
+* Debug Python interpreter support.
+  `#2025 <https://github.com/pybind/pybind11/pull/2025>`_
+
+
+
+v2.5.0 (Mar 31, 2020)
+-----------------------------------------------------
+
+* Use C++17 fold expressions in type casters, if available. This can
+  improve performance during overload resolution when functions have
+  multiple arguments.
+  `#2043 <https://github.com/pybind/pybind11/pull/2043>`_.
+
+* Changed include directory resolution in ``pybind11/__init__.py``
+  and installation in ``setup.py``. This fixes a number of open issues
+  where pybind11 headers could not be found in certain environments.
+  `#1995 <https://github.com/pybind/pybind11/pull/1995>`_.
+
+* C++20 ``char8_t`` and ``u8string`` support. `#2026
+  <https://github.com/pybind/pybind11/pull/2026>`_.
+
+* CMake: search for Python 3.9. `bb9c91
+  <https://github.com/pybind/pybind11/commit/bb9c91>`_.
+
+* Fixes for MSYS-based build environments.
+  `#2087 <https://github.com/pybind/pybind11/pull/2087>`_,
+  `#2053 <https://github.com/pybind/pybind11/pull/2053>`_.
+
+* STL bindings for ``std::vector<...>::clear``. `#2074
+  <https://github.com/pybind/pybind11/pull/2074>`_.
+
+* Read-only flag for ``py::buffer``. `#1466
+  <https://github.com/pybind/pybind11/pull/1466>`_.
+
+* Exception handling during module initialization.
+  `bf2b031 <https://github.com/pybind/pybind11/commit/bf2b031>`_.
+
+* Support linking against a CPython debug build.
+  `#2025 <https://github.com/pybind/pybind11/pull/2025>`_.
+
+* Fixed issues involving the availability and use of aligned ``new`` and
+  ``delete``. `#1988 <https://github.com/pybind/pybind11/pull/1988>`_,
+  `759221 <https://github.com/pybind/pybind11/commit/759221>`_.
+
+* Fixed a resource leak upon interpreter shutdown.
+  `#2020 <https://github.com/pybind/pybind11/pull/2020>`_.
+
+* Fixed error handling in the boolean caster.
+  `#1976 <https://github.com/pybind/pybind11/pull/1976>`_.
+
+v2.4.3 (Oct 15, 2019)
+-----------------------------------------------------
+
+* Adapt pybind11 to a C API convention change in Python 3.8. `#1950
+  <https://github.com/pybind/pybind11/pull/1950>`_.
+
+v2.4.2 (Sep 21, 2019)
+-----------------------------------------------------
+
+* Replaced usage of a C++14 only construct. `#1929
+  <https://github.com/pybind/pybind11/pull/1929>`_.
+
+* Made an ifdef future-proof for Python >= 4. `f3109d
+  <https://github.com/pybind/pybind11/commit/f3109d>`_.
+
+v2.4.1 (Sep 20, 2019)
+-----------------------------------------------------
+
+* Fixed a problem involving implicit conversion from enumerations to integers
+  on Python 3.8. `#1780 <https://github.com/pybind/pybind11/pull/1780>`_.
+
+v2.4.0 (Sep 19, 2019)
+-----------------------------------------------------
+
+* Try harder to keep pybind11-internal data structures separate when there
+  are potential ABI incompatibilities. Fixes crashes that occurred when loading
+  multiple pybind11 extensions that were e.g. compiled by GCC (libstdc++)
+  and Clang (libc++).
+  `#1588 <https://github.com/pybind/pybind11/pull/1588>`_ and
+  `c9f5a <https://github.com/pybind/pybind11/commit/c9f5a>`_.
+
+* Added support for ``__await__``, ``__aiter__``, and ``__anext__`` protocols.
+  `#1842 <https://github.com/pybind/pybind11/pull/1842>`_.
+
+* ``pybind11_add_module()``: don't strip symbols when compiling in
+  ``RelWithDebInfo`` mode. `#1980
+  <https://github.com/pybind/pybind11/pull/1980>`_.
+
+* ``enum_``: Reproduce Python behavior when comparing against invalid values
+  (e.g. ``None``, strings, etc.). Add back support for ``__invert__()``.
+  `#1912 <https://github.com/pybind/pybind11/pull/1912>`_,
+  `#1907 <https://github.com/pybind/pybind11/pull/1907>`_.
+
+* List insertion operation for ``py::list``.
+  Added ``.empty()`` to all collection types.
+  Added ``py::set::contains()`` and ``py::dict::contains()``.
+  `#1887 <https://github.com/pybind/pybind11/pull/1887>`_,
+  `#1884 <https://github.com/pybind/pybind11/pull/1884>`_,
+  `#1888 <https://github.com/pybind/pybind11/pull/1888>`_.
+
+* ``py::details::overload_cast_impl`` is available in C++11 mode, can be used
+  like ``overload_cast`` with an additional set of parantheses.
+  `#1581 <https://github.com/pybind/pybind11/pull/1581>`_.
+
+* Fixed ``get_include()`` on Conda.
+  `#1877 <https://github.com/pybind/pybind11/pull/1877>`_.
+
+* ``stl_bind.h``: negative indexing support.
+  `#1882 <https://github.com/pybind/pybind11/pull/1882>`_.
+
+* Minor CMake fix to add MinGW compatibility.
+  `#1851 <https://github.com/pybind/pybind11/pull/1851>`_.
+
+* GIL-related fixes.
+  `#1836 <https://github.com/pybind/pybind11/pull/1836>`_,
+  `8b90b <https://github.com/pybind/pybind11/commit/8b90b>`_.
+
+* Other very minor/subtle fixes and improvements.
+  `#1329 <https://github.com/pybind/pybind11/pull/1329>`_,
+  `#1910 <https://github.com/pybind/pybind11/pull/1910>`_,
+  `#1863 <https://github.com/pybind/pybind11/pull/1863>`_,
+  `#1847 <https://github.com/pybind/pybind11/pull/1847>`_,
+  `#1890 <https://github.com/pybind/pybind11/pull/1890>`_,
+  `#1860 <https://github.com/pybind/pybind11/pull/1860>`_,
+  `#1848 <https://github.com/pybind/pybind11/pull/1848>`_,
+  `#1821 <https://github.com/pybind/pybind11/pull/1821>`_,
+  `#1837 <https://github.com/pybind/pybind11/pull/1837>`_,
+  `#1833 <https://github.com/pybind/pybind11/pull/1833>`_,
+  `#1748 <https://github.com/pybind/pybind11/pull/1748>`_,
+  `#1852 <https://github.com/pybind/pybind11/pull/1852>`_.
+
+v2.3.0 (June 11, 2019)
+-----------------------------------------------------
+
+* Significantly reduced module binary size (10-20%) when compiled in C++11 mode
+  with GCC/Clang, or in any mode with MSVC. Function signatures are now always
+  precomputed at compile time (this was previously only available in C++14 mode
+  for non-MSVC compilers).
+  `#934 <https://github.com/pybind/pybind11/pull/934>`_.
+
+* Add basic support for tag-based static polymorphism, where classes
+  provide a method to returns the desired type of an instance.
+  `#1326 <https://github.com/pybind/pybind11/pull/1326>`_.
+
+* Python type wrappers (``py::handle``, ``py::object``, etc.)
+  now support map Python's number protocol onto C++ arithmetic
+  operators such as ``operator+``, ``operator/=``, etc.
+  `#1511 <https://github.com/pybind/pybind11/pull/1511>`_.
+
+* A number of improvements related to enumerations:
+
+   1. The ``enum_`` implementation was rewritten from scratch to reduce
+      code bloat. Rather than instantiating a full implementation for each
+      enumeration, most code is now contained in a generic base class.
+      `#1511 <https://github.com/pybind/pybind11/pull/1511>`_.
+
+   2. The ``value()``  method of ``py::enum_`` now accepts an optional
+      docstring that will be shown in the documentation of the associated
+      enumeration. `#1160 <https://github.com/pybind/pybind11/pull/1160>`_.
+
+   3. check for already existing enum value and throw an error if present.
+      `#1453 <https://github.com/pybind/pybind11/pull/1453>`_.
+
+* Support for over-aligned type allocation via C++17's aligned ``new``
+  statement. `#1582 <https://github.com/pybind/pybind11/pull/1582>`_.
+
+* Added ``py::ellipsis()`` method for slicing of multidimensional NumPy arrays
+  `#1502 <https://github.com/pybind/pybind11/pull/1502>`_.
+
+* Numerous Improvements to the ``mkdoc.py`` script for extracting documentation
+  from C++ header files.
+  `#1788 <https://github.com/pybind/pybind11/pull/1788>`_.
+
+* ``pybind11_add_module()``: allow including Python as a ``SYSTEM`` include path.
+  `#1416 <https://github.com/pybind/pybind11/pull/1416>`_.
+
+* ``pybind11/stl.h`` does not convert strings to ``vector<string>`` anymore.
+  `#1258 <https://github.com/pybind/pybind11/issues/1258>`_.
+
+* Mark static methods as such to fix auto-generated Sphinx documentation.
+  `#1732 <https://github.com/pybind/pybind11/pull/1732>`_.
+
+* Re-throw forced unwind exceptions (e.g. during pthread termination).
+  `#1208 <https://github.com/pybind/pybind11/pull/1208>`_.
+
+* Added ``__contains__`` method to the bindings of maps (``std::map``,
+  ``std::unordered_map``).
+  `#1767 <https://github.com/pybind/pybind11/pull/1767>`_.
+
+* Improvements to ``gil_scoped_acquire``.
+  `#1211 <https://github.com/pybind/pybind11/pull/1211>`_.
+
+* Type caster support for ``std::deque<T>``.
+  `#1609 <https://github.com/pybind/pybind11/pull/1609>`_.
+
+* Support for ``std::unique_ptr`` holders, whose deleters differ between a base and derived
+  class. `#1353 <https://github.com/pybind/pybind11/pull/1353>`_.
+
+* Construction of STL array/vector-like data structures from
+  iterators. Added an ``extend()`` operation.
+  `#1709 <https://github.com/pybind/pybind11/pull/1709>`_,
+
+* CMake build system improvements for projects that include non-C++
+  files (e.g. plain C, CUDA) in ``pybind11_add_module`` et al.
+  `#1678 <https://github.com/pybind/pybind11/pull/1678>`_.
+
+* Fixed asynchronous invocation and deallocation of Python functions
+  wrapped in ``std::function``.
+  `#1595 <https://github.com/pybind/pybind11/pull/1595>`_.
+
+* Fixes regarding return value policy propagation in STL type casters.
+  `#1603 <https://github.com/pybind/pybind11/pull/1603>`_.
+
+* Fixed scoped enum comparisons.
+  `#1571 <https://github.com/pybind/pybind11/pull/1571>`_.
+
+* Fixed iostream redirection for code that releases the GIL.
+  `#1368 <https://github.com/pybind/pybind11/pull/1368>`_,
+
+* A number of CI-related fixes.
+  `#1757 <https://github.com/pybind/pybind11/pull/1757>`_,
+  `#1744 <https://github.com/pybind/pybind11/pull/1744>`_,
+  `#1670 <https://github.com/pybind/pybind11/pull/1670>`_.
+
+v2.2.4 (September 11, 2018)
+-----------------------------------------------------
+
+* Use new Python 3.7 Thread Specific Storage (TSS) implementation if available.
+  `#1454 <https://github.com/pybind/pybind11/pull/1454>`_,
+  `#1517 <https://github.com/pybind/pybind11/pull/1517>`_.
+
+* Fixes for newer MSVC versions and C++17 mode.
+  `#1347 <https://github.com/pybind/pybind11/pull/1347>`_,
+  `#1462 <https://github.com/pybind/pybind11/pull/1462>`_.
+
+* Propagate return value policies to type-specific casters
+  when casting STL containers.
+  `#1455 <https://github.com/pybind/pybind11/pull/1455>`_.
+
+* Allow ostream-redirection of more than 1024 characters.
+  `#1479 <https://github.com/pybind/pybind11/pull/1479>`_.
+
+* Set ``Py_DEBUG`` define when compiling against a debug Python build.
+  `#1438 <https://github.com/pybind/pybind11/pull/1438>`_.
+
+* Untangle integer logic in number type caster to work for custom
+  types that may only be castable to a restricted set of builtin types.
+  `#1442 <https://github.com/pybind/pybind11/pull/1442>`_.
+
+* CMake build system: Remember Python version in cache file.
+  `#1434 <https://github.com/pybind/pybind11/pull/1434>`_.
+
+* Fix for custom smart pointers: use ``std::addressof`` to obtain holder
+  address instead of ``operator&``.
+  `#1435 <https://github.com/pybind/pybind11/pull/1435>`_.
+
+* Properly report exceptions thrown during module initialization.
+  `#1362 <https://github.com/pybind/pybind11/pull/1362>`_.
+
+* Fixed a segmentation fault when creating empty-shaped NumPy array.
+  `#1371 <https://github.com/pybind/pybind11/pull/1371>`_.
+
+* The version of Intel C++ compiler must be >= 2017, and this is now checked by
+  the header files. `#1363 <https://github.com/pybind/pybind11/pull/1363>`_.
+
+* A few minor typo fixes and improvements to the test suite, and
+  patches that silence compiler warnings.
+
+* Vectors now support construction from generators, as well as ``extend()`` from a
+  list or generator.
+  `#1496 <https://github.com/pybind/pybind11/pull/1496>`_.
+
+
+v2.2.3 (April 29, 2018)
+-----------------------------------------------------
+
+* The pybind11 header location detection was replaced by a new implementation
+  that no longer depends on ``pip`` internals (the recently released ``pip``
+  10 has restricted access to this API).
+  `#1190 <https://github.com/pybind/pybind11/pull/1190>`_.
+
+* Small adjustment to an implementation detail to work around a compiler segmentation fault in Clang 3.3/3.4.
+  `#1350 <https://github.com/pybind/pybind11/pull/1350>`_.
+
+* The minimal supported version of the Intel compiler was >= 17.0 since
+  pybind11 v2.1. This check is now explicit, and a compile-time error is raised
+  if the compiler meet the requirement.
+  `#1363 <https://github.com/pybind/pybind11/pull/1363>`_.
+
+* Fixed an endianness-related fault in the test suite.
+  `#1287 <https://github.com/pybind/pybind11/pull/1287>`_.
+
+v2.2.2 (February 7, 2018)
+-----------------------------------------------------
+
+* Fixed a segfault when combining embedded interpreter
+  shutdown/reinitialization with external loaded pybind11 modules.
+  `#1092 <https://github.com/pybind/pybind11/pull/1092>`_.
+
+* Eigen support: fixed a bug where Nx1/1xN numpy inputs couldn't be passed as
+  arguments to Eigen vectors (which for Eigen are simply compile-time fixed
+  Nx1/1xN matrices).
+  `#1106 <https://github.com/pybind/pybind11/pull/1106>`_.
+
+* Clarified to license by moving the licensing of contributions from
+  ``LICENSE`` into ``CONTRIBUTING.md``: the licensing of contributions is not
+  actually part of the software license as distributed.  This isn't meant to be
+  a substantial change in the licensing of the project, but addresses concerns
+  that the clause made the license non-standard.
+  `#1109 <https://github.com/pybind/pybind11/issues/1109>`_.
+
+* Fixed a regression introduced in 2.1 that broke binding functions with lvalue
+  character literal arguments.
+  `#1128 <https://github.com/pybind/pybind11/pull/1128>`_.
+
+* MSVC: fix for compilation failures under /permissive-, and added the flag to
+  the appveyor test suite.
+  `#1155 <https://github.com/pybind/pybind11/pull/1155>`_.
+
+* Fixed ``__qualname__`` generation, and in turn, fixes how class names
+  (especially nested class names) are shown in generated docstrings.
+  `#1171 <https://github.com/pybind/pybind11/pull/1171>`_.
+
+* Updated the FAQ with a suggested project citation reference.
+  `#1189 <https://github.com/pybind/pybind11/pull/1189>`_.
+
+* Added fixes for deprecation warnings when compiled under C++17 with
+  ``-Wdeprecated`` turned on, and add ``-Wdeprecated`` to the test suite
+  compilation flags.
+  `#1191 <https://github.com/pybind/pybind11/pull/1191>`_.
+
+* Fixed outdated PyPI URLs in ``setup.py``.
+  `#1213 <https://github.com/pybind/pybind11/pull/1213>`_.
+
+* Fixed a refcount leak for arguments that end up in a ``py::args`` argument
+  for functions with both fixed positional and ``py::args`` arguments.
+  `#1216 <https://github.com/pybind/pybind11/pull/1216>`_.
+
+* Fixed a potential segfault resulting from possible premature destruction of
+  ``py::args``/``py::kwargs`` arguments with overloaded functions.
+  `#1223 <https://github.com/pybind/pybind11/pull/1223>`_.
+
+* Fixed ``del map[item]`` for a ``stl_bind.h`` bound stl map.
+  `#1229 <https://github.com/pybind/pybind11/pull/1229>`_.
+
+* Fixed a regression from v2.1.x where the aggregate initialization could
+  unintentionally end up at a constructor taking a templated
+  ``std::initializer_list<T>`` argument.
+  `#1249 <https://github.com/pybind/pybind11/pull/1249>`_.
+
+* Fixed an issue where calling a function with a keep_alive policy on the same
+  nurse/patient pair would cause the internal patient storage to needlessly
+  grow (unboundedly, if the nurse is long-lived).
+  `#1251 <https://github.com/pybind/pybind11/issues/1251>`_.
+
+* Various other minor fixes.
+
+v2.2.1 (September 14, 2017)
+-----------------------------------------------------
+
+* Added ``py::module::reload()`` member function for reloading a module.
+  `#1040 <https://github.com/pybind/pybind11/pull/1040>`_.
+
+* Fixed a reference leak in the number converter.
+  `#1078 <https://github.com/pybind/pybind11/pull/1078>`_.
+
+* Fixed compilation with Clang on host GCC < 5 (old libstdc++ which isn't fully
+  C++11 compliant). `#1062 <https://github.com/pybind/pybind11/pull/1062>`_.
+
+* Fixed a regression where the automatic ``std::vector<bool>`` caster would
+  fail to compile. The same fix also applies to any container which returns
+  element proxies instead of references.
+  `#1053 <https://github.com/pybind/pybind11/pull/1053>`_.
+
+* Fixed a regression where the ``py::keep_alive`` policy could not be applied
+  to constructors. `#1065 <https://github.com/pybind/pybind11/pull/1065>`_.
+
+* Fixed a nullptr dereference when loading a ``py::module_local`` type
+  that's only registered in an external module.
+  `#1058 <https://github.com/pybind/pybind11/pull/1058>`_.
+
+* Fixed implicit conversion of accessors to types derived from ``py::object``.
+  `#1076 <https://github.com/pybind/pybind11/pull/1076>`_.
+
+* The ``name`` in ``PYBIND11_MODULE(name, variable)`` can now be a macro.
+  `#1082 <https://github.com/pybind/pybind11/pull/1082>`_.
+
+* Relaxed overly strict ``py::pickle()`` check for matching get and set types.
+  `#1064 <https://github.com/pybind/pybind11/pull/1064>`_.
+
+* Conversion errors now try to be more informative when it's likely that
+  a missing header is the cause (e.g. forgetting ``<pybind11/stl.h>``).
+  `#1077 <https://github.com/pybind/pybind11/pull/1077>`_.
+
+v2.2.0 (August 31, 2017)
+-----------------------------------------------------
+
+* Support for embedding the Python interpreter. See the
+  :doc:`documentation page </advanced/embedding>` for a
+  full overview of the new features.
+  `#774 <https://github.com/pybind/pybind11/pull/774>`_,
+  `#889 <https://github.com/pybind/pybind11/pull/889>`_,
+  `#892 <https://github.com/pybind/pybind11/pull/892>`_,
+  `#920 <https://github.com/pybind/pybind11/pull/920>`_.
+
+  .. code-block:: cpp
+
+      #include <pybind11/embed.h>
+      namespace py = pybind11;
+
+      int main() {
+          py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+
+          py::print("Hello, World!"); // use the Python API
+      }
+
+* Support for inheriting from multiple C++ bases in Python.
+  `#693 <https://github.com/pybind/pybind11/pull/693>`_.
+
+  .. code-block:: python
+
+      from cpp_module import CppBase1, CppBase2
+
+      class PyDerived(CppBase1, CppBase2):
+          def __init__(self):
+              CppBase1.__init__(self)  # C++ bases must be initialized explicitly
+              CppBase2.__init__(self)
+
+* ``PYBIND11_MODULE`` is now the preferred way to create module entry points.
+  ``PYBIND11_PLUGIN`` is deprecated. See :ref:`macros` for details.
+  `#879 <https://github.com/pybind/pybind11/pull/879>`_.
+
+  .. code-block:: cpp
+
+      // new
+      PYBIND11_MODULE(example, m) {
+          m.def("add", [](int a, int b) { return a + b; });
+      }
+
+      // old
+      PYBIND11_PLUGIN(example) {
+          py::module m("example");
+          m.def("add", [](int a, int b) { return a + b; });
+          return m.ptr();
+      }
+
+* pybind11's headers and build system now more strictly enforce hidden symbol
+  visibility for extension modules. This should be seamless for most users,
+  but see the :doc:`upgrade` if you use a custom build system.
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_.
+
+* Support for ``py::module_local`` types which allow multiple modules to
+  export the same C++ types without conflicts. This is useful for opaque
+  types like ``std::vector<int>``. ``py::bind_vector`` and ``py::bind_map``
+  now default to ``py::module_local`` if their elements are builtins or
+  local types. See :ref:`module_local` for details.
+  `#949 <https://github.com/pybind/pybind11/pull/949>`_,
+  `#981 <https://github.com/pybind/pybind11/pull/981>`_,
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_,
+  `#997 <https://github.com/pybind/pybind11/pull/997>`_.
+
+* Custom constructors can now be added very easily using lambdas or factory
+  functions which return a class instance by value, pointer or holder. This
+  supersedes the old placement-new ``__init__`` technique.
+  See :ref:`custom_constructors` for details.
+  `#805 <https://github.com/pybind/pybind11/pull/805>`_,
+  `#1014 <https://github.com/pybind/pybind11/pull/1014>`_.
+
+  .. code-block:: cpp
+
+      struct Example {
+          Example(std::string);
+      };
+
+      py::class_<Example>(m, "Example")
+          .def(py::init<std::string>()) // existing constructor
+          .def(py::init([](int n) { // custom constructor
+              return std::make_unique<Example>(std::to_string(n));
+          }));
+
+* Similarly to custom constructors, pickling support functions are now bound
+  using the ``py::pickle()`` adaptor which improves type safety. See the
+  :doc:`upgrade` and :ref:`pickling` for details.
+  `#1038 <https://github.com/pybind/pybind11/pull/1038>`_.
+
+* Builtin support for converting C++17 standard library types and general
+  conversion improvements:
+
+  1. C++17 ``std::variant`` is supported right out of the box. C++11/14
+     equivalents (e.g. ``boost::variant``) can also be added with a simple
+     user-defined specialization. See :ref:`cpp17_container_casters` for details.
+     `#811 <https://github.com/pybind/pybind11/pull/811>`_,
+     `#845 <https://github.com/pybind/pybind11/pull/845>`_,
+     `#989 <https://github.com/pybind/pybind11/pull/989>`_.
+
+  2. Out-of-the-box support for C++17 ``std::string_view``.
+     `#906 <https://github.com/pybind/pybind11/pull/906>`_.
+
+  3. Improved compatibility of the builtin ``optional`` converter.
+     `#874 <https://github.com/pybind/pybind11/pull/874>`_.
+
+  4. The ``bool`` converter now accepts ``numpy.bool_`` and types which
+     define ``__bool__`` (Python 3.x) or ``__nonzero__`` (Python 2.7).
+     `#925 <https://github.com/pybind/pybind11/pull/925>`_.
+
+  5. C++-to-Python casters are now more efficient and move elements out
+     of rvalue containers whenever possible.
+     `#851 <https://github.com/pybind/pybind11/pull/851>`_,
+     `#936 <https://github.com/pybind/pybind11/pull/936>`_,
+     `#938 <https://github.com/pybind/pybind11/pull/938>`_.
+
+  6. Fixed ``bytes`` to ``std::string/char*`` conversion on Python 3.
+     `#817 <https://github.com/pybind/pybind11/pull/817>`_.
+
+  7. Fixed lifetime of temporary C++ objects created in Python-to-C++ conversions.
+     `#924 <https://github.com/pybind/pybind11/pull/924>`_.
+
+* Scope guard call policy for RAII types, e.g. ``py::call_guard<py::gil_scoped_release>()``,
+  ``py::call_guard<py::scoped_ostream_redirect>()``. See :ref:`call_policies` for details.
+  `#740 <https://github.com/pybind/pybind11/pull/740>`_.
+
+* Utility for redirecting C++ streams to Python (e.g. ``std::cout`` ->
+  ``sys.stdout``). Scope guard ``py::scoped_ostream_redirect`` in C++ and
+  a context manager in Python. See :ref:`ostream_redirect`.
+  `#1009 <https://github.com/pybind/pybind11/pull/1009>`_.
+
+* Improved handling of types and exceptions across module boundaries.
+  `#915 <https://github.com/pybind/pybind11/pull/915>`_,
+  `#951 <https://github.com/pybind/pybind11/pull/951>`_,
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_.
+
+* Fixed destruction order of ``py::keep_alive`` nurse/patient objects
+  in reference cycles.
+  `#856 <https://github.com/pybind/pybind11/pull/856>`_.
+
+* Numpy and buffer protocol related improvements:
+
+  1. Support for negative strides in Python buffer objects/numpy arrays. This
+     required changing integers from unsigned to signed for the related C++ APIs.
+     Note: If you have compiler warnings enabled, you may notice some new conversion
+     warnings after upgrading. These can be resolved with ``static_cast``.
+     `#782 <https://github.com/pybind/pybind11/pull/782>`_.
+
+  2. Support ``std::complex`` and arrays inside ``PYBIND11_NUMPY_DTYPE``.
+     `#831 <https://github.com/pybind/pybind11/pull/831>`_,
+     `#832 <https://github.com/pybind/pybind11/pull/832>`_.
+
+  3. Support for constructing ``py::buffer_info`` and ``py::arrays`` using
+     arbitrary containers or iterators instead of requiring a ``std::vector``.
+     `#788 <https://github.com/pybind/pybind11/pull/788>`_,
+     `#822 <https://github.com/pybind/pybind11/pull/822>`_,
+     `#860 <https://github.com/pybind/pybind11/pull/860>`_.
+
+  4. Explicitly check numpy version and require >= 1.7.0.
+     `#819 <https://github.com/pybind/pybind11/pull/819>`_.
+
+* Support for allowing/prohibiting ``None`` for specific arguments and improved
+  ``None`` overload resolution order. See :ref:`none_arguments` for details.
+  `#843 <https://github.com/pybind/pybind11/pull/843>`_.
+  `#859 <https://github.com/pybind/pybind11/pull/859>`_.
+
+* Added ``py::exec()`` as a shortcut for ``py::eval<py::eval_statements>()``
+  and support for C++11 raw string literals as input. See :ref:`eval`.
+  `#766 <https://github.com/pybind/pybind11/pull/766>`_,
+  `#827 <https://github.com/pybind/pybind11/pull/827>`_.
+
+* ``py::vectorize()`` ignores non-vectorizable arguments and supports
+  member functions.
+  `#762 <https://github.com/pybind/pybind11/pull/762>`_.
+
+* Support for bound methods as callbacks (``pybind11/functional.h``).
+  `#815 <https://github.com/pybind/pybind11/pull/815>`_.
+
+* Allow aliasing pybind11 methods: ``cls.attr("foo") = cls.attr("bar")``.
+  `#802 <https://github.com/pybind/pybind11/pull/802>`_.
+
+* Don't allow mixed static/non-static overloads.
+  `#804 <https://github.com/pybind/pybind11/pull/804>`_.
+
+* Fixed overriding static properties in derived classes.
+  `#784 <https://github.com/pybind/pybind11/pull/784>`_.
+
+* Added support for write only properties.
+  `#1144 <https://github.com/pybind/pybind11/pull/1144>`_.
+
+* Improved deduction of member functions of a derived class when its bases
+  aren't registered with pybind11.
+  `#855 <https://github.com/pybind/pybind11/pull/855>`_.
+
+  .. code-block:: cpp
+
+      struct Base {
+          int foo() { return 42; }
+      }
+
+      struct Derived : Base {}
+
+      // Now works, but previously required also binding `Base`
+      py::class_<Derived>(m, "Derived")
+          .def("foo", &Derived::foo); // function is actually from `Base`
+
+* The implementation of ``py::init<>`` now uses C++11 brace initialization
+  syntax to construct instances, which permits binding implicit constructors of
+  aggregate types. `#1015 <https://github.com/pybind/pybind11/pull/1015>`_.
+
+    .. code-block:: cpp
+
+        struct Aggregate {
+            int a;
+            std::string b;
+        };
+
+        py::class_<Aggregate>(m, "Aggregate")
+            .def(py::init<int, const std::string &>());
+
+* Fixed issues with multiple inheritance with offset base/derived pointers.
+  `#812 <https://github.com/pybind/pybind11/pull/812>`_,
+  `#866 <https://github.com/pybind/pybind11/pull/866>`_,
+  `#960 <https://github.com/pybind/pybind11/pull/960>`_.
+
+* Fixed reference leak of type objects.
+  `#1030 <https://github.com/pybind/pybind11/pull/1030>`_.
+
+* Improved support for the ``/std:c++14`` and ``/std:c++latest`` modes
+  on MSVC 2017.
+  `#841 <https://github.com/pybind/pybind11/pull/841>`_,
+  `#999 <https://github.com/pybind/pybind11/pull/999>`_.
+
+* Fixed detection of private operator new on MSVC.
+  `#893 <https://github.com/pybind/pybind11/pull/893>`_,
+  `#918 <https://github.com/pybind/pybind11/pull/918>`_.
+
+* Intel C++ compiler compatibility fixes.
+  `#937 <https://github.com/pybind/pybind11/pull/937>`_.
+
+* Fixed implicit conversion of `py::enum_` to integer types on Python 2.7.
+  `#821 <https://github.com/pybind/pybind11/pull/821>`_.
+
+* Added ``py::hash`` to fetch the hash value of Python objects, and
+  ``.def(hash(py::self))`` to provide the C++ ``std::hash`` as the Python
+  ``__hash__`` method.
+  `#1034 <https://github.com/pybind/pybind11/pull/1034>`_.
+
+* Fixed ``__truediv__`` on Python 2 and ``__itruediv__`` on Python 3.
+  `#867 <https://github.com/pybind/pybind11/pull/867>`_.
+
+* ``py::capsule`` objects now support the ``name`` attribute. This is useful
+  for interfacing with ``scipy.LowLevelCallable``.
+  `#902 <https://github.com/pybind/pybind11/pull/902>`_.
+
+* Fixed ``py::make_iterator``'s ``__next__()`` for past-the-end calls.
+  `#897 <https://github.com/pybind/pybind11/pull/897>`_.
+
+* Added ``error_already_set::matches()`` for checking Python exceptions.
+  `#772 <https://github.com/pybind/pybind11/pull/772>`_.
+
+* Deprecated ``py::error_already_set::clear()``. It's no longer needed
+  following a simplification of the ``py::error_already_set`` class.
+  `#954 <https://github.com/pybind/pybind11/pull/954>`_.
+
+* Deprecated ``py::handle::operator==()`` in favor of ``py::handle::is()``
+  `#825 <https://github.com/pybind/pybind11/pull/825>`_.
+
+* Deprecated ``py::object::borrowed``/``py::object::stolen``.
+  Use ``py::object::borrowed_t{}``/``py::object::stolen_t{}`` instead.
+  `#771 <https://github.com/pybind/pybind11/pull/771>`_.
+
+* Changed internal data structure versioning to avoid conflicts between
+  modules compiled with different revisions of pybind11.
+  `#1012 <https://github.com/pybind/pybind11/pull/1012>`_.
+
+* Additional compile-time and run-time error checking and more informative messages.
+  `#786 <https://github.com/pybind/pybind11/pull/786>`_,
+  `#794 <https://github.com/pybind/pybind11/pull/794>`_,
+  `#803 <https://github.com/pybind/pybind11/pull/803>`_.
+
+* Various minor improvements and fixes.
+  `#764 <https://github.com/pybind/pybind11/pull/764>`_,
+  `#791 <https://github.com/pybind/pybind11/pull/791>`_,
+  `#795 <https://github.com/pybind/pybind11/pull/795>`_,
+  `#840 <https://github.com/pybind/pybind11/pull/840>`_,
+  `#844 <https://github.com/pybind/pybind11/pull/844>`_,
+  `#846 <https://github.com/pybind/pybind11/pull/846>`_,
+  `#849 <https://github.com/pybind/pybind11/pull/849>`_,
+  `#858 <https://github.com/pybind/pybind11/pull/858>`_,
+  `#862 <https://github.com/pybind/pybind11/pull/862>`_,
+  `#871 <https://github.com/pybind/pybind11/pull/871>`_,
+  `#872 <https://github.com/pybind/pybind11/pull/872>`_,
+  `#881 <https://github.com/pybind/pybind11/pull/881>`_,
+  `#888 <https://github.com/pybind/pybind11/pull/888>`_,
+  `#899 <https://github.com/pybind/pybind11/pull/899>`_,
+  `#928 <https://github.com/pybind/pybind11/pull/928>`_,
+  `#931 <https://github.com/pybind/pybind11/pull/931>`_,
+  `#944 <https://github.com/pybind/pybind11/pull/944>`_,
+  `#950 <https://github.com/pybind/pybind11/pull/950>`_,
+  `#952 <https://github.com/pybind/pybind11/pull/952>`_,
+  `#962 <https://github.com/pybind/pybind11/pull/962>`_,
+  `#965 <https://github.com/pybind/pybind11/pull/965>`_,
+  `#970 <https://github.com/pybind/pybind11/pull/970>`_,
+  `#978 <https://github.com/pybind/pybind11/pull/978>`_,
+  `#979 <https://github.com/pybind/pybind11/pull/979>`_,
+  `#986 <https://github.com/pybind/pybind11/pull/986>`_,
+  `#1020 <https://github.com/pybind/pybind11/pull/1020>`_,
+  `#1027 <https://github.com/pybind/pybind11/pull/1027>`_,
+  `#1037 <https://github.com/pybind/pybind11/pull/1037>`_.
+
+* Testing improvements.
+  `#798 <https://github.com/pybind/pybind11/pull/798>`_,
+  `#882 <https://github.com/pybind/pybind11/pull/882>`_,
+  `#898 <https://github.com/pybind/pybind11/pull/898>`_,
+  `#900 <https://github.com/pybind/pybind11/pull/900>`_,
+  `#921 <https://github.com/pybind/pybind11/pull/921>`_,
+  `#923 <https://github.com/pybind/pybind11/pull/923>`_,
+  `#963 <https://github.com/pybind/pybind11/pull/963>`_.
+
+v2.1.1 (April 7, 2017)
+-----------------------------------------------------
+
+* Fixed minimum version requirement for MSVC 2015u3
+  `#773 <https://github.com/pybind/pybind11/pull/773>`_.
+
+v2.1.0 (March 22, 2017)
+-----------------------------------------------------
+
+* pybind11 now performs function overload resolution in two phases. The first
+  phase only considers exact type matches, while the second allows for implicit
+  conversions to take place. A special ``noconvert()`` syntax can be used to
+  completely disable implicit conversions for specific arguments.
+  `#643 <https://github.com/pybind/pybind11/pull/643>`_,
+  `#634 <https://github.com/pybind/pybind11/pull/634>`_,
+  `#650 <https://github.com/pybind/pybind11/pull/650>`_.
+
+* Fixed a regression where static properties no longer worked with classes
+  using multiple inheritance. The ``py::metaclass`` attribute is no longer
+  necessary (and deprecated as of this release) when binding classes with
+  static properties.
+  `#679 <https://github.com/pybind/pybind11/pull/679>`_,
+
+* Classes bound using ``pybind11`` can now use custom metaclasses.
+  `#679 <https://github.com/pybind/pybind11/pull/679>`_,
+
+* ``py::args`` and ``py::kwargs`` can now be mixed with other positional
+  arguments when binding functions using pybind11.
+  `#611 <https://github.com/pybind/pybind11/pull/611>`_.
+
+* Improved support for C++11 unicode string and character types; added
+  extensive documentation regarding pybind11's string conversion behavior.
+  `#624 <https://github.com/pybind/pybind11/pull/624>`_,
+  `#636 <https://github.com/pybind/pybind11/pull/636>`_,
+  `#715 <https://github.com/pybind/pybind11/pull/715>`_.
+
+* pybind11 can now avoid expensive copies when converting Eigen arrays to NumPy
+  arrays (and vice versa). `#610 <https://github.com/pybind/pybind11/pull/610>`_.
+
+* The "fast path" in ``py::vectorize`` now works for any full-size group of C or
+  F-contiguous arrays. The non-fast path is also faster since it no longer performs
+  copies of the input arguments (except when type conversions are necessary).
+  `#610 <https://github.com/pybind/pybind11/pull/610>`_.
+
+* Added fast, unchecked access to NumPy arrays via a proxy object.
+  `#746 <https://github.com/pybind/pybind11/pull/746>`_.
+
+* Transparent support for class-specific ``operator new`` and
+  ``operator delete`` implementations.
+  `#755 <https://github.com/pybind/pybind11/pull/755>`_.
+
+* Slimmer and more efficient STL-compatible iterator interface for sequence types.
+  `#662 <https://github.com/pybind/pybind11/pull/662>`_.
+
+* Improved custom holder type support.
+  `#607 <https://github.com/pybind/pybind11/pull/607>`_.
+
+* ``nullptr`` to ``None`` conversion fixed in various builtin type casters.
+  `#732 <https://github.com/pybind/pybind11/pull/732>`_.
+
+* ``enum_`` now exposes its members via a special ``__members__`` attribute.
+  `#666 <https://github.com/pybind/pybind11/pull/666>`_.
+
+* ``std::vector`` bindings created using ``stl_bind.h`` can now optionally
+  implement the buffer protocol. `#488 <https://github.com/pybind/pybind11/pull/488>`_.
+
+* Automated C++ reference documentation using doxygen and breathe.
+  `#598 <https://github.com/pybind/pybind11/pull/598>`_.
+
+* Added minimum compiler version assertions.
+  `#727 <https://github.com/pybind/pybind11/pull/727>`_.
+
+* Improved compatibility with C++1z.
+  `#677 <https://github.com/pybind/pybind11/pull/677>`_.
+
+* Improved ``py::capsule`` API. Can be used to implement cleanup
+  callbacks that are involved at module destruction time.
+  `#752 <https://github.com/pybind/pybind11/pull/752>`_.
+
+* Various minor improvements and fixes.
+  `#595 <https://github.com/pybind/pybind11/pull/595>`_,
+  `#588 <https://github.com/pybind/pybind11/pull/588>`_,
+  `#589 <https://github.com/pybind/pybind11/pull/589>`_,
+  `#603 <https://github.com/pybind/pybind11/pull/603>`_,
+  `#619 <https://github.com/pybind/pybind11/pull/619>`_,
+  `#648 <https://github.com/pybind/pybind11/pull/648>`_,
+  `#695 <https://github.com/pybind/pybind11/pull/695>`_,
+  `#720 <https://github.com/pybind/pybind11/pull/720>`_,
+  `#723 <https://github.com/pybind/pybind11/pull/723>`_,
+  `#729 <https://github.com/pybind/pybind11/pull/729>`_,
+  `#724 <https://github.com/pybind/pybind11/pull/724>`_,
+  `#742 <https://github.com/pybind/pybind11/pull/742>`_,
+  `#753 <https://github.com/pybind/pybind11/pull/753>`_.
+
+v2.0.1 (Jan 4, 2017)
+-----------------------------------------------------
+
+* Fix pointer to reference error in type_caster on MSVC
+  `#583 <https://github.com/pybind/pybind11/pull/583>`_.
+
+* Fixed a segmentation in the test suite due to a typo
+  `cd7eac <https://github.com/pybind/pybind11/commit/cd7eac>`_.
+
+v2.0.0 (Jan 1, 2017)
+-----------------------------------------------------
+
+* Fixed a reference counting regression affecting types with custom metaclasses
+  (introduced in v2.0.0-rc1).
+  `#571 <https://github.com/pybind/pybind11/pull/571>`_.
+
+* Quenched a CMake policy warning.
+  `#570 <https://github.com/pybind/pybind11/pull/570>`_.
+
+v2.0.0-rc1 (Dec 23, 2016)
+-----------------------------------------------------
+
+The pybind11 developers are excited to issue a release candidate of pybind11
+with a subsequent v2.0.0 release planned in early January next year.
+
+An incredible amount of effort by went into pybind11 over the last ~5 months,
+leading to a release that is jam-packed with exciting new features and numerous
+usability improvements. The following list links PRs or individual commits
+whenever applicable.
+
+Happy Christmas!
+
+* Support for binding C++ class hierarchies that make use of multiple
+  inheritance. `#410 <https://github.com/pybind/pybind11/pull/410>`_.
+
+* PyPy support: pybind11 now supports nightly builds of PyPy and will
+  interoperate with the future 5.7 release. No code changes are necessary,
+  everything "just" works as usual. Note that we only target the Python 2.7
+  branch for now; support for 3.x will be added once its ``cpyext`` extension
+  support catches up. A few minor features remain unsupported for the time
+  being (notably dynamic attributes in custom types).
+  `#527 <https://github.com/pybind/pybind11/pull/527>`_.
+
+* Significant work on the documentation -- in particular, the monolithic
+  ``advanced.rst`` file was restructured into a easier to read hierarchical
+  organization. `#448 <https://github.com/pybind/pybind11/pull/448>`_.
+
+* Many NumPy-related improvements:
+
+  1. Object-oriented API to access and modify NumPy ``ndarray`` instances,
+     replicating much of the corresponding NumPy C API functionality.
+     `#402 <https://github.com/pybind/pybind11/pull/402>`_.
+
+  2. NumPy array ``dtype`` array descriptors are now first-class citizens and
+     are exposed via a new class ``py::dtype``.
+
+  3. Structured dtypes can be registered using the ``PYBIND11_NUMPY_DTYPE()``
+     macro. Special ``array`` constructors accepting dtype objects were also
+     added.
+
+     One potential caveat involving this change: format descriptor strings
+     should now be accessed via ``format_descriptor::format()`` (however, for
+     compatibility purposes, the old syntax ``format_descriptor::value`` will
+     still work for non-structured data types). `#308
+     <https://github.com/pybind/pybind11/pull/308>`_.
+
+  4. Further improvements to support structured dtypes throughout the system.
+     `#472 <https://github.com/pybind/pybind11/pull/472>`_,
+     `#474 <https://github.com/pybind/pybind11/pull/474>`_,
+     `#459 <https://github.com/pybind/pybind11/pull/459>`_,
+     `#453 <https://github.com/pybind/pybind11/pull/453>`_,
+     `#452 <https://github.com/pybind/pybind11/pull/452>`_, and
+     `#505 <https://github.com/pybind/pybind11/pull/505>`_.
+
+  5. Fast access operators. `#497 <https://github.com/pybind/pybind11/pull/497>`_.
+
+  6. Constructors for arrays whose storage is owned by another object.
+     `#440 <https://github.com/pybind/pybind11/pull/440>`_.
+
+  7. Added constructors for ``array`` and ``array_t`` explicitly accepting shape
+     and strides; if strides are not provided, they are deduced assuming
+     C-contiguity. Also added simplified constructors for 1-dimensional case.
+
+  8. Added buffer/NumPy support for ``char[N]`` and ``std::array<char, N>`` types.
+
+  9. Added ``memoryview`` wrapper type which is constructible from ``buffer_info``.
+
+* Eigen: many additional conversions and support for non-contiguous
+  arrays/slices.
+  `#427 <https://github.com/pybind/pybind11/pull/427>`_,
+  `#315 <https://github.com/pybind/pybind11/pull/315>`_,
+  `#316 <https://github.com/pybind/pybind11/pull/316>`_,
+  `#312 <https://github.com/pybind/pybind11/pull/312>`_, and
+  `#267 <https://github.com/pybind/pybind11/pull/267>`_
+
+* Incompatible changes in ``class_<...>::class_()``:
+
+    1. Declarations of types that provide access via the buffer protocol must
+       now include the ``py::buffer_protocol()`` annotation as an argument to
+       the ``class_`` constructor.
+
+    2. Declarations of types that require a custom metaclass (i.e. all classes
+       which include static properties via commands such as
+       ``def_readwrite_static()``) must now include the ``py::metaclass()``
+       annotation as an argument to the ``class_`` constructor.
+
+       These two changes were necessary to make type definitions in pybind11
+       future-proof, and to support PyPy via its cpyext mechanism. `#527
+       <https://github.com/pybind/pybind11/pull/527>`_.
+
+
+    3. This version of pybind11 uses a redesigned mechanism for instantiating
+       trampoline classes that are used to override virtual methods from within
+       Python. This led to the following user-visible syntax change: instead of
+
+       .. code-block:: cpp
+
+           py::class_<TrampolineClass>("MyClass")
+             .alias<MyClass>()
+             ....
+
+       write
+
+       .. code-block:: cpp
+
+           py::class_<MyClass, TrampolineClass>("MyClass")
+             ....
+
+       Importantly, both the original and the trampoline class are now
+       specified as an arguments (in arbitrary order) to the ``py::class_``
+       template, and the ``alias<..>()`` call is gone. The new scheme has zero
+       overhead in cases when Python doesn't override any functions of the
+       underlying C++ class. `rev. 86d825
+       <https://github.com/pybind/pybind11/commit/86d825>`_.
+
+* Added ``eval`` and ``eval_file`` functions for evaluating expressions and
+  statements from a string or file. `rev. 0d3fc3
+  <https://github.com/pybind/pybind11/commit/0d3fc3>`_.
+
+* pybind11 can now create types with a modifiable dictionary.
+  `#437 <https://github.com/pybind/pybind11/pull/437>`_ and
+  `#444 <https://github.com/pybind/pybind11/pull/444>`_.
+
+* Support for translation of arbitrary C++ exceptions to Python counterparts.
+  `#296 <https://github.com/pybind/pybind11/pull/296>`_ and
+  `#273 <https://github.com/pybind/pybind11/pull/273>`_.
+
+* Report full backtraces through mixed C++/Python code, better reporting for
+  import errors, fixed GIL management in exception processing.
+  `#537 <https://github.com/pybind/pybind11/pull/537>`_,
+  `#494 <https://github.com/pybind/pybind11/pull/494>`_,
+  `rev. e72d95 <https://github.com/pybind/pybind11/commit/e72d95>`_, and
+  `rev. 099d6e <https://github.com/pybind/pybind11/commit/099d6e>`_.
+
+* Support for bit-level operations, comparisons, and serialization of C++
+  enumerations. `#503 <https://github.com/pybind/pybind11/pull/503>`_,
+  `#508 <https://github.com/pybind/pybind11/pull/508>`_,
+  `#380 <https://github.com/pybind/pybind11/pull/380>`_,
+  `#309 <https://github.com/pybind/pybind11/pull/309>`_.
+  `#311 <https://github.com/pybind/pybind11/pull/311>`_.
+
+* The ``class_`` constructor now accepts its template arguments in any order.
+  `#385 <https://github.com/pybind/pybind11/pull/385>`_.
+
+* Attribute and item accessors now have a more complete interface which makes
+  it possible to chain attributes as in
+  ``obj.attr("a")[key].attr("b").attr("method")(1, 2, 3)``. `#425
+  <https://github.com/pybind/pybind11/pull/425>`_.
+
+* Major redesign of the default and conversion constructors in ``pytypes.h``.
+  `#464 <https://github.com/pybind/pybind11/pull/464>`_.
+
+* Added built-in support for ``std::shared_ptr`` holder type. It is no longer
+  necessary to to include a declaration of the form
+  ``PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>)`` (though continuing to
+  do so won't cause an error).
+  `#454 <https://github.com/pybind/pybind11/pull/454>`_.
+
+* New ``py::overload_cast`` casting operator to select among multiple possible
+  overloads of a function. An example:
+
+    .. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def("set", py::overload_cast<int>(&Pet::set), "Set the pet's age")
+            .def("set", py::overload_cast<const std::string &>(&Pet::set), "Set the pet's name");
+
+  This feature only works on C++14-capable compilers.
+  `#541 <https://github.com/pybind/pybind11/pull/541>`_.
+
+* C++ types are automatically cast to Python types, e.g. when assigning
+  them as an attribute. For instance, the following is now legal:
+
+    .. code-block:: cpp
+
+        py::module m = /* ... */
+        m.attr("constant") = 123;
+
+  (Previously, a ``py::cast`` call was necessary to avoid a compilation error.)
+  `#551 <https://github.com/pybind/pybind11/pull/551>`_.
+
+* Redesigned ``pytest``-based test suite. `#321 <https://github.com/pybind/pybind11/pull/321>`_.
+
+* Instance tracking to detect reference leaks in test suite. `#324 <https://github.com/pybind/pybind11/pull/324>`_
+
+* pybind11 can now distinguish between multiple different instances that are
+  located at the same memory address, but which have different types.
+  `#329 <https://github.com/pybind/pybind11/pull/329>`_.
+
+* Improved logic in ``move`` return value policy.
+  `#510 <https://github.com/pybind/pybind11/pull/510>`_,
+  `#297 <https://github.com/pybind/pybind11/pull/297>`_.
+
+* Generalized unpacking API to permit calling Python functions from C++ using
+  notation such as ``foo(a1, a2, *args, "ka"_a=1, "kb"_a=2, **kwargs)``. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* ``py::print()`` function whose behavior matches that of the native Python
+  ``print()`` function. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* Added ``py::dict`` keyword constructor:``auto d = dict("number"_a=42,
+  "name"_a="World");``. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* Added ``py::str::format()`` method and ``_s`` literal: ``py::str s = "1 + 2
+  = {}"_s.format(3);``. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* Added ``py::repr()`` function which is equivalent to Python's builtin
+  ``repr()``. `#333 <https://github.com/pybind/pybind11/pull/333>`_.
+
+* Improved construction and destruction logic for holder types. It is now
+  possible to reference instances with smart pointer holder types without
+  constructing the holder if desired. The ``PYBIND11_DECLARE_HOLDER_TYPE``
+  macro now accepts an optional second parameter to indicate whether the holder
+  type uses intrusive reference counting.
+  `#533 <https://github.com/pybind/pybind11/pull/533>`_ and
+  `#561 <https://github.com/pybind/pybind11/pull/561>`_.
+
+* Mapping a stateless C++ function to Python and back is now "for free" (i.e.
+  no extra indirections or argument conversion overheads). `rev. 954b79
+  <https://github.com/pybind/pybind11/commit/954b79>`_.
+
+* Bindings for ``std::valarray<T>``.
+  `#545 <https://github.com/pybind/pybind11/pull/545>`_.
+
+* Improved support for C++17 capable compilers.
+  `#562 <https://github.com/pybind/pybind11/pull/562>`_.
+
+* Bindings for ``std::optional<t>``.
+  `#475 <https://github.com/pybind/pybind11/pull/475>`_,
+  `#476 <https://github.com/pybind/pybind11/pull/476>`_,
+  `#479 <https://github.com/pybind/pybind11/pull/479>`_,
+  `#499 <https://github.com/pybind/pybind11/pull/499>`_, and
+  `#501 <https://github.com/pybind/pybind11/pull/501>`_.
+
+* ``stl_bind.h``: general improvements and support for ``std::map`` and
+  ``std::unordered_map``.
+  `#490 <https://github.com/pybind/pybind11/pull/490>`_,
+  `#282 <https://github.com/pybind/pybind11/pull/282>`_,
+  `#235 <https://github.com/pybind/pybind11/pull/235>`_.
+
+* The ``std::tuple``, ``std::pair``, ``std::list``, and ``std::vector`` type
+  casters now accept any Python sequence type as input. `rev. 107285
+  <https://github.com/pybind/pybind11/commit/107285>`_.
+
+* Improved CMake Python detection on multi-architecture Linux.
+  `#532 <https://github.com/pybind/pybind11/pull/532>`_.
+
+* Infrastructure to selectively disable or enable parts of the automatically
+  generated docstrings. `#486 <https://github.com/pybind/pybind11/pull/486>`_.
+
+* ``reference`` and ``reference_internal`` are now the default return value
+  properties for static and non-static properties, respectively. `#473
+  <https://github.com/pybind/pybind11/pull/473>`_. (the previous defaults
+  were ``automatic``). `#473 <https://github.com/pybind/pybind11/pull/473>`_.
+
+* Support for ``std::unique_ptr`` with non-default deleters or no deleter at
+  all (``py::nodelete``). `#384 <https://github.com/pybind/pybind11/pull/384>`_.
+
+* Deprecated ``handle::call()`` method. The new syntax to call Python
+  functions is simply ``handle()``. It can also be invoked explicitly via
+  ``handle::operator<X>()``, where ``X`` is an optional return value policy.
+
+* Print more informative error messages when ``make_tuple()`` or ``cast()``
+  fail. `#262 <https://github.com/pybind/pybind11/pull/262>`_.
+
+* Creation of holder types for classes deriving from
+  ``std::enable_shared_from_this<>`` now also works for ``const`` values.
+  `#260 <https://github.com/pybind/pybind11/pull/260>`_.
+
+* ``make_iterator()`` improvements for better compatibility with various
+  types (now uses prefix increment operator); it now also accepts iterators
+  with different begin/end types as long as they are equality comparable.
+  `#247 <https://github.com/pybind/pybind11/pull/247>`_.
+
+* ``arg()`` now accepts a wider range of argument types for default values.
+  `#244 <https://github.com/pybind/pybind11/pull/244>`_.
+
+* Support ``keep_alive`` where the nurse object may be ``None``. `#341
+  <https://github.com/pybind/pybind11/pull/341>`_.
+
+* Added constructors for ``str`` and ``bytes`` from zero-terminated char
+  pointers, and from char pointers and length. Added constructors for ``str``
+  from ``bytes`` and for ``bytes`` from ``str``, which will perform UTF-8
+  decoding/encoding as required.
+
+* Many other improvements of library internals without user-visible changes
+
+
+1.8.1 (July 12, 2016)
+----------------------
+* Fixed a rare but potentially very severe issue when the garbage collector ran
+  during pybind11 type creation.
+
+1.8.0 (June 14, 2016)
+----------------------
+* Redesigned CMake build system which exports a convenient
+  ``pybind11_add_module`` function to parent projects.
+* ``std::vector<>`` type bindings analogous to Boost.Python's ``indexing_suite``
+* Transparent conversion of sparse and dense Eigen matrices and vectors (``eigen.h``)
+* Added an ``ExtraFlags`` template argument to the NumPy ``array_t<>`` wrapper
+  to disable an enforced cast that may lose precision, e.g. to create overloads
+  for different precisions and complex vs real-valued matrices.
+* Prevent implicit conversion of floating point values to integral types in
+  function arguments
+* Fixed incorrect default return value policy for functions returning a shared
+  pointer
+* Don't allow registering a type via ``class_`` twice
+* Don't allow casting a ``None`` value into a C++ lvalue reference
+* Fixed a crash in ``enum_::operator==`` that was triggered by the ``help()`` command
+* Improved detection of whether or not custom C++ types can be copy/move-constructed
+* Extended ``str`` type to also work with ``bytes`` instances
+* Added a ``"name"_a`` user defined string literal that is equivalent to ``py::arg("name")``.
+* When specifying function arguments via ``py::arg``, the test that verifies
+  the number of arguments now runs at compile time.
+* Added ``[[noreturn]]`` attribute to ``pybind11_fail()`` to quench some
+  compiler warnings
+* List function arguments in exception text when the dispatch code cannot find
+  a matching overload
+* Added ``PYBIND11_OVERLOAD_NAME`` and ``PYBIND11_OVERLOAD_PURE_NAME`` macros which
+  can be used to override virtual methods whose name differs in C++ and Python
+  (e.g. ``__call__`` and ``operator()``)
+* Various minor ``iterator`` and ``make_iterator()`` improvements
+* Transparently support ``__bool__`` on Python 2.x and Python 3.x
+* Fixed issue with destructor of unpickled object not being called
+* Minor CMake build system improvements on Windows
+* New ``pybind11::args`` and ``pybind11::kwargs`` types to create functions which
+  take an arbitrary number of arguments and keyword arguments
+* New syntax to call a Python function from C++ using ``*args`` and ``*kwargs``
+* The functions ``def_property_*`` now correctly process docstring arguments (these
+  formerly caused a segmentation fault)
+* Many ``mkdoc.py`` improvements (enumerations, template arguments, ``DOC()``
+  macro accepts more arguments)
+* Cygwin support
+* Documentation improvements (pickling support, ``keep_alive``, macro usage)
+
+1.7 (April 30, 2016)
+----------------------
+* Added a new ``move`` return value policy that triggers C++11 move semantics.
+  The automatic return value policy falls back to this case whenever a rvalue
+  reference is encountered
+* Significantly more general GIL state routines that are used instead of
+  Python's troublesome ``PyGILState_Ensure`` and ``PyGILState_Release`` API
+* Redesign of opaque types that drastically simplifies their usage
+* Extended ability to pass values of type ``[const] void *``
+* ``keep_alive`` fix: don't fail when there is no patient
+* ``functional.h``: acquire the GIL before calling a Python function
+* Added Python RAII type wrappers ``none`` and ``iterable``
+* Added ``*args`` and ``*kwargs`` pass-through parameters to
+  ``pybind11.get_include()`` function
+* Iterator improvements and fixes
+* Documentation on return value policies and opaque types improved
+
+1.6 (April 30, 2016)
+----------------------
+* Skipped due to upload to PyPI gone wrong and inability to recover
+  (https://github.com/pypa/packaging-problems/issues/74)
+
+1.5 (April 21, 2016)
+----------------------
+* For polymorphic types, use RTTI to try to return the closest type registered with pybind11
+* Pickling support for serializing and unserializing C++ instances to a byte stream in Python
+* Added a convenience routine ``make_iterator()`` which turns a range indicated
+  by a pair of C++ iterators into a iterable Python object
+* Added ``len()`` and a variadic ``make_tuple()`` function
+* Addressed a rare issue that could confuse the current virtual function
+  dispatcher and another that could lead to crashes in multi-threaded
+  applications
+* Added a ``get_include()`` function to the Python module that returns the path
+  of the directory containing the installed pybind11 header files
+* Documentation improvements: import issues, symbol visibility, pickling, limitations
+* Added casting support for ``std::reference_wrapper<>``
+
+1.4 (April 7, 2016)
+--------------------------
+* Transparent type conversion for ``std::wstring`` and ``wchar_t``
+* Allow passing ``nullptr``-valued strings
+* Transparent passing of ``void *`` pointers using capsules
+* Transparent support for returning values wrapped in ``std::unique_ptr<>``
+* Improved docstring generation for compatibility with Sphinx
+* Nicer debug error message when default parameter construction fails
+* Support for "opaque" types that bypass the transparent conversion layer for STL containers
+* Redesigned type casting interface to avoid ambiguities that could occasionally cause compiler errors
+* Redesigned property implementation; fixes crashes due to an unfortunate default return value policy
+* Anaconda package generation support
+
+1.3 (March 8, 2016)
+--------------------------
+
+* Added support for the Intel C++ compiler (v15+)
+* Added support for the STL unordered set/map data structures
+* Added support for the STL linked list data structure
+* NumPy-style broadcasting support in ``pybind11::vectorize``
+* pybind11 now displays more verbose error messages when ``arg::operator=()`` fails
+* pybind11 internal data structures now live in a version-dependent namespace to avoid ABI issues
+* Many, many bugfixes involving corner cases and advanced usage
+
+1.2 (February 7, 2016)
+--------------------------
+
+* Optional: efficient generation of function signatures at compile time using C++14
+* Switched to a simpler and more general way of dealing with function default
+  arguments. Unused keyword arguments in function calls are now detected and
+  cause errors as expected
+* New ``keep_alive`` call policy analogous to Boost.Python's ``with_custodian_and_ward``
+* New ``pybind11::base<>`` attribute to indicate a subclass relationship
+* Improved interface for RAII type wrappers in ``pytypes.h``
+* Use RAII type wrappers consistently within pybind11 itself. This
+  fixes various potential refcount leaks when exceptions occur
+* Added new ``bytes`` RAII type wrapper (maps to ``string`` in Python 2.7)
+* Made handle and related RAII classes const correct, using them more
+  consistently everywhere now
+* Got rid of the ugly ``__pybind11__`` attributes on the Python side---they are
+  now stored in a C++ hash table that is not visible in Python
+* Fixed refcount leaks involving NumPy arrays and bound functions
+* Vastly improved handling of shared/smart pointers
+* Removed an unnecessary copy operation in ``pybind11::vectorize``
+* Fixed naming clashes when both pybind11 and NumPy headers are included
+* Added conversions for additional exception types
+* Documentation improvements (using multiple extension modules, smart pointers,
+  other minor clarifications)
+* unified infrastructure for parsing variadic arguments in ``class_`` and cpp_function
+* Fixed license text (was: ZLIB, should have been: 3-clause BSD)
+* Python 3.2 compatibility
+* Fixed remaining issues when accessing types in another plugin module
+* Added enum comparison and casting methods
+* Improved SFINAE-based detection of whether types are copy-constructible
+* Eliminated many warnings about unused variables and the use of ``offsetof()``
+* Support for ``std::array<>`` conversions
+
+1.1 (December 7, 2015)
+--------------------------
+
+* Documentation improvements (GIL, wrapping functions, casting, fixed many typos)
+* Generalized conversion of integer types
+* Improved support for casting function objects
+* Improved support for ``std::shared_ptr<>`` conversions
+* Initial support for ``std::set<>`` conversions
+* Fixed type resolution issue for types defined in a separate plugin module
+* Cmake build system improvements
+* Factored out generic functionality to non-templated code (smaller code size)
+* Added a code size / compile time benchmark vs Boost.Python
+* Added an appveyor CI script
+
+1.0 (October 15, 2015)
+------------------------
+* Initial release
diff --git a/pybind11/docs/classes.rst b/pybind11/docs/classes.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d44a5931d0d4a293d8069ebb3edf860783d7266
--- /dev/null
+++ b/pybind11/docs/classes.rst
@@ -0,0 +1,532 @@
+.. _classes:
+
+Object-oriented code
+####################
+
+Creating bindings for a custom type
+===================================
+
+Let's now look at a more complex example where we'll create bindings for a
+custom C++ data structure named ``Pet``. Its definition is given below:
+
+.. code-block:: cpp
+
+    struct Pet {
+        Pet(const std::string &name) : name(name) { }
+        void setName(const std::string &name_) { name = name_; }
+        const std::string &getName() const { return name; }
+
+        std::string name;
+    };
+
+The binding code for ``Pet`` looks as follows:
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+
+    namespace py = pybind11;
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def("setName", &Pet::setName)
+            .def("getName", &Pet::getName);
+    }
+
+:class:`class_` creates bindings for a C++ *class* or *struct*-style data
+structure. :func:`init` is a convenience function that takes the types of a
+constructor's parameters as template arguments and wraps the corresponding
+constructor (see the :ref:`custom_constructors` section for details). An
+interactive Python session demonstrating this example is shown below:
+
+.. code-block:: pycon
+
+    % python
+    >>> import example
+    >>> p = example.Pet('Molly')
+    >>> print(p)
+    <example.Pet object at 0x10cd98060>
+    >>> p.getName()
+    u'Molly'
+    >>> p.setName('Charly')
+    >>> p.getName()
+    u'Charly'
+
+.. seealso::
+
+    Static member functions can be bound in the same way using
+    :func:`class_::def_static`.
+
+Keyword and default arguments
+=============================
+It is possible to specify keyword and default arguments using the syntax
+discussed in the previous chapter. Refer to the sections :ref:`keyword_args`
+and :ref:`default_args` for details.
+
+Binding lambda functions
+========================
+
+Note how ``print(p)`` produced a rather useless summary of our data structure in the example above:
+
+.. code-block:: pycon
+
+    >>> print(p)
+    <example.Pet object at 0x10cd98060>
+
+To address this, we could bind a utility function that returns a human-readable
+summary to the special method slot named ``__repr__``. Unfortunately, there is no
+suitable functionality in the ``Pet`` data structure, and it would be nice if
+we did not have to change it. This can easily be accomplished by binding a
+Lambda function instead:
+
+.. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def("setName", &Pet::setName)
+            .def("getName", &Pet::getName)
+            .def("__repr__",
+                [](const Pet &a) {
+                    return "<example.Pet named '" + a.name + "'>";
+                }
+            );
+
+Both stateless [#f1]_ and stateful lambda closures are supported by pybind11.
+With the above change, the same Python code now produces the following output:
+
+.. code-block:: pycon
+
+    >>> print(p)
+    <example.Pet named 'Molly'>
+
+.. [#f1] Stateless closures are those with an empty pair of brackets ``[]`` as the capture object.
+
+.. _properties:
+
+Instance and static fields
+==========================
+
+We can also directly expose the ``name`` field using the
+:func:`class_::def_readwrite` method. A similar :func:`class_::def_readonly`
+method also exists for ``const`` fields.
+
+.. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def_readwrite("name", &Pet::name)
+            // ... remainder ...
+
+This makes it possible to write
+
+.. code-block:: pycon
+
+    >>> p = example.Pet('Molly')
+    >>> p.name
+    u'Molly'
+    >>> p.name = 'Charly'
+    >>> p.name
+    u'Charly'
+
+Now suppose that ``Pet::name`` was a private internal variable
+that can only be accessed via setters and getters.
+
+.. code-block:: cpp
+
+    class Pet {
+    public:
+        Pet(const std::string &name) : name(name) { }
+        void setName(const std::string &name_) { name = name_; }
+        const std::string &getName() const { return name; }
+    private:
+        std::string name;
+    };
+
+In this case, the method :func:`class_::def_property`
+(:func:`class_::def_property_readonly` for read-only data) can be used to
+provide a field-like interface within Python that will transparently call
+the setter and getter functions:
+
+.. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def_property("name", &Pet::getName, &Pet::setName)
+            // ... remainder ...
+
+Write only properties can be defined by passing ``nullptr`` as the
+input for the read function.
+
+.. seealso::
+
+    Similar functions :func:`class_::def_readwrite_static`,
+    :func:`class_::def_readonly_static` :func:`class_::def_property_static`,
+    and :func:`class_::def_property_readonly_static` are provided for binding
+    static variables and properties. Please also see the section on
+    :ref:`static_properties` in the advanced part of the documentation.
+
+Dynamic attributes
+==================
+
+Native Python classes can pick up new attributes dynamically:
+
+.. code-block:: pycon
+
+    >>> class Pet:
+    ...     name = 'Molly'
+    ...
+    >>> p = Pet()
+    >>> p.name = 'Charly'  # overwrite existing
+    >>> p.age = 2  # dynamically add a new attribute
+
+By default, classes exported from C++ do not support this and the only writable
+attributes are the ones explicitly defined using :func:`class_::def_readwrite`
+or :func:`class_::def_property`.
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+        .def(py::init<>())
+        .def_readwrite("name", &Pet::name);
+
+Trying to set any other attribute results in an error:
+
+.. code-block:: pycon
+
+    >>> p = example.Pet()
+    >>> p.name = 'Charly'  # OK, attribute defined in C++
+    >>> p.age = 2  # fail
+    AttributeError: 'Pet' object has no attribute 'age'
+
+To enable dynamic attributes for C++ classes, the :class:`py::dynamic_attr` tag
+must be added to the :class:`py::class_` constructor:
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet", py::dynamic_attr())
+        .def(py::init<>())
+        .def_readwrite("name", &Pet::name);
+
+Now everything works as expected:
+
+.. code-block:: pycon
+
+    >>> p = example.Pet()
+    >>> p.name = 'Charly'  # OK, overwrite value in C++
+    >>> p.age = 2  # OK, dynamically add a new attribute
+    >>> p.__dict__  # just like a native Python class
+    {'age': 2}
+
+Note that there is a small runtime cost for a class with dynamic attributes.
+Not only because of the addition of a ``__dict__``, but also because of more
+expensive garbage collection tracking which must be activated to resolve
+possible circular references. Native Python classes incur this same cost by
+default, so this is not anything to worry about. By default, pybind11 classes
+are more efficient than native Python classes. Enabling dynamic attributes
+just brings them on par.
+
+.. _inheritance:
+
+Inheritance and automatic downcasting
+=====================================
+
+Suppose now that the example consists of two data structures with an
+inheritance relationship:
+
+.. code-block:: cpp
+
+    struct Pet {
+        Pet(const std::string &name) : name(name) { }
+        std::string name;
+    };
+
+    struct Dog : Pet {
+        Dog(const std::string &name) : Pet(name) { }
+        std::string bark() const { return "woof!"; }
+    };
+
+There are two different ways of indicating a hierarchical relationship to
+pybind11: the first specifies the C++ base class as an extra template
+parameter of the :class:`class_`:
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+       .def(py::init<const std::string &>())
+       .def_readwrite("name", &Pet::name);
+
+    // Method 1: template parameter:
+    py::class_<Dog, Pet /* <- specify C++ parent type */>(m, "Dog")
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Alternatively, we can also assign a name to the previously bound ``Pet``
+:class:`class_` object and reference it when binding the ``Dog`` class:
+
+.. code-block:: cpp
+
+    py::class_<Pet> pet(m, "Pet");
+    pet.def(py::init<const std::string &>())
+       .def_readwrite("name", &Pet::name);
+
+    // Method 2: pass parent class_ object:
+    py::class_<Dog>(m, "Dog", pet /* <- specify Python parent type */)
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Functionality-wise, both approaches are equivalent. Afterwards, instances will
+expose fields and methods of both types:
+
+.. code-block:: pycon
+
+    >>> p = example.Dog('Molly')
+    >>> p.name
+    u'Molly'
+    >>> p.bark()
+    u'woof!'
+
+The C++ classes defined above are regular non-polymorphic types with an
+inheritance relationship. This is reflected in Python:
+
+.. code-block:: cpp
+
+    // Return a base pointer to a derived instance
+    m.def("pet_store", []() { return std::unique_ptr<Pet>(new Dog("Molly")); });
+
+.. code-block:: pycon
+
+    >>> p = example.pet_store()
+    >>> type(p)  # `Dog` instance behind `Pet` pointer
+    Pet          # no pointer downcasting for regular non-polymorphic types
+    >>> p.bark()
+    AttributeError: 'Pet' object has no attribute 'bark'
+
+The function returned a ``Dog`` instance, but because it's a non-polymorphic
+type behind a base pointer, Python only sees a ``Pet``. In C++, a type is only
+considered polymorphic if it has at least one virtual function and pybind11
+will automatically recognize this:
+
+.. code-block:: cpp
+
+    struct PolymorphicPet {
+        virtual ~PolymorphicPet() = default;
+    };
+
+    struct PolymorphicDog : PolymorphicPet {
+        std::string bark() const { return "woof!"; }
+    };
+
+    // Same binding code
+    py::class_<PolymorphicPet>(m, "PolymorphicPet");
+    py::class_<PolymorphicDog, PolymorphicPet>(m, "PolymorphicDog")
+        .def(py::init<>())
+        .def("bark", &PolymorphicDog::bark);
+
+    // Again, return a base pointer to a derived instance
+    m.def("pet_store2", []() { return std::unique_ptr<PolymorphicPet>(new PolymorphicDog); });
+
+.. code-block:: pycon
+
+    >>> p = example.pet_store2()
+    >>> type(p)
+    PolymorphicDog  # automatically downcast
+    >>> p.bark()
+    u'woof!'
+
+Given a pointer to a polymorphic base, pybind11 performs automatic downcasting
+to the actual derived type. Note that this goes beyond the usual situation in
+C++: we don't just get access to the virtual functions of the base, we get the
+concrete derived type including functions and attributes that the base type may
+not even be aware of.
+
+.. seealso::
+
+    For more information about polymorphic behavior see :ref:`overriding_virtuals`.
+
+
+Overloaded methods
+==================
+
+Sometimes there are several overloaded C++ methods with the same name taking
+different kinds of input arguments:
+
+.. code-block:: cpp
+
+    struct Pet {
+        Pet(const std::string &name, int age) : name(name), age(age) { }
+
+        void set(int age_) { age = age_; }
+        void set(const std::string &name_) { name = name_; }
+
+        std::string name;
+        int age;
+    };
+
+Attempting to bind ``Pet::set`` will cause an error since the compiler does not
+know which method the user intended to select. We can disambiguate by casting
+them to function pointers. Binding multiple functions to the same Python name
+automatically creates a chain of function overloads that will be tried in
+sequence.
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+       .def(py::init<const std::string &, int>())
+       .def("set", (void (Pet::*)(int)) &Pet::set, "Set the pet's age")
+       .def("set", (void (Pet::*)(const std::string &)) &Pet::set, "Set the pet's name");
+
+The overload signatures are also visible in the method's docstring:
+
+.. code-block:: pycon
+
+    >>> help(example.Pet)
+
+    class Pet(__builtin__.object)
+     |  Methods defined here:
+     |
+     |  __init__(...)
+     |      Signature : (Pet, str, int) -> NoneType
+     |
+     |  set(...)
+     |      1. Signature : (Pet, int) -> NoneType
+     |
+     |      Set the pet's age
+     |
+     |      2. Signature : (Pet, str) -> NoneType
+     |
+     |      Set the pet's name
+
+If you have a C++14 compatible compiler [#cpp14]_, you can use an alternative
+syntax to cast the overloaded function:
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+        .def("set", py::overload_cast<int>(&Pet::set), "Set the pet's age")
+        .def("set", py::overload_cast<const std::string &>(&Pet::set), "Set the pet's name");
+
+Here, ``py::overload_cast`` only requires the parameter types to be specified.
+The return type and class are deduced. This avoids the additional noise of
+``void (Pet::*)()`` as seen in the raw cast. If a function is overloaded based
+on constness, the ``py::const_`` tag should be used:
+
+.. code-block:: cpp
+
+    struct Widget {
+        int foo(int x, float y);
+        int foo(int x, float y) const;
+    };
+
+    py::class_<Widget>(m, "Widget")
+       .def("foo_mutable", py::overload_cast<int, float>(&Widget::foo))
+       .def("foo_const",   py::overload_cast<int, float>(&Widget::foo, py::const_));
+
+If you prefer the ``py::overload_cast`` syntax but have a C++11 compatible compiler only,
+you can use ``py::detail::overload_cast_impl`` with an additional set of parentheses:
+
+.. code-block:: cpp
+
+    template <typename... Args>
+    using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
+
+    py::class_<Pet>(m, "Pet")
+        .def("set", overload_cast_<int>()(&Pet::set), "Set the pet's age")
+        .def("set", overload_cast_<const std::string &>()(&Pet::set), "Set the pet's name");
+
+.. [#cpp14] A compiler which supports the ``-std=c++14`` flag
+            or Visual Studio 2015 Update 2 and newer.
+
+.. note::
+
+    To define multiple overloaded constructors, simply declare one after the
+    other using the ``.def(py::init<...>())`` syntax. The existing machinery
+    for specifying keyword and default arguments also works.
+
+Enumerations and internal types
+===============================
+
+Let's now suppose that the example class contains an internal enumeration type,
+e.g.:
+
+.. code-block:: cpp
+
+    struct Pet {
+        enum Kind {
+            Dog = 0,
+            Cat
+        };
+
+        Pet(const std::string &name, Kind type) : name(name), type(type) { }
+
+        std::string name;
+        Kind type;
+    };
+
+The binding code for this example looks as follows:
+
+.. code-block:: cpp
+
+    py::class_<Pet> pet(m, "Pet");
+
+    pet.def(py::init<const std::string &, Pet::Kind>())
+        .def_readwrite("name", &Pet::name)
+        .def_readwrite("type", &Pet::type);
+
+    py::enum_<Pet::Kind>(pet, "Kind")
+        .value("Dog", Pet::Kind::Dog)
+        .value("Cat", Pet::Kind::Cat)
+        .export_values();
+
+To ensure that the ``Kind`` type is created within the scope of ``Pet``, the
+``pet`` :class:`class_` instance must be supplied to the :class:`enum_`.
+constructor. The :func:`enum_::export_values` function exports the enum entries
+into the parent scope, which should be skipped for newer C++11-style strongly
+typed enums.
+
+.. code-block:: pycon
+
+    >>> p = Pet('Lucy', Pet.Cat)
+    >>> p.type
+    Kind.Cat
+    >>> int(p.type)
+    1L
+
+The entries defined by the enumeration type are exposed in the ``__members__`` property:
+
+.. code-block:: pycon
+
+    >>> Pet.Kind.__members__
+    {'Dog': Kind.Dog, 'Cat': Kind.Cat}
+
+The ``name`` property returns the name of the enum value as a unicode string.
+
+.. note::
+
+    It is also possible to use ``str(enum)``, however these accomplish different
+    goals. The following shows how these two approaches differ.
+
+    .. code-block:: pycon
+
+        >>> p = Pet( "Lucy", Pet.Cat )
+        >>> pet_type = p.type
+        >>> pet_type
+        Pet.Cat
+        >>> str(pet_type)
+        'Pet.Cat'
+        >>> pet_type.name
+        'Cat'
+
+.. note::
+
+    When the special tag ``py::arithmetic()`` is specified to the ``enum_``
+    constructor, pybind11 creates an enumeration that also supports rudimentary
+    arithmetic and bit-level operations like comparisons, and, or, xor, negation,
+    etc.
+
+    .. code-block:: cpp
+
+        py::enum_<Pet::Kind>(pet, "Kind", py::arithmetic())
+           ...
+
+    By default, these are omitted to conserve space.
diff --git a/pybind11/docs/compiling.rst b/pybind11/docs/compiling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..72b0c1eecf352e4ae6657cd6c5293542eba63ec5
--- /dev/null
+++ b/pybind11/docs/compiling.rst
@@ -0,0 +1,400 @@
+.. _compiling:
+
+Build systems
+#############
+
+Building with setuptools
+========================
+
+For projects on PyPI, building with setuptools is the way to go. Sylvain Corlay
+has kindly provided an example project which shows how to set up everything,
+including automatic generation of documentation using Sphinx. Please refer to
+the [python_example]_ repository.
+
+.. [python_example] https://github.com/pybind/python_example
+
+Building with cppimport
+========================
+
+[cppimport]_ is a small Python import hook that determines whether there is a C++
+source file whose name matches the requested module. If there is, the file is
+compiled as a Python extension using pybind11 and placed in the same folder as
+the C++ source file. Python is then able to find the module and load it.
+
+.. [cppimport] https://github.com/tbenthompson/cppimport
+
+.. _cmake:
+
+Building with CMake
+===================
+
+For C++ codebases that have an existing CMake-based build system, a Python
+extension module can be created with just a few lines of code:
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.4...3.18)
+    project(example LANGUAGES CXX)
+
+    add_subdirectory(pybind11)
+    pybind11_add_module(example example.cpp)
+
+This assumes that the pybind11 repository is located in a subdirectory named
+:file:`pybind11` and that the code is located in a file named :file:`example.cpp`.
+The CMake command ``add_subdirectory`` will import the pybind11 project which
+provides the ``pybind11_add_module`` function. It will take care of all the
+details needed to build a Python extension module on any platform.
+
+A working sample project, including a way to invoke CMake from :file:`setup.py` for
+PyPI integration, can be found in the [cmake_example]_  repository.
+
+.. [cmake_example] https://github.com/pybind/cmake_example
+
+.. versionchanged:: 2.6
+   CMake 3.4+ is required.
+
+pybind11_add_module
+-------------------
+
+To ease the creation of Python extension modules, pybind11 provides a CMake
+function with the following signature:
+
+.. code-block:: cmake
+
+    pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+                        [NO_EXTRAS] [THIN_LTO] source1 [source2 ...])
+
+This function behaves very much like CMake's builtin ``add_library`` (in fact,
+it's a wrapper function around that command). It will add a library target
+called ``<name>`` to be built from the listed source files. In addition, it
+will take care of all the Python-specific compiler and linker flags as well
+as the OS- and Python-version-specific file extension. The produced target
+``<name>`` can be further manipulated with regular CMake commands.
+
+``MODULE`` or ``SHARED`` may be given to specify the type of library. If no
+type is given, ``MODULE`` is used by default which ensures the creation of a
+Python-exclusive module. Specifying ``SHARED`` will create a more traditional
+dynamic library which can also be linked from elsewhere. ``EXCLUDE_FROM_ALL``
+removes this target from the default build (see CMake docs for details).
+
+Since pybind11 is a template library, ``pybind11_add_module`` adds compiler
+flags to ensure high quality code generation without bloat arising from long
+symbol names and duplication of code in different translation units. It
+sets default visibility to *hidden*, which is required for some pybind11
+features and functionality when attempting to load multiple pybind11 modules
+compiled under different pybind11 versions.  It also adds additional flags
+enabling LTO (Link Time Optimization) and strip unneeded symbols. See the
+:ref:`FAQ entry <faq:symhidden>` for a more detailed explanation. These
+latter optimizations are never applied in ``Debug`` mode.  If ``NO_EXTRAS`` is
+given, they will always be disabled, even in ``Release`` mode. However, this
+will result in code bloat and is generally not recommended.
+
+As stated above, LTO is enabled by default. Some newer compilers also support
+different flavors of LTO such as `ThinLTO`_. Setting ``THIN_LTO`` will cause
+the function to prefer this flavor if available. The function falls back to
+regular LTO if ``-flto=thin`` is not available. If
+``CMAKE_INTERPROCEDURAL_OPTIMIZATION`` is set (either ON or OFF), then that
+will be respected instead of the built-in flag search.
+
+.. _ThinLTO: http://clang.llvm.org/docs/ThinLTO.html
+
+Configuration variables
+-----------------------
+
+By default, pybind11 will compile modules with the compiler default or the
+minimum standard required by pybind11, whichever is higher.  You can set the
+standard explicitly with
+`CMAKE_CXX_STANDARD <https://cmake.org/cmake/help/latest/variable/CMAKE_CXX_STANDARD.html>`_:
+
+.. code-block:: cmake
+
+    set(CMAKE_CXX_STANDARD 14)  # or 11, 14, 17, 20
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)  # optional, ensure standard is supported
+    set(CMAKE_CXX_EXTENSIONS OFF)  # optional, keep compiler extensionsn off
+
+
+The variables can also be set when calling CMake from the command line using
+the ``-D<variable>=<value>`` flag. You can also manually set ``CXX_STANDARD``
+on a target or use ``target_compile_features`` on your targets - anything that
+CMake supports.
+
+Classic Python support: The target Python version can be selected by setting
+``PYBIND11_PYTHON_VERSION`` or an exact Python installation can be specified
+with ``PYTHON_EXECUTABLE``.  For example:
+
+.. code-block:: bash
+
+    cmake -DPYBIND11_PYTHON_VERSION=3.6 ..
+
+    # Another method:
+    cmake -DPYTHON_EXECUTABLE=/path/to/python ..
+
+    # This often is a good way to get the current Python, works in environments:
+    cmake -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)") ..
+
+
+find_package vs. add_subdirectory
+---------------------------------
+
+For CMake-based projects that don't include the pybind11 repository internally,
+an external installation can be detected through ``find_package(pybind11)``.
+See the `Config file`_ docstring for details of relevant CMake variables.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.4...3.18)
+    project(example LANGUAGES CXX)
+
+    find_package(pybind11 REQUIRED)
+    pybind11_add_module(example example.cpp)
+
+Note that ``find_package(pybind11)`` will only work correctly if pybind11
+has been correctly installed on the system, e. g. after downloading or cloning
+the pybind11 repository  :
+
+.. code-block:: bash
+
+    # Classic CMake
+    cd pybind11
+    mkdir build
+    cd build
+    cmake ..
+    make install
+
+    # CMake 3.15+
+    cd pybind11
+    cmake -S . -B build
+    cmake --build build -j 2  # Build on 2 cores
+    cmake --install build
+
+Once detected, the aforementioned ``pybind11_add_module`` can be employed as
+before. The function usage and configuration variables are identical no matter
+if pybind11 is added as a subdirectory or found as an installed package. You
+can refer to the same [cmake_example]_ repository for a full sample project
+-- just swap out ``add_subdirectory`` for ``find_package``.
+
+.. _Config file: https://github.com/pybind/pybind11/blob/master/tools/pybind11Config.cmake.in
+
+
+.. _find-python-mode:
+
+FindPython mode
+---------------
+
+CMake 3.12+ (3.15+ recommended) added a new module called FindPython that had a
+highly improved search algorithm and modern targets and tools. If you use
+FindPython, pybind11 will detect this and use the existing targets instead:
+
+.. code-block:: cmake
+
+    cmake_minumum_required(VERSION 3.15...3.18)
+    project(example LANGUAGES CXX)
+
+    find_package(Python COMPONENTS Interpreter Development REQUIRED)
+    find_package(pybind11 CONFIG REQUIRED)
+    # or add_subdirectory(pybind11)
+
+    pybind11_add_module(example example.cpp)
+
+You can also use the targets (as listed below) with FindPython. If you define
+``PYBIND11_FINDPYTHON``, pybind11 will perform the FindPython step for you
+(mostly useful when building pybind11's own tests, or as a way to change search
+algorithms from the CMake invocation, with ``-DPYBIND11_FINDPYTHON=ON``.
+
+.. warning::
+
+    If you use FindPython2 and FindPython3 to dual-target Python, use the
+    individual targets listed below, and avoid targets that directly include
+    Python parts.
+
+There are `many ways to hint or force a discovery of a specific Python
+installation <https://cmake.org/cmake/help/latest/module/FindPython.html>`_),
+setting ``Python_ROOT_DIR`` may be the most common one (though with
+virtualenv/venv support, and Conda support, this tends to find the correct
+Python version more often than the old system did).
+
+.. versionadded:: 2.6
+
+Advanced: interface library targets
+-----------------------------------
+
+Pybind11 supports modern CMake usage patterns with a set of interface targets,
+available in all modes. The targets provided are:
+
+   ``pybind11::headers``
+     Just the pybind11 headers and minimum compile requirements
+
+   ``pybind11::python2_no_register``
+     Quiets the warning/error when mixing C++14 or higher and Python 2
+
+   ``pybind11::pybind11``
+     Python headers + ``pybind11::headers`` + ``pybind11::python2_no_register`` (Python 2 only)
+
+   ``pybind11::python_link_helper``
+     Just the "linking" part of pybind11:module
+
+   ``pybind11::module``
+     Everything for extension modules - ``pybind11::pybind11`` + ``Python::Module`` (FindPython CMake 3.15+) or ``pybind11::python_link_helper``
+
+   ``pybind11::embed``
+     Everything for embedding the Python interpreter - ``pybind11::pybind11`` + ``Python::Embed`` (FindPython) or Python libs
+
+   ``pybind11::lto`` / ``pybind11::thin_lto``
+     An alternative to `INTERPROCEDURAL_OPTIMIZATION` for adding link-time optimization.
+
+   ``pybind11::windows_extras``
+     ``/bigobj`` and ``/mp`` for MSVC.
+
+Two helper functions are also provided:
+
+    ``pybind11_strip(target)``
+      Strips a target (uses ``CMAKE_STRIP`` after the target is built)
+
+    ``pybind11_extension(target)``
+      Sets the correct extension (with SOABI) for a target.
+
+You can use these targets to build complex applications. For example, the
+``add_python_module`` function is identical to:
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.4)
+    project(example LANGUAGES CXX)
+
+    find_package(pybind11 REQUIRED)  # or add_subdirectory(pybind11)
+
+    add_library(example MODULE main.cpp)
+
+    target_link_libraries(example PRIVATE pybind11::module pybind11::lto pybind11::windows_extras)
+
+    pybind11_extension(example)
+    pybind11_strip(example)
+
+    set_target_properties(example PROPERTIES CXX_VISIBILITY_PRESET "hidden"
+                                             CUDA_VISIBILITY_PRESET "hidden")
+
+Instead of setting properties, you can set ``CMAKE_*`` variables to initialize these correctly.
+
+.. warning::
+
+    Since pybind11 is a metatemplate library, it is crucial that certain
+    compiler flags are provided to ensure high quality code generation. In
+    contrast to the ``pybind11_add_module()`` command, the CMake interface
+    provides a *composable* set of targets to ensure that you retain flexibility.
+    It can be expecially important to provide or set these properties; the
+    :ref:`FAQ <faq:symhidden>` contains an explanation on why these are needed.
+
+.. versionadded:: 2.6
+
+.. _nopython-mode:
+
+Advanced: NOPYTHON mode
+-----------------------
+
+If you want complete control, you can set ``PYBIND11_NOPYTHON`` to completely
+disable Python integration (this also happens if you run ``FindPython2`` and
+``FindPython3`` without running ``FindPython``). This gives you complete
+freedom to integrate into an existing system (like `Scikit-Build's
+<https://scikit-build.readthedocs.io>`_ ``PythonExtensions``).
+``pybind11_add_module`` and ``pybind11_extension`` will be unavailable, and the
+targets will be missing any Python specific behavior.
+
+.. versionadded:: 2.6
+
+Embedding the Python interpreter
+--------------------------------
+
+In addition to extension modules, pybind11 also supports embedding Python into
+a C++ executable or library. In CMake, simply link with the ``pybind11::embed``
+target. It provides everything needed to get the interpreter running. The Python
+headers and libraries are attached to the target. Unlike ``pybind11::module``,
+there is no need to manually set any additional properties here. For more
+information about usage in C++, see :doc:`/advanced/embedding`.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.4...3.18)
+    project(example LANGUAGES CXX)
+
+    find_package(pybind11 REQUIRED)  # or add_subdirectory(pybind11)
+
+    add_executable(example main.cpp)
+    target_link_libraries(example PRIVATE pybind11::embed)
+
+.. _building_manually:
+
+Building manually
+=================
+
+pybind11 is a header-only library, hence it is not necessary to link against
+any special libraries and there are no intermediate (magic) translation steps.
+
+On Linux, you can compile an example such as the one given in
+:ref:`simple_example` using the following command:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+The flags given here assume that you're using Python 3. For Python 2, just
+change the executable appropriately (to ``python`` or ``python2``).
+
+The ``python3 -m pybind11 --includes`` command fetches the include paths for
+both pybind11 and Python headers. This assumes that pybind11 has been installed
+using ``pip`` or ``conda``. If it hasn't, you can also manually specify
+``-I <path-to-pybind11>/include`` together with the Python includes path
+``python3-config --includes``.
+
+Note that Python 2.7 modules don't use a special suffix, so you should simply
+use ``example.so`` instead of ``example`python3-config --extension-suffix```.
+Besides, the ``--extension-suffix`` option may or may not be available, depending
+on the distribution; in the latter case, the module extension can be manually
+set to ``.so``.
+
+On Mac OS: the build command is almost the same but it also requires passing
+the ``-undefined dynamic_lookup`` flag so as to ignore missing symbols when
+building the module:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -undefined dynamic_lookup `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+In general, it is advisable to include several additional build parameters
+that can considerably reduce the size of the created binary. Refer to section
+:ref:`cmake` for a detailed example of a suitable cross-platform CMake-based
+build system that works on all platforms including Windows.
+
+.. note::
+
+    On Linux and macOS, it's better to (intentionally) not link against
+    ``libpython``. The symbols will be resolved when the extension library
+    is loaded into a Python binary. This is preferable because you might
+    have several different installations of a given Python version (e.g. the
+    system-provided Python, and one that ships with a piece of commercial
+    software). In this way, the plugin will work with both versions, instead
+    of possibly importing a second Python library into a process that already
+    contains one (which will lead to a segfault).
+
+Generating binding code automatically
+=====================================
+
+The ``Binder`` project is a tool for automatic generation of pybind11 binding
+code by introspecting existing C++ codebases using LLVM/Clang. See the
+[binder]_ documentation for details.
+
+.. [binder] http://cppbinder.readthedocs.io/en/latest/about.html
+
+[AutoWIG]_ is a Python library that wraps automatically compiled libraries into
+high-level languages. It parses C++ code using LLVM/Clang technologies and
+generates the wrappers using the Mako templating engine. The approach is automatic,
+extensible, and applies to very complex C++ libraries, composed of thousands of
+classes or incorporating modern meta-programming constructs.
+
+.. [AutoWIG] https://github.com/StatisKit/AutoWIG
+
+[robotpy-build]_ is a is a pure python, cross platform build tool that aims to
+simplify creation of python wheels for pybind11 projects, and provide
+cross-project dependency management. Additionally, it is able to autogenerate
+customizable pybind11-based wrappers by parsing C++ header files.
+
+.. [robotpy-build] https://robotpy-build.readthedocs.io
diff --git a/pybind11/docs/conf.py b/pybind11/docs/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..0946f30e2e1ddea55a7d4c4069b8a989a29fe5e9
--- /dev/null
+++ b/pybind11/docs/conf.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# pybind11 documentation build configuration file, created by
+# sphinx-quickstart on Sun Oct 11 19:23:48 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+import subprocess
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+
+breathe_projects = {'pybind11': '.build/doxygenxml/'}
+breathe_default_project = 'pybind11'
+breathe_domain_by_extension = {'h': 'cpp'}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['.templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'pybind11'
+copyright = '2017, Wenzel Jakob'
+author = 'Wenzel Jakob'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '2.5'
+# The full version, including alpha/beta/rc tags.
+release = '2.5.dev1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['.build', 'release.rst']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+default_role = 'any'
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+#pygments_style = 'monokai'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+if not on_rtd:  # only import and set the theme if we're building docs locally
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+    html_context = {
+        'css_files': [
+            '_static/theme_overrides.css'
+        ]
+    }
+else:
+    html_context = {
+        'css_files': [
+            '//media.readthedocs.org/css/sphinx_rtd_theme.css',
+            '//media.readthedocs.org/css/readthedocs-doc-embed.css',
+            '_static/theme_overrides.css'
+        ]
+    }
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pybind11doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+'preamble': r'\DeclareUnicodeCharacter{00A0}{}',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'pybind11.tex', 'pybind11 Documentation',
+   'Wenzel Jakob', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = 'pybind11-logo.png'
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pybind11', 'pybind11 Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'pybind11', 'pybind11 Documentation',
+   author, 'pybind11', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+primary_domain = 'cpp'
+highlight_language = 'cpp'
+
+
+def generate_doxygen_xml(app):
+    build_dir = os.path.join(app.confdir, '.build')
+    if not os.path.exists(build_dir):
+        os.mkdir(build_dir)
+
+    try:
+        subprocess.call(['doxygen', '--version'])
+        retcode = subprocess.call(['doxygen'], cwd=app.confdir)
+        if retcode < 0:
+            sys.stderr.write("doxygen error code: {}\n".format(-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: {}\n".format(e))
+
+
+def setup(app):
+    """Add hook for building doxygen xml when needed"""
+    app.connect("builder-inited", generate_doxygen_xml)
diff --git a/pybind11/docs/faq.rst b/pybind11/docs/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b68562910ad271c2417a15f35aa81cba03312e45
--- /dev/null
+++ b/pybind11/docs/faq.rst
@@ -0,0 +1,324 @@
+Frequently asked questions
+##########################
+
+"ImportError: dynamic module does not define init function"
+===========================================================
+
+1. Make sure that the name specified in PYBIND11_MODULE is identical to the
+filename of the extension library (without suffixes such as .so)
+
+2. If the above did not fix the issue, you are likely using an incompatible
+version of Python (for instance, the extension library was compiled against
+Python 2, while the interpreter is running on top of some version of Python
+3, or vice versa).
+
+"Symbol not found: ``__Py_ZeroStruct`` / ``_PyInstanceMethod_Type``"
+========================================================================
+
+See the first answer.
+
+"SystemError: dynamic module not initialized properly"
+======================================================
+
+See the first answer.
+
+The Python interpreter immediately crashes when importing my module
+===================================================================
+
+See the first answer.
+
+CMake doesn't detect the right Python version
+=============================================
+
+The CMake-based build system will try to automatically detect the installed
+version of Python and link against that. When this fails, or when there are
+multiple versions of Python and it finds the wrong one, delete
+``CMakeCache.txt`` and then invoke CMake as follows:
+
+.. code-block:: bash
+
+    cmake -DPYTHON_EXECUTABLE:FILEPATH=<path-to-python-executable> .
+
+.. _faq_reference_arguments:
+
+Limitations involving reference arguments
+=========================================
+
+In C++, it's fairly common to pass arguments using mutable references or
+mutable pointers, which allows both read and write access to the value
+supplied by the caller. This is sometimes done for efficiency reasons, or to
+realize functions that have multiple return values. Here are two very basic
+examples:
+
+.. code-block:: cpp
+
+    void increment(int &i) { i++; }
+    void increment_ptr(int *i) { (*i)++; }
+
+In Python, all arguments are passed by reference, so there is no general
+issue in binding such code from Python.
+
+However, certain basic Python types (like ``str``, ``int``, ``bool``,
+``float``, etc.) are **immutable**. This means that the following attempt
+to port the function to Python doesn't have the same effect on the value
+provided by the caller -- in fact, it does nothing at all.
+
+.. code-block:: python
+
+    def increment(i):
+        i += 1 # nope..
+
+pybind11 is also affected by such language-level conventions, which means that
+binding ``increment`` or ``increment_ptr`` will also create Python functions
+that don't modify their arguments.
+
+Although inconvenient, one workaround is to encapsulate the immutable types in
+a custom type that does allow modifications.
+
+An other alternative involves binding a small wrapper lambda function that
+returns a tuple with all output arguments (see the remainder of the
+documentation for examples on binding lambda functions). An example:
+
+.. code-block:: cpp
+
+    int foo(int &i) { i++; return 123; }
+
+and the binding code
+
+.. code-block:: cpp
+
+   m.def("foo", [](int i) { int rv = foo(i); return std::make_tuple(rv, i); });
+
+
+How can I reduce the build time?
+================================
+
+It's good practice to split binding code over multiple files, as in the
+following example:
+
+:file:`example.cpp`:
+
+.. code-block:: cpp
+
+    void init_ex1(py::module &);
+    void init_ex2(py::module &);
+    /* ... */
+
+    PYBIND11_MODULE(example, m) {
+        init_ex1(m);
+        init_ex2(m);
+        /* ... */
+    }
+
+:file:`ex1.cpp`:
+
+.. code-block:: cpp
+
+    void init_ex1(py::module &m) {
+        m.def("add", [](int a, int b) { return a + b; });
+    }
+
+:file:`ex2.cpp`:
+
+.. code-block:: cpp
+
+    void init_ex2(py::module &m) {
+        m.def("sub", [](int a, int b) { return a - b; });
+    }
+
+:command:`python`:
+
+.. code-block:: pycon
+
+    >>> import example
+    >>> example.add(1, 2)
+    3
+    >>> example.sub(1, 1)
+    0
+
+As shown above, the various ``init_ex`` functions should be contained in
+separate files that can be compiled independently from one another, and then
+linked together into the same final shared object.  Following this approach
+will:
+
+1. reduce memory requirements per compilation unit.
+
+2. enable parallel builds (if desired).
+
+3. allow for faster incremental builds. For instance, when a single class
+   definition is changed, only a subset of the binding code will generally need
+   to be recompiled.
+
+"recursive template instantiation exceeded maximum depth of 256"
+================================================================
+
+If you receive an error about excessive recursive template evaluation, try
+specifying a larger value, e.g. ``-ftemplate-depth=1024`` on GCC/Clang. The
+culprit is generally the generation of function signatures at compile time
+using C++14 template metaprogramming.
+
+.. _`faq:hidden_visibility`:
+
+"‘SomeClass’ declared with greater visibility than the type of its field ‘SomeClass::member’ [-Wattributes]"
+============================================================================================================
+
+This error typically indicates that you are compiling without the required
+``-fvisibility`` flag.  pybind11 code internally forces hidden visibility on
+all internal code, but if non-hidden (and thus *exported*) code attempts to
+include a pybind type (for example, ``py::object`` or ``py::list``) you can run
+into this warning.
+
+To avoid it, make sure you are specifying ``-fvisibility=hidden`` when
+compiling pybind code.
+
+As to why ``-fvisibility=hidden`` is necessary, because pybind modules could
+have been compiled under different versions of pybind itself, it is also
+important that the symbols defined in one module do not clash with the
+potentially-incompatible symbols defined in another.  While Python extension
+modules are usually loaded with localized symbols (under POSIX systems
+typically using ``dlopen`` with the ``RTLD_LOCAL`` flag), this Python default
+can be changed, but even if it isn't it is not always enough to guarantee
+complete independence of the symbols involved when not using
+``-fvisibility=hidden``.
+
+Additionally, ``-fvisiblity=hidden`` can deliver considerably binary size
+savings.  (See the following section for more details).
+
+
+.. _`faq:symhidden`:
+
+How can I create smaller binaries?
+==================================
+
+To do its job, pybind11 extensively relies on a programming technique known as
+*template metaprogramming*, which is a way of performing computation at compile
+time using type information. Template metaprogamming usually instantiates code
+involving significant numbers of deeply nested types that are either completely
+removed or reduced to just a few instructions during the compiler's optimization
+phase. However, due to the nested nature of these types, the resulting symbol
+names in the compiled extension library can be extremely long. For instance,
+the included test suite contains the following symbol:
+
+.. only:: html
+
+    .. code-block:: none
+
+        _​_​Z​N​8​p​y​b​i​n​d​1​1​1​2​c​p​p​_​f​u​n​c​t​i​o​n​C​1​I​v​8​E​x​a​m​p​l​e​2​J​R​N​S​t​3​_​_​1​6​v​e​c​t​o​r​I​N​S​3​_​1​2​b​a​s​i​c​_​s​t​r​i​n​g​I​w​N​S​3​_​1​1​c​h​a​r​_​t​r​a​i​t​s​I​w​E​E​N​S​3​_​9​a​l​l​o​c​a​t​o​r​I​w​E​E​E​E​N​S​8​_​I​S​A​_​E​E​E​E​E​J​N​S​_​4​n​a​m​e​E​N​S​_​7​s​i​b​l​i​n​g​E​N​S​_​9​i​s​_​m​e​t​h​o​d​E​A​2​8​_​c​E​E​E​M​T​0​_​F​T​_​D​p​T​1​_​E​D​p​R​K​T​2​_
+
+.. only:: not html
+
+    .. code-block:: cpp
+
+        __ZN8pybind1112cpp_functionC1Iv8Example2JRNSt3__16vectorINS3_12basic_stringIwNS3_11char_traitsIwEENS3_9allocatorIwEEEENS8_ISA_EEEEEJNS_4nameENS_7siblingENS_9is_methodEA28_cEEEMT0_FT_DpT1_EDpRKT2_
+
+which is the mangled form of the following function type:
+
+.. code-block:: cpp
+
+    pybind11::cpp_function::cpp_function<void, Example2, std::__1::vector<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> >, std::__1::allocator<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> > > >&, pybind11::name, pybind11::sibling, pybind11::is_method, char [28]>(void (Example2::*)(std::__1::vector<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> >, std::__1::allocator<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> > > >&), pybind11::name const&, pybind11::sibling const&, pybind11::is_method const&, char const (&) [28])
+
+The memory needed to store just the mangled name of this function (196 bytes)
+is larger than the actual piece of code (111 bytes) it represents! On the other
+hand, it's silly to even give this function a name -- after all, it's just a
+tiny cog in a bigger piece of machinery that is not exposed to the outside
+world. So we'll generally only want to export symbols for those functions which
+are actually called from the outside.
+
+This can be achieved by specifying the parameter ``-fvisibility=hidden`` to GCC
+and Clang, which sets the default symbol visibility to *hidden*, which has a
+tremendous impact on the final binary size of the resulting extension library.
+(On Visual Studio, symbols are already hidden by default, so nothing needs to
+be done there.)
+
+In addition to decreasing binary size, ``-fvisibility=hidden`` also avoids
+potential serious issues when loading multiple modules and is required for
+proper pybind operation.  See the previous FAQ entry for more details.
+
+Working with ancient Visual Studio 2008 builds on Windows
+=========================================================
+
+The official Windows distributions of Python are compiled using truly
+ancient versions of Visual Studio that lack good C++11 support. Some users
+implicitly assume that it would be impossible to load a plugin built with
+Visual Studio 2015 into a Python distribution that was compiled using Visual
+Studio 2008. However, no such issue exists: it's perfectly legitimate to
+interface DLLs that are built with different compilers and/or C libraries.
+Common gotchas to watch out for involve not ``free()``-ing memory region
+that that were ``malloc()``-ed in another shared library, using data
+structures with incompatible ABIs, and so on. pybind11 is very careful not
+to make these types of mistakes.
+
+How can I properly handle Ctrl-C in long-running functions?
+===========================================================
+
+Ctrl-C is received by the Python interpreter, and holds it until the GIL
+is released, so a long-running function won't be interrupted.
+
+To interrupt from inside your function, you can use the ``PyErr_CheckSignals()``
+function, that will tell if a signal has been raised on the Python side.  This
+function merely checks a flag, so its impact is negligible. When a signal has
+been received, you must either explicitly interrupt execution by throwing
+``py::error_already_set`` (which will propagate the existing
+``KeyboardInterrupt``), or clear the error (which you usually will not want):
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m)
+    {
+        m.def("long running_func", []()
+        {
+            for (;;) {
+                if (PyErr_CheckSignals() != 0)
+                    throw py::error_already_set();
+                // Long running iteration
+            }
+        });
+    }
+
+Inconsistent detection of Python version in CMake and pybind11
+==============================================================
+
+The functions ``find_package(PythonInterp)`` and ``find_package(PythonLibs)`` provided by CMake
+for Python version detection are not used by pybind11 due to unreliability and limitations that make
+them unsuitable for pybind11's needs. Instead pybind provides its own, more reliable Python detection
+CMake code. Conflicts can arise, however, when using pybind11 in a project that *also* uses the CMake
+Python detection in a system with several Python versions installed.
+
+This difference may cause inconsistencies and errors if *both* mechanisms are used in the same project. Consider the following
+Cmake code executed in a system with Python 2.7 and 3.x installed:
+
+.. code-block:: cmake
+
+    find_package(PythonInterp)
+    find_package(PythonLibs)
+    find_package(pybind11)
+
+It will detect Python 2.7 and pybind11 will pick it as well.
+
+In contrast this code:
+
+.. code-block:: cmake
+
+    find_package(pybind11)
+    find_package(PythonInterp)
+    find_package(PythonLibs)
+
+will detect Python 3.x for pybind11 and may crash on ``find_package(PythonLibs)`` afterwards.
+
+It is advised to avoid using ``find_package(PythonInterp)`` and ``find_package(PythonLibs)`` from CMake and rely
+on pybind11 in detecting Python version. If this is not possible CMake machinery should be called *before* including pybind11.
+
+How to cite this project?
+=========================
+
+We suggest the following BibTeX template to cite pybind11 in scientific
+discourse:
+
+.. code-block:: bash
+
+    @misc{pybind11,
+       author = {Wenzel Jakob and Jason Rhinelander and Dean Moldovan},
+       year = {2017},
+       note = {https://github.com/pybind/pybind11},
+       title = {pybind11 -- Seamless operability between C++11 and Python}
+    }
diff --git a/pybind11/docs/index.rst b/pybind11/docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d236611b7224454415f395194cb72f783a42af37
--- /dev/null
+++ b/pybind11/docs/index.rst
@@ -0,0 +1,47 @@
+.. only: not latex
+
+    .. image:: pybind11-logo.png
+
+pybind11 --- Seamless operability between C++11 and Python
+==========================================================
+
+.. only: not latex
+
+    Contents:
+
+.. toctree::
+   :maxdepth: 1
+
+   intro
+   changelog
+   upgrade
+
+.. toctree::
+   :caption: The Basics
+   :maxdepth: 2
+
+   basics
+   classes
+   compiling
+
+.. toctree::
+   :caption: Advanced Topics
+   :maxdepth: 2
+
+   advanced/functions
+   advanced/classes
+   advanced/exceptions
+   advanced/smart_ptrs
+   advanced/cast/index
+   advanced/pycpp/index
+   advanced/embedding
+   advanced/misc
+
+.. toctree::
+   :caption: Extra Information
+   :maxdepth: 1
+
+   faq
+   benchmark
+   limitations
+   reference
diff --git a/pybind11/docs/intro.rst b/pybind11/docs/intro.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10e1799a19d4a2be8efb8f58515b290ef36514f8
--- /dev/null
+++ b/pybind11/docs/intro.rst
@@ -0,0 +1,93 @@
+.. image:: pybind11-logo.png
+
+About this project
+==================
+**pybind11** is a lightweight header-only library that exposes C++ types in Python
+and vice versa, mainly to create Python bindings of existing C++ code. Its
+goals and syntax are similar to the excellent `Boost.Python`_ library by David
+Abrahams: to minimize boilerplate code in traditional extension modules by
+inferring type information using compile-time introspection.
+
+.. _Boost.Python: http://www.boost.org/doc/libs/release/libs/python/doc/index.html
+
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.
+
+Core features
+*************
+The following core C++ features can be mapped to Python
+
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+
+Goodies
+*******
+In addition to the core functionality, pybind11 provides some extra goodies:
+
+- Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) are supported with an
+  implementation-agnostic interface.
+
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of `PyRosetta`_, an enormous Boost.Python binding project, reported a binary
+  size reduction of **5.4x** and compile time reduction by **5.8x**.
+
+- Function signatures are precomputed at compile time (using ``constexpr``),
+  leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+
+.. _PyRosetta: http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
+
+Supported compilers
+*******************
+
+1. Clang/LLVM (any non-ancient version with C++11 support)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 or newer
+4. Intel C++ compiler v17 or newer (v16 with pybind11 v2.0 and v15 with pybind11 v2.0 and a `workaround <https://github.com/pybind/pybind11/issues/276>`_ )
diff --git a/pybind11/docs/limitations.rst b/pybind11/docs/limitations.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59474f82fd9f9f5834d35430cc283f8b57ed10dc
--- /dev/null
+++ b/pybind11/docs/limitations.rst
@@ -0,0 +1,19 @@
+Limitations
+###########
+
+pybind11 strives to be a general solution to binding generation, but it also has
+certain limitations:
+
+- pybind11 casts away ``const``-ness in function arguments and return values.
+  This is in line with the Python language, which has no concept of ``const``
+  values. This means that some additional care is needed to avoid bugs that
+  would be caught by the type checker in a traditional C++ program.
+
+- The NumPy interface ``pybind11::array`` greatly simplifies accessing
+  numerical data from C++ (and vice versa), but it's not a full-blown array
+  class like ``Eigen::Array`` or ``boost.multi_array``.
+
+These features could be implemented but would lead to a significant increase in
+complexity. I've decided to draw the line here to keep this project simple and
+compact. Users who absolutely require these features are encouraged to fork
+pybind11.
diff --git a/pybind11/docs/pybind11-logo.png b/pybind11/docs/pybind11-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..4cbad54f797d3ced04d4048f282df5e4336d4af4
Binary files /dev/null and b/pybind11/docs/pybind11-logo.png differ
diff --git a/pybind11/docs/pybind11_vs_boost_python1.png b/pybind11/docs/pybind11_vs_boost_python1.png
new file mode 100644
index 0000000000000000000000000000000000000000..833231f240809884fb6eb4079db528b9b3c0a9ac
Binary files /dev/null and b/pybind11/docs/pybind11_vs_boost_python1.png differ
diff --git a/pybind11/docs/pybind11_vs_boost_python1.svg b/pybind11/docs/pybind11_vs_boost_python1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..5bf950e6fdc81676d9a9774926a623b4f6a2e2a8
--- /dev/null
+++ b/pybind11/docs/pybind11_vs_boost_python1.svg
@@ -0,0 +1,427 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="468pt" height="252pt" viewBox="0 0 468 252" version="1.1">
+<defs>
+<g>
+<symbol overflow="visible" id="glyph0-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph0-1">
+<path style="stroke:none;" d="M 3.726562 0 L 2.847656 0 L 2.847656 -5.601562 C 2.636719 -5.398438 2.359375 -5.195312 2.015625 -4.996094 C 1.671875 -4.792969 1.363281 -4.640625 1.089844 -4.539062 L 1.089844 -5.390625 C 1.582031 -5.621094 2.011719 -5.902344 2.378906 -6.230469 C 2.746094 -6.558594 3.007812 -6.878906 3.160156 -7.1875 L 3.726562 -7.1875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-2">
+<path style="stroke:none;" d="M 0.414062 -3.53125 C 0.414062 -4.375 0.503906 -5.058594 0.675781 -5.574219 C 0.851562 -6.089844 1.109375 -6.488281 1.453125 -6.765625 C 1.796875 -7.046875 2.226562 -7.1875 2.75 -7.1875 C 3.132812 -7.1875 3.46875 -7.109375 3.757812 -6.957031 C 4.046875 -6.800781 4.289062 -6.578125 4.476562 -6.285156 C 4.664062 -5.996094 4.8125 -5.640625 4.921875 -5.222656 C 5.03125 -4.804688 5.082031 -4.238281 5.082031 -3.53125 C 5.082031 -2.691406 4.996094 -2.011719 4.824219 -1.496094 C 4.652344 -0.980469 4.394531 -0.582031 4.050781 -0.300781 C 3.707031 -0.0195312 3.273438 0.121094 2.75 0.121094 C 2.058594 0.121094 1.515625 -0.125 1.125 -0.621094 C 0.652344 -1.214844 0.414062 -2.1875 0.414062 -3.53125 Z M 1.320312 -3.53125 C 1.320312 -2.355469 1.457031 -1.574219 1.730469 -1.183594 C 2.007812 -0.796875 2.34375 -0.601562 2.75 -0.601562 C 3.152344 -0.601562 3.492188 -0.796875 3.765625 -1.1875 C 4.042969 -1.578125 4.179688 -2.359375 4.179688 -3.53125 C 4.179688 -4.710938 4.042969 -5.492188 3.765625 -5.878906 C 3.492188 -6.265625 3.148438 -6.460938 2.738281 -6.460938 C 2.335938 -6.460938 2.011719 -6.289062 1.773438 -5.945312 C 1.46875 -5.511719 1.320312 -4.707031 1.320312 -3.53125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-3">
+<path style="stroke:none;" d="M 5.035156 -0.84375 L 5.035156 0 L 0.304688 0 C 0.296875 -0.210938 0.332031 -0.414062 0.40625 -0.609375 C 0.527344 -0.933594 0.71875 -1.25 0.984375 -1.5625 C 1.25 -1.875 1.632812 -2.234375 2.132812 -2.648438 C 2.910156 -3.285156 3.4375 -3.789062 3.710938 -4.164062 C 3.984375 -4.535156 4.121094 -4.886719 4.121094 -5.21875 C 4.121094 -5.566406 3.996094 -5.863281 3.746094 -6.101562 C 3.5 -6.339844 3.171875 -6.460938 2.773438 -6.460938 C 2.351562 -6.460938 2.011719 -6.332031 1.757812 -6.078125 C 1.503906 -5.824219 1.375 -5.472656 1.371094 -5.023438 L 0.46875 -5.117188 C 0.53125 -5.789062 0.761719 -6.304688 1.167969 -6.65625 C 1.570312 -7.011719 2.113281 -7.1875 2.792969 -7.1875 C 3.480469 -7.1875 4.023438 -6.996094 4.421875 -6.617188 C 4.824219 -6.234375 5.023438 -5.761719 5.023438 -5.199219 C 5.023438 -4.914062 4.964844 -4.632812 4.847656 -4.355469 C 4.730469 -4.078125 4.535156 -3.789062 4.265625 -3.480469 C 3.992188 -3.175781 3.542969 -2.753906 2.910156 -2.222656 C 2.382812 -1.78125 2.042969 -1.480469 1.894531 -1.320312 C 1.746094 -1.164062 1.621094 -1.003906 1.523438 -0.84375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-4">
+<path style="stroke:none;" d="M 0.414062 -1.875 L 1.335938 -1.953125 C 1.40625 -1.503906 1.566406 -1.167969 1.8125 -0.941406 C 2.0625 -0.714844 2.363281 -0.601562 2.714844 -0.601562 C 3.136719 -0.601562 3.496094 -0.761719 3.789062 -1.078125 C 4.082031 -1.398438 4.226562 -1.820312 4.226562 -2.347656 C 4.226562 -2.851562 4.085938 -3.246094 3.804688 -3.535156 C 3.523438 -3.824219 3.15625 -3.96875 2.699219 -3.96875 C 2.417969 -3.96875 2.160156 -3.90625 1.933594 -3.777344 C 1.707031 -3.648438 1.527344 -3.480469 1.398438 -3.277344 L 0.570312 -3.382812 L 1.265625 -7.0625 L 4.824219 -7.0625 L 4.824219 -6.21875 L 1.96875 -6.21875 L 1.582031 -4.296875 C 2.011719 -4.597656 2.460938 -4.746094 2.933594 -4.746094 C 3.558594 -4.746094 4.085938 -4.53125 4.515625 -4.097656 C 4.945312 -3.664062 5.160156 -3.105469 5.160156 -2.425781 C 5.160156 -1.777344 4.972656 -1.21875 4.59375 -0.746094 C 4.136719 -0.167969 3.507812 0.121094 2.714844 0.121094 C 2.0625 0.121094 1.53125 -0.0585938 1.121094 -0.425781 C 0.710938 -0.789062 0.472656 -1.273438 0.414062 -1.875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-5">
+<path style="stroke:none;" d="M 0.820312 0 L 0.820312 -7.15625 L 5.648438 -7.15625 L 5.648438 -6.3125 L 1.765625 -6.3125 L 1.765625 -4.097656 L 5.125 -4.097656 L 5.125 -3.25 L 1.765625 -3.25 L 1.765625 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-6">
+<path style="stroke:none;" d="M 4.058594 0 L 4.058594 -0.761719 C 3.65625 -0.175781 3.105469 0.117188 2.414062 0.117188 C 2.105469 0.117188 1.820312 0.0585938 1.554688 -0.0585938 C 1.289062 -0.175781 1.09375 -0.324219 0.964844 -0.5 C 0.835938 -0.679688 0.746094 -0.894531 0.695312 -1.152344 C 0.65625 -1.324219 0.640625 -1.597656 0.640625 -1.972656 L 0.640625 -5.1875 L 1.519531 -5.1875 L 1.519531 -2.308594 C 1.519531 -1.851562 1.535156 -1.542969 1.570312 -1.382812 C 1.625 -1.152344 1.746094 -0.96875 1.921875 -0.835938 C 2.101562 -0.703125 2.324219 -0.640625 2.585938 -0.640625 C 2.851562 -0.640625 3.097656 -0.707031 3.328125 -0.84375 C 3.5625 -0.976562 3.726562 -1.160156 3.820312 -1.394531 C 3.917969 -1.625 3.964844 -1.964844 3.964844 -2.40625 L 3.964844 -5.1875 L 4.84375 -5.1875 L 4.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-7">
+<path style="stroke:none;" d="M 0.660156 0 L 0.660156 -5.1875 L 1.449219 -5.1875 L 1.449219 -4.449219 C 1.832031 -5.019531 2.382812 -5.304688 3.101562 -5.304688 C 3.414062 -5.304688 3.699219 -5.246094 3.960938 -5.132812 C 4.222656 -5.023438 4.421875 -4.875 4.550781 -4.691406 C 4.679688 -4.507812 4.773438 -4.292969 4.824219 -4.042969 C 4.855469 -3.878906 4.875 -3.59375 4.875 -3.1875 L 4.875 0 L 3.992188 0 L 3.992188 -3.15625 C 3.992188 -3.511719 3.960938 -3.78125 3.890625 -3.957031 C 3.824219 -4.132812 3.703125 -4.277344 3.527344 -4.382812 C 3.351562 -4.488281 3.148438 -4.539062 2.914062 -4.539062 C 2.539062 -4.539062 2.21875 -4.421875 1.945312 -4.183594 C 1.671875 -3.945312 1.539062 -3.496094 1.539062 -2.832031 L 1.539062 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-8">
+<path style="stroke:none;" d="M 4.042969 -1.898438 L 4.90625 -1.789062 C 4.8125 -1.191406 4.570312 -0.726562 4.183594 -0.386719 C 3.792969 -0.0507812 3.316406 0.117188 2.75 0.117188 C 2.039062 0.117188 1.46875 -0.113281 1.039062 -0.578125 C 0.605469 -1.042969 0.390625 -1.707031 0.390625 -2.574219 C 0.390625 -3.132812 0.484375 -3.625 0.667969 -4.042969 C 0.855469 -4.460938 1.136719 -4.777344 1.515625 -4.988281 C 1.894531 -5.199219 2.308594 -5.304688 2.753906 -5.304688 C 3.316406 -5.304688 3.777344 -5.160156 4.136719 -4.875 C 4.492188 -4.589844 4.722656 -4.1875 4.824219 -3.664062 L 3.96875 -3.53125 C 3.886719 -3.878906 3.746094 -4.140625 3.539062 -4.316406 C 3.332031 -4.492188 3.082031 -4.578125 2.789062 -4.578125 C 2.34375 -4.578125 1.984375 -4.421875 1.710938 -4.105469 C 1.433594 -3.789062 1.292969 -3.285156 1.292969 -2.597656 C 1.292969 -1.902344 1.425781 -1.394531 1.695312 -1.078125 C 1.960938 -0.761719 2.308594 -0.605469 2.738281 -0.605469 C 3.085938 -0.605469 3.371094 -0.710938 3.601562 -0.921875 C 3.835938 -1.132812 3.980469 -1.460938 4.042969 -1.898438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-9">
+<path style="stroke:none;" d="M 2.578125 -0.785156 L 2.703125 -0.0078125 C 2.457031 0.0429688 2.234375 0.0703125 2.039062 0.0703125 C 1.722656 0.0703125 1.476562 0.0195312 1.296875 -0.0820312 C 1.121094 -0.183594 1 -0.316406 0.929688 -0.480469 C 0.855469 -0.644531 0.820312 -0.992188 0.820312 -1.519531 L 0.820312 -4.5 L 0.175781 -4.5 L 0.175781 -5.1875 L 0.820312 -5.1875 L 0.820312 -6.46875 L 1.695312 -6.996094 L 1.695312 -5.1875 L 2.578125 -5.1875 L 2.578125 -4.5 L 1.695312 -4.5 L 1.695312 -1.46875 C 1.695312 -1.21875 1.710938 -1.058594 1.742188 -0.984375 C 1.773438 -0.914062 1.820312 -0.859375 1.890625 -0.816406 C 1.960938 -0.773438 2.0625 -0.75 2.191406 -0.75 C 2.289062 -0.75 2.417969 -0.761719 2.578125 -0.785156 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-10">
+<path style="stroke:none;" d="M 0.664062 -6.148438 L 0.664062 -7.15625 L 1.542969 -7.15625 L 1.542969 -6.148438 Z M 0.664062 0 L 0.664062 -5.1875 L 1.542969 -5.1875 L 1.542969 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-11">
+<path style="stroke:none;" d="M 0.332031 -2.59375 C 0.332031 -3.554688 0.597656 -4.265625 1.132812 -4.726562 C 1.578125 -5.109375 2.121094 -5.304688 2.765625 -5.304688 C 3.476562 -5.304688 4.058594 -5.070312 4.511719 -4.601562 C 4.964844 -4.132812 5.191406 -3.488281 5.191406 -2.664062 C 5.191406 -2 5.089844 -1.472656 4.890625 -1.089844 C 4.691406 -0.707031 4.398438 -0.410156 4.015625 -0.199219 C 3.632812 0.0117188 3.214844 0.117188 2.765625 0.117188 C 2.039062 0.117188 1.449219 -0.117188 1.003906 -0.582031 C 0.554688 -1.046875 0.332031 -1.71875 0.332031 -2.59375 Z M 1.234375 -2.59375 C 1.234375 -1.929688 1.378906 -1.429688 1.671875 -1.101562 C 1.960938 -0.769531 2.324219 -0.605469 2.765625 -0.605469 C 3.199219 -0.605469 3.5625 -0.773438 3.851562 -1.101562 C 4.140625 -1.433594 4.289062 -1.941406 4.289062 -2.621094 C 4.289062 -3.261719 4.140625 -3.75 3.851562 -4.078125 C 3.558594 -4.410156 3.195312 -4.574219 2.765625 -4.574219 C 2.324219 -4.574219 1.960938 -4.410156 1.671875 -4.082031 C 1.382812 -3.753906 1.234375 -3.257812 1.234375 -2.59375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-12">
+<path style="stroke:none;" d="M 0.308594 -1.546875 L 1.175781 -1.683594 C 1.226562 -1.335938 1.363281 -1.070312 1.585938 -0.882812 C 1.808594 -0.699219 2.117188 -0.605469 2.519531 -0.605469 C 2.921875 -0.605469 3.222656 -0.6875 3.417969 -0.851562 C 3.613281 -1.015625 3.710938 -1.210938 3.710938 -1.429688 C 3.710938 -1.628906 3.625 -1.785156 3.453125 -1.898438 C 3.332031 -1.976562 3.03125 -2.078125 2.554688 -2.195312 C 1.910156 -2.359375 1.460938 -2.5 1.214844 -2.621094 C 0.964844 -2.738281 0.777344 -2.902344 0.648438 -3.113281 C 0.519531 -3.324219 0.453125 -3.554688 0.453125 -3.808594 C 0.453125 -4.039062 0.507812 -4.253906 0.613281 -4.449219 C 0.71875 -4.648438 0.863281 -4.8125 1.046875 -4.941406 C 1.183594 -5.042969 1.367188 -5.128906 1.605469 -5.199219 C 1.839844 -5.269531 2.09375 -5.304688 2.363281 -5.304688 C 2.769531 -5.304688 3.128906 -5.242188 3.433594 -5.125 C 3.742188 -5.007812 3.96875 -4.851562 4.117188 -4.652344 C 4.261719 -4.453125 4.363281 -4.183594 4.417969 -3.847656 L 3.558594 -3.730469 C 3.519531 -3.996094 3.40625 -4.207031 3.21875 -4.355469 C 3.03125 -4.503906 2.769531 -4.578125 2.425781 -4.578125 C 2.023438 -4.578125 1.734375 -4.511719 1.5625 -4.378906 C 1.390625 -4.246094 1.304688 -4.089844 1.304688 -3.910156 C 1.304688 -3.796875 1.339844 -3.695312 1.410156 -3.601562 C 1.484375 -3.507812 1.59375 -3.429688 1.75 -3.367188 C 1.835938 -3.335938 2.09375 -3.261719 2.523438 -3.144531 C 3.144531 -2.976562 3.578125 -2.84375 3.824219 -2.738281 C 4.070312 -2.632812 4.265625 -2.476562 4.40625 -2.273438 C 4.546875 -2.074219 4.613281 -1.824219 4.613281 -1.523438 C 4.613281 -1.230469 4.527344 -0.953125 4.359375 -0.695312 C 4.1875 -0.4375 3.941406 -0.238281 3.617188 -0.09375 C 3.296875 0.046875 2.929688 0.117188 2.523438 0.117188 C 1.851562 0.117188 1.335938 -0.0234375 0.984375 -0.304688 C 0.632812 -0.582031 0.40625 -0.996094 0.308594 -1.546875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph1-1">
+<path style="stroke:none;" d="M -2.300781 -0.449219 L -2.378906 -1.34375 C -2.019531 -1.386719 -1.726562 -1.484375 -1.496094 -1.636719 C -1.265625 -1.792969 -1.082031 -2.03125 -0.941406 -2.359375 C -0.796875 -2.683594 -0.726562 -3.050781 -0.726562 -3.457031 C -0.726562 -3.820312 -0.78125 -4.136719 -0.890625 -4.414062 C -0.996094 -4.691406 -1.144531 -4.898438 -1.332031 -5.03125 C -1.519531 -5.167969 -1.722656 -5.234375 -1.945312 -5.234375 C -2.167969 -5.234375 -2.363281 -5.167969 -2.53125 -5.039062 C -2.699219 -4.910156 -2.839844 -4.695312 -2.953125 -4.394531 C -3.027344 -4.203125 -3.144531 -3.777344 -3.304688 -3.121094 C -3.460938 -2.460938 -3.609375 -2 -3.75 -1.738281 C -3.929688 -1.398438 -4.152344 -1.140625 -4.417969 -0.972656 C -4.683594 -0.804688 -4.980469 -0.722656 -5.308594 -0.722656 C -5.667969 -0.722656 -6.007812 -0.824219 -6.320312 -1.03125 C -6.632812 -1.234375 -6.875 -1.535156 -7.035156 -1.929688 C -7.199219 -2.324219 -7.28125 -2.761719 -7.28125 -3.242188 C -7.28125 -3.773438 -7.195312 -4.242188 -7.023438 -4.644531 C -6.851562 -5.050781 -6.601562 -5.363281 -6.269531 -5.582031 C -5.9375 -5.800781 -5.5625 -5.917969 -5.140625 -5.933594 L -5.074219 -5.023438 C -5.527344 -4.976562 -5.867188 -4.808594 -6.097656 -4.527344 C -6.328125 -4.246094 -6.445312 -3.832031 -6.445312 -3.28125 C -6.445312 -2.707031 -6.339844 -2.289062 -6.128906 -2.027344 C -5.921875 -1.765625 -5.667969 -1.636719 -5.371094 -1.636719 C -5.113281 -1.636719 -4.902344 -1.726562 -4.734375 -1.914062 C -4.570312 -2.097656 -4.398438 -2.574219 -4.226562 -3.34375 C -4.050781 -4.113281 -3.898438 -4.640625 -3.769531 -4.925781 C -3.578125 -5.34375 -3.335938 -5.652344 -3.039062 -5.851562 C -2.746094 -6.046875 -2.40625 -6.148438 -2.023438 -6.148438 C -1.640625 -6.148438 -1.28125 -6.039062 -0.945312 -5.820312 C -0.609375 -5.601562 -0.347656 -5.289062 -0.160156 -4.878906 C 0.0273438 -4.472656 0.121094 -4.011719 0.121094 -3.5 C 0.121094 -2.851562 0.0273438 -2.308594 -0.160156 -1.871094 C -0.351562 -1.433594 -0.632812 -1.089844 -1.011719 -0.84375 C -1.390625 -0.59375 -1.820312 -0.460938 -2.300781 -0.449219 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-2">
+<path style="stroke:none;" d="M -1.671875 -4.210938 L -1.558594 -5.117188 C -1.027344 -4.972656 -0.617188 -4.707031 -0.320312 -4.320312 C -0.0273438 -3.933594 0.117188 -3.4375 0.117188 -2.835938 C 0.117188 -2.078125 -0.117188 -1.476562 -0.582031 -1.03125 C -1.050781 -0.585938 -1.707031 -0.367188 -2.546875 -0.367188 C -3.421875 -0.367188 -4.097656 -0.589844 -4.578125 -1.039062 C -5.0625 -1.488281 -5.304688 -2.070312 -5.304688 -2.789062 C -5.304688 -3.480469 -5.066406 -4.046875 -4.59375 -4.488281 C -4.121094 -4.925781 -3.457031 -5.148438 -2.601562 -5.148438 C -2.550781 -5.148438 -2.472656 -5.144531 -2.367188 -5.140625 L -2.367188 -1.273438 C -1.796875 -1.304688 -1.363281 -1.46875 -1.058594 -1.757812 C -0.757812 -2.046875 -0.605469 -2.410156 -0.605469 -2.84375 C -0.605469 -3.164062 -0.691406 -3.4375 -0.859375 -3.667969 C -1.027344 -3.894531 -1.296875 -4.074219 -1.671875 -4.210938 Z M -3.089844 -1.324219 L -3.089844 -4.21875 C -3.527344 -4.179688 -3.855469 -4.070312 -4.070312 -3.886719 C -4.410156 -3.605469 -4.578125 -3.242188 -4.578125 -2.796875 C -4.578125 -2.394531 -4.445312 -2.054688 -4.175781 -1.78125 C -3.90625 -1.503906 -3.542969 -1.351562 -3.089844 -1.324219 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-3">
+<path style="stroke:none;" d="M -1.898438 -4.042969 L -1.789062 -4.90625 C -1.191406 -4.8125 -0.726562 -4.570312 -0.386719 -4.183594 C -0.0507812 -3.792969 0.117188 -3.316406 0.117188 -2.75 C 0.117188 -2.039062 -0.113281 -1.46875 -0.578125 -1.039062 C -1.042969 -0.605469 -1.707031 -0.390625 -2.574219 -0.390625 C -3.132812 -0.390625 -3.625 -0.484375 -4.042969 -0.667969 C -4.460938 -0.855469 -4.777344 -1.136719 -4.988281 -1.515625 C -5.199219 -1.894531 -5.304688 -2.308594 -5.304688 -2.753906 C -5.304688 -3.316406 -5.160156 -3.777344 -4.875 -4.136719 C -4.589844 -4.492188 -4.1875 -4.722656 -3.664062 -4.824219 L -3.53125 -3.96875 C -3.878906 -3.886719 -4.140625 -3.746094 -4.316406 -3.539062 C -4.492188 -3.332031 -4.578125 -3.082031 -4.578125 -2.789062 C -4.578125 -2.34375 -4.421875 -1.984375 -4.105469 -1.710938 C -3.789062 -1.433594 -3.285156 -1.292969 -2.597656 -1.292969 C -1.902344 -1.292969 -1.394531 -1.425781 -1.078125 -1.695312 C -0.761719 -1.960938 -0.605469 -2.308594 -0.605469 -2.738281 C -0.605469 -3.085938 -0.710938 -3.371094 -0.921875 -3.601562 C -1.132812 -3.835938 -1.460938 -3.980469 -1.898438 -4.042969 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-4">
+<path style="stroke:none;" d="M -2.59375 -0.332031 C -3.554688 -0.332031 -4.265625 -0.597656 -4.726562 -1.132812 C -5.109375 -1.578125 -5.304688 -2.121094 -5.304688 -2.765625 C -5.304688 -3.476562 -5.070312 -4.058594 -4.601562 -4.511719 C -4.132812 -4.964844 -3.488281 -5.191406 -2.664062 -5.191406 C -2 -5.191406 -1.472656 -5.089844 -1.089844 -4.890625 C -0.707031 -4.691406 -0.410156 -4.398438 -0.199219 -4.015625 C 0.0117188 -3.632812 0.117188 -3.214844 0.117188 -2.765625 C 0.117188 -2.039062 -0.117188 -1.449219 -0.582031 -1.003906 C -1.046875 -0.554688 -1.71875 -0.332031 -2.59375 -0.332031 Z M -2.59375 -1.234375 C -1.929688 -1.234375 -1.429688 -1.378906 -1.101562 -1.671875 C -0.769531 -1.960938 -0.605469 -2.324219 -0.605469 -2.765625 C -0.605469 -3.199219 -0.773438 -3.5625 -1.101562 -3.851562 C -1.433594 -4.140625 -1.941406 -4.289062 -2.621094 -4.289062 C -3.261719 -4.289062 -3.75 -4.140625 -4.078125 -3.851562 C -4.410156 -3.558594 -4.574219 -3.195312 -4.574219 -2.765625 C -4.574219 -2.324219 -4.410156 -1.960938 -4.082031 -1.671875 C -3.753906 -1.382812 -3.257812 -1.234375 -2.59375 -1.234375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-5">
+<path style="stroke:none;" d="M 0 -0.660156 L -5.1875 -0.660156 L -5.1875 -1.449219 L -4.449219 -1.449219 C -5.019531 -1.832031 -5.304688 -2.382812 -5.304688 -3.101562 C -5.304688 -3.414062 -5.246094 -3.699219 -5.132812 -3.960938 C -5.023438 -4.222656 -4.875 -4.421875 -4.691406 -4.550781 C -4.507812 -4.679688 -4.292969 -4.773438 -4.042969 -4.824219 C -3.878906 -4.855469 -3.59375 -4.875 -3.1875 -4.875 L 0 -4.875 L 0 -3.992188 L -3.15625 -3.992188 C -3.511719 -3.992188 -3.78125 -3.960938 -3.957031 -3.890625 C -4.132812 -3.824219 -4.277344 -3.703125 -4.382812 -3.527344 C -4.488281 -3.351562 -4.539062 -3.148438 -4.539062 -2.914062 C -4.539062 -2.539062 -4.421875 -2.21875 -4.183594 -1.945312 C -3.945312 -1.671875 -3.496094 -1.539062 -2.832031 -1.539062 L 0 -1.539062 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-6">
+<path style="stroke:none;" d="M 0 -4.023438 L -0.65625 -4.023438 C -0.140625 -3.695312 0.117188 -3.210938 0.117188 -2.574219 C 0.117188 -2.160156 0.00390625 -1.78125 -0.226562 -1.433594 C -0.453125 -1.085938 -0.769531 -0.816406 -1.179688 -0.628906 C -1.585938 -0.4375 -2.058594 -0.34375 -2.585938 -0.34375 C -3.105469 -0.34375 -3.574219 -0.429688 -3.996094 -0.601562 C -4.417969 -0.773438 -4.742188 -1.03125 -4.964844 -1.375 C -5.191406 -1.722656 -5.304688 -2.109375 -5.304688 -2.535156 C -5.304688 -2.847656 -5.238281 -3.125 -5.105469 -3.367188 C -4.972656 -3.613281 -4.800781 -3.8125 -4.589844 -3.964844 L -7.15625 -3.964844 L -7.15625 -4.839844 L 0 -4.839844 Z M -2.585938 -1.246094 C -1.921875 -1.246094 -1.425781 -1.386719 -1.097656 -1.664062 C -0.769531 -1.945312 -0.605469 -2.273438 -0.605469 -2.65625 C -0.605469 -3.039062 -0.761719 -3.367188 -1.078125 -3.636719 C -1.390625 -3.90625 -1.871094 -4.039062 -2.515625 -4.039062 C -3.226562 -4.039062 -3.746094 -3.902344 -4.078125 -3.628906 C -4.410156 -3.355469 -4.574219 -3.015625 -4.574219 -2.617188 C -4.574219 -2.226562 -4.414062 -1.898438 -4.097656 -1.636719 C -3.777344 -1.375 -3.273438 -1.246094 -2.585938 -1.246094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-7">
+<path style="stroke:none;" d="M -1.546875 -0.308594 L -1.683594 -1.175781 C -1.335938 -1.226562 -1.070312 -1.363281 -0.882812 -1.585938 C -0.699219 -1.808594 -0.605469 -2.117188 -0.605469 -2.519531 C -0.605469 -2.921875 -0.6875 -3.222656 -0.851562 -3.417969 C -1.015625 -3.613281 -1.210938 -3.710938 -1.429688 -3.710938 C -1.628906 -3.710938 -1.785156 -3.625 -1.898438 -3.453125 C -1.976562 -3.332031 -2.078125 -3.03125 -2.195312 -2.554688 C -2.359375 -1.910156 -2.5 -1.460938 -2.621094 -1.214844 C -2.738281 -0.964844 -2.902344 -0.777344 -3.113281 -0.648438 C -3.324219 -0.519531 -3.554688 -0.453125 -3.808594 -0.453125 C -4.039062 -0.453125 -4.253906 -0.507812 -4.449219 -0.613281 C -4.648438 -0.71875 -4.8125 -0.863281 -4.941406 -1.046875 C -5.042969 -1.183594 -5.128906 -1.367188 -5.199219 -1.605469 C -5.269531 -1.839844 -5.304688 -2.09375 -5.304688 -2.363281 C -5.304688 -2.769531 -5.242188 -3.128906 -5.125 -3.433594 C -5.007812 -3.742188 -4.851562 -3.96875 -4.652344 -4.117188 C -4.453125 -4.261719 -4.183594 -4.363281 -3.847656 -4.417969 L -3.730469 -3.558594 C -3.996094 -3.519531 -4.207031 -3.40625 -4.355469 -3.21875 C -4.503906 -3.03125 -4.578125 -2.769531 -4.578125 -2.425781 C -4.578125 -2.023438 -4.511719 -1.734375 -4.378906 -1.5625 C -4.246094 -1.390625 -4.089844 -1.304688 -3.910156 -1.304688 C -3.796875 -1.304688 -3.695312 -1.339844 -3.601562 -1.410156 C -3.507812 -1.484375 -3.429688 -1.59375 -3.367188 -1.75 C -3.335938 -1.835938 -3.261719 -2.09375 -3.144531 -2.523438 C -2.976562 -3.144531 -2.84375 -3.578125 -2.738281 -3.824219 C -2.632812 -4.070312 -2.476562 -4.265625 -2.273438 -4.40625 C -2.074219 -4.546875 -1.824219 -4.613281 -1.523438 -4.613281 C -1.230469 -4.613281 -0.953125 -4.527344 -0.695312 -4.359375 C -0.4375 -4.1875 -0.238281 -3.941406 -0.09375 -3.617188 C 0.046875 -3.296875 0.117188 -2.929688 0.117188 -2.523438 C 0.117188 -1.851562 -0.0234375 -1.335938 -0.304688 -0.984375 C -0.582031 -0.632812 -0.996094 -0.40625 -1.546875 -0.308594 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph2-1">
+<path style="stroke:none;" d="M 7.054688 -3.011719 L 8.191406 -2.726562 C 7.953125 -1.792969 7.523438 -1.078125 6.90625 -0.589844 C 6.285156 -0.0976562 5.53125 0.148438 4.632812 0.148438 C 3.707031 0.148438 2.957031 -0.0429688 2.375 -0.417969 C 1.796875 -0.796875 1.355469 -1.34375 1.050781 -2.054688 C 0.75 -2.769531 0.597656 -3.539062 0.597656 -4.359375 C 0.597656 -5.253906 0.769531 -6.035156 1.109375 -6.699219 C 1.453125 -7.367188 1.9375 -7.871094 2.570312 -8.21875 C 3.199219 -8.5625 3.894531 -8.734375 4.652344 -8.734375 C 5.511719 -8.734375 6.234375 -8.515625 6.820312 -8.078125 C 7.40625 -7.640625 7.8125 -7.027344 8.046875 -6.234375 L 6.925781 -5.96875 C 6.726562 -6.59375 6.4375 -7.050781 6.058594 -7.335938 C 5.679688 -7.621094 5.203125 -7.765625 4.628906 -7.765625 C 3.96875 -7.765625 3.417969 -7.605469 2.972656 -7.289062 C 2.53125 -6.972656 2.21875 -6.546875 2.039062 -6.015625 C 1.859375 -5.480469 1.769531 -4.929688 1.769531 -4.367188 C 1.769531 -3.636719 1.875 -2.996094 2.089844 -2.453125 C 2.300781 -1.90625 2.632812 -1.5 3.082031 -1.230469 C 3.53125 -0.960938 4.015625 -0.828125 4.539062 -0.828125 C 5.175781 -0.828125 5.71875 -1.007812 6.15625 -1.375 C 6.597656 -1.742188 6.898438 -2.289062 7.054688 -3.011719 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-2">
+<path style="stroke:none;" d="M 0.398438 -3.109375 C 0.398438 -4.261719 0.71875 -5.117188 1.359375 -5.671875 C 1.894531 -6.132812 2.546875 -6.363281 3.316406 -6.363281 C 4.171875 -6.363281 4.871094 -6.082031 5.414062 -5.523438 C 5.957031 -4.960938 6.226562 -4.1875 6.226562 -3.199219 C 6.226562 -2.398438 6.109375 -1.769531 5.867188 -1.308594 C 5.628906 -0.851562 5.277344 -0.492188 4.820312 -0.242188 C 4.359375 0.0117188 3.859375 0.140625 3.316406 0.140625 C 2.445312 0.140625 1.742188 -0.140625 1.203125 -0.695312 C 0.667969 -1.253906 0.398438 -2.0625 0.398438 -3.109375 Z M 1.484375 -3.109375 C 1.484375 -2.3125 1.65625 -1.71875 2.003906 -1.320312 C 2.351562 -0.925781 2.789062 -0.726562 3.316406 -0.726562 C 3.839844 -0.726562 4.273438 -0.925781 4.625 -1.324219 C 4.972656 -1.722656 5.144531 -2.328125 5.144531 -3.148438 C 5.144531 -3.917969 4.96875 -4.5 4.621094 -4.894531 C 4.269531 -5.292969 3.835938 -5.492188 3.316406 -5.492188 C 2.789062 -5.492188 2.351562 -5.292969 2.003906 -4.898438 C 1.65625 -4.503906 1.484375 -3.90625 1.484375 -3.109375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-3">
+<path style="stroke:none;" d="M 0.789062 0 L 0.789062 -6.222656 L 1.734375 -6.222656 L 1.734375 -5.351562 C 1.929688 -5.65625 2.1875 -5.898438 2.515625 -6.085938 C 2.839844 -6.269531 3.207031 -6.363281 3.621094 -6.363281 C 4.082031 -6.363281 4.460938 -6.265625 4.753906 -6.078125 C 5.050781 -5.886719 5.257812 -5.617188 5.378906 -5.273438 C 5.871094 -6 6.511719 -6.363281 7.300781 -6.363281 C 7.917969 -6.363281 8.390625 -6.191406 8.726562 -5.851562 C 9.058594 -5.507812 9.222656 -4.984375 9.222656 -4.273438 L 9.222656 0 L 8.171875 0 L 8.171875 -3.921875 C 8.171875 -4.34375 8.140625 -4.644531 8.070312 -4.832031 C 8.003906 -5.015625 7.878906 -5.164062 7.699219 -5.28125 C 7.519531 -5.394531 7.308594 -5.449219 7.066406 -5.449219 C 6.628906 -5.449219 6.265625 -5.304688 5.976562 -5.011719 C 5.6875 -4.722656 5.542969 -4.257812 5.542969 -3.617188 L 5.542969 0 L 4.488281 0 L 4.488281 -4.042969 C 4.488281 -4.511719 4.402344 -4.863281 4.230469 -5.097656 C 4.058594 -5.332031 3.777344 -5.449219 3.386719 -5.449219 C 3.089844 -5.449219 2.816406 -5.371094 2.5625 -5.214844 C 2.3125 -5.058594 2.128906 -4.828125 2.015625 -4.53125 C 1.902344 -4.230469 1.84375 -3.796875 1.84375 -3.226562 L 1.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-4">
+<path style="stroke:none;" d="M 0.789062 2.382812 L 0.789062 -6.222656 L 1.75 -6.222656 L 1.75 -5.414062 C 1.976562 -5.730469 2.234375 -5.96875 2.519531 -6.125 C 2.804688 -6.285156 3.148438 -6.363281 3.554688 -6.363281 C 4.085938 -6.363281 4.554688 -6.226562 4.960938 -5.953125 C 5.367188 -5.679688 5.675781 -5.292969 5.882812 -4.796875 C 6.089844 -4.296875 6.195312 -3.75 6.195312 -3.15625 C 6.195312 -2.519531 6.078125 -1.949219 5.851562 -1.4375 C 5.621094 -0.929688 5.289062 -0.539062 4.855469 -0.265625 C 4.417969 0.00390625 3.960938 0.140625 3.480469 0.140625 C 3.128906 0.140625 2.8125 0.0664062 2.535156 -0.0820312 C 2.253906 -0.230469 2.023438 -0.417969 1.84375 -0.644531 L 1.84375 2.382812 Z M 1.746094 -3.078125 C 1.746094 -2.277344 1.90625 -1.683594 2.234375 -1.300781 C 2.558594 -0.917969 2.949219 -0.726562 3.410156 -0.726562 C 3.878906 -0.726562 4.28125 -0.925781 4.613281 -1.320312 C 4.949219 -1.71875 5.117188 -2.332031 5.117188 -3.164062 C 5.117188 -3.957031 4.953125 -4.550781 4.625 -4.945312 C 4.300781 -5.339844 3.910156 -5.539062 3.457031 -5.539062 C 3.007812 -5.539062 2.609375 -5.328125 2.265625 -4.90625 C 1.917969 -4.488281 1.746094 -3.875 1.746094 -3.078125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-5">
+<path style="stroke:none;" d="M 0.796875 -7.375 L 0.796875 -8.589844 L 1.851562 -8.589844 L 1.851562 -7.375 Z M 0.796875 0 L 0.796875 -6.222656 L 1.851562 -6.222656 L 1.851562 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-6">
+<path style="stroke:none;" d="M 0.765625 0 L 0.765625 -8.589844 L 1.820312 -8.589844 L 1.820312 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-7">
+<path style="stroke:none;" d="M 4.851562 -0.765625 C 4.460938 -0.433594 4.085938 -0.203125 3.722656 -0.0625 C 3.363281 0.0742188 2.976562 0.140625 2.5625 0.140625 C 1.878906 0.140625 1.351562 -0.0273438 0.984375 -0.359375 C 0.617188 -0.695312 0.433594 -1.121094 0.433594 -1.640625 C 0.433594 -1.945312 0.503906 -2.222656 0.640625 -2.476562 C 0.78125 -2.726562 0.960938 -2.929688 1.1875 -3.082031 C 1.410156 -3.234375 1.664062 -3.351562 1.945312 -3.429688 C 2.152344 -3.484375 2.464844 -3.535156 2.882812 -3.585938 C 3.734375 -3.6875 4.359375 -3.808594 4.765625 -3.949219 C 4.769531 -4.09375 4.769531 -4.1875 4.769531 -4.226562 C 4.769531 -4.65625 4.671875 -4.957031 4.46875 -5.132812 C 4.199219 -5.371094 3.800781 -5.492188 3.269531 -5.492188 C 2.773438 -5.492188 2.40625 -5.402344 2.171875 -5.230469 C 1.933594 -5.054688 1.757812 -4.75 1.648438 -4.304688 L 0.617188 -4.445312 C 0.710938 -4.886719 0.863281 -5.246094 1.078125 -5.515625 C 1.292969 -5.789062 1.601562 -5.996094 2.007812 -6.144531 C 2.414062 -6.289062 2.886719 -6.363281 3.421875 -6.363281 C 3.953125 -6.363281 4.382812 -6.300781 4.71875 -6.175781 C 5.050781 -6.050781 5.292969 -5.894531 5.449219 -5.703125 C 5.605469 -5.515625 5.714844 -5.273438 5.777344 -4.984375 C 5.8125 -4.804688 5.828125 -4.484375 5.828125 -4.015625 L 5.828125 -2.609375 C 5.828125 -1.628906 5.851562 -1.007812 5.898438 -0.746094 C 5.941406 -0.488281 6.03125 -0.238281 6.164062 0 L 5.0625 0 C 4.953125 -0.21875 4.882812 -0.476562 4.851562 -0.765625 Z M 4.765625 -3.125 C 4.382812 -2.96875 3.804688 -2.835938 3.039062 -2.726562 C 2.605469 -2.664062 2.300781 -2.59375 2.121094 -2.515625 C 1.941406 -2.4375 1.804688 -2.320312 1.703125 -2.171875 C 1.605469 -2.019531 1.558594 -1.851562 1.558594 -1.671875 C 1.558594 -1.390625 1.664062 -1.15625 1.878906 -0.96875 C 2.089844 -0.78125 2.402344 -0.6875 2.8125 -0.6875 C 3.21875 -0.6875 3.578125 -0.773438 3.898438 -0.953125 C 4.214844 -1.128906 4.445312 -1.375 4.59375 -1.679688 C 4.707031 -1.917969 4.765625 -2.273438 4.765625 -2.734375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-8">
+<path style="stroke:none;" d="M 3.09375 -0.945312 L 3.246094 -0.0117188 C 2.949219 0.0507812 2.683594 0.0820312 2.449219 0.0820312 C 2.066406 0.0820312 1.769531 0.0234375 1.558594 -0.101562 C 1.347656 -0.222656 1.199219 -0.378906 1.113281 -0.578125 C 1.027344 -0.773438 0.984375 -1.1875 0.984375 -1.820312 L 0.984375 -5.402344 L 0.210938 -5.402344 L 0.210938 -6.222656 L 0.984375 -6.222656 L 0.984375 -7.765625 L 2.03125 -8.398438 L 2.03125 -6.222656 L 3.09375 -6.222656 L 3.09375 -5.402344 L 2.03125 -5.402344 L 2.03125 -1.765625 C 2.03125 -1.464844 2.050781 -1.269531 2.089844 -1.183594 C 2.125 -1.097656 2.1875 -1.03125 2.269531 -0.976562 C 2.355469 -0.925781 2.476562 -0.902344 2.632812 -0.902344 C 2.75 -0.902344 2.902344 -0.914062 3.09375 -0.945312 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-9">
+<path style="stroke:none;" d="M 0.789062 0 L 0.789062 -6.222656 L 1.742188 -6.222656 L 1.742188 -5.335938 C 2.199219 -6.019531 2.859375 -6.363281 3.71875 -6.363281 C 4.09375 -6.363281 4.441406 -6.296875 4.753906 -6.160156 C 5.070312 -6.027344 5.304688 -5.851562 5.460938 -5.632812 C 5.617188 -5.414062 5.726562 -5.152344 5.789062 -4.851562 C 5.828125 -4.65625 5.847656 -4.3125 5.847656 -3.828125 L 5.847656 0 L 4.792969 0 L 4.792969 -3.785156 C 4.792969 -4.214844 4.75 -4.535156 4.671875 -4.75 C 4.589844 -4.960938 4.441406 -5.132812 4.234375 -5.257812 C 4.023438 -5.386719 3.78125 -5.449219 3.5 -5.449219 C 3.050781 -5.449219 2.660156 -5.304688 2.335938 -5.023438 C 2.007812 -4.738281 1.84375 -4.195312 1.84375 -3.398438 L 1.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-10">
+<path style="stroke:none;" d="M 5.050781 -2.003906 L 6.140625 -1.867188 C 5.96875 -1.230469 5.648438 -0.738281 5.1875 -0.386719 C 4.722656 -0.0351562 4.125 0.140625 3.40625 0.140625 C 2.496094 0.140625 1.773438 -0.140625 1.238281 -0.699219 C 0.707031 -1.261719 0.4375 -2.046875 0.4375 -3.058594 C 0.4375 -4.105469 0.710938 -4.917969 1.25 -5.496094 C 1.789062 -6.074219 2.484375 -6.363281 3.34375 -6.363281 C 4.175781 -6.363281 4.859375 -6.078125 5.382812 -5.515625 C 5.910156 -4.949219 6.175781 -4.148438 6.175781 -3.125 C 6.175781 -3.0625 6.171875 -2.96875 6.171875 -2.84375 L 1.53125 -2.84375 C 1.570312 -2.160156 1.761719 -1.632812 2.109375 -1.273438 C 2.457031 -0.910156 2.890625 -0.726562 3.410156 -0.726562 C 3.796875 -0.726562 4.125 -0.828125 4.398438 -1.03125 C 4.671875 -1.234375 4.890625 -1.558594 5.050781 -2.003906 Z M 1.585938 -3.710938 L 5.0625 -3.710938 C 5.015625 -4.234375 4.882812 -4.625 4.664062 -4.886719 C 4.328125 -5.292969 3.890625 -5.496094 3.359375 -5.496094 C 2.875 -5.496094 2.464844 -5.335938 2.136719 -5.007812 C 1.804688 -4.683594 1.625 -4.25 1.585938 -3.710938 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-11">
+<path style="stroke:none;" d="M 1.042969 0 L 1.042969 -5.402344 L 0.109375 -5.402344 L 0.109375 -6.222656 L 1.042969 -6.222656 L 1.042969 -6.882812 C 1.042969 -7.300781 1.078125 -7.613281 1.15625 -7.816406 C 1.257812 -8.089844 1.433594 -8.3125 1.691406 -8.480469 C 1.945312 -8.652344 2.304688 -8.734375 2.765625 -8.734375 C 3.0625 -8.734375 3.390625 -8.703125 3.75 -8.632812 L 3.59375 -7.710938 C 3.375 -7.75 3.164062 -7.769531 2.96875 -7.769531 C 2.648438 -7.769531 2.421875 -7.703125 2.289062 -7.5625 C 2.15625 -7.425781 2.09375 -7.171875 2.09375 -6.796875 L 2.09375 -6.222656 L 3.304688 -6.222656 L 3.304688 -5.402344 L 2.09375 -5.402344 L 2.09375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-12">
+<path style="stroke:none;" d="M 4.828125 0 L 4.828125 -0.785156 C 4.433594 -0.167969 3.851562 0.140625 3.085938 0.140625 C 2.589844 0.140625 2.136719 0.00390625 1.71875 -0.269531 C 1.304688 -0.542969 0.980469 -0.925781 0.753906 -1.414062 C 0.523438 -1.90625 0.410156 -2.46875 0.410156 -3.105469 C 0.410156 -3.726562 0.515625 -4.289062 0.71875 -4.796875 C 0.925781 -5.300781 1.238281 -5.6875 1.652344 -5.960938 C 2.066406 -6.230469 2.53125 -6.363281 3.039062 -6.363281 C 3.414062 -6.363281 3.75 -6.285156 4.042969 -6.125 C 4.335938 -5.96875 4.574219 -5.761719 4.757812 -5.507812 L 4.757812 -8.589844 L 5.804688 -8.589844 L 5.804688 0 Z M 1.492188 -3.105469 C 1.492188 -2.308594 1.664062 -1.710938 2 -1.320312 C 2.335938 -0.925781 2.730469 -0.726562 3.1875 -0.726562 C 3.648438 -0.726562 4.039062 -0.914062 4.363281 -1.292969 C 4.683594 -1.667969 4.84375 -2.242188 4.84375 -3.015625 C 4.84375 -3.867188 4.679688 -4.492188 4.351562 -4.890625 C 4.023438 -5.289062 3.621094 -5.492188 3.140625 -5.492188 C 2.671875 -5.492188 2.28125 -5.296875 1.964844 -4.914062 C 1.652344 -4.53125 1.492188 -3.929688 1.492188 -3.105469 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-13">
+<path style="stroke:none;" d="M 4.867188 0 L 4.867188 -0.914062 C 4.382812 -0.210938 3.726562 0.140625 2.894531 0.140625 C 2.527344 0.140625 2.183594 0.0703125 1.867188 -0.0703125 C 1.546875 -0.210938 1.3125 -0.386719 1.15625 -0.601562 C 1.003906 -0.8125 0.894531 -1.074219 0.832031 -1.382812 C 0.789062 -1.589844 0.765625 -1.917969 0.765625 -2.367188 L 0.765625 -6.222656 L 1.820312 -6.222656 L 1.820312 -2.773438 C 1.820312 -2.222656 1.84375 -1.851562 1.886719 -1.65625 C 1.953125 -1.378906 2.09375 -1.164062 2.308594 -1.003906 C 2.523438 -0.847656 2.789062 -0.765625 3.105469 -0.765625 C 3.421875 -0.765625 3.71875 -0.847656 3.996094 -1.011719 C 4.273438 -1.171875 4.46875 -1.394531 4.585938 -1.671875 C 4.699219 -1.953125 4.757812 -2.359375 4.757812 -2.890625 L 4.757812 -6.222656 L 5.8125 -6.222656 L 5.8125 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph3-1">
+<path style="stroke:none;" d="M 0.953125 0 L 0.953125 -9.304688 L 4.445312 -9.304688 C 5.15625 -9.304688 5.722656 -9.210938 6.152344 -9.023438 C 6.582031 -8.835938 6.921875 -8.546875 7.164062 -8.152344 C 7.40625 -7.761719 7.527344 -7.351562 7.527344 -6.925781 C 7.527344 -6.527344 7.421875 -6.152344 7.203125 -5.800781 C 6.988281 -5.449219 6.664062 -5.167969 6.226562 -4.953125 C 6.789062 -4.785156 7.222656 -4.503906 7.523438 -4.105469 C 7.828125 -3.707031 7.980469 -3.238281 7.980469 -2.699219 C 7.980469 -2.261719 7.886719 -1.855469 7.703125 -1.480469 C 7.519531 -1.105469 7.292969 -0.820312 7.019531 -0.617188 C 6.75 -0.414062 6.410156 -0.257812 6 -0.15625 C 5.59375 -0.0507812 5.09375 0 4.5 0 Z M 2.183594 -5.394531 L 4.195312 -5.394531 C 4.742188 -5.394531 5.132812 -5.429688 5.371094 -5.503906 C 5.683594 -5.597656 5.917969 -5.75 6.078125 -5.96875 C 6.238281 -6.183594 6.316406 -6.453125 6.316406 -6.78125 C 6.316406 -7.089844 6.242188 -7.359375 6.09375 -7.59375 C 5.945312 -7.828125 5.734375 -7.992188 5.460938 -8.078125 C 5.183594 -8.164062 4.710938 -8.207031 4.042969 -8.207031 L 2.183594 -8.207031 Z M 2.183594 -1.097656 L 4.5 -1.097656 C 4.898438 -1.097656 5.175781 -1.113281 5.339844 -1.140625 C 5.621094 -1.191406 5.859375 -1.277344 6.050781 -1.398438 C 6.242188 -1.515625 6.394531 -1.6875 6.519531 -1.914062 C 6.640625 -2.140625 6.703125 -2.402344 6.703125 -2.699219 C 6.703125 -3.046875 6.613281 -3.347656 6.4375 -3.601562 C 6.257812 -3.859375 6.011719 -4.039062 5.695312 -4.140625 C 5.382812 -4.246094 4.929688 -4.296875 4.335938 -4.296875 L 2.183594 -4.296875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-2">
+<path style="stroke:none;" d="M 0.429688 -3.371094 C 0.429688 -4.617188 0.777344 -5.542969 1.472656 -6.144531 C 2.050781 -6.644531 2.757812 -6.894531 3.59375 -6.894531 C 4.519531 -6.894531 5.277344 -6.589844 5.867188 -5.984375 C 6.453125 -5.375 6.746094 -4.535156 6.746094 -3.464844 C 6.746094 -2.597656 6.617188 -1.914062 6.355469 -1.417969 C 6.097656 -0.921875 5.71875 -0.535156 5.222656 -0.261719 C 4.722656 0.015625 4.179688 0.152344 3.59375 0.152344 C 2.648438 0.152344 1.886719 -0.148438 1.304688 -0.753906 C 0.722656 -1.359375 0.429688 -2.230469 0.429688 -3.371094 Z M 1.605469 -3.371094 C 1.605469 -2.507812 1.792969 -1.859375 2.171875 -1.429688 C 2.546875 -1 3.023438 -0.789062 3.59375 -0.789062 C 4.160156 -0.789062 4.632812 -1.003906 5.007812 -1.433594 C 5.382812 -1.867188 5.574219 -2.523438 5.574219 -3.410156 C 5.574219 -4.242188 5.382812 -4.875 5.003906 -5.304688 C 4.625 -5.734375 4.15625 -5.949219 3.59375 -5.949219 C 3.023438 -5.949219 2.546875 -5.734375 2.171875 -5.304688 C 1.792969 -4.878906 1.605469 -4.234375 1.605469 -3.371094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-3">
+<path style="stroke:none;" d="M 0.398438 -2.011719 L 1.53125 -2.191406 C 1.59375 -1.738281 1.769531 -1.390625 2.058594 -1.148438 C 2.347656 -0.90625 2.753906 -0.789062 3.273438 -0.789062 C 3.800781 -0.789062 4.1875 -0.894531 4.445312 -1.109375 C 4.699219 -1.320312 4.824219 -1.570312 4.824219 -1.859375 C 4.824219 -2.117188 4.710938 -2.320312 4.488281 -2.46875 C 4.332031 -2.570312 3.941406 -2.699219 3.320312 -2.855469 C 2.480469 -3.066406 1.902344 -3.25 1.578125 -3.40625 C 1.253906 -3.558594 1.007812 -3.773438 0.839844 -4.046875 C 0.671875 -4.320312 0.589844 -4.621094 0.589844 -4.953125 C 0.589844 -5.253906 0.660156 -5.53125 0.796875 -5.785156 C 0.933594 -6.042969 1.121094 -6.253906 1.359375 -6.421875 C 1.535156 -6.554688 1.777344 -6.667969 2.085938 -6.757812 C 2.390625 -6.847656 2.722656 -6.894531 3.070312 -6.894531 C 3.601562 -6.894531 4.066406 -6.816406 4.464844 -6.664062 C 4.867188 -6.511719 5.160156 -6.304688 5.351562 -6.046875 C 5.542969 -5.785156 5.671875 -5.4375 5.746094 -5 L 4.628906 -4.851562 C 4.578125 -5.195312 4.429688 -5.46875 4.1875 -5.664062 C 3.945312 -5.859375 3.597656 -5.953125 3.15625 -5.953125 C 2.628906 -5.953125 2.253906 -5.867188 2.03125 -5.695312 C 1.808594 -5.519531 1.695312 -5.316406 1.695312 -5.085938 C 1.695312 -4.9375 1.742188 -4.804688 1.835938 -4.683594 C 1.929688 -4.5625 2.074219 -4.460938 2.273438 -4.378906 C 2.386719 -4.335938 2.722656 -4.242188 3.28125 -4.085938 C 4.089844 -3.871094 4.652344 -3.695312 4.972656 -3.558594 C 5.292969 -3.421875 5.542969 -3.21875 5.726562 -2.957031 C 5.90625 -2.695312 6 -2.371094 6 -1.980469 C 6 -1.601562 5.886719 -1.242188 5.664062 -0.90625 C 5.441406 -0.570312 5.121094 -0.308594 4.703125 -0.125 C 4.285156 0.0585938 3.8125 0.152344 3.28125 0.152344 C 2.40625 0.152344 1.738281 -0.03125 1.277344 -0.394531 C 0.820312 -0.757812 0.527344 -1.296875 0.398438 -2.011719 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-4">
+<path style="stroke:none;" d="M 3.351562 -1.023438 L 3.515625 -0.0117188 C 3.195312 0.0546875 2.90625 0.0898438 2.652344 0.0898438 C 2.238281 0.0898438 1.917969 0.0234375 1.6875 -0.109375 C 1.460938 -0.238281 1.300781 -0.410156 1.207031 -0.625 C 1.113281 -0.839844 1.066406 -1.289062 1.066406 -1.972656 L 1.066406 -5.851562 L 0.226562 -5.851562 L 0.226562 -6.742188 L 1.066406 -6.742188 L 1.066406 -8.410156 L 2.203125 -9.097656 L 2.203125 -6.742188 L 3.351562 -6.742188 L 3.351562 -5.851562 L 2.203125 -5.851562 L 2.203125 -1.910156 C 2.203125 -1.585938 2.222656 -1.375 2.261719 -1.28125 C 2.304688 -1.1875 2.367188 -1.113281 2.460938 -1.058594 C 2.550781 -1.003906 2.679688 -0.976562 2.851562 -0.976562 C 2.976562 -0.976562 3.144531 -0.992188 3.351562 -1.023438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-5">
+<path style="stroke:none;" d="M 1.179688 0 L 1.179688 -1.300781 L 2.480469 -1.300781 L 2.480469 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-6">
+<path style="stroke:none;" d="M 1.003906 0 L 1.003906 -9.304688 L 4.511719 -9.304688 C 5.128906 -9.304688 5.601562 -9.277344 5.929688 -9.21875 C 6.386719 -9.140625 6.769531 -8.996094 7.078125 -8.78125 C 7.386719 -8.566406 7.636719 -8.269531 7.824219 -7.882812 C 8.011719 -7.5 8.105469 -7.074219 8.105469 -6.613281 C 8.105469 -5.824219 7.855469 -5.152344 7.351562 -4.605469 C 6.847656 -4.058594 5.9375 -3.78125 4.621094 -3.78125 L 2.234375 -3.78125 L 2.234375 0 Z M 2.234375 -4.882812 L 4.640625 -4.882812 C 5.4375 -4.882812 6 -5.03125 6.335938 -5.324219 C 6.667969 -5.621094 6.835938 -6.039062 6.835938 -6.578125 C 6.835938 -6.964844 6.738281 -7.296875 6.542969 -7.574219 C 6.34375 -7.851562 6.085938 -8.035156 5.765625 -8.125 C 5.558594 -8.179688 5.171875 -8.207031 4.613281 -8.207031 L 2.234375 -8.207031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-7">
+<path style="stroke:none;" d="M 0.804688 2.597656 L 0.679688 1.523438 C 0.929688 1.589844 1.148438 1.625 1.332031 1.625 C 1.585938 1.625 1.789062 1.582031 1.941406 1.5 C 2.09375 1.414062 2.21875 1.296875 2.316406 1.140625 C 2.390625 1.027344 2.503906 0.746094 2.664062 0.292969 C 2.6875 0.230469 2.722656 0.136719 2.765625 0.0117188 L 0.210938 -6.742188 L 1.441406 -6.742188 L 2.84375 -2.835938 C 3.027344 -2.34375 3.1875 -1.820312 3.332031 -1.277344 C 3.464844 -1.800781 3.621094 -2.3125 3.800781 -2.8125 L 5.242188 -6.742188 L 6.386719 -6.742188 L 3.820312 0.113281 C 3.546875 0.855469 3.332031 1.363281 3.179688 1.644531 C 2.976562 2.019531 2.746094 2.296875 2.480469 2.472656 C 2.21875 2.648438 1.90625 2.734375 1.542969 2.734375 C 1.324219 2.734375 1.078125 2.6875 0.804688 2.597656 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-8">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -9.304688 L 2 -9.304688 L 2 -5.96875 C 2.53125 -6.585938 3.207031 -6.894531 4.019531 -6.894531 C 4.519531 -6.894531 4.953125 -6.796875 5.320312 -6.597656 C 5.6875 -6.402344 5.949219 -6.128906 6.109375 -5.78125 C 6.269531 -5.433594 6.347656 -4.933594 6.347656 -4.273438 L 6.347656 0 L 5.203125 0 L 5.203125 -4.273438 C 5.203125 -4.84375 5.082031 -5.257812 4.832031 -5.519531 C 4.585938 -5.78125 4.234375 -5.910156 3.78125 -5.910156 C 3.445312 -5.910156 3.125 -5.820312 2.828125 -5.644531 C 2.53125 -5.46875 2.316406 -5.234375 2.191406 -4.933594 C 2.0625 -4.632812 2 -4.21875 2 -3.6875 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-9">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -6.742188 L 1.886719 -6.742188 L 1.886719 -5.78125 C 2.382812 -6.523438 3.09375 -6.894531 4.03125 -6.894531 C 4.4375 -6.894531 4.808594 -6.820312 5.152344 -6.675781 C 5.492188 -6.527344 5.746094 -6.335938 5.914062 -6.101562 C 6.085938 -5.863281 6.203125 -5.582031 6.273438 -5.257812 C 6.3125 -5.046875 6.335938 -4.675781 6.335938 -4.144531 L 6.335938 0 L 5.191406 0 L 5.191406 -4.101562 C 5.191406 -4.566406 5.148438 -4.914062 5.058594 -5.144531 C 4.96875 -5.375 4.8125 -5.558594 4.585938 -5.695312 C 4.359375 -5.835938 4.09375 -5.902344 3.789062 -5.902344 C 3.304688 -5.902344 2.882812 -5.75 2.53125 -5.441406 C 2.175781 -5.132812 2 -4.546875 2 -3.679688 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-10">
+<path style="stroke:none;" d="M 0.855469 2.582031 L 0.855469 -6.742188 L 1.898438 -6.742188 L 1.898438 -5.867188 C 2.144531 -6.207031 2.421875 -6.464844 2.730469 -6.636719 C 3.039062 -6.808594 3.414062 -6.894531 3.851562 -6.894531 C 4.429688 -6.894531 4.9375 -6.746094 5.375 -6.449219 C 5.816406 -6.152344 6.148438 -5.734375 6.375 -5.195312 C 6.597656 -4.65625 6.710938 -4.066406 6.710938 -3.421875 C 6.710938 -2.730469 6.585938 -2.109375 6.339844 -1.558594 C 6.089844 -1.007812 5.730469 -0.582031 5.257812 -0.289062 C 4.785156 0.00390625 4.289062 0.152344 3.769531 0.152344 C 3.390625 0.152344 3.046875 0.0703125 2.746094 -0.0898438 C 2.441406 -0.25 2.195312 -0.453125 2 -0.699219 L 2 2.582031 Z M 1.890625 -3.332031 C 1.890625 -2.464844 2.066406 -1.824219 2.417969 -1.410156 C 2.769531 -0.996094 3.195312 -0.789062 3.695312 -0.789062 C 4.203125 -0.789062 4.636719 -1 5 -1.429688 C 5.359375 -1.859375 5.542969 -2.527344 5.542969 -3.429688 C 5.542969 -4.289062 5.363281 -4.929688 5.011719 -5.359375 C 4.65625 -5.785156 4.234375 -6 3.746094 -6 C 3.257812 -6 2.828125 -5.769531 2.453125 -5.316406 C 2.078125 -4.859375 1.890625 -4.199219 1.890625 -3.332031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-11">
+<path style="stroke:none;" d="M 1.910156 0 L 0.851562 0 L 0.851562 -9.304688 L 1.992188 -9.304688 L 1.992188 -5.984375 C 2.476562 -6.589844 3.089844 -6.894531 3.839844 -6.894531 C 4.253906 -6.894531 4.648438 -6.808594 5.019531 -6.644531 C 5.390625 -6.476562 5.691406 -6.242188 5.933594 -5.9375 C 6.171875 -5.636719 6.359375 -5.269531 6.492188 -4.84375 C 6.628906 -4.414062 6.695312 -3.957031 6.695312 -3.472656 C 6.695312 -2.316406 6.410156 -1.425781 5.839844 -0.792969 C 5.269531 -0.164062 4.582031 0.152344 3.78125 0.152344 C 2.988281 0.152344 2.363281 -0.179688 1.910156 -0.84375 Z M 1.898438 -3.421875 C 1.898438 -2.613281 2.007812 -2.027344 2.226562 -1.667969 C 2.585938 -1.082031 3.074219 -0.789062 3.6875 -0.789062 C 4.1875 -0.789062 4.617188 -1.003906 4.984375 -1.4375 C 5.347656 -1.871094 5.527344 -2.519531 5.527344 -3.375 C 5.527344 -4.257812 5.355469 -4.90625 5.003906 -5.324219 C 4.65625 -5.742188 4.234375 -5.953125 3.738281 -5.953125 C 3.238281 -5.953125 2.808594 -5.738281 2.445312 -5.304688 C 2.082031 -4.871094 1.898438 -4.242188 1.898438 -3.421875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-12">
+<path style="stroke:none;" d="M 0.863281 -7.992188 L 0.863281 -9.304688 L 2.007812 -9.304688 L 2.007812 -7.992188 Z M 0.863281 0 L 0.863281 -6.742188 L 2.007812 -6.742188 L 2.007812 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-13">
+<path style="stroke:none;" d="M 5.230469 0 L 5.230469 -0.851562 C 4.804688 -0.183594 4.175781 0.152344 3.34375 0.152344 C 2.808594 0.152344 2.3125 0.00390625 1.863281 -0.292969 C 1.414062 -0.589844 1.0625 -1 0.816406 -1.53125 C 0.570312 -2.0625 0.445312 -2.675781 0.445312 -3.363281 C 0.445312 -4.035156 0.554688 -4.648438 0.78125 -5.195312 C 1.003906 -5.742188 1.339844 -6.164062 1.789062 -6.457031 C 2.238281 -6.75 2.738281 -6.894531 3.292969 -6.894531 C 3.699219 -6.894531 4.0625 -6.808594 4.378906 -6.636719 C 4.695312 -6.464844 4.957031 -6.242188 5.15625 -5.96875 L 5.15625 -9.304688 L 6.289062 -9.304688 L 6.289062 0 Z M 1.617188 -3.363281 C 1.617188 -2.5 1.800781 -1.855469 2.164062 -1.429688 C 2.527344 -1 2.957031 -0.789062 3.453125 -0.789062 C 3.953125 -0.789062 4.375 -0.992188 4.726562 -1.398438 C 5.074219 -1.808594 5.25 -2.429688 5.25 -3.269531 C 5.25 -4.191406 5.070312 -4.867188 4.714844 -5.300781 C 4.359375 -5.730469 3.921875 -5.949219 3.402344 -5.949219 C 2.894531 -5.949219 2.46875 -5.742188 2.128906 -5.324219 C 1.789062 -4.910156 1.617188 -4.257812 1.617188 -3.363281 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-14">
+<path style="stroke:none;" d="M 4.84375 0 L 3.699219 0 L 3.699219 -7.28125 C 3.425781 -7.019531 3.066406 -6.757812 2.617188 -6.492188 C 2.171875 -6.230469 1.769531 -6.035156 1.414062 -5.902344 L 1.414062 -7.007812 C 2.054688 -7.308594 2.613281 -7.671875 3.089844 -8.101562 C 3.570312 -8.527344 3.90625 -8.941406 4.105469 -9.34375 L 4.84375 -9.34375 Z "/>
+</symbol>
+</g>
+<clipPath id="clip1">
+  <path d="M 89 21 L 91 21 L 91 221 L 89 221 Z "/>
+</clipPath>
+<clipPath id="clip2">
+  <path d="M 201 21 L 203 21 L 203 221 L 201 221 Z "/>
+</clipPath>
+<clipPath id="clip3">
+  <path d="M 313 21 L 315 21 L 315 221 L 313 221 Z "/>
+</clipPath>
+<clipPath id="clip4">
+  <path d="M 33 180 L 355 180 L 355 182 L 33 182 Z "/>
+</clipPath>
+<clipPath id="clip5">
+  <path d="M 33 146 L 355 146 L 355 148 L 33 148 Z "/>
+</clipPath>
+<clipPath id="clip6">
+  <path d="M 33 120 L 355 120 L 355 122 L 33 122 Z "/>
+</clipPath>
+<clipPath id="clip7">
+  <path d="M 33 95 L 355 95 L 355 97 L 33 97 Z "/>
+</clipPath>
+<clipPath id="clip8">
+  <path d="M 33 61 L 355 61 L 355 63 L 33 63 Z "/>
+</clipPath>
+</defs>
+<g id="surface11">
+<g clip-path="url(#clip1)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 90.226562 220.007812 L 90.226562 21 "/>
+</g>
+<g clip-path="url(#clip2)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 201.996094 220.007812 L 201.996094 21 "/>
+</g>
+<g clip-path="url(#clip3)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 313.761719 220.007812 L 313.761719 21 "/>
+</g>
+<g clip-path="url(#clip4)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 180.734375 L 355 180.734375 "/>
+</g>
+<g clip-path="url(#clip5)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 146.960938 L 355 146.960938 "/>
+</g>
+<g clip-path="url(#clip6)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 121.410156 L 355 121.410156 "/>
+</g>
+<g clip-path="url(#clip7)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 95.859375 L 355 95.859375 "/>
+</g>
+<g clip-path="url(#clip8)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 62.082031 L 355 62.082031 "/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 49.128906 169 C 49.128906 168.167969 48.800781 167.375 48.214844 166.785156 C 47.625 166.199219 46.832031 165.871094 46 165.871094 C 45.167969 165.871094 44.375 166.199219 43.785156 166.785156 C 43.199219 167.375 42.871094 168.167969 42.871094 169 C 42.871094 169.832031 43.199219 170.625 43.785156 171.214844 C 44.375 171.800781 45.167969 172.128906 46 172.128906 C 46.832031 172.128906 47.625 171.800781 48.214844 171.214844 C 48.800781 170.625 49.128906 169.832031 49.128906 169 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 82.128906 169 C 82.128906 168.167969 81.800781 167.375 81.214844 166.785156 C 80.625 166.199219 79.832031 165.871094 79 165.871094 C 78.167969 165.871094 77.375 166.199219 76.785156 166.785156 C 76.199219 167.375 75.871094 168.167969 75.871094 169 C 75.871094 169.832031 76.199219 170.625 76.785156 171.214844 C 77.375 171.800781 78.167969 172.128906 79 172.128906 C 79.832031 172.128906 80.625 171.800781 81.214844 171.214844 C 81.800781 170.625 82.128906 169.832031 82.128906 169 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 116.128906 167 C 116.128906 166.167969 115.800781 165.375 115.214844 164.785156 C 114.625 164.199219 113.832031 163.871094 113 163.871094 C 112.167969 163.871094 111.375 164.199219 110.785156 164.785156 C 110.199219 165.375 109.871094 166.167969 109.871094 167 C 109.871094 167.832031 110.199219 168.625 110.785156 169.214844 C 111.375 169.800781 112.167969 170.128906 113 170.128906 C 113.832031 170.128906 114.625 169.800781 115.214844 169.214844 C 115.800781 168.625 116.128906 167.832031 116.128906 167 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 150.128906 159 C 150.128906 158.167969 149.800781 157.375 149.214844 156.785156 C 148.625 156.199219 147.832031 155.871094 147 155.871094 C 146.167969 155.871094 145.375 156.199219 144.785156 156.785156 C 144.199219 157.375 143.871094 158.167969 143.871094 159 C 143.871094 159.832031 144.199219 160.625 144.785156 161.214844 C 145.375 161.800781 146.167969 162.128906 147 162.128906 C 147.832031 162.128906 148.625 161.800781 149.214844 161.214844 C 149.800781 160.625 150.128906 159.832031 150.128906 159 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 183.128906 146 C 183.128906 145.167969 182.800781 144.375 182.214844 143.785156 C 181.625 143.199219 180.832031 142.871094 180 142.871094 C 179.167969 142.871094 178.375 143.199219 177.785156 143.785156 C 177.199219 144.375 176.871094 145.167969 176.871094 146 C 176.871094 146.832031 177.199219 147.625 177.785156 148.214844 C 178.375 148.800781 179.167969 149.128906 180 149.128906 C 180.832031 149.128906 181.625 148.800781 182.214844 148.214844 C 182.800781 147.625 183.128906 146.832031 183.128906 146 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 217.128906 128 C 217.128906 127.167969 216.800781 126.375 216.214844 125.785156 C 215.625 125.199219 214.832031 124.871094 214 124.871094 C 213.167969 124.871094 212.375 125.199219 211.785156 125.785156 C 211.199219 126.375 210.871094 127.167969 210.871094 128 C 210.871094 128.832031 211.199219 129.625 211.785156 130.214844 C 212.375 130.800781 213.167969 131.128906 214 131.128906 C 214.832031 131.128906 215.625 130.800781 216.214844 130.214844 C 216.800781 129.625 217.128906 128.832031 217.128906 128 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 251.128906 108 C 251.128906 107.167969 250.800781 106.375 250.214844 105.785156 C 249.625 105.199219 248.832031 104.871094 248 104.871094 C 247.167969 104.871094 246.375 105.199219 245.785156 105.785156 C 245.199219 106.375 244.871094 107.167969 244.871094 108 C 244.871094 108.832031 245.199219 109.625 245.785156 110.214844 C 246.375 110.800781 247.167969 111.128906 248 111.128906 C 248.832031 111.128906 249.625 110.800781 250.214844 110.214844 C 250.800781 109.625 251.128906 108.832031 251.128906 108 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 284.128906 85 C 284.128906 84.167969 283.800781 83.375 283.214844 82.785156 C 282.625 82.199219 281.832031 81.871094 281 81.871094 C 280.167969 81.871094 279.375 82.199219 278.785156 82.785156 C 278.199219 83.375 277.871094 84.167969 277.871094 85 C 277.871094 85.832031 278.199219 86.625 278.785156 87.214844 C 279.375 87.800781 280.167969 88.128906 281 88.128906 C 281.832031 88.128906 282.625 87.800781 283.214844 87.214844 C 283.800781 86.625 284.128906 85.832031 284.128906 85 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 318.128906 59 C 318.128906 58.167969 317.800781 57.375 317.214844 56.785156 C 316.625 56.199219 315.832031 55.871094 315 55.871094 C 314.167969 55.871094 313.375 56.199219 312.785156 56.785156 C 312.199219 57.375 311.871094 58.167969 311.871094 59 C 311.871094 59.832031 312.199219 60.625 312.785156 61.214844 C 313.375 61.800781 314.167969 62.128906 315 62.128906 C 315.832031 62.128906 316.625 61.800781 317.214844 61.214844 C 317.800781 60.625 318.128906 59.832031 318.128906 59 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 352.128906 31 C 352.128906 30.167969 351.800781 29.375 351.214844 28.785156 C 350.625 28.199219 349.832031 27.871094 349 27.871094 C 348.167969 27.871094 347.375 28.199219 346.785156 28.785156 C 346.199219 29.375 345.871094 30.167969 345.871094 31 C 345.871094 31.832031 346.199219 32.625 346.785156 33.214844 C 347.375 33.800781 348.167969 34.128906 349 34.128906 C 349.832031 34.128906 350.625 33.800781 351.214844 33.214844 C 351.800781 32.625 352.128906 31.832031 352.128906 31 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 49.128906 205 C 49.128906 204.167969 48.800781 203.375 48.214844 202.785156 C 47.625 202.199219 46.832031 201.871094 46 201.871094 C 45.167969 201.871094 44.375 202.199219 43.785156 202.785156 C 43.199219 203.375 42.871094 204.167969 42.871094 205 C 42.871094 205.832031 43.199219 206.625 43.785156 207.214844 C 44.375 207.800781 45.167969 208.128906 46 208.128906 C 46.832031 208.128906 47.625 207.800781 48.214844 207.214844 C 48.800781 206.625 49.128906 205.832031 49.128906 205 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 82.128906 199 C 82.128906 198.167969 81.800781 197.375 81.214844 196.785156 C 80.625 196.199219 79.832031 195.871094 79 195.871094 C 78.167969 195.871094 77.375 196.199219 76.785156 196.785156 C 76.199219 197.375 75.871094 198.167969 75.871094 199 C 75.871094 199.832031 76.199219 200.625 76.785156 201.214844 C 77.375 201.800781 78.167969 202.128906 79 202.128906 C 79.832031 202.128906 80.625 201.800781 81.214844 201.214844 C 81.800781 200.625 82.128906 199.832031 82.128906 199 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 116.128906 190 C 116.128906 189.167969 115.800781 188.375 115.214844 187.785156 C 114.625 187.199219 113.832031 186.871094 113 186.871094 C 112.167969 186.871094 111.375 187.199219 110.785156 187.785156 C 110.199219 188.375 109.871094 189.167969 109.871094 190 C 109.871094 190.832031 110.199219 191.625 110.785156 192.214844 C 111.375 192.800781 112.167969 193.128906 113 193.128906 C 113.832031 193.128906 114.625 192.800781 115.214844 192.214844 C 115.800781 191.625 116.128906 190.832031 116.128906 190 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 150.128906 177 C 150.128906 176.167969 149.800781 175.375 149.214844 174.785156 C 148.625 174.199219 147.832031 173.871094 147 173.871094 C 146.167969 173.871094 145.375 174.199219 144.785156 174.785156 C 144.199219 175.375 143.871094 176.167969 143.871094 177 C 143.871094 177.832031 144.199219 178.625 144.785156 179.214844 C 145.375 179.800781 146.167969 180.128906 147 180.128906 C 147.832031 180.128906 148.625 179.800781 149.214844 179.214844 C 149.800781 178.625 150.128906 177.832031 150.128906 177 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 183.128906 159 C 183.128906 158.167969 182.800781 157.375 182.214844 156.785156 C 181.625 156.199219 180.832031 155.871094 180 155.871094 C 179.167969 155.871094 178.375 156.199219 177.785156 156.785156 C 177.199219 157.375 176.871094 158.167969 176.871094 159 C 176.871094 159.832031 177.199219 160.625 177.785156 161.214844 C 178.375 161.800781 179.167969 162.128906 180 162.128906 C 180.832031 162.128906 181.625 161.800781 182.214844 161.214844 C 182.800781 160.625 183.128906 159.832031 183.128906 159 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 217.128906 138 C 217.128906 137.167969 216.800781 136.375 216.214844 135.785156 C 215.625 135.199219 214.832031 134.871094 214 134.871094 C 213.167969 134.871094 212.375 135.199219 211.785156 135.785156 C 211.199219 136.375 210.871094 137.167969 210.871094 138 C 210.871094 138.832031 211.199219 139.625 211.785156 140.214844 C 212.375 140.800781 213.167969 141.128906 214 141.128906 C 214.832031 141.128906 215.625 140.800781 216.214844 140.214844 C 216.800781 139.625 217.128906 138.832031 217.128906 138 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 251.128906 115 C 251.128906 114.167969 250.800781 113.375 250.214844 112.785156 C 249.625 112.199219 248.832031 111.871094 248 111.871094 C 247.167969 111.871094 246.375 112.199219 245.785156 112.785156 C 245.199219 113.375 244.871094 114.167969 244.871094 115 C 244.871094 115.832031 245.199219 116.625 245.785156 117.214844 C 246.375 117.800781 247.167969 118.128906 248 118.128906 C 248.832031 118.128906 249.625 117.800781 250.214844 117.214844 C 250.800781 116.625 251.128906 115.832031 251.128906 115 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 284.128906 91 C 284.128906 90.167969 283.800781 89.375 283.214844 88.785156 C 282.625 88.199219 281.832031 87.871094 281 87.871094 C 280.167969 87.871094 279.375 88.199219 278.785156 88.785156 C 278.199219 89.375 277.871094 90.167969 277.871094 91 C 277.871094 91.832031 278.199219 92.625 278.785156 93.214844 C 279.375 93.800781 280.167969 94.128906 281 94.128906 C 281.832031 94.128906 282.625 93.800781 283.214844 93.214844 C 283.800781 92.625 284.128906 91.832031 284.128906 91 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 318.128906 65 C 318.128906 64.167969 317.800781 63.375 317.214844 62.785156 C 316.625 62.199219 315.832031 61.871094 315 61.871094 C 314.167969 61.871094 313.375 62.199219 312.785156 62.785156 C 312.199219 63.375 311.871094 64.167969 311.871094 65 C 311.871094 65.832031 312.199219 66.625 312.785156 67.214844 C 313.375 67.800781 314.167969 68.128906 315 68.128906 C 315.832031 68.128906 316.625 67.800781 317.214844 67.214844 C 317.800781 66.625 318.128906 65.832031 318.128906 65 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 352.128906 38 C 352.128906 37.167969 351.800781 36.375 351.214844 35.785156 C 350.625 35.199219 349.832031 34.871094 349 34.871094 C 348.167969 34.871094 347.375 35.199219 346.785156 35.785156 C 346.199219 36.375 345.871094 37.167969 345.871094 38 C 345.871094 38.832031 346.199219 39.625 346.785156 40.214844 C 347.375 40.800781 348.167969 41.128906 349 41.128906 C 349.832031 41.128906 350.625 40.800781 351.214844 40.214844 C 351.800781 39.625 352.128906 38.832031 352.128906 38 Z "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 220.007812 L 33 220.007812 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 220.007812 L 33 21 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 21 L 355 21 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 21 L 355 220.007812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.226562 220.007812 L 90.226562 216.785156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="84.725786" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="90.28731" y="232.006944"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 201.996094 220.007812 L 201.996094 216.785156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="193.49443" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="199.055953" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="204.617477" y="232.006944"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 313.761719 220.007812 L 313.761719 216.785156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="302.763074" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="308.324597" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="313.88612" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="319.447644" y="232.006944"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 45.75 220.007812 L 45.75 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 56.578125 220.007812 L 56.578125 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 65.429688 220.007812 L 65.429688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 72.914062 220.007812 L 72.914062 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 79.394531 220.007812 L 79.394531 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 85.113281 220.007812 L 85.113281 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 123.871094 220.007812 L 123.871094 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 143.554688 220.007812 L 143.554688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 157.515625 220.007812 L 157.515625 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 168.347656 220.007812 L 168.347656 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 177.199219 220.007812 L 177.199219 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 184.679688 220.007812 L 184.679688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 191.164062 220.007812 L 191.164062 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 196.878906 220.007812 L 196.878906 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 235.640625 220.007812 L 235.640625 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 255.320312 220.007812 L 255.320312 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 269.285156 220.007812 L 269.285156 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 280.117188 220.007812 L 280.117188 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 288.96875 220.007812 L 288.96875 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 296.449219 220.007812 L 296.449219 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 302.929688 220.007812 L 302.929688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 308.648438 220.007812 L 308.648438 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.410156 220.007812 L 347.410156 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 180.734375 L 36.21875 180.734375 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-3" x="24" y="183.235493"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 146.960938 L 36.21875 146.960938 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-4" x="24" y="149.45964"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 121.410156 L 36.21875 121.410156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="123.909193"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="123.909193"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 95.859375 L 36.21875 95.859375 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-3" x="19" y="98.358747"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="98.358747"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 62.082031 L 36.21875 62.082031 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-4" x="19" y="64.582894"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="64.582894"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 206.285156 L 34.609375 206.285156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 165.789062 L 34.609375 165.789062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 155.183594 L 34.609375 155.183594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 140.238281 L 34.609375 140.238281 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 134.558594 L 34.609375 134.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 129.632812 L 34.609375 129.632812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 125.292969 L 34.609375 125.292969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 80.914062 L 34.609375 80.914062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 70.308594 L 34.609375 70.308594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 36.53125 L 34.609375 36.53125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.226562 21 L 90.226562 24.21875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 201.996094 21 L 201.996094 24.21875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 313.761719 21 L 313.761719 24.21875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 45.75 21 L 45.75 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 56.578125 21 L 56.578125 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 65.429688 21 L 65.429688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 72.914062 21 L 72.914062 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 79.394531 21 L 79.394531 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 85.113281 21 L 85.113281 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 123.871094 21 L 123.871094 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 143.554688 21 L 143.554688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 157.515625 21 L 157.515625 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 168.347656 21 L 168.347656 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 177.199219 21 L 177.199219 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 184.679688 21 L 184.679688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 191.164062 21 L 191.164062 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 196.878906 21 L 196.878906 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 235.640625 21 L 235.640625 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 255.320312 21 L 255.320312 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 269.285156 21 L 269.285156 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 280.117188 21 L 280.117188 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 288.96875 21 L 288.96875 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 296.449219 21 L 296.449219 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 302.929688 21 L 302.929688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 308.648438 21 L 308.648438 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.410156 21 L 347.410156 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 180.734375 L 351.78125 180.734375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 146.960938 L 351.78125 146.960938 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 121.410156 L 351.78125 121.410156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 95.859375 L 351.78125 95.859375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 62.082031 L 351.78125 62.082031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 206.285156 L 353.390625 206.285156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 165.789062 L 353.390625 165.789062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 155.183594 L 353.390625 155.183594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 140.238281 L 353.390625 140.238281 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 134.558594 L 353.390625 134.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 129.632812 L 353.390625 129.632812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 125.292969 L 353.390625 125.292969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 80.914062 L 353.390625 80.914062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 70.308594 L 353.390625 70.308594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 36.53125 L 353.390625 36.53125 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-5" x="172.5" y="251.006944"/>
+  <use xlink:href="#glyph0-6" x="178.608398" y="251.006944"/>
+  <use xlink:href="#glyph0-7" x="184.169922" y="251.006944"/>
+  <use xlink:href="#glyph0-8" x="189.731445" y="251.006944"/>
+  <use xlink:href="#glyph0-9" x="194.731445" y="251.006944"/>
+  <use xlink:href="#glyph0-10" x="197.509766" y="251.006944"/>
+  <use xlink:href="#glyph0-11" x="199.731445" y="251.006944"/>
+  <use xlink:href="#glyph0-7" x="205.292969" y="251.006944"/>
+  <use xlink:href="#glyph0-12" x="210.854492" y="251.006944"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-1" x="9" y="140.003472"/>
+  <use xlink:href="#glyph1-2" x="9" y="133.33355"/>
+  <use xlink:href="#glyph1-3" x="9" y="127.772027"/>
+  <use xlink:href="#glyph1-4" x="9" y="122.772027"/>
+  <use xlink:href="#glyph1-5" x="9" y="117.210503"/>
+  <use xlink:href="#glyph1-6" x="9" y="111.64898"/>
+  <use xlink:href="#glyph1-7" x="9" y="106.087457"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-1" x="122" y="12"/>
+  <use xlink:href="#glyph2-2" x="130.666016" y="12"/>
+  <use xlink:href="#glyph2-3" x="137.339844" y="12"/>
+  <use xlink:href="#glyph2-4" x="147.335938" y="12"/>
+  <use xlink:href="#glyph2-5" x="154.009766" y="12"/>
+  <use xlink:href="#glyph2-6" x="156.675781" y="12"/>
+  <use xlink:href="#glyph2-7" x="159.341797" y="12"/>
+  <use xlink:href="#glyph2-8" x="166.015625" y="12"/>
+  <use xlink:href="#glyph2-5" x="169.349609" y="12"/>
+  <use xlink:href="#glyph2-2" x="172.015625" y="12"/>
+  <use xlink:href="#glyph2-9" x="178.689453" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-8" x="188" y="12"/>
+  <use xlink:href="#glyph2-5" x="191.333984" y="12"/>
+  <use xlink:href="#glyph2-3" x="194" y="12"/>
+  <use xlink:href="#glyph2-10" x="203.996094" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-2" x="214" y="12"/>
+  <use xlink:href="#glyph2-11" x="220.673828" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-3" x="227" y="12"/>
+  <use xlink:href="#glyph2-2" x="236.996094" y="12"/>
+  <use xlink:href="#glyph2-12" x="243.669922" y="12"/>
+  <use xlink:href="#glyph2-13" x="250.34375" y="12"/>
+  <use xlink:href="#glyph2-6" x="257.017578" y="12"/>
+  <use xlink:href="#glyph2-10" x="259.683594" y="12"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 382.148438 116 C 382.148438 115.164062 381.816406 114.363281 381.226562 113.773438 C 380.636719 113.183594 379.835938 112.851562 379 112.851562 C 378.164062 112.851562 377.363281 113.183594 376.773438 113.773438 C 376.183594 114.363281 375.851562 115.164062 375.851562 116 C 375.851562 116.835938 376.183594 117.636719 376.773438 118.226562 C 377.363281 118.816406 378.164062 119.148438 379 119.148438 C 379.835938 119.148438 380.636719 118.816406 381.226562 118.226562 C 381.816406 117.636719 382.148438 116.835938 382.148438 116 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-1" x="391" y="120.28418"/>
+  <use xlink:href="#glyph3-2" x="399.670898" y="120.28418"/>
+  <use xlink:href="#glyph3-2" x="406.901367" y="120.28418"/>
+  <use xlink:href="#glyph3-3" x="414.131836" y="120.28418"/>
+  <use xlink:href="#glyph3-4" x="420.631836" y="120.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-5" x="424" y="120.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-6" x="428" y="120.28418"/>
+  <use xlink:href="#glyph3-7" x="436.670898" y="120.28418"/>
+  <use xlink:href="#glyph3-4" x="443.170898" y="120.28418"/>
+  <use xlink:href="#glyph3-8" x="446.783203" y="120.28418"/>
+  <use xlink:href="#glyph3-2" x="454.013672" y="120.28418"/>
+  <use xlink:href="#glyph3-9" x="461.244141" y="120.28418"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 382.148438 139 C 382.148438 138.164062 381.816406 137.363281 381.226562 136.773438 C 380.636719 136.183594 379.835938 135.851562 379 135.851562 C 378.164062 135.851562 377.363281 136.183594 376.773438 136.773438 C 376.183594 137.363281 375.851562 138.164062 375.851562 139 C 375.851562 139.835938 376.183594 140.636719 376.773438 141.226562 C 377.363281 141.816406 378.164062 142.148438 379 142.148438 C 379.835938 142.148438 380.636719 141.816406 381.226562 141.226562 C 381.816406 140.636719 382.148438 139.835938 382.148438 139 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-10" x="391" y="143.28418"/>
+  <use xlink:href="#glyph3-7" x="398.230469" y="143.28418"/>
+  <use xlink:href="#glyph3-11" x="404.730469" y="143.28418"/>
+  <use xlink:href="#glyph3-12" x="411.960938" y="143.28418"/>
+  <use xlink:href="#glyph3-9" x="414.849609" y="143.28418"/>
+  <use xlink:href="#glyph3-13" x="422.080078" y="143.28418"/>
+  <use xlink:href="#glyph3-14" x="429.310547" y="143.28418"/>
+  <use xlink:href="#glyph3-14" x="436.541016" y="143.28418"/>
+</g>
+</g>
+</svg>
diff --git a/pybind11/docs/pybind11_vs_boost_python2.png b/pybind11/docs/pybind11_vs_boost_python2.png
new file mode 100644
index 0000000000000000000000000000000000000000..9f17272c50663957d6ae6d8e23fdd5a15757e71f
Binary files /dev/null and b/pybind11/docs/pybind11_vs_boost_python2.png differ
diff --git a/pybind11/docs/pybind11_vs_boost_python2.svg b/pybind11/docs/pybind11_vs_boost_python2.svg
new file mode 100644
index 0000000000000000000000000000000000000000..5ed6530ca112cbe643d5dd6d6fde385c4edea6b5
--- /dev/null
+++ b/pybind11/docs/pybind11_vs_boost_python2.svg
@@ -0,0 +1,427 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="468pt" height="246pt" viewBox="0 0 468 246" version="1.1">
+<defs>
+<g>
+<symbol overflow="visible" id="glyph0-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph0-1">
+<path style="stroke:none;" d="M 3.726562 0 L 2.847656 0 L 2.847656 -5.601562 C 2.636719 -5.398438 2.359375 -5.195312 2.015625 -4.996094 C 1.671875 -4.792969 1.363281 -4.640625 1.089844 -4.539062 L 1.089844 -5.390625 C 1.582031 -5.621094 2.011719 -5.902344 2.378906 -6.230469 C 2.746094 -6.558594 3.007812 -6.878906 3.160156 -7.1875 L 3.726562 -7.1875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-2">
+<path style="stroke:none;" d="M 0.414062 -3.53125 C 0.414062 -4.375 0.503906 -5.058594 0.675781 -5.574219 C 0.851562 -6.089844 1.109375 -6.488281 1.453125 -6.765625 C 1.796875 -7.046875 2.226562 -7.1875 2.75 -7.1875 C 3.132812 -7.1875 3.46875 -7.109375 3.757812 -6.957031 C 4.046875 -6.800781 4.289062 -6.578125 4.476562 -6.285156 C 4.664062 -5.996094 4.8125 -5.640625 4.921875 -5.222656 C 5.03125 -4.804688 5.082031 -4.238281 5.082031 -3.53125 C 5.082031 -2.691406 4.996094 -2.011719 4.824219 -1.496094 C 4.652344 -0.980469 4.394531 -0.582031 4.050781 -0.300781 C 3.707031 -0.0195312 3.273438 0.121094 2.75 0.121094 C 2.058594 0.121094 1.515625 -0.125 1.125 -0.621094 C 0.652344 -1.214844 0.414062 -2.1875 0.414062 -3.53125 Z M 1.320312 -3.53125 C 1.320312 -2.355469 1.457031 -1.574219 1.730469 -1.183594 C 2.007812 -0.796875 2.34375 -0.601562 2.75 -0.601562 C 3.152344 -0.601562 3.492188 -0.796875 3.765625 -1.1875 C 4.042969 -1.578125 4.179688 -2.359375 4.179688 -3.53125 C 4.179688 -4.710938 4.042969 -5.492188 3.765625 -5.878906 C 3.492188 -6.265625 3.148438 -6.460938 2.738281 -6.460938 C 2.335938 -6.460938 2.011719 -6.289062 1.773438 -5.945312 C 1.46875 -5.511719 1.320312 -4.707031 1.320312 -3.53125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-3">
+<path style="stroke:none;" d="M 0.820312 0 L 0.820312 -7.15625 L 5.648438 -7.15625 L 5.648438 -6.3125 L 1.765625 -6.3125 L 1.765625 -4.097656 L 5.125 -4.097656 L 5.125 -3.25 L 1.765625 -3.25 L 1.765625 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-4">
+<path style="stroke:none;" d="M 4.058594 0 L 4.058594 -0.761719 C 3.65625 -0.175781 3.105469 0.117188 2.414062 0.117188 C 2.105469 0.117188 1.820312 0.0585938 1.554688 -0.0585938 C 1.289062 -0.175781 1.09375 -0.324219 0.964844 -0.5 C 0.835938 -0.679688 0.746094 -0.894531 0.695312 -1.152344 C 0.65625 -1.324219 0.640625 -1.597656 0.640625 -1.972656 L 0.640625 -5.1875 L 1.519531 -5.1875 L 1.519531 -2.308594 C 1.519531 -1.851562 1.535156 -1.542969 1.570312 -1.382812 C 1.625 -1.152344 1.746094 -0.96875 1.921875 -0.835938 C 2.101562 -0.703125 2.324219 -0.640625 2.585938 -0.640625 C 2.851562 -0.640625 3.097656 -0.707031 3.328125 -0.84375 C 3.5625 -0.976562 3.726562 -1.160156 3.820312 -1.394531 C 3.917969 -1.625 3.964844 -1.964844 3.964844 -2.40625 L 3.964844 -5.1875 L 4.84375 -5.1875 L 4.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-5">
+<path style="stroke:none;" d="M 0.660156 0 L 0.660156 -5.1875 L 1.449219 -5.1875 L 1.449219 -4.449219 C 1.832031 -5.019531 2.382812 -5.304688 3.101562 -5.304688 C 3.414062 -5.304688 3.699219 -5.246094 3.960938 -5.132812 C 4.222656 -5.023438 4.421875 -4.875 4.550781 -4.691406 C 4.679688 -4.507812 4.773438 -4.292969 4.824219 -4.042969 C 4.855469 -3.878906 4.875 -3.59375 4.875 -3.1875 L 4.875 0 L 3.992188 0 L 3.992188 -3.15625 C 3.992188 -3.511719 3.960938 -3.78125 3.890625 -3.957031 C 3.824219 -4.132812 3.703125 -4.277344 3.527344 -4.382812 C 3.351562 -4.488281 3.148438 -4.539062 2.914062 -4.539062 C 2.539062 -4.539062 2.21875 -4.421875 1.945312 -4.183594 C 1.671875 -3.945312 1.539062 -3.496094 1.539062 -2.832031 L 1.539062 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-6">
+<path style="stroke:none;" d="M 4.042969 -1.898438 L 4.90625 -1.789062 C 4.8125 -1.191406 4.570312 -0.726562 4.183594 -0.386719 C 3.792969 -0.0507812 3.316406 0.117188 2.75 0.117188 C 2.039062 0.117188 1.46875 -0.113281 1.039062 -0.578125 C 0.605469 -1.042969 0.390625 -1.707031 0.390625 -2.574219 C 0.390625 -3.132812 0.484375 -3.625 0.667969 -4.042969 C 0.855469 -4.460938 1.136719 -4.777344 1.515625 -4.988281 C 1.894531 -5.199219 2.308594 -5.304688 2.753906 -5.304688 C 3.316406 -5.304688 3.777344 -5.160156 4.136719 -4.875 C 4.492188 -4.589844 4.722656 -4.1875 4.824219 -3.664062 L 3.96875 -3.53125 C 3.886719 -3.878906 3.746094 -4.140625 3.539062 -4.316406 C 3.332031 -4.492188 3.082031 -4.578125 2.789062 -4.578125 C 2.34375 -4.578125 1.984375 -4.421875 1.710938 -4.105469 C 1.433594 -3.789062 1.292969 -3.285156 1.292969 -2.597656 C 1.292969 -1.902344 1.425781 -1.394531 1.695312 -1.078125 C 1.960938 -0.761719 2.308594 -0.605469 2.738281 -0.605469 C 3.085938 -0.605469 3.371094 -0.710938 3.601562 -0.921875 C 3.835938 -1.132812 3.980469 -1.460938 4.042969 -1.898438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-7">
+<path style="stroke:none;" d="M 2.578125 -0.785156 L 2.703125 -0.0078125 C 2.457031 0.0429688 2.234375 0.0703125 2.039062 0.0703125 C 1.722656 0.0703125 1.476562 0.0195312 1.296875 -0.0820312 C 1.121094 -0.183594 1 -0.316406 0.929688 -0.480469 C 0.855469 -0.644531 0.820312 -0.992188 0.820312 -1.519531 L 0.820312 -4.5 L 0.175781 -4.5 L 0.175781 -5.1875 L 0.820312 -5.1875 L 0.820312 -6.46875 L 1.695312 -6.996094 L 1.695312 -5.1875 L 2.578125 -5.1875 L 2.578125 -4.5 L 1.695312 -4.5 L 1.695312 -1.46875 C 1.695312 -1.21875 1.710938 -1.058594 1.742188 -0.984375 C 1.773438 -0.914062 1.820312 -0.859375 1.890625 -0.816406 C 1.960938 -0.773438 2.0625 -0.75 2.191406 -0.75 C 2.289062 -0.75 2.417969 -0.761719 2.578125 -0.785156 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-8">
+<path style="stroke:none;" d="M 0.664062 -6.148438 L 0.664062 -7.15625 L 1.542969 -7.15625 L 1.542969 -6.148438 Z M 0.664062 0 L 0.664062 -5.1875 L 1.542969 -5.1875 L 1.542969 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-9">
+<path style="stroke:none;" d="M 0.332031 -2.59375 C 0.332031 -3.554688 0.597656 -4.265625 1.132812 -4.726562 C 1.578125 -5.109375 2.121094 -5.304688 2.765625 -5.304688 C 3.476562 -5.304688 4.058594 -5.070312 4.511719 -4.601562 C 4.964844 -4.132812 5.191406 -3.488281 5.191406 -2.664062 C 5.191406 -2 5.089844 -1.472656 4.890625 -1.089844 C 4.691406 -0.707031 4.398438 -0.410156 4.015625 -0.199219 C 3.632812 0.0117188 3.214844 0.117188 2.765625 0.117188 C 2.039062 0.117188 1.449219 -0.117188 1.003906 -0.582031 C 0.554688 -1.046875 0.332031 -1.71875 0.332031 -2.59375 Z M 1.234375 -2.59375 C 1.234375 -1.929688 1.378906 -1.429688 1.671875 -1.101562 C 1.960938 -0.769531 2.324219 -0.605469 2.765625 -0.605469 C 3.199219 -0.605469 3.5625 -0.773438 3.851562 -1.101562 C 4.140625 -1.433594 4.289062 -1.941406 4.289062 -2.621094 C 4.289062 -3.261719 4.140625 -3.75 3.851562 -4.078125 C 3.558594 -4.410156 3.195312 -4.574219 2.765625 -4.574219 C 2.324219 -4.574219 1.960938 -4.410156 1.671875 -4.082031 C 1.382812 -3.753906 1.234375 -3.257812 1.234375 -2.59375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-10">
+<path style="stroke:none;" d="M 0.308594 -1.546875 L 1.175781 -1.683594 C 1.226562 -1.335938 1.363281 -1.070312 1.585938 -0.882812 C 1.808594 -0.699219 2.117188 -0.605469 2.519531 -0.605469 C 2.921875 -0.605469 3.222656 -0.6875 3.417969 -0.851562 C 3.613281 -1.015625 3.710938 -1.210938 3.710938 -1.429688 C 3.710938 -1.628906 3.625 -1.785156 3.453125 -1.898438 C 3.332031 -1.976562 3.03125 -2.078125 2.554688 -2.195312 C 1.910156 -2.359375 1.460938 -2.5 1.214844 -2.621094 C 0.964844 -2.738281 0.777344 -2.902344 0.648438 -3.113281 C 0.519531 -3.324219 0.453125 -3.554688 0.453125 -3.808594 C 0.453125 -4.039062 0.507812 -4.253906 0.613281 -4.449219 C 0.71875 -4.648438 0.863281 -4.8125 1.046875 -4.941406 C 1.183594 -5.042969 1.367188 -5.128906 1.605469 -5.199219 C 1.839844 -5.269531 2.09375 -5.304688 2.363281 -5.304688 C 2.769531 -5.304688 3.128906 -5.242188 3.433594 -5.125 C 3.742188 -5.007812 3.96875 -4.851562 4.117188 -4.652344 C 4.261719 -4.453125 4.363281 -4.183594 4.417969 -3.847656 L 3.558594 -3.730469 C 3.519531 -3.996094 3.40625 -4.207031 3.21875 -4.355469 C 3.03125 -4.503906 2.769531 -4.578125 2.425781 -4.578125 C 2.023438 -4.578125 1.734375 -4.511719 1.5625 -4.378906 C 1.390625 -4.246094 1.304688 -4.089844 1.304688 -3.910156 C 1.304688 -3.796875 1.339844 -3.695312 1.410156 -3.601562 C 1.484375 -3.507812 1.59375 -3.429688 1.75 -3.367188 C 1.835938 -3.335938 2.09375 -3.261719 2.523438 -3.144531 C 3.144531 -2.976562 3.578125 -2.84375 3.824219 -2.738281 C 4.070312 -2.632812 4.265625 -2.476562 4.40625 -2.273438 C 4.546875 -2.074219 4.613281 -1.824219 4.613281 -1.523438 C 4.613281 -1.230469 4.527344 -0.953125 4.359375 -0.695312 C 4.1875 -0.4375 3.941406 -0.238281 3.617188 -0.09375 C 3.296875 0.046875 2.929688 0.117188 2.523438 0.117188 C 1.851562 0.117188 1.335938 -0.0234375 0.984375 -0.304688 C 0.632812 -0.582031 0.40625 -0.996094 0.308594 -1.546875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph1-1">
+<path style="stroke:none;" d="M 0.375 -1.6875 L 1.203125 -1.757812 C 1.265625 -1.351562 1.410156 -1.050781 1.632812 -0.847656 C 1.855469 -0.644531 2.125 -0.539062 2.445312 -0.539062 C 2.824219 -0.539062 3.148438 -0.683594 3.410156 -0.972656 C 3.671875 -1.257812 3.804688 -1.640625 3.804688 -2.113281 C 3.804688 -2.566406 3.679688 -2.921875 3.425781 -3.179688 C 3.171875 -3.441406 2.839844 -3.574219 2.429688 -3.574219 C 2.175781 -3.574219 1.945312 -3.515625 1.742188 -3.398438 C 1.535156 -3.28125 1.375 -3.132812 1.257812 -2.949219 L 0.515625 -3.046875 L 1.136719 -6.355469 L 4.34375 -6.355469 L 4.34375 -5.597656 L 1.769531 -5.597656 L 1.421875 -3.867188 C 1.808594 -4.136719 2.214844 -4.273438 2.640625 -4.273438 C 3.203125 -4.273438 3.679688 -4.078125 4.066406 -3.6875 C 4.453125 -3.296875 4.644531 -2.796875 4.644531 -2.183594 C 4.644531 -1.601562 4.476562 -1.097656 4.136719 -0.671875 C 3.722656 -0.152344 3.15625 0.109375 2.445312 0.109375 C 1.859375 0.109375 1.378906 -0.0546875 1.007812 -0.382812 C 0.636719 -0.710938 0.425781 -1.144531 0.375 -1.6875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-2">
+<path style="stroke:none;" d="M 4.476562 -4.863281 L 3.691406 -4.804688 C 3.621094 -5.113281 3.523438 -5.339844 3.390625 -5.480469 C 3.179688 -5.707031 2.914062 -5.820312 2.601562 -5.820312 C 2.351562 -5.820312 2.128906 -5.75 1.9375 -5.609375 C 1.6875 -5.425781 1.492188 -5.160156 1.347656 -4.8125 C 1.203125 -4.464844 1.132812 -3.96875 1.125 -3.320312 C 1.316406 -3.613281 1.546875 -3.828125 1.824219 -3.96875 C 2.097656 -4.109375 2.386719 -4.179688 2.6875 -4.179688 C 3.214844 -4.179688 3.664062 -3.984375 4.035156 -3.597656 C 4.40625 -3.210938 4.59375 -2.707031 4.59375 -2.09375 C 4.59375 -1.6875 4.503906 -1.3125 4.332031 -0.964844 C 4.15625 -0.617188 3.917969 -0.351562 3.613281 -0.167969 C 3.308594 0.015625 2.960938 0.109375 2.574219 0.109375 C 1.914062 0.109375 1.378906 -0.132812 0.960938 -0.617188 C 0.546875 -1.101562 0.339844 -1.902344 0.339844 -3.015625 C 0.339844 -4.261719 0.570312 -5.164062 1.027344 -5.730469 C 1.429688 -6.222656 1.96875 -6.46875 2.648438 -6.46875 C 3.15625 -6.46875 3.570312 -6.328125 3.894531 -6.042969 C 4.21875 -5.757812 4.414062 -5.367188 4.476562 -4.863281 Z M 1.25 -2.085938 C 1.25 -1.8125 1.304688 -1.554688 1.421875 -1.304688 C 1.539062 -1.054688 1.699219 -0.867188 1.90625 -0.734375 C 2.113281 -0.605469 2.332031 -0.539062 2.5625 -0.539062 C 2.894531 -0.539062 3.183594 -0.675781 3.421875 -0.945312 C 3.664062 -1.214844 3.785156 -1.582031 3.785156 -2.042969 C 3.785156 -2.488281 3.664062 -2.839844 3.429688 -3.097656 C 3.191406 -3.351562 2.890625 -3.480469 2.53125 -3.480469 C 2.171875 -3.480469 1.871094 -3.351562 1.621094 -3.097656 C 1.371094 -2.839844 1.25 -2.503906 1.25 -2.085938 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-3">
+<path style="stroke:none;" d="M 0.425781 -5.597656 L 0.425781 -6.359375 L 4.597656 -6.359375 L 4.597656 -5.742188 C 4.1875 -5.304688 3.78125 -4.726562 3.378906 -4.003906 C 2.976562 -3.28125 2.664062 -2.535156 2.445312 -1.769531 C 2.285156 -1.230469 2.183594 -0.640625 2.140625 0 L 1.328125 0 C 1.335938 -0.507812 1.4375 -1.117188 1.625 -1.835938 C 1.816406 -2.554688 2.089844 -3.246094 2.445312 -3.914062 C 2.800781 -4.578125 3.179688 -5.140625 3.582031 -5.597656 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph2-1">
+<path style="stroke:none;" d="M 0 -0.734375 L -7.15625 -0.734375 L -7.15625 -3.417969 C -7.15625 -3.964844 -7.085938 -4.402344 -6.941406 -4.734375 C -6.796875 -5.0625 -6.574219 -5.324219 -6.273438 -5.511719 C -5.972656 -5.699219 -5.65625 -5.789062 -5.328125 -5.789062 C -5.023438 -5.789062 -4.734375 -5.707031 -4.460938 -5.542969 C -4.191406 -5.375 -3.976562 -5.125 -3.808594 -4.789062 C -3.679688 -5.222656 -3.464844 -5.554688 -3.160156 -5.789062 C -2.851562 -6.023438 -2.492188 -6.136719 -2.074219 -6.136719 C -1.738281 -6.136719 -1.429688 -6.066406 -1.140625 -5.925781 C -0.851562 -5.785156 -0.628906 -5.609375 -0.472656 -5.398438 C -0.316406 -5.191406 -0.199219 -4.929688 -0.121094 -4.617188 C -0.0390625 -4.304688 0 -3.917969 0 -3.460938 Z M -4.148438 -1.679688 L -4.148438 -3.226562 C -4.148438 -3.648438 -4.179688 -3.949219 -4.234375 -4.132812 C -4.304688 -4.371094 -4.421875 -4.554688 -4.589844 -4.675781 C -4.757812 -4.796875 -4.964844 -4.859375 -5.214844 -4.859375 C -5.453125 -4.859375 -5.660156 -4.800781 -5.84375 -4.6875 C -6.023438 -4.574219 -6.148438 -4.410156 -6.214844 -4.199219 C -6.28125 -3.988281 -6.3125 -3.625 -6.3125 -3.109375 L -6.3125 -1.679688 Z M -0.84375 -1.679688 L -0.84375 -3.460938 C -0.84375 -3.765625 -0.855469 -3.984375 -0.878906 -4.105469 C -0.917969 -4.324219 -0.984375 -4.507812 -1.074219 -4.652344 C -1.164062 -4.800781 -1.296875 -4.921875 -1.472656 -5.015625 C -1.648438 -5.109375 -1.847656 -5.15625 -2.074219 -5.15625 C -2.34375 -5.15625 -2.574219 -5.085938 -2.769531 -4.953125 C -2.96875 -4.816406 -3.105469 -4.625 -3.1875 -4.382812 C -3.265625 -4.140625 -3.304688 -3.789062 -3.304688 -3.335938 L -3.304688 -1.679688 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-2">
+<path style="stroke:none;" d="M 1.996094 -0.621094 L 1.171875 -0.523438 C 1.222656 -0.714844 1.25 -0.882812 1.25 -1.023438 C 1.25 -1.21875 1.21875 -1.375 1.152344 -1.492188 C 1.085938 -1.609375 0.996094 -1.707031 0.878906 -1.78125 C 0.789062 -1.835938 0.574219 -1.925781 0.226562 -2.050781 C 0.175781 -2.066406 0.105469 -2.09375 0.0078125 -2.128906 L -5.1875 -0.160156 L -5.1875 -1.109375 L -2.183594 -2.1875 C -1.800781 -2.328125 -1.402344 -2.453125 -0.980469 -2.5625 C -1.382812 -2.664062 -1.777344 -2.785156 -2.164062 -2.925781 L -5.1875 -4.03125 L -5.1875 -4.914062 L 0.0859375 -2.9375 C 0.65625 -2.726562 1.050781 -2.5625 1.265625 -2.445312 C 1.554688 -2.289062 1.765625 -2.109375 1.902344 -1.910156 C 2.039062 -1.707031 2.105469 -1.464844 2.105469 -1.1875 C 2.105469 -1.015625 2.070312 -0.828125 1.996094 -0.621094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-3">
+<path style="stroke:none;" d="M -0.785156 -2.578125 L -0.0078125 -2.703125 C 0.0429688 -2.457031 0.0703125 -2.234375 0.0703125 -2.039062 C 0.0703125 -1.722656 0.0195312 -1.476562 -0.0820312 -1.296875 C -0.183594 -1.121094 -0.316406 -1 -0.480469 -0.929688 C -0.644531 -0.855469 -0.992188 -0.820312 -1.519531 -0.820312 L -4.5 -0.820312 L -4.5 -0.175781 L -5.1875 -0.175781 L -5.1875 -0.820312 L -6.46875 -0.820312 L -6.996094 -1.695312 L -5.1875 -1.695312 L -5.1875 -2.578125 L -4.5 -2.578125 L -4.5 -1.695312 L -1.46875 -1.695312 C -1.21875 -1.695312 -1.058594 -1.710938 -0.984375 -1.742188 C -0.914062 -1.773438 -0.859375 -1.820312 -0.816406 -1.890625 C -0.773438 -1.960938 -0.75 -2.0625 -0.75 -2.191406 C -0.75 -2.289062 -0.761719 -2.417969 -0.785156 -2.578125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-4">
+<path style="stroke:none;" d="M -1.671875 -4.210938 L -1.558594 -5.117188 C -1.027344 -4.972656 -0.617188 -4.707031 -0.320312 -4.320312 C -0.0273438 -3.933594 0.117188 -3.4375 0.117188 -2.835938 C 0.117188 -2.078125 -0.117188 -1.476562 -0.582031 -1.03125 C -1.050781 -0.585938 -1.707031 -0.367188 -2.546875 -0.367188 C -3.421875 -0.367188 -4.097656 -0.589844 -4.578125 -1.039062 C -5.0625 -1.488281 -5.304688 -2.070312 -5.304688 -2.789062 C -5.304688 -3.480469 -5.066406 -4.046875 -4.59375 -4.488281 C -4.121094 -4.925781 -3.457031 -5.148438 -2.601562 -5.148438 C -2.550781 -5.148438 -2.472656 -5.144531 -2.367188 -5.140625 L -2.367188 -1.273438 C -1.796875 -1.304688 -1.363281 -1.46875 -1.058594 -1.757812 C -0.757812 -2.046875 -0.605469 -2.410156 -0.605469 -2.84375 C -0.605469 -3.164062 -0.691406 -3.4375 -0.859375 -3.667969 C -1.027344 -3.894531 -1.296875 -4.074219 -1.671875 -4.210938 Z M -3.089844 -1.324219 L -3.089844 -4.21875 C -3.527344 -4.179688 -3.855469 -4.070312 -4.070312 -3.886719 C -4.410156 -3.605469 -4.578125 -3.242188 -4.578125 -2.796875 C -4.578125 -2.394531 -4.445312 -2.054688 -4.175781 -1.78125 C -3.90625 -1.503906 -3.542969 -1.351562 -3.089844 -1.324219 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-5">
+<path style="stroke:none;" d="M -1.546875 -0.308594 L -1.683594 -1.175781 C -1.335938 -1.226562 -1.070312 -1.363281 -0.882812 -1.585938 C -0.699219 -1.808594 -0.605469 -2.117188 -0.605469 -2.519531 C -0.605469 -2.921875 -0.6875 -3.222656 -0.851562 -3.417969 C -1.015625 -3.613281 -1.210938 -3.710938 -1.429688 -3.710938 C -1.628906 -3.710938 -1.785156 -3.625 -1.898438 -3.453125 C -1.976562 -3.332031 -2.078125 -3.03125 -2.195312 -2.554688 C -2.359375 -1.910156 -2.5 -1.460938 -2.621094 -1.214844 C -2.738281 -0.964844 -2.902344 -0.777344 -3.113281 -0.648438 C -3.324219 -0.519531 -3.554688 -0.453125 -3.808594 -0.453125 C -4.039062 -0.453125 -4.253906 -0.507812 -4.449219 -0.613281 C -4.648438 -0.71875 -4.8125 -0.863281 -4.941406 -1.046875 C -5.042969 -1.183594 -5.128906 -1.367188 -5.199219 -1.605469 C -5.269531 -1.839844 -5.304688 -2.09375 -5.304688 -2.363281 C -5.304688 -2.769531 -5.242188 -3.128906 -5.125 -3.433594 C -5.007812 -3.742188 -4.851562 -3.96875 -4.652344 -4.117188 C -4.453125 -4.261719 -4.183594 -4.363281 -3.847656 -4.417969 L -3.730469 -3.558594 C -3.996094 -3.519531 -4.207031 -3.40625 -4.355469 -3.21875 C -4.503906 -3.03125 -4.578125 -2.769531 -4.578125 -2.425781 C -4.578125 -2.023438 -4.511719 -1.734375 -4.378906 -1.5625 C -4.246094 -1.390625 -4.089844 -1.304688 -3.910156 -1.304688 C -3.796875 -1.304688 -3.695312 -1.339844 -3.601562 -1.410156 C -3.507812 -1.484375 -3.429688 -1.59375 -3.367188 -1.75 C -3.335938 -1.835938 -3.261719 -2.09375 -3.144531 -2.523438 C -2.976562 -3.144531 -2.84375 -3.578125 -2.738281 -3.824219 C -2.632812 -4.070312 -2.476562 -4.265625 -2.273438 -4.40625 C -2.074219 -4.546875 -1.824219 -4.613281 -1.523438 -4.613281 C -1.230469 -4.613281 -0.953125 -4.527344 -0.695312 -4.359375 C -0.4375 -4.1875 -0.238281 -3.941406 -0.09375 -3.617188 C 0.046875 -3.296875 0.117188 -2.929688 0.117188 -2.523438 C 0.117188 -1.851562 -0.0234375 -1.335938 -0.304688 -0.984375 C -0.582031 -0.632812 -0.996094 -0.40625 -1.546875 -0.308594 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph3-1">
+<path style="stroke:none;" d="M 0.984375 0 L 0.984375 -8.589844 L 6.78125 -8.589844 L 6.78125 -7.578125 L 2.121094 -7.578125 L 2.121094 -4.914062 L 6.152344 -4.914062 L 6.152344 -3.902344 L 2.121094 -3.902344 L 2.121094 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-2">
+<path style="stroke:none;" d="M 0.796875 -7.375 L 0.796875 -8.589844 L 1.851562 -8.589844 L 1.851562 -7.375 Z M 0.796875 0 L 0.796875 -6.222656 L 1.851562 -6.222656 L 1.851562 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-3">
+<path style="stroke:none;" d="M 0.765625 0 L 0.765625 -8.589844 L 1.820312 -8.589844 L 1.820312 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-4">
+<path style="stroke:none;" d="M 5.050781 -2.003906 L 6.140625 -1.867188 C 5.96875 -1.230469 5.648438 -0.738281 5.1875 -0.386719 C 4.722656 -0.0351562 4.125 0.140625 3.40625 0.140625 C 2.496094 0.140625 1.773438 -0.140625 1.238281 -0.699219 C 0.707031 -1.261719 0.4375 -2.046875 0.4375 -3.058594 C 0.4375 -4.105469 0.710938 -4.917969 1.25 -5.496094 C 1.789062 -6.074219 2.484375 -6.363281 3.34375 -6.363281 C 4.175781 -6.363281 4.859375 -6.078125 5.382812 -5.515625 C 5.910156 -4.949219 6.175781 -4.148438 6.175781 -3.125 C 6.175781 -3.0625 6.171875 -2.96875 6.171875 -2.84375 L 1.53125 -2.84375 C 1.570312 -2.160156 1.761719 -1.632812 2.109375 -1.273438 C 2.457031 -0.910156 2.890625 -0.726562 3.410156 -0.726562 C 3.796875 -0.726562 4.125 -0.828125 4.398438 -1.03125 C 4.671875 -1.234375 4.890625 -1.558594 5.050781 -2.003906 Z M 1.585938 -3.710938 L 5.0625 -3.710938 C 5.015625 -4.234375 4.882812 -4.625 4.664062 -4.886719 C 4.328125 -5.292969 3.890625 -5.496094 3.359375 -5.496094 C 2.875 -5.496094 2.464844 -5.335938 2.136719 -5.007812 C 1.804688 -4.683594 1.625 -4.25 1.585938 -3.710938 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-5">
+<path style="stroke:none;" d="M 0.367188 -1.859375 L 1.414062 -2.023438 C 1.472656 -1.605469 1.632812 -1.28125 1.902344 -1.0625 C 2.167969 -0.839844 2.542969 -0.726562 3.023438 -0.726562 C 3.507812 -0.726562 3.867188 -0.824219 4.101562 -1.023438 C 4.335938 -1.21875 4.453125 -1.453125 4.453125 -1.71875 C 4.453125 -1.957031 4.351562 -2.140625 4.140625 -2.28125 C 3.996094 -2.375 3.640625 -2.492188 3.0625 -2.636719 C 2.289062 -2.832031 1.753906 -3 1.457031 -3.144531 C 1.15625 -3.285156 0.929688 -3.484375 0.777344 -3.734375 C 0.621094 -3.988281 0.546875 -4.265625 0.546875 -4.570312 C 0.546875 -4.847656 0.609375 -5.105469 0.734375 -5.339844 C 0.863281 -5.578125 1.035156 -5.773438 1.253906 -5.929688 C 1.417969 -6.050781 1.640625 -6.152344 1.925781 -6.238281 C 2.207031 -6.320312 2.511719 -6.363281 2.835938 -6.363281 C 3.324219 -6.363281 3.753906 -6.292969 4.121094 -6.152344 C 4.492188 -6.011719 4.765625 -5.820312 4.9375 -5.582031 C 5.113281 -5.339844 5.234375 -5.019531 5.304688 -4.617188 L 4.273438 -4.476562 C 4.226562 -4.796875 4.089844 -5.046875 3.863281 -5.226562 C 3.640625 -5.40625 3.320312 -5.496094 2.914062 -5.496094 C 2.429688 -5.496094 2.082031 -5.414062 1.875 -5.257812 C 1.667969 -5.097656 1.5625 -4.90625 1.5625 -4.695312 C 1.5625 -4.558594 1.609375 -4.433594 1.695312 -4.324219 C 1.78125 -4.210938 1.914062 -4.117188 2.097656 -4.042969 C 2.203125 -4.003906 2.515625 -3.914062 3.03125 -3.773438 C 3.777344 -3.574219 4.296875 -3.410156 4.589844 -3.285156 C 4.886719 -3.15625 5.117188 -2.972656 5.285156 -2.730469 C 5.453125 -2.488281 5.539062 -2.1875 5.539062 -1.828125 C 5.539062 -1.476562 5.433594 -1.144531 5.230469 -0.835938 C 5.023438 -0.523438 4.726562 -0.285156 4.34375 -0.113281 C 3.957031 0.0546875 3.515625 0.140625 3.03125 0.140625 C 2.222656 0.140625 1.605469 -0.0273438 1.179688 -0.363281 C 0.757812 -0.699219 0.484375 -1.195312 0.367188 -1.859375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-6">
+<path style="stroke:none;" d="M 0.234375 0 L 0.234375 -0.855469 L 4.195312 -5.402344 C 3.746094 -5.378906 3.351562 -5.367188 3.007812 -5.367188 L 0.46875 -5.367188 L 0.46875 -6.222656 L 5.554688 -6.222656 L 5.554688 -5.523438 L 2.1875 -1.578125 L 1.535156 -0.855469 C 2.007812 -0.890625 2.453125 -0.90625 2.867188 -0.90625 L 5.742188 -0.90625 L 5.742188 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-7">
+<path style="stroke:none;" d="M 0.398438 -3.109375 C 0.398438 -4.261719 0.71875 -5.117188 1.359375 -5.671875 C 1.894531 -6.132812 2.546875 -6.363281 3.316406 -6.363281 C 4.171875 -6.363281 4.871094 -6.082031 5.414062 -5.523438 C 5.957031 -4.960938 6.226562 -4.1875 6.226562 -3.199219 C 6.226562 -2.398438 6.109375 -1.769531 5.867188 -1.308594 C 5.628906 -0.851562 5.277344 -0.492188 4.820312 -0.242188 C 4.359375 0.0117188 3.859375 0.140625 3.316406 0.140625 C 2.445312 0.140625 1.742188 -0.140625 1.203125 -0.695312 C 0.667969 -1.253906 0.398438 -2.0625 0.398438 -3.109375 Z M 1.484375 -3.109375 C 1.484375 -2.3125 1.65625 -1.71875 2.003906 -1.320312 C 2.351562 -0.925781 2.789062 -0.726562 3.316406 -0.726562 C 3.839844 -0.726562 4.273438 -0.925781 4.625 -1.324219 C 4.972656 -1.722656 5.144531 -2.328125 5.144531 -3.148438 C 5.144531 -3.917969 4.96875 -4.5 4.621094 -4.894531 C 4.269531 -5.292969 3.835938 -5.492188 3.316406 -5.492188 C 2.789062 -5.492188 2.351562 -5.292969 2.003906 -4.898438 C 1.65625 -4.503906 1.484375 -3.90625 1.484375 -3.109375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-8">
+<path style="stroke:none;" d="M 1.042969 0 L 1.042969 -5.402344 L 0.109375 -5.402344 L 0.109375 -6.222656 L 1.042969 -6.222656 L 1.042969 -6.882812 C 1.042969 -7.300781 1.078125 -7.613281 1.15625 -7.816406 C 1.257812 -8.089844 1.433594 -8.3125 1.691406 -8.480469 C 1.945312 -8.652344 2.304688 -8.734375 2.765625 -8.734375 C 3.0625 -8.734375 3.390625 -8.703125 3.75 -8.632812 L 3.59375 -7.710938 C 3.375 -7.75 3.164062 -7.769531 2.96875 -7.769531 C 2.648438 -7.769531 2.421875 -7.703125 2.289062 -7.5625 C 2.15625 -7.425781 2.09375 -7.171875 2.09375 -6.796875 L 2.09375 -6.222656 L 3.304688 -6.222656 L 3.304688 -5.402344 L 2.09375 -5.402344 L 2.09375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-9">
+<path style="stroke:none;" d="M 0.789062 0 L 0.789062 -6.222656 L 1.734375 -6.222656 L 1.734375 -5.351562 C 1.929688 -5.65625 2.1875 -5.898438 2.515625 -6.085938 C 2.839844 -6.269531 3.207031 -6.363281 3.621094 -6.363281 C 4.082031 -6.363281 4.460938 -6.265625 4.753906 -6.078125 C 5.050781 -5.886719 5.257812 -5.617188 5.378906 -5.273438 C 5.871094 -6 6.511719 -6.363281 7.300781 -6.363281 C 7.917969 -6.363281 8.390625 -6.191406 8.726562 -5.851562 C 9.058594 -5.507812 9.222656 -4.984375 9.222656 -4.273438 L 9.222656 0 L 8.171875 0 L 8.171875 -3.921875 C 8.171875 -4.34375 8.140625 -4.644531 8.070312 -4.832031 C 8.003906 -5.015625 7.878906 -5.164062 7.699219 -5.28125 C 7.519531 -5.394531 7.308594 -5.449219 7.066406 -5.449219 C 6.628906 -5.449219 6.265625 -5.304688 5.976562 -5.011719 C 5.6875 -4.722656 5.542969 -4.257812 5.542969 -3.617188 L 5.542969 0 L 4.488281 0 L 4.488281 -4.042969 C 4.488281 -4.511719 4.402344 -4.863281 4.230469 -5.097656 C 4.058594 -5.332031 3.777344 -5.449219 3.386719 -5.449219 C 3.089844 -5.449219 2.816406 -5.371094 2.5625 -5.214844 C 2.3125 -5.058594 2.128906 -4.828125 2.015625 -4.53125 C 1.902344 -4.230469 1.84375 -3.796875 1.84375 -3.226562 L 1.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-10">
+<path style="stroke:none;" d="M 4.828125 0 L 4.828125 -0.785156 C 4.433594 -0.167969 3.851562 0.140625 3.085938 0.140625 C 2.589844 0.140625 2.136719 0.00390625 1.71875 -0.269531 C 1.304688 -0.542969 0.980469 -0.925781 0.753906 -1.414062 C 0.523438 -1.90625 0.410156 -2.46875 0.410156 -3.105469 C 0.410156 -3.726562 0.515625 -4.289062 0.71875 -4.796875 C 0.925781 -5.300781 1.238281 -5.6875 1.652344 -5.960938 C 2.066406 -6.230469 2.53125 -6.363281 3.039062 -6.363281 C 3.414062 -6.363281 3.75 -6.285156 4.042969 -6.125 C 4.335938 -5.96875 4.574219 -5.761719 4.757812 -5.507812 L 4.757812 -8.589844 L 5.804688 -8.589844 L 5.804688 0 Z M 1.492188 -3.105469 C 1.492188 -2.308594 1.664062 -1.710938 2 -1.320312 C 2.335938 -0.925781 2.730469 -0.726562 3.1875 -0.726562 C 3.648438 -0.726562 4.039062 -0.914062 4.363281 -1.292969 C 4.683594 -1.667969 4.84375 -2.242188 4.84375 -3.015625 C 4.84375 -3.867188 4.679688 -4.492188 4.351562 -4.890625 C 4.023438 -5.289062 3.621094 -5.492188 3.140625 -5.492188 C 2.671875 -5.492188 2.28125 -5.296875 1.964844 -4.914062 C 1.652344 -4.53125 1.492188 -3.929688 1.492188 -3.105469 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-11">
+<path style="stroke:none;" d="M 4.867188 0 L 4.867188 -0.914062 C 4.382812 -0.210938 3.726562 0.140625 2.894531 0.140625 C 2.527344 0.140625 2.183594 0.0703125 1.867188 -0.0703125 C 1.546875 -0.210938 1.3125 -0.386719 1.15625 -0.601562 C 1.003906 -0.8125 0.894531 -1.074219 0.832031 -1.382812 C 0.789062 -1.589844 0.765625 -1.917969 0.765625 -2.367188 L 0.765625 -6.222656 L 1.820312 -6.222656 L 1.820312 -2.773438 C 1.820312 -2.222656 1.84375 -1.851562 1.886719 -1.65625 C 1.953125 -1.378906 2.09375 -1.164062 2.308594 -1.003906 C 2.523438 -0.847656 2.789062 -0.765625 3.105469 -0.765625 C 3.421875 -0.765625 3.71875 -0.847656 3.996094 -1.011719 C 4.273438 -1.171875 4.46875 -1.394531 4.585938 -1.671875 C 4.699219 -1.953125 4.757812 -2.359375 4.757812 -2.890625 L 4.757812 -6.222656 L 5.8125 -6.222656 L 5.8125 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph4-1">
+<path style="stroke:none;" d="M 0.953125 0 L 0.953125 -9.304688 L 4.445312 -9.304688 C 5.15625 -9.304688 5.722656 -9.210938 6.152344 -9.023438 C 6.582031 -8.835938 6.921875 -8.546875 7.164062 -8.152344 C 7.40625 -7.761719 7.527344 -7.351562 7.527344 -6.925781 C 7.527344 -6.527344 7.421875 -6.152344 7.203125 -5.800781 C 6.988281 -5.449219 6.664062 -5.167969 6.226562 -4.953125 C 6.789062 -4.785156 7.222656 -4.503906 7.523438 -4.105469 C 7.828125 -3.707031 7.980469 -3.238281 7.980469 -2.699219 C 7.980469 -2.261719 7.886719 -1.855469 7.703125 -1.480469 C 7.519531 -1.105469 7.292969 -0.820312 7.019531 -0.617188 C 6.75 -0.414062 6.410156 -0.257812 6 -0.15625 C 5.59375 -0.0507812 5.09375 0 4.5 0 Z M 2.183594 -5.394531 L 4.195312 -5.394531 C 4.742188 -5.394531 5.132812 -5.429688 5.371094 -5.503906 C 5.683594 -5.597656 5.917969 -5.75 6.078125 -5.96875 C 6.238281 -6.183594 6.316406 -6.453125 6.316406 -6.78125 C 6.316406 -7.089844 6.242188 -7.359375 6.09375 -7.59375 C 5.945312 -7.828125 5.734375 -7.992188 5.460938 -8.078125 C 5.183594 -8.164062 4.710938 -8.207031 4.042969 -8.207031 L 2.183594 -8.207031 Z M 2.183594 -1.097656 L 4.5 -1.097656 C 4.898438 -1.097656 5.175781 -1.113281 5.339844 -1.140625 C 5.621094 -1.191406 5.859375 -1.277344 6.050781 -1.398438 C 6.242188 -1.515625 6.394531 -1.6875 6.519531 -1.914062 C 6.640625 -2.140625 6.703125 -2.402344 6.703125 -2.699219 C 6.703125 -3.046875 6.613281 -3.347656 6.4375 -3.601562 C 6.257812 -3.859375 6.011719 -4.039062 5.695312 -4.140625 C 5.382812 -4.246094 4.929688 -4.296875 4.335938 -4.296875 L 2.183594 -4.296875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-2">
+<path style="stroke:none;" d="M 0.429688 -3.371094 C 0.429688 -4.617188 0.777344 -5.542969 1.472656 -6.144531 C 2.050781 -6.644531 2.757812 -6.894531 3.59375 -6.894531 C 4.519531 -6.894531 5.277344 -6.589844 5.867188 -5.984375 C 6.453125 -5.375 6.746094 -4.535156 6.746094 -3.464844 C 6.746094 -2.597656 6.617188 -1.914062 6.355469 -1.417969 C 6.097656 -0.921875 5.71875 -0.535156 5.222656 -0.261719 C 4.722656 0.015625 4.179688 0.152344 3.59375 0.152344 C 2.648438 0.152344 1.886719 -0.148438 1.304688 -0.753906 C 0.722656 -1.359375 0.429688 -2.230469 0.429688 -3.371094 Z M 1.605469 -3.371094 C 1.605469 -2.507812 1.792969 -1.859375 2.171875 -1.429688 C 2.546875 -1 3.023438 -0.789062 3.59375 -0.789062 C 4.160156 -0.789062 4.632812 -1.003906 5.007812 -1.433594 C 5.382812 -1.867188 5.574219 -2.523438 5.574219 -3.410156 C 5.574219 -4.242188 5.382812 -4.875 5.003906 -5.304688 C 4.625 -5.734375 4.15625 -5.949219 3.59375 -5.949219 C 3.023438 -5.949219 2.546875 -5.734375 2.171875 -5.304688 C 1.792969 -4.878906 1.605469 -4.234375 1.605469 -3.371094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-3">
+<path style="stroke:none;" d="M 0.398438 -2.011719 L 1.53125 -2.191406 C 1.59375 -1.738281 1.769531 -1.390625 2.058594 -1.148438 C 2.347656 -0.90625 2.753906 -0.789062 3.273438 -0.789062 C 3.800781 -0.789062 4.1875 -0.894531 4.445312 -1.109375 C 4.699219 -1.320312 4.824219 -1.570312 4.824219 -1.859375 C 4.824219 -2.117188 4.710938 -2.320312 4.488281 -2.46875 C 4.332031 -2.570312 3.941406 -2.699219 3.320312 -2.855469 C 2.480469 -3.066406 1.902344 -3.25 1.578125 -3.40625 C 1.253906 -3.558594 1.007812 -3.773438 0.839844 -4.046875 C 0.671875 -4.320312 0.589844 -4.621094 0.589844 -4.953125 C 0.589844 -5.253906 0.660156 -5.53125 0.796875 -5.785156 C 0.933594 -6.042969 1.121094 -6.253906 1.359375 -6.421875 C 1.535156 -6.554688 1.777344 -6.667969 2.085938 -6.757812 C 2.390625 -6.847656 2.722656 -6.894531 3.070312 -6.894531 C 3.601562 -6.894531 4.066406 -6.816406 4.464844 -6.664062 C 4.867188 -6.511719 5.160156 -6.304688 5.351562 -6.046875 C 5.542969 -5.785156 5.671875 -5.4375 5.746094 -5 L 4.628906 -4.851562 C 4.578125 -5.195312 4.429688 -5.46875 4.1875 -5.664062 C 3.945312 -5.859375 3.597656 -5.953125 3.15625 -5.953125 C 2.628906 -5.953125 2.253906 -5.867188 2.03125 -5.695312 C 1.808594 -5.519531 1.695312 -5.316406 1.695312 -5.085938 C 1.695312 -4.9375 1.742188 -4.804688 1.835938 -4.683594 C 1.929688 -4.5625 2.074219 -4.460938 2.273438 -4.378906 C 2.386719 -4.335938 2.722656 -4.242188 3.28125 -4.085938 C 4.089844 -3.871094 4.652344 -3.695312 4.972656 -3.558594 C 5.292969 -3.421875 5.542969 -3.21875 5.726562 -2.957031 C 5.90625 -2.695312 6 -2.371094 6 -1.980469 C 6 -1.601562 5.886719 -1.242188 5.664062 -0.90625 C 5.441406 -0.570312 5.121094 -0.308594 4.703125 -0.125 C 4.285156 0.0585938 3.8125 0.152344 3.28125 0.152344 C 2.40625 0.152344 1.738281 -0.03125 1.277344 -0.394531 C 0.820312 -0.757812 0.527344 -1.296875 0.398438 -2.011719 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-4">
+<path style="stroke:none;" d="M 3.351562 -1.023438 L 3.515625 -0.0117188 C 3.195312 0.0546875 2.90625 0.0898438 2.652344 0.0898438 C 2.238281 0.0898438 1.917969 0.0234375 1.6875 -0.109375 C 1.460938 -0.238281 1.300781 -0.410156 1.207031 -0.625 C 1.113281 -0.839844 1.066406 -1.289062 1.066406 -1.972656 L 1.066406 -5.851562 L 0.226562 -5.851562 L 0.226562 -6.742188 L 1.066406 -6.742188 L 1.066406 -8.410156 L 2.203125 -9.097656 L 2.203125 -6.742188 L 3.351562 -6.742188 L 3.351562 -5.851562 L 2.203125 -5.851562 L 2.203125 -1.910156 C 2.203125 -1.585938 2.222656 -1.375 2.261719 -1.28125 C 2.304688 -1.1875 2.367188 -1.113281 2.460938 -1.058594 C 2.550781 -1.003906 2.679688 -0.976562 2.851562 -0.976562 C 2.976562 -0.976562 3.144531 -0.992188 3.351562 -1.023438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-5">
+<path style="stroke:none;" d="M 1.179688 0 L 1.179688 -1.300781 L 2.480469 -1.300781 L 2.480469 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-6">
+<path style="stroke:none;" d="M 1.003906 0 L 1.003906 -9.304688 L 4.511719 -9.304688 C 5.128906 -9.304688 5.601562 -9.277344 5.929688 -9.21875 C 6.386719 -9.140625 6.769531 -8.996094 7.078125 -8.78125 C 7.386719 -8.566406 7.636719 -8.269531 7.824219 -7.882812 C 8.011719 -7.5 8.105469 -7.074219 8.105469 -6.613281 C 8.105469 -5.824219 7.855469 -5.152344 7.351562 -4.605469 C 6.847656 -4.058594 5.9375 -3.78125 4.621094 -3.78125 L 2.234375 -3.78125 L 2.234375 0 Z M 2.234375 -4.882812 L 4.640625 -4.882812 C 5.4375 -4.882812 6 -5.03125 6.335938 -5.324219 C 6.667969 -5.621094 6.835938 -6.039062 6.835938 -6.578125 C 6.835938 -6.964844 6.738281 -7.296875 6.542969 -7.574219 C 6.34375 -7.851562 6.085938 -8.035156 5.765625 -8.125 C 5.558594 -8.179688 5.171875 -8.207031 4.613281 -8.207031 L 2.234375 -8.207031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-7">
+<path style="stroke:none;" d="M 0.804688 2.597656 L 0.679688 1.523438 C 0.929688 1.589844 1.148438 1.625 1.332031 1.625 C 1.585938 1.625 1.789062 1.582031 1.941406 1.5 C 2.09375 1.414062 2.21875 1.296875 2.316406 1.140625 C 2.390625 1.027344 2.503906 0.746094 2.664062 0.292969 C 2.6875 0.230469 2.722656 0.136719 2.765625 0.0117188 L 0.210938 -6.742188 L 1.441406 -6.742188 L 2.84375 -2.835938 C 3.027344 -2.34375 3.1875 -1.820312 3.332031 -1.277344 C 3.464844 -1.800781 3.621094 -2.3125 3.800781 -2.8125 L 5.242188 -6.742188 L 6.386719 -6.742188 L 3.820312 0.113281 C 3.546875 0.855469 3.332031 1.363281 3.179688 1.644531 C 2.976562 2.019531 2.746094 2.296875 2.480469 2.472656 C 2.21875 2.648438 1.90625 2.734375 1.542969 2.734375 C 1.324219 2.734375 1.078125 2.6875 0.804688 2.597656 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-8">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -9.304688 L 2 -9.304688 L 2 -5.96875 C 2.53125 -6.585938 3.207031 -6.894531 4.019531 -6.894531 C 4.519531 -6.894531 4.953125 -6.796875 5.320312 -6.597656 C 5.6875 -6.402344 5.949219 -6.128906 6.109375 -5.78125 C 6.269531 -5.433594 6.347656 -4.933594 6.347656 -4.273438 L 6.347656 0 L 5.203125 0 L 5.203125 -4.273438 C 5.203125 -4.84375 5.082031 -5.257812 4.832031 -5.519531 C 4.585938 -5.78125 4.234375 -5.910156 3.78125 -5.910156 C 3.445312 -5.910156 3.125 -5.820312 2.828125 -5.644531 C 2.53125 -5.46875 2.316406 -5.234375 2.191406 -4.933594 C 2.0625 -4.632812 2 -4.21875 2 -3.6875 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-9">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -6.742188 L 1.886719 -6.742188 L 1.886719 -5.78125 C 2.382812 -6.523438 3.09375 -6.894531 4.03125 -6.894531 C 4.4375 -6.894531 4.808594 -6.820312 5.152344 -6.675781 C 5.492188 -6.527344 5.746094 -6.335938 5.914062 -6.101562 C 6.085938 -5.863281 6.203125 -5.582031 6.273438 -5.257812 C 6.3125 -5.046875 6.335938 -4.675781 6.335938 -4.144531 L 6.335938 0 L 5.191406 0 L 5.191406 -4.101562 C 5.191406 -4.566406 5.148438 -4.914062 5.058594 -5.144531 C 4.96875 -5.375 4.8125 -5.558594 4.585938 -5.695312 C 4.359375 -5.835938 4.09375 -5.902344 3.789062 -5.902344 C 3.304688 -5.902344 2.882812 -5.75 2.53125 -5.441406 C 2.175781 -5.132812 2 -4.546875 2 -3.679688 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-10">
+<path style="stroke:none;" d="M 0.855469 2.582031 L 0.855469 -6.742188 L 1.898438 -6.742188 L 1.898438 -5.867188 C 2.144531 -6.207031 2.421875 -6.464844 2.730469 -6.636719 C 3.039062 -6.808594 3.414062 -6.894531 3.851562 -6.894531 C 4.429688 -6.894531 4.9375 -6.746094 5.375 -6.449219 C 5.816406 -6.152344 6.148438 -5.734375 6.375 -5.195312 C 6.597656 -4.65625 6.710938 -4.066406 6.710938 -3.421875 C 6.710938 -2.730469 6.585938 -2.109375 6.339844 -1.558594 C 6.089844 -1.007812 5.730469 -0.582031 5.257812 -0.289062 C 4.785156 0.00390625 4.289062 0.152344 3.769531 0.152344 C 3.390625 0.152344 3.046875 0.0703125 2.746094 -0.0898438 C 2.441406 -0.25 2.195312 -0.453125 2 -0.699219 L 2 2.582031 Z M 1.890625 -3.332031 C 1.890625 -2.464844 2.066406 -1.824219 2.417969 -1.410156 C 2.769531 -0.996094 3.195312 -0.789062 3.695312 -0.789062 C 4.203125 -0.789062 4.636719 -1 5 -1.429688 C 5.359375 -1.859375 5.542969 -2.527344 5.542969 -3.429688 C 5.542969 -4.289062 5.363281 -4.929688 5.011719 -5.359375 C 4.65625 -5.785156 4.234375 -6 3.746094 -6 C 3.257812 -6 2.828125 -5.769531 2.453125 -5.316406 C 2.078125 -4.859375 1.890625 -4.199219 1.890625 -3.332031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-11">
+<path style="stroke:none;" d="M 1.910156 0 L 0.851562 0 L 0.851562 -9.304688 L 1.992188 -9.304688 L 1.992188 -5.984375 C 2.476562 -6.589844 3.089844 -6.894531 3.839844 -6.894531 C 4.253906 -6.894531 4.648438 -6.808594 5.019531 -6.644531 C 5.390625 -6.476562 5.691406 -6.242188 5.933594 -5.9375 C 6.171875 -5.636719 6.359375 -5.269531 6.492188 -4.84375 C 6.628906 -4.414062 6.695312 -3.957031 6.695312 -3.472656 C 6.695312 -2.316406 6.410156 -1.425781 5.839844 -0.792969 C 5.269531 -0.164062 4.582031 0.152344 3.78125 0.152344 C 2.988281 0.152344 2.363281 -0.179688 1.910156 -0.84375 Z M 1.898438 -3.421875 C 1.898438 -2.613281 2.007812 -2.027344 2.226562 -1.667969 C 2.585938 -1.082031 3.074219 -0.789062 3.6875 -0.789062 C 4.1875 -0.789062 4.617188 -1.003906 4.984375 -1.4375 C 5.347656 -1.871094 5.527344 -2.519531 5.527344 -3.375 C 5.527344 -4.257812 5.355469 -4.90625 5.003906 -5.324219 C 4.65625 -5.742188 4.234375 -5.953125 3.738281 -5.953125 C 3.238281 -5.953125 2.808594 -5.738281 2.445312 -5.304688 C 2.082031 -4.871094 1.898438 -4.242188 1.898438 -3.421875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-12">
+<path style="stroke:none;" d="M 0.863281 -7.992188 L 0.863281 -9.304688 L 2.007812 -9.304688 L 2.007812 -7.992188 Z M 0.863281 0 L 0.863281 -6.742188 L 2.007812 -6.742188 L 2.007812 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-13">
+<path style="stroke:none;" d="M 5.230469 0 L 5.230469 -0.851562 C 4.804688 -0.183594 4.175781 0.152344 3.34375 0.152344 C 2.808594 0.152344 2.3125 0.00390625 1.863281 -0.292969 C 1.414062 -0.589844 1.0625 -1 0.816406 -1.53125 C 0.570312 -2.0625 0.445312 -2.675781 0.445312 -3.363281 C 0.445312 -4.035156 0.554688 -4.648438 0.78125 -5.195312 C 1.003906 -5.742188 1.339844 -6.164062 1.789062 -6.457031 C 2.238281 -6.75 2.738281 -6.894531 3.292969 -6.894531 C 3.699219 -6.894531 4.0625 -6.808594 4.378906 -6.636719 C 4.695312 -6.464844 4.957031 -6.242188 5.15625 -5.96875 L 5.15625 -9.304688 L 6.289062 -9.304688 L 6.289062 0 Z M 1.617188 -3.363281 C 1.617188 -2.5 1.800781 -1.855469 2.164062 -1.429688 C 2.527344 -1 2.957031 -0.789062 3.453125 -0.789062 C 3.953125 -0.789062 4.375 -0.992188 4.726562 -1.398438 C 5.074219 -1.808594 5.25 -2.429688 5.25 -3.269531 C 5.25 -4.191406 5.070312 -4.867188 4.714844 -5.300781 C 4.359375 -5.730469 3.921875 -5.949219 3.402344 -5.949219 C 2.894531 -5.949219 2.46875 -5.742188 2.128906 -5.324219 C 1.789062 -4.910156 1.617188 -4.257812 1.617188 -3.363281 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-14">
+<path style="stroke:none;" d="M 4.84375 0 L 3.699219 0 L 3.699219 -7.28125 C 3.425781 -7.019531 3.066406 -6.757812 2.617188 -6.492188 C 2.171875 -6.230469 1.769531 -6.035156 1.414062 -5.902344 L 1.414062 -7.007812 C 2.054688 -7.308594 2.613281 -7.671875 3.089844 -8.101562 C 3.570312 -8.527344 3.90625 -8.941406 4.105469 -9.34375 L 4.84375 -9.34375 Z "/>
+</symbol>
+</g>
+<clipPath id="clip1">
+  <path d="M 94 19 L 96 19 L 96 215 L 94 215 Z "/>
+</clipPath>
+<clipPath id="clip2">
+  <path d="M 204 19 L 206 19 L 206 215 L 204 215 Z "/>
+</clipPath>
+<clipPath id="clip3">
+  <path d="M 314 19 L 316 19 L 316 215 L 314 215 Z "/>
+</clipPath>
+<clipPath id="clip4">
+  <path d="M 39 171 L 355 171 L 355 173 L 39 173 Z "/>
+</clipPath>
+<clipPath id="clip5">
+  <path d="M 39 107 L 355 107 L 355 109 L 39 109 Z "/>
+</clipPath>
+<clipPath id="clip6">
+  <path d="M 39 44 L 355 44 L 355 46 L 39 46 Z "/>
+</clipPath>
+</defs>
+<g id="surface18">
+<g clip-path="url(#clip1)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 95.160156 214.296875 L 95.160156 19 "/>
+</g>
+<g clip-path="url(#clip2)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 204.84375 214.296875 L 204.84375 19 "/>
+</g>
+<g clip-path="url(#clip3)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 314.53125 214.296875 L 314.53125 19 "/>
+</g>
+<g clip-path="url(#clip4)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 39 171.960938 L 355 171.960938 "/>
+</g>
+<g clip-path="url(#clip5)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 39 108.234375 L 355 108.234375 "/>
+</g>
+<g clip-path="url(#clip6)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 39 44.511719 L 355 44.511719 "/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 55.070312 201 C 55.070312 200.183594 54.75 199.402344 54.171875 198.828125 C 53.597656 198.25 52.816406 197.929688 52 197.929688 C 51.183594 197.929688 50.402344 198.25 49.828125 198.828125 C 49.25 199.402344 48.929688 200.183594 48.929688 201 C 48.929688 201.816406 49.25 202.597656 49.828125 203.171875 C 50.402344 203.75 51.183594 204.070312 52 204.070312 C 52.816406 204.070312 53.597656 203.75 54.171875 203.171875 C 54.75 202.597656 55.070312 201.816406 55.070312 201 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 88.070312 176 C 88.070312 175.183594 87.75 174.402344 87.171875 173.828125 C 86.597656 173.25 85.816406 172.929688 85 172.929688 C 84.183594 172.929688 83.402344 173.25 82.828125 173.828125 C 82.25 174.402344 81.929688 175.183594 81.929688 176 C 81.929688 176.816406 82.25 177.597656 82.828125 178.171875 C 83.402344 178.75 84.183594 179.070312 85 179.070312 C 85.816406 179.070312 86.597656 178.75 87.171875 178.171875 C 87.75 177.597656 88.070312 176.816406 88.070312 176 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 121.070312 160 C 121.070312 159.183594 120.75 158.402344 120.171875 157.828125 C 119.597656 157.25 118.816406 156.929688 118 156.929688 C 117.183594 156.929688 116.402344 157.25 115.828125 157.828125 C 115.25 158.402344 114.929688 159.183594 114.929688 160 C 114.929688 160.816406 115.25 161.597656 115.828125 162.171875 C 116.402344 162.75 117.183594 163.070312 118 163.070312 C 118.816406 163.070312 119.597656 162.75 120.171875 162.171875 C 120.75 161.597656 121.070312 160.816406 121.070312 160 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 154.070312 142 C 154.070312 141.183594 153.75 140.402344 153.171875 139.828125 C 152.597656 139.25 151.816406 138.929688 151 138.929688 C 150.183594 138.929688 149.402344 139.25 148.828125 139.828125 C 148.25 140.402344 147.929688 141.183594 147.929688 142 C 147.929688 142.816406 148.25 143.597656 148.828125 144.171875 C 149.402344 144.75 150.183594 145.070312 151 145.070312 C 151.816406 145.070312 152.597656 144.75 153.171875 144.171875 C 153.75 143.597656 154.070312 142.816406 154.070312 142 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 187.070312 124 C 187.070312 123.183594 186.75 122.402344 186.171875 121.828125 C 185.597656 121.25 184.816406 120.929688 184 120.929688 C 183.183594 120.929688 182.402344 121.25 181.828125 121.828125 C 181.25 122.402344 180.929688 123.183594 180.929688 124 C 180.929688 124.816406 181.25 125.597656 181.828125 126.171875 C 182.402344 126.75 183.183594 127.070312 184 127.070312 C 184.816406 127.070312 185.597656 126.75 186.171875 126.171875 C 186.75 125.597656 187.070312 124.816406 187.070312 124 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 220.070312 105 C 220.070312 104.183594 219.75 103.402344 219.171875 102.828125 C 218.597656 102.25 217.816406 101.929688 217 101.929688 C 216.183594 101.929688 215.402344 102.25 214.828125 102.828125 C 214.25 103.402344 213.929688 104.183594 213.929688 105 C 213.929688 105.816406 214.25 106.597656 214.828125 107.171875 C 215.402344 107.75 216.183594 108.070312 217 108.070312 C 217.816406 108.070312 218.597656 107.75 219.171875 107.171875 C 219.75 106.597656 220.070312 105.816406 220.070312 105 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 253.070312 86 C 253.070312 85.183594 252.75 84.402344 252.171875 83.828125 C 251.597656 83.25 250.816406 82.929688 250 82.929688 C 249.183594 82.929688 248.402344 83.25 247.828125 83.828125 C 247.25 84.402344 246.929688 85.183594 246.929688 86 C 246.929688 86.816406 247.25 87.597656 247.828125 88.171875 C 248.402344 88.75 249.183594 89.070312 250 89.070312 C 250.816406 89.070312 251.597656 88.75 252.171875 88.171875 C 252.75 87.597656 253.070312 86.816406 253.070312 86 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 286.070312 67 C 286.070312 66.183594 285.75 65.402344 285.171875 64.828125 C 284.597656 64.25 283.816406 63.929688 283 63.929688 C 282.183594 63.929688 281.402344 64.25 280.828125 64.828125 C 280.25 65.402344 279.929688 66.183594 279.929688 67 C 279.929688 67.816406 280.25 68.597656 280.828125 69.171875 C 281.402344 69.75 282.183594 70.070312 283 70.070312 C 283.816406 70.070312 284.597656 69.75 285.171875 69.171875 C 285.75 68.597656 286.070312 67.816406 286.070312 67 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 319.070312 48 C 319.070312 47.183594 318.75 46.402344 318.171875 45.828125 C 317.597656 45.25 316.816406 44.929688 316 44.929688 C 315.183594 44.929688 314.402344 45.25 313.828125 45.828125 C 313.25 46.402344 312.929688 47.183594 312.929688 48 C 312.929688 48.816406 313.25 49.597656 313.828125 50.171875 C 314.402344 50.75 315.183594 51.070312 316 51.070312 C 316.816406 51.070312 317.597656 50.75 318.171875 50.171875 C 318.75 49.597656 319.070312 48.816406 319.070312 48 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 352.070312 29 C 352.070312 28.183594 351.75 27.402344 351.171875 26.828125 C 350.597656 26.25 349.816406 25.929688 349 25.929688 C 348.183594 25.929688 347.402344 26.25 346.828125 26.828125 C 346.25 27.402344 345.929688 28.183594 345.929688 29 C 345.929688 29.816406 346.25 30.597656 346.828125 31.171875 C 347.402344 31.75 348.183594 32.070312 349 32.070312 C 349.816406 32.070312 350.597656 31.75 351.171875 31.171875 C 351.75 30.597656 352.070312 29.816406 352.070312 29 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 55.070312 191 C 55.070312 190.183594 54.75 189.402344 54.171875 188.828125 C 53.597656 188.25 52.816406 187.929688 52 187.929688 C 51.183594 187.929688 50.402344 188.25 49.828125 188.828125 C 49.25 189.402344 48.929688 190.183594 48.929688 191 C 48.929688 191.816406 49.25 192.597656 49.828125 193.171875 C 50.402344 193.75 51.183594 194.070312 52 194.070312 C 52.816406 194.070312 53.597656 193.75 54.171875 193.171875 C 54.75 192.597656 55.070312 191.816406 55.070312 191 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 88.070312 181 C 88.070312 180.183594 87.75 179.402344 87.171875 178.828125 C 86.597656 178.25 85.816406 177.929688 85 177.929688 C 84.183594 177.929688 83.402344 178.25 82.828125 178.828125 C 82.25 179.402344 81.929688 180.183594 81.929688 181 C 81.929688 181.816406 82.25 182.597656 82.828125 183.171875 C 83.402344 183.75 84.183594 184.070312 85 184.070312 C 85.816406 184.070312 86.597656 183.75 87.171875 183.171875 C 87.75 182.597656 88.070312 181.816406 88.070312 181 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 121.070312 170 C 121.070312 169.183594 120.75 168.402344 120.171875 167.828125 C 119.597656 167.25 118.816406 166.929688 118 166.929688 C 117.183594 166.929688 116.402344 167.25 115.828125 167.828125 C 115.25 168.402344 114.929688 169.183594 114.929688 170 C 114.929688 170.816406 115.25 171.597656 115.828125 172.171875 C 116.402344 172.75 117.183594 173.070312 118 173.070312 C 118.816406 173.070312 119.597656 172.75 120.171875 172.171875 C 120.75 171.597656 121.070312 170.816406 121.070312 170 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 154.070312 157 C 154.070312 156.183594 153.75 155.402344 153.171875 154.828125 C 152.597656 154.25 151.816406 153.929688 151 153.929688 C 150.183594 153.929688 149.402344 154.25 148.828125 154.828125 C 148.25 155.402344 147.929688 156.183594 147.929688 157 C 147.929688 157.816406 148.25 158.597656 148.828125 159.171875 C 149.402344 159.75 150.183594 160.070312 151 160.070312 C 151.816406 160.070312 152.597656 159.75 153.171875 159.171875 C 153.75 158.597656 154.070312 157.816406 154.070312 157 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 187.070312 142 C 187.070312 141.183594 186.75 140.402344 186.171875 139.828125 C 185.597656 139.25 184.816406 138.929688 184 138.929688 C 183.183594 138.929688 182.402344 139.25 181.828125 139.828125 C 181.25 140.402344 180.929688 141.183594 180.929688 142 C 180.929688 142.816406 181.25 143.597656 181.828125 144.171875 C 182.402344 144.75 183.183594 145.070312 184 145.070312 C 184.816406 145.070312 185.597656 144.75 186.171875 144.171875 C 186.75 143.597656 187.070312 142.816406 187.070312 142 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 220.070312 125 C 220.070312 124.183594 219.75 123.402344 219.171875 122.828125 C 218.597656 122.25 217.816406 121.929688 217 121.929688 C 216.183594 121.929688 215.402344 122.25 214.828125 122.828125 C 214.25 123.402344 213.929688 124.183594 213.929688 125 C 213.929688 125.816406 214.25 126.597656 214.828125 127.171875 C 215.402344 127.75 216.183594 128.070312 217 128.070312 C 217.816406 128.070312 218.597656 127.75 219.171875 127.171875 C 219.75 126.597656 220.070312 125.816406 220.070312 125 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 253.070312 107 C 253.070312 106.183594 252.75 105.402344 252.171875 104.828125 C 251.597656 104.25 250.816406 103.929688 250 103.929688 C 249.183594 103.929688 248.402344 104.25 247.828125 104.828125 C 247.25 105.402344 246.929688 106.183594 246.929688 107 C 246.929688 107.816406 247.25 108.597656 247.828125 109.171875 C 248.402344 109.75 249.183594 110.070312 250 110.070312 C 250.816406 110.070312 251.597656 109.75 252.171875 109.171875 C 252.75 108.597656 253.070312 107.816406 253.070312 107 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 286.070312 88 C 286.070312 87.183594 285.75 86.402344 285.171875 85.828125 C 284.597656 85.25 283.816406 84.929688 283 84.929688 C 282.183594 84.929688 281.402344 85.25 280.828125 85.828125 C 280.25 86.402344 279.929688 87.183594 279.929688 88 C 279.929688 88.816406 280.25 89.597656 280.828125 90.171875 C 281.402344 90.75 282.183594 91.070312 283 91.070312 C 283.816406 91.070312 284.597656 90.75 285.171875 90.171875 C 285.75 89.597656 286.070312 88.816406 286.070312 88 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 319.070312 69 C 319.070312 68.183594 318.75 67.402344 318.171875 66.828125 C 317.597656 66.25 316.816406 65.929688 316 65.929688 C 315.183594 65.929688 314.402344 66.25 313.828125 66.828125 C 313.25 67.402344 312.929688 68.183594 312.929688 69 C 312.929688 69.816406 313.25 70.597656 313.828125 71.171875 C 314.402344 71.75 315.183594 72.070312 316 72.070312 C 316.816406 72.070312 317.597656 71.75 318.171875 71.171875 C 318.75 70.597656 319.070312 69.816406 319.070312 69 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 352.070312 50 C 352.070312 49.183594 351.75 48.402344 351.171875 47.828125 C 350.597656 47.25 349.816406 46.929688 349 46.929688 C 348.183594 46.929688 347.402344 47.25 346.828125 47.828125 C 346.25 48.402344 345.929688 49.183594 345.929688 50 C 345.929688 50.816406 346.25 51.597656 346.828125 52.171875 C 347.402344 52.75 348.183594 53.070312 349 53.070312 C 349.816406 53.070312 350.597656 52.75 351.171875 52.171875 C 351.75 51.597656 352.070312 50.816406 352.070312 50 Z "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 214.296875 L 39 214.296875 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 214.296875 L 39 19 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 19 L 355 19 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 19 L 355 214.296875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 95.160156 214.296875 L 95.160156 211.140625 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="89.659467" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="95.220991" y="226.29874"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 204.84375 214.296875 L 204.84375 211.140625 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="196.345465" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="201.906989" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="207.468512" y="226.29874"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 314.53125 214.296875 L 314.53125 211.140625 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="303.531463" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="309.092987" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="314.65451" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="320.216034" y="226.29874"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 51.511719 214.296875 L 51.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 62.140625 214.296875 L 62.140625 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 70.824219 214.296875 L 70.824219 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 78.167969 214.296875 L 78.167969 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 84.53125 214.296875 L 84.53125 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.140625 214.296875 L 90.140625 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 128.179688 214.296875 L 128.179688 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 147.492188 214.296875 L 147.492188 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 161.195312 214.296875 L 161.195312 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 171.828125 214.296875 L 171.828125 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 180.511719 214.296875 L 180.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 187.855469 214.296875 L 187.855469 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 194.214844 214.296875 L 194.214844 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 199.828125 214.296875 L 199.828125 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 237.863281 214.296875 L 237.863281 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 257.179688 214.296875 L 257.179688 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 270.882812 214.296875 L 270.882812 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 281.511719 214.296875 L 281.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 290.199219 214.296875 L 290.199219 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 297.539062 214.296875 L 297.539062 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 303.902344 214.296875 L 303.902344 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 309.511719 214.296875 L 309.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.550781 214.296875 L 347.550781 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 171.960938 L 42.160156 171.960938 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="176.961688"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="176.961688"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-1" x="30" y="172.81325"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 108.234375 L 42.160156 108.234375 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="113.23603"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="113.23603"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-2" x="30" y="109.087593"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 44.511719 L 42.160156 44.511719 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="49.510373"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="49.510373"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-3" x="30" y="45.361936"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 205.28125 L 40.578125 205.28125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 197.320312 L 40.578125 197.320312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 191.144531 L 40.578125 191.144531 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 186.097656 L 40.578125 186.097656 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 181.832031 L 40.578125 181.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 178.136719 L 40.578125 178.136719 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 174.878906 L 40.578125 174.878906 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 152.777344 L 40.578125 152.777344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 141.558594 L 40.578125 141.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 133.59375 L 40.578125 133.59375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 127.417969 L 40.578125 127.417969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 122.375 L 40.578125 122.375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 118.105469 L 40.578125 118.105469 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 114.410156 L 40.578125 114.410156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 111.152344 L 40.578125 111.152344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 89.050781 L 40.578125 89.050781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 77.832031 L 40.578125 77.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 69.871094 L 40.578125 69.871094 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 63.695312 L 40.578125 63.695312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 58.648438 L 40.578125 58.648438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 54.382812 L 40.578125 54.382812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 50.6875 L 40.578125 50.6875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 47.425781 L 40.578125 47.425781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 25.328125 L 40.578125 25.328125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 95.160156 19 L 95.160156 22.160156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 204.84375 19 L 204.84375 22.160156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 314.53125 19 L 314.53125 22.160156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 51.511719 19 L 51.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 62.140625 19 L 62.140625 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 70.824219 19 L 70.824219 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 78.167969 19 L 78.167969 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 84.53125 19 L 84.53125 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.140625 19 L 90.140625 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 128.179688 19 L 128.179688 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 147.492188 19 L 147.492188 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 161.195312 19 L 161.195312 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 171.828125 19 L 171.828125 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 180.511719 19 L 180.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 187.855469 19 L 187.855469 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 194.214844 19 L 194.214844 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 199.828125 19 L 199.828125 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 237.863281 19 L 237.863281 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 257.179688 19 L 257.179688 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 270.882812 19 L 270.882812 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 281.511719 19 L 281.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 290.199219 19 L 290.199219 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 297.539062 19 L 297.539062 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 303.902344 19 L 303.902344 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 309.511719 19 L 309.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.550781 19 L 347.550781 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 171.960938 L 351.839844 171.960938 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 108.234375 L 351.839844 108.234375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 44.511719 L 351.839844 44.511719 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 205.28125 L 353.421875 205.28125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 197.320312 L 353.421875 197.320312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 191.144531 L 353.421875 191.144531 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 186.097656 L 353.421875 186.097656 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 181.832031 L 353.421875 181.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 178.136719 L 353.421875 178.136719 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 174.878906 L 353.421875 174.878906 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 152.777344 L 353.421875 152.777344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 141.558594 L 353.421875 141.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 133.59375 L 353.421875 133.59375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 127.417969 L 353.421875 127.417969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 122.375 L 353.421875 122.375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 118.105469 L 353.421875 118.105469 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 114.410156 L 353.421875 114.410156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 111.152344 L 353.421875 111.152344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 89.050781 L 353.421875 89.050781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 77.832031 L 353.421875 77.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 69.871094 L 353.421875 69.871094 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 63.695312 L 353.421875 63.695312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 58.648438 L 353.421875 58.648438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 54.382812 L 353.421875 54.382812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 50.6875 L 353.421875 50.6875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 47.425781 L 353.421875 47.425781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 25.328125 L 353.421875 25.328125 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-3" x="175.5" y="245.29874"/>
+  <use xlink:href="#glyph0-4" x="181.608398" y="245.29874"/>
+  <use xlink:href="#glyph0-5" x="187.169922" y="245.29874"/>
+  <use xlink:href="#glyph0-6" x="192.731445" y="245.29874"/>
+  <use xlink:href="#glyph0-7" x="197.731445" y="245.29874"/>
+  <use xlink:href="#glyph0-8" x="200.509766" y="245.29874"/>
+  <use xlink:href="#glyph0-9" x="202.731445" y="245.29874"/>
+  <use xlink:href="#glyph0-5" x="208.292969" y="245.29874"/>
+  <use xlink:href="#glyph0-10" x="213.854492" y="245.29874"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-1" x="9" y="129.14937"/>
+  <use xlink:href="#glyph2-2" x="9" y="122.479448"/>
+  <use xlink:href="#glyph2-3" x="9" y="117.479448"/>
+  <use xlink:href="#glyph2-4" x="9" y="114.701128"/>
+  <use xlink:href="#glyph2-5" x="9" y="109.139605"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-1" x="148" y="12"/>
+  <use xlink:href="#glyph3-2" x="155.330078" y="12"/>
+  <use xlink:href="#glyph3-3" x="157.996094" y="12"/>
+  <use xlink:href="#glyph3-4" x="160.662109" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-5" x="170" y="12"/>
+  <use xlink:href="#glyph3-2" x="176" y="12"/>
+  <use xlink:href="#glyph3-6" x="178.666016" y="12"/>
+  <use xlink:href="#glyph3-4" x="184.666016" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-7" x="194" y="12"/>
+  <use xlink:href="#glyph3-8" x="200.673828" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-9" x="207" y="12"/>
+  <use xlink:href="#glyph3-7" x="216.996094" y="12"/>
+  <use xlink:href="#glyph3-10" x="223.669922" y="12"/>
+  <use xlink:href="#glyph3-11" x="230.34375" y="12"/>
+  <use xlink:href="#glyph3-3" x="237.017578" y="12"/>
+  <use xlink:href="#glyph3-4" x="239.683594" y="12"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 382.148438 113 C 382.148438 112.164062 381.816406 111.363281 381.226562 110.773438 C 380.636719 110.183594 379.835938 109.851562 379 109.851562 C 378.164062 109.851562 377.363281 110.183594 376.773438 110.773438 C 376.183594 111.363281 375.851562 112.164062 375.851562 113 C 375.851562 113.835938 376.183594 114.636719 376.773438 115.226562 C 377.363281 115.816406 378.164062 116.148438 379 116.148438 C 379.835938 116.148438 380.636719 115.816406 381.226562 115.226562 C 381.816406 114.636719 382.148438 113.835938 382.148438 113 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-1" x="391" y="117.28418"/>
+  <use xlink:href="#glyph4-2" x="399.670898" y="117.28418"/>
+  <use xlink:href="#glyph4-2" x="406.901367" y="117.28418"/>
+  <use xlink:href="#glyph4-3" x="414.131836" y="117.28418"/>
+  <use xlink:href="#glyph4-4" x="420.631836" y="117.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-5" x="424" y="117.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-6" x="428" y="117.28418"/>
+  <use xlink:href="#glyph4-7" x="436.670898" y="117.28418"/>
+  <use xlink:href="#glyph4-4" x="443.170898" y="117.28418"/>
+  <use xlink:href="#glyph4-8" x="446.783203" y="117.28418"/>
+  <use xlink:href="#glyph4-2" x="454.013672" y="117.28418"/>
+  <use xlink:href="#glyph4-9" x="461.244141" y="117.28418"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 382.148438 136 C 382.148438 135.164062 381.816406 134.363281 381.226562 133.773438 C 380.636719 133.183594 379.835938 132.851562 379 132.851562 C 378.164062 132.851562 377.363281 133.183594 376.773438 133.773438 C 376.183594 134.363281 375.851562 135.164062 375.851562 136 C 375.851562 136.835938 376.183594 137.636719 376.773438 138.226562 C 377.363281 138.816406 378.164062 139.148438 379 139.148438 C 379.835938 139.148438 380.636719 138.816406 381.226562 138.226562 C 381.816406 137.636719 382.148438 136.835938 382.148438 136 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-10" x="391" y="140.28418"/>
+  <use xlink:href="#glyph4-7" x="398.230469" y="140.28418"/>
+  <use xlink:href="#glyph4-11" x="404.730469" y="140.28418"/>
+  <use xlink:href="#glyph4-12" x="411.960938" y="140.28418"/>
+  <use xlink:href="#glyph4-9" x="414.849609" y="140.28418"/>
+  <use xlink:href="#glyph4-13" x="422.080078" y="140.28418"/>
+  <use xlink:href="#glyph4-14" x="429.310547" y="140.28418"/>
+  <use xlink:href="#glyph4-14" x="436.541016" y="140.28418"/>
+</g>
+</g>
+</svg>
diff --git a/pybind11/docs/reference.rst b/pybind11/docs/reference.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a9fbe60015ca466e71a37f6f9cd9866bfa7adfe5
--- /dev/null
+++ b/pybind11/docs/reference.rst
@@ -0,0 +1,117 @@
+.. _reference:
+
+.. warning::
+
+    Please be advised that the reference documentation discussing pybind11
+    internals is currently incomplete. Please refer to the previous sections
+    and the pybind11 header files for the nitty gritty details.
+
+Reference
+#########
+
+.. _macros:
+
+Macros
+======
+
+.. doxygendefine:: PYBIND11_MODULE
+
+.. _core_types:
+
+Convenience classes for arbitrary Python types
+==============================================
+
+Common member functions
+-----------------------
+
+.. doxygenclass:: object_api
+    :members:
+
+Without reference counting
+--------------------------
+
+.. doxygenclass:: handle
+    :members:
+
+With reference counting
+-----------------------
+
+.. doxygenclass:: object
+    :members:
+
+.. doxygenfunction:: reinterpret_borrow
+
+.. doxygenfunction:: reinterpret_steal
+
+Convenience classes for specific Python types
+=============================================
+
+.. doxygenclass:: module
+    :members:
+
+.. doxygengroup:: pytypes
+    :members:
+
+.. _extras:
+
+Passing extra arguments to ``def`` or ``class_``
+================================================
+
+.. doxygengroup:: annotations
+    :members:
+
+Embedding the interpreter
+=========================
+
+.. doxygendefine:: PYBIND11_EMBEDDED_MODULE
+
+.. doxygenfunction:: initialize_interpreter
+
+.. doxygenfunction:: finalize_interpreter
+
+.. doxygenclass:: scoped_interpreter
+
+Redirecting C++ streams
+=======================
+
+.. doxygenclass:: scoped_ostream_redirect
+
+.. doxygenclass:: scoped_estream_redirect
+
+.. doxygenfunction:: add_ostream_redirect
+
+Python built-in functions
+=========================
+
+.. doxygengroup:: python_builtins
+    :members:
+
+Inheritance
+===========
+
+See :doc:`/classes` and :doc:`/advanced/classes` for more detail.
+
+.. doxygendefine:: PYBIND11_OVERLOAD
+
+.. doxygendefine:: PYBIND11_OVERLOAD_PURE
+
+.. doxygendefine:: PYBIND11_OVERLOAD_NAME
+
+.. doxygendefine:: PYBIND11_OVERLOAD_PURE_NAME
+
+.. doxygenfunction:: get_overload
+
+Exceptions
+==========
+
+.. doxygenclass:: error_already_set
+    :members:
+
+.. doxygenclass:: builtin_exception
+    :members:
+
+
+Literals
+========
+
+.. doxygennamespace:: literals
diff --git a/pybind11/docs/release.rst b/pybind11/docs/release.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9846f971a6ff88e40ceeaf16e14227ba3b6ae63c
--- /dev/null
+++ b/pybind11/docs/release.rst
@@ -0,0 +1,21 @@
+To release a new version of pybind11:
+
+- Update the version number and push to pypi
+    - Update ``pybind11/_version.py`` (set release version, remove 'dev').
+    - Update ``PYBIND11_VERSION_MAJOR`` etc. in ``include/pybind11/detail/common.h``.
+    - Ensure that all the information in ``setup.py`` is up-to-date.
+    - Update version in ``docs/conf.py``.
+    - Tag release date in ``docs/changelog.rst``.
+    - ``git add`` and ``git commit``.
+    - if new minor version: ``git checkout -b vX.Y``, ``git push -u origin vX.Y``
+    - ``git tag -a vX.Y.Z -m 'vX.Y.Z release'``.
+    - ``git push``
+    - ``git push --tags``.
+    - ``python setup.py sdist upload``.
+    - ``python setup.py bdist_wheel upload``.
+- Get back to work
+    - Update ``_version.py`` (add 'dev' and increment minor).
+    - Update version in ``docs/conf.py``
+    - Update version macros in ``include/pybind11/common.h``
+    - ``git add`` and ``git commit``.
+      ``git push``
diff --git a/pybind11/docs/requirements.txt b/pybind11/docs/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f4c3dc2e0b658f610497d65b039025ac917cfb5b
--- /dev/null
+++ b/pybind11/docs/requirements.txt
@@ -0,0 +1,5 @@
+breathe==4.20.0
+commonmark==0.9.1
+recommonmark==0.6.0
+sphinx==3.2.1
+sphinx_rtd_theme==0.5.0
diff --git a/pybind11/docs/upgrade.rst b/pybind11/docs/upgrade.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7c3f1c32808f3c07236b3e925153a6ac1c317f2e
--- /dev/null
+++ b/pybind11/docs/upgrade.rst
@@ -0,0 +1,459 @@
+Upgrade guide
+#############
+
+This is a companion guide to the :doc:`changelog`. While the changelog briefly
+lists all of the new features, improvements and bug fixes, this upgrade guide
+focuses only the subset which directly impacts your experience when upgrading
+to a new version. But it goes into more detail. This includes things like
+deprecated APIs and their replacements, build system changes, general code
+modernization and other useful information.
+
+.. _upgrade-guide-2.6:
+
+v2.6
+====
+
+An error is now thrown when ``__init__`` is forgotten on subclasses. This was
+incorrect before, but was not checked. Add a call to ``__init__`` if it is
+missing.
+
+If ``__eq__`` defined but not ``__hash__``, ``__hash__`` is now set to
+``None``, as in normal CPython. You should add ``__hash__`` if you intended the
+class to be hashable, possibly using the new ``py::hash`` shortcut.
+
+CMake support:
+--------------
+
+The minimum required version of CMake is now 3.4.  Several details of the CMake
+support have been deprecated; warnings will be shown if you need to change
+something. The changes are:
+
+* ``PYBIND11_CPP_STANDARD=<platform-flag>`` is deprecated, please use
+  ``CMAKE_CXX_STANDARD=<number>`` instead, or any other valid CMake CXX or CUDA
+  standard selection method, like ``target_compile_features``.
+
+* If you do not request a standard, PyBind11 targets will compile with the
+  compiler default, but not less than C++11, instead of forcing C++14 always.
+  If you depend on the old behavior, please use ``set(CMAKE_CXX_STANDARD 14)``
+  instead.
+
+* Direct ``pybind11::module`` usage should always be accompanied by at least
+  ``set(CMAKE_CXX_VISIBILITY_PRESET hidden)`` or similar - it used to try to
+  manually force this compiler flag (but not correctly on all compilers or with
+  CUDA).
+
+* ``pybind11_add_module``'s ``SYSTEM`` argument is deprecated and does nothing;
+  linking now behaves like other imported libraries consistently in both
+  config and submodule mode, and behaves like a ``SYSTEM`` library by
+  default.
+
+* If ``PYTHON_EXECUTABLE`` is not set, virtual environments (``venv``,
+  ``virtualenv``, and ``conda``) are prioritized over the standard search
+  (similar to the new FindPython mode).
+
+In addition, the following changes may be of interest:
+
+* ``CMAKE_INTERPROCEDURAL_OPTIMIZATION`` will be respected by
+  ``pybind11_add_module`` if set instead of linking to ``pybind11::lto`` or
+  ``pybind11::thin_lto``.
+
+* Using ``find_package(Python COMPONENTS Interpreter Development)`` before
+  pybind11 will cause pybind11 to use the new Python mechanisms instead of its
+  own custom search, based on a patched version of classic
+  FindPythonInterp/FindPythonLibs. In the future, this may become the default.
+
+
+
+v2.2
+====
+
+Deprecation of the ``PYBIND11_PLUGIN`` macro
+--------------------------------------------
+
+``PYBIND11_MODULE`` is now the preferred way to create module entry points.
+The old macro emits a compile-time deprecation warning.
+
+.. code-block:: cpp
+
+    // old
+    PYBIND11_PLUGIN(example) {
+        py::module m("example", "documentation string");
+
+        m.def("add", [](int a, int b) { return a + b; });
+
+        return m.ptr();
+    }
+
+    // new
+    PYBIND11_MODULE(example, m) {
+        m.doc() = "documentation string"; // optional
+
+        m.def("add", [](int a, int b) { return a + b; });
+    }
+
+
+New API for defining custom constructors and pickling functions
+---------------------------------------------------------------
+
+The old placement-new custom constructors have been deprecated. The new approach
+uses ``py::init()`` and factory functions to greatly improve type safety.
+
+Placement-new can be called accidentally with an incompatible type (without any
+compiler errors or warnings), or it can initialize the same object multiple times
+if not careful with the Python-side ``__init__`` calls. The new-style custom
+constructors prevent such mistakes. See :ref:`custom_constructors` for details.
+
+.. code-block:: cpp
+
+    // old -- deprecated (runtime warning shown only in debug mode)
+    py::class<Foo>(m, "Foo")
+        .def("__init__", [](Foo &self, ...) {
+            new (&self) Foo(...); // uses placement-new
+        });
+
+    // new
+    py::class<Foo>(m, "Foo")
+        .def(py::init([](...) { // Note: no `self` argument
+            return new Foo(...); // return by raw pointer
+            // or: return std::make_unique<Foo>(...); // return by holder
+            // or: return Foo(...); // return by value (move constructor)
+        }));
+
+Mirroring the custom constructor changes, ``py::pickle()`` is now the preferred
+way to get and set object state. See :ref:`pickling` for details.
+
+.. code-block:: cpp
+
+    // old -- deprecated (runtime warning shown only in debug mode)
+    py::class<Foo>(m, "Foo")
+        ...
+        .def("__getstate__", [](const Foo &self) {
+            return py::make_tuple(self.value1(), self.value2(), ...);
+        })
+        .def("__setstate__", [](Foo &self, py::tuple t) {
+            new (&self) Foo(t[0].cast<std::string>(), ...);
+        });
+
+    // new
+    py::class<Foo>(m, "Foo")
+        ...
+        .def(py::pickle(
+            [](const Foo &self) { // __getstate__
+                return py::make_tuple(f.value1(), f.value2(), ...); // unchanged
+            },
+            [](py::tuple t) { // __setstate__, note: no `self` argument
+                return new Foo(t[0].cast<std::string>(), ...);
+                // or: return std::make_unique<Foo>(...); // return by holder
+                // or: return Foo(...); // return by value (move constructor)
+            }
+        ));
+
+For both the constructors and pickling, warnings are shown at module
+initialization time (on import, not when the functions are called).
+They're only visible when compiled in debug mode. Sample warning:
+
+.. code-block:: none
+
+    pybind11-bound class 'mymodule.Foo' is using an old-style placement-new '__init__'
+    which has been deprecated. See the upgrade guide in pybind11's docs.
+
+
+Stricter enforcement of hidden symbol visibility for pybind11 modules
+---------------------------------------------------------------------
+
+pybind11 now tries to actively enforce hidden symbol visibility for modules.
+If you're using either one of pybind11's :doc:`CMake or Python build systems
+<compiling>` (the two example repositories) and you haven't been exporting any
+symbols, there's nothing to be concerned about. All the changes have been done
+transparently in the background. If you were building manually or relied on
+specific default visibility, read on.
+
+Setting default symbol visibility to *hidden* has always been recommended for
+pybind11 (see :ref:`faq:symhidden`). On Linux and macOS, hidden symbol
+visibility (in conjunction with the ``strip`` utility) yields much smaller
+module binaries. `CPython's extension docs`_ also recommend hiding symbols
+by default, with the goal of avoiding symbol name clashes between modules.
+Starting with v2.2, pybind11 enforces this more strictly: (1) by declaring
+all symbols inside the ``pybind11`` namespace as hidden and (2) by including
+the ``-fvisibility=hidden`` flag on Linux and macOS (only for extension
+modules, not for embedding the interpreter).
+
+.. _CPython's extension docs: https://docs.python.org/3/extending/extending.html#providing-a-c-api-for-an-extension-module
+
+The namespace-scope hidden visibility is done automatically in pybind11's
+headers and it's generally transparent to users. It ensures that:
+
+* Modules compiled with different pybind11 versions don't clash with each other.
+
+* Some new features, like ``py::module_local`` bindings, can work as intended.
+
+The ``-fvisibility=hidden`` flag applies the same visibility to user bindings
+outside of the ``pybind11`` namespace. It's now set automatic by pybind11's
+CMake and Python build systems, but this needs to be done manually by users
+of other build systems. Adding this flag:
+
+* Minimizes the chances of symbol conflicts between modules. E.g. if two
+  unrelated modules were statically linked to different (ABI-incompatible)
+  versions of the same third-party library, a symbol clash would be likely
+  (and would end with unpredictable results).
+
+* Produces smaller binaries on Linux and macOS, as pointed out previously.
+
+Within pybind11's CMake build system, ``pybind11_add_module`` has always been
+setting the ``-fvisibility=hidden`` flag in release mode. From now on, it's
+being applied unconditionally, even in debug mode and it can no longer be opted
+out of with the ``NO_EXTRAS`` option. The ``pybind11::module`` target now also
+adds this flag to it's interface. The ``pybind11::embed`` target is unchanged.
+
+The most significant change here is for the ``pybind11::module`` target. If you
+were previously relying on default visibility, i.e. if your Python module was
+doubling as a shared library with dependents, you'll need to either export
+symbols manually (recommended for cross-platform libraries) or factor out the
+shared library (and have the Python module link to it like the other
+dependents). As a temporary workaround, you can also restore default visibility
+using the CMake code below, but this is not recommended in the long run:
+
+.. code-block:: cmake
+
+    target_link_libraries(mymodule PRIVATE pybind11::module)
+
+    add_library(restore_default_visibility INTERFACE)
+    target_compile_options(restore_default_visibility INTERFACE -fvisibility=default)
+    target_link_libraries(mymodule PRIVATE restore_default_visibility)
+
+
+Local STL container bindings
+----------------------------
+
+Previous pybind11 versions could only bind types globally -- all pybind11
+modules, even unrelated ones, would have access to the same exported types.
+However, this would also result in a conflict if two modules exported the
+same C++ type, which is especially problematic for very common types, e.g.
+``std::vector<int>``. :ref:`module_local` were added to resolve this (see
+that section for a complete usage guide).
+
+``py::class_`` still defaults to global bindings (because these types are
+usually unique across modules), however in order to avoid clashes of opaque
+types, ``py::bind_vector`` and ``py::bind_map`` will now bind STL containers
+as ``py::module_local`` if their elements are: builtins (``int``, ``float``,
+etc.), not bound using ``py::class_``, or bound as ``py::module_local``. For
+example, this change allows multiple modules to bind ``std::vector<int>``
+without causing conflicts. See :ref:`stl_bind` for more details.
+
+When upgrading to this version, if you have multiple modules which depend on
+a single global binding of an STL container, note that all modules can still
+accept foreign  ``py::module_local`` types in the direction of Python-to-C++.
+The locality only affects the C++-to-Python direction. If this is needed in
+multiple modules, you'll need to either:
+
+* Add a copy of the same STL binding to all of the modules which need it.
+
+* Restore the global status of that single binding by marking it
+  ``py::module_local(false)``.
+
+The latter is an easy workaround, but in the long run it would be best to
+localize all common type bindings in order to avoid conflicts with
+third-party modules.
+
+
+Negative strides for Python buffer objects and numpy arrays
+-----------------------------------------------------------
+
+Support for negative strides required changing the integer type from unsigned
+to signed in the interfaces of ``py::buffer_info`` and ``py::array``. If you
+have compiler warnings enabled, you may notice some new conversion warnings
+after upgrading. These can be resolved using ``static_cast``.
+
+
+Deprecation of some ``py::object`` APIs
+---------------------------------------
+
+To compare ``py::object`` instances by pointer, you should now use
+``obj1.is(obj2)`` which is equivalent to ``obj1 is obj2`` in Python.
+Previously, pybind11 used ``operator==`` for this (``obj1 == obj2``), but
+that could be confusing and is now deprecated (so that it can eventually
+be replaced with proper rich object comparison in a future release).
+
+For classes which inherit from ``py::object``, ``borrowed`` and ``stolen``
+were previously available as protected constructor tags. Now the types
+should be used directly instead: ``borrowed_t{}`` and ``stolen_t{}``
+(`#771 <https://github.com/pybind/pybind11/pull/771>`_).
+
+
+Stricter compile-time error checking
+------------------------------------
+
+Some error checks have been moved from run time to compile time. Notably,
+automatic conversion of ``std::shared_ptr<T>`` is not possible when ``T`` is
+not directly registered with ``py::class_<T>`` (e.g. ``std::shared_ptr<int>``
+or ``std::shared_ptr<std::vector<T>>`` are not automatically convertible).
+Attempting to bind a function with such arguments now results in a compile-time
+error instead of waiting to fail at run time.
+
+``py::init<...>()`` constructor definitions are also stricter and now prevent
+bindings which could cause unexpected behavior:
+
+.. code-block:: cpp
+
+    struct Example {
+        Example(int &);
+    };
+
+    py::class_<Example>(m, "Example")
+        .def(py::init<int &>()); // OK, exact match
+        // .def(py::init<int>()); // compile-time error, mismatch
+
+A non-``const`` lvalue reference is not allowed to bind to an rvalue. However,
+note that a constructor taking ``const T &`` can still be registered using
+``py::init<T>()`` because a ``const`` lvalue reference can bind to an rvalue.
+
+v2.1
+====
+
+Minimum compiler versions are enforced at compile time
+------------------------------------------------------
+
+The minimums also apply to v2.0 but the check is now explicit and a compile-time
+error is raised if the compiler does not meet the requirements:
+
+* GCC >= 4.8
+* clang >= 3.3 (appleclang >= 5.0)
+* MSVC >= 2015u3
+* Intel C++ >= 15.0
+
+
+The ``py::metaclass`` attribute is not required for static properties
+---------------------------------------------------------------------
+
+Binding classes with static properties is now possible by default. The
+zero-parameter version of ``py::metaclass()`` is deprecated. However, a new
+one-parameter ``py::metaclass(python_type)`` version was added for rare
+cases when a custom metaclass is needed to override pybind11's default.
+
+.. code-block:: cpp
+
+    // old -- emits a deprecation warning
+    py::class_<Foo>(m, "Foo", py::metaclass())
+        .def_property_readonly_static("foo", ...);
+
+    // new -- static properties work without the attribute
+    py::class_<Foo>(m, "Foo")
+        .def_property_readonly_static("foo", ...);
+
+    // new -- advanced feature, override pybind11's default metaclass
+    py::class_<Bar>(m, "Bar", py::metaclass(custom_python_type))
+        ...
+
+
+v2.0
+====
+
+Breaking changes in ``py::class_``
+----------------------------------
+
+These changes were necessary to make type definitions in pybind11
+future-proof, to support PyPy via its ``cpyext`` mechanism (`#527
+<https://github.com/pybind/pybind11/pull/527>`_), and to improve efficiency
+(`rev. 86d825 <https://github.com/pybind/pybind11/commit/86d825>`_).
+
+1. Declarations of types that provide access via the buffer protocol must
+   now include the ``py::buffer_protocol()`` annotation as an argument to
+   the ``py::class_`` constructor.
+
+   .. code-block:: cpp
+
+       py::class_<Matrix>("Matrix", py::buffer_protocol())
+           .def(py::init<...>())
+           .def_buffer(...);
+
+2. Classes which include static properties (e.g. ``def_readwrite_static()``)
+   must now include the ``py::metaclass()`` attribute. Note: this requirement
+   has since been removed in v2.1. If you're upgrading from 1.x, it's
+   recommended to skip directly to v2.1 or newer.
+
+3. This version of pybind11 uses a redesigned mechanism for instantiating
+   trampoline classes that are used to override virtual methods from within
+   Python. This led to the following user-visible syntax change:
+
+   .. code-block:: cpp
+
+       // old v1.x syntax
+       py::class_<TrampolineClass>("MyClass")
+           .alias<MyClass>()
+           ...
+
+       // new v2.x syntax
+       py::class_<MyClass, TrampolineClass>("MyClass")
+           ...
+
+   Importantly, both the original and the trampoline class are now specified
+   as arguments to the ``py::class_`` template, and the ``alias<..>()`` call
+   is gone. The new scheme has zero overhead in cases when Python doesn't
+   override any functions of the underlying C++ class.
+   `rev. 86d825 <https://github.com/pybind/pybind11/commit/86d825>`_.
+
+   The class type must be the first template argument given to ``py::class_``
+   while the trampoline can be mixed in arbitrary order with other arguments
+   (see the following section).
+
+
+Deprecation of the ``py::base<T>()`` attribute
+----------------------------------------------
+
+``py::base<T>()`` was deprecated in favor of specifying ``T`` as a template
+argument to ``py::class_``. This new syntax also supports multiple inheritance.
+Note that, while the type being exported must be the first argument in the
+``py::class_<Class, ...>`` template, the order of the following types (bases,
+holder and/or trampoline) is not important.
+
+.. code-block:: cpp
+
+    // old v1.x
+    py::class_<Derived>("Derived", py::base<Base>());
+
+    // new v2.x
+    py::class_<Derived, Base>("Derived");
+
+    // new -- multiple inheritance
+    py::class_<Derived, Base1, Base2>("Derived");
+
+    // new -- apart from `Derived` the argument order can be arbitrary
+    py::class_<Derived, Base1, Holder, Base2, Trampoline>("Derived");
+
+
+Out-of-the-box support for ``std::shared_ptr``
+----------------------------------------------
+
+The relevant type caster is now built in, so it's no longer necessary to
+include a declaration of the form:
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>)
+
+Continuing to do so won’t cause an error or even a deprecation warning,
+but it's completely redundant.
+
+
+Deprecation of a few ``py::object`` APIs
+----------------------------------------
+
+All of the old-style calls emit deprecation warnings.
+
++---------------------------------------+---------------------------------------------+
+|  Old syntax                           |  New syntax                                 |
++=======================================+=============================================+
+| ``obj.call(args...)``                 | ``obj(args...)``                            |
++---------------------------------------+---------------------------------------------+
+| ``obj.str()``                         | ``py::str(obj)``                            |
++---------------------------------------+---------------------------------------------+
+| ``auto l = py::list(obj); l.check()`` | ``py::isinstance<py::list>(obj)``           |
++---------------------------------------+---------------------------------------------+
+| ``py::object(ptr, true)``             | ``py::reinterpret_borrow<py::object>(ptr)`` |
++---------------------------------------+---------------------------------------------+
+| ``py::object(ptr, false)``            | ``py::reinterpret_steal<py::object>(ptr)``  |
++---------------------------------------+---------------------------------------------+
+| ``if (obj.attr("foo"))``              | ``if (py::hasattr(obj, "foo"))``            |
++---------------------------------------+---------------------------------------------+
+| ``if (obj["bar"])``                   | ``if (obj.contains("bar"))``                |
++---------------------------------------+---------------------------------------------+
diff --git a/pybind11/include/pybind11/attr.h b/pybind11/include/pybind11/attr.h
new file mode 100644
index 0000000000000000000000000000000000000000..54065fc9e10a075e1a2de5d6095e88d4b0a4aca2
--- /dev/null
+++ b/pybind11/include/pybind11/attr.h
@@ -0,0 +1,528 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "cast.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+
+/// Annotation for operators
+struct is_operator { };
+
+/// Annotation for classes that cannot be subclassed
+struct is_final { };
+
+/// Annotation for parent scope
+struct scope { handle value; scope(const handle &s) : value(s) { } };
+
+/// Annotation for documentation
+struct doc { const char *value; doc(const char *value) : value(value) { } };
+
+/// Annotation for function names
+struct name { const char *value; name(const char *value) : value(value) { } };
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+
+/// Annotation indicating that a class derives from another given type
+template <typename T> struct base {
+    PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() { }
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient> struct keep_alive { };
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance { };
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr { };
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol { };
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() {}
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) { }
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic { };
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts> struct call_guard;
+
+template <> struct call_guard<> { using type = detail::void_type; };
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) { }
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), is_method(false),
+          has_args(false), has_kwargs(false), has_kwonly_args(false) { }
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl) (function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = { };
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data) (function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True once a 'py::kwonly' is encountered (any following args are keyword-only)
+    bool has_kwonly_args : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Number of trailing arguments (counted in `nargs`) that are keyword-only
+    std::uint16_t nargs_kwonly = 0;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false), is_final(false) { }
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    /// Is the class inheritable from python classes?
+    bool is_final : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
+        auto base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) +
+                          "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname + "\" " +
+                    (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+        if (base_info->type->tp_dictoffset != 0)
+            dynamic_attr = true;
+
+        if (caster)
+            base_info->implicit_casts.emplace_back(type, caster);
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) :
+        func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor { };
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void> struct process_attribute;
+
+template <typename T> struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) { }
+    static void init(const T &, type_record *) { }
+    static void precall(function_call &) { }
+    static void postcall(function_call &, handle) { }
+};
+
+/// Process an attribute specifying the function's name
+template <> struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <> struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <> struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = const_cast<char *>(d); }
+};
+template <> struct process_attribute<char *> : process_attribute<const char *> { };
+
+/// Process an attribute indicating the function's return value policy
+template <> struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a given sibling
+template <> struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <> struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <> struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <> struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+};
+
+inline void process_kwonly_arg(const arg &a, function_record *r) {
+    if (!a.name || strlen(a.name) == 0)
+        pybind11_fail("arg(): cannot specify an unnamed argument after an kwonly() annotation");
+    ++r->nargs_kwonly;
+}
+
+/// Process a keyword argument attribute (*without* a default value)
+template <> struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+
+        if (r->has_kwonly_args) process_kwonly_arg(a, r);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+
+        if (!a.value) {
+#if !defined(NDEBUG)
+            std::string descr("'");
+            if (a.name) descr += std::string(a.name) + ": ";
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name)
+                    descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
+                else
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument "
+                          + descr + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "Compile in debug mode for more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+
+        if (r->has_kwonly_args) process_kwonly_arg(a, r);
+    }
+};
+
+/// Process a keyword-only-arguments-follow pseudo argument
+template <> struct process_attribute<kwonly> : process_attribute_default<kwonly> {
+    static void init(const kwonly &, function_record *r) {
+        r->has_kwonly_args = true;
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<is_final> : process_attribute_default<is_final> {
+    static void init(const is_final &, type_record *r) { r->is_final = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args> struct process_attributes {
+    static void init(const Args&... args, function_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void init(const Args&... args, type_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void precall(function_call &call) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
+        ignore_unused(unused);
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
+        ignore_unused(unused);
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/buffer_info.h b/pybind11/include/pybind11/buffer_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..8349a46b8b92f87e9f641b30b7b86617b7f85d50
--- /dev/null
+++ b/pybind11/include/pybind11/buffer_info.h
@@ -0,0 +1,116 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+      shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        for (size_t i = 0; i < (size_t) ndim; ++i)
+            size *= shape[i];
+    }
+
+    template <typename T>
+    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in), readonly) { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) { }
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly=true)
+    : buffer_info(const_cast<T*>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}, view->readonly) {
+        this->m_view = view;
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info& operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) {
+        (*this) = std::move(other);
+    }
+
+    buffer_info& operator=(buffer_info &&rhs) {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(m_view, rhs.m_view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (m_view && ownview) { PyBuffer_Release(m_view); delete m_view; }
+    }
+
+    Py_buffer *view() const { return m_view; }
+    Py_buffer *&view() { return m_view; }
+private:
+    struct private_ctr_tag { };
+
+    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in, bool readonly)
+    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) { }
+
+    Py_buffer *m_view = nullptr;
+    bool ownview = false;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE = void> struct compare_buffer_info {
+    static bool compare(const buffer_info& b) {
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/cast.h b/pybind11/include/pybind11/cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..5711004df9f575c66ec7ca389cbd995675ac69e0
--- /dev/null
+++ b/pybind11/include/pybind11/cast.h
@@ -0,0 +1,2210 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pytypes.h"
+#include "detail/typeid.h"
+#include "detail/descr.h"
+#include "detail/internals.h"
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+
+#if defined(PYBIND11_CPP17)
+#  if defined(__has_include)
+#    if __has_include(<string_view>)
+#      define PYBIND11_HAS_STRING_VIEW
+#    endif
+#  elif defined(_MSC_VER)
+#    define PYBIND11_HAS_STRING_VIEW
+#  endif
+#endif
+#ifdef PYBIND11_HAS_STRING_VIEW
+#include <string_view>
+#endif
+
+#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
+#  define PYBIND11_HAS_U8STRING
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() {
+        get_internals().loader_patient_stack.push_back(nullptr);
+    }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            pybind11_fail("loader_life_support: internal error");
+
+        auto ptr = stack.back();
+        stack.pop_back();
+        Py_CLEAR(ptr);
+
+        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
+        if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2)
+            stack.shrink_to_fit();
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+
+        auto &list_ptr = stack.back();
+        if (list_ptr == nullptr) {
+            list_ptr = PyList_New(1);
+            if (!list_ptr)
+                pybind11_fail("loader_life_support: error allocating list");
+            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
+        } else {
+            auto result = PyList_Append(list_ptr, h.ptr());
+            if (result == -1)
+                pybind11_fail("loader_life_support: error adding patient");
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+        check.push_back((PyTypeObject *) parent.ptr());
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) continue;
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
+            // want to follow Python/virtual C++ rules that there should only be one instance of a
+            // common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) { found = true; break; }
+                }
+                if (!found) bases.push_back(tinfo);
+            }
+        }
+        else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+                check.push_back((PyTypeObject *) parent.ptr());
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second)
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
+    auto &bases = all_type_info(type);
+    if (bases.size() == 0)
+        return nullptr;
+    if (bases.size() > 1)
+        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = registered_local_types_cpp();
+    auto it = locals.find(tp);
+    if (it != locals.end())
+        return it->second;
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end())
+        return it->second;
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
+PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
+                                                          bool throw_if_missing = false) {
+    if (auto ltype = get_local_type_info(tp))
+        return ltype;
+    if (auto gtype = get_global_type_info(tp))
+        return gtype;
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
+        inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
+    {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() {}
+
+    // Used for past-the-end iterator
+    value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void> V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr(); }
+
+    template <typename H> H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+            ? inst->simple_holder_constructed
+            : inst->nonsimple.status[index] & instance::status_holder_constructed;
+    }
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_holder_constructed = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : inst->nonsimple.status[index] & instance::status_instance_registered;
+    }
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+            curr(inst /* instance */,
+                 types->empty() ? nullptr : (*types)[0] /* type info */,
+                 0, /* vpos: (non-simple types only): the first vptr comes first */
+                 0 /* index */)
+        {}
+        // Past-the-end iterator:
+        iterator(size_t end) : curr(end) {}
+    public:
+        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout)
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) ++it;
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type)
+        return value_and_holder(this, find_type, 0, 0);
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end())
+        return *it;
+
+    if (!throw_if_missing)
+        return value_and_holder();
+
+#if defined(NDEBUG)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+            "type is not a pybind11 base of the given instance "
+            "(compile in debug mode for type details)");
+#else
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
+            std::string(Py_TYPE(this)->tp_name) + "' instance");
+#endif
+}
+
+PYBIND11_NOINLINE inline void instance::allocate_layout() {
+    auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0)
+        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+
+    simple_layout =
+        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    }
+    else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto t : tinfo) {
+            space += 1; // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
+        // they default to using pymalloc, which is designed to be efficient for small allocations
+        // like the one we're doing here; in earlier versions (and for larger allocations) they are
+        // just wrappers around malloc.
+#if PY_VERSION_HEX >= 0x03050000
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+#else
+        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+#endif
+        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+PYBIND11_NOINLINE inline void instance::deallocate_layout() {
+    if (!simple_layout)
+        PyMem_Free(nonsimple.values_and_holders);
+}
+
+PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type)
+        return false;
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE inline std::string error_string() {
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
+        return "Unknown internal error occurred";
+    }
+
+    error_scope scope; // Preserve error state
+
+    std::string errorString;
+    if (scope.type) {
+        errorString += handle(scope.type).attr("__name__").cast<std::string>();
+        errorString += ": ";
+    }
+    if (scope.value)
+        errorString += (std::string) str(scope.value);
+
+    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
+
+#if PY_MAJOR_VERSION >= 3
+    if (scope.trace != nullptr)
+        PyException_SetTraceback(scope.value, scope.trace);
+#endif
+
+#if !defined(PYPY_VERSION)
+    if (scope.trace) {
+        PyTracebackObject *trace = (PyTracebackObject *) scope.trace;
+
+        /* Get the deepest trace possible */
+        while (trace->tb_next)
+            trace = trace->tb_next;
+
+        PyFrameObject *frame = trace->tb_frame;
+        errorString += "\n\nAt:\n";
+        while (frame) {
+            int lineno = PyFrame_GetLineNumber(frame);
+            errorString +=
+                "  " + handle(frame->f_code->co_filename).cast<std::string>() +
+                "(" + std::to_string(lineno) + "): " +
+                handle(frame->f_code->co_name).cast<std::string>() + "\n";
+            frame = frame->f_back;
+        }
+    }
+#endif
+
+    return errorString;
+}
+
+PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (const auto &vh : values_and_holders(it->second)) {
+            if (vh.type == type)
+                return handle((PyObject *) it->second);
+        }
+    }
+    return handle();
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x03000000
+    return _PyThreadState_Current;
+#elif PY_VERSION_HEX < 0x03050000
+    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
+#elif PY_VERSION_HEX < 0x03050200
+    return (PyThreadState*) _PyThreadState_Current.value;
+#else
+    return _PyThreadState_UncheckedGet();
+#endif
+}
+
+// Forward declarations
+inline void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
+
+    type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
+
+    bool load(handle src, bool convert) {
+        return load_impl<type_caster_generic>(src, convert);
+    }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) // no type info: error will be set already
+            return handle();
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr)
+            return none().release();
+
+        auto it_instances = get_internals().registered_instances.equal_range(src);
+        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
+                    return handle((PyObject *) it_i->second).inc_ref();
+            }
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (compile in debug mode for details)");
+#else
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " +
+                                     type_name + " is non-copyable!");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor)
+                    valueptr = move_constructor(src);
+                else if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(compile in debug mode for details)");
+#else
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " +
+                                     type_name + " is neither movable nor copyable!");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+                #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+                        vptr = ::operator new(type->type_size,
+                                              std::align_val_t(type->type_align));
+                    else
+                #endif
+                vptr = ::operator new(type->type_size);
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value))
+                return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false))
+            return caster.value;
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = src.get_type();
+        if (!hasattr(pytype, local_key))
+            return false;
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+            return false;
+
+        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) return false;
+        if (!typeinfo) return try_load_foreign_module_local(src);
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            value = nullptr;
+            return true;
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
+            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
+            // can safely reinterpret_cast to the relevant pointer.
+            else if (bases.size() > 1) {
+                for (auto base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
+                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
+            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
+            // when MI is involved).
+            if (this_.try_implicit_casts(src, convert))
+                return true;
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src))
+                return true;
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        return try_load_foreign_module_local(src);
+    }
+
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
+            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type))
+            return {src, const_cast<const type_info *>(tpi)};
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type =
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type =
+    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+    conditional_t<std::is_rvalue_reference<T>::value,
+        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
+
+// Specialization for types that appear to be copy constructible but also look like stl containers
+// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
+// so, copy constructability depends on whether the value_type is copy constructible.
+template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
+        std::is_copy_constructible<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>,
+        // Avoid infinite recursion
+        negation<std::is_same<Container, typename Container::value_type>>
+    >::value>> : is_copy_constructible<typename Container::value_type> {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't themselves
+// copy constructible, but this can not be relied upon when T1 or T2 are themselves containers).
+template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T, typename SFINAE = void> struct is_copy_assignable : std::is_copy_assignable<T> {};
+template <typename Container> struct is_copy_assignable<Container, enable_if_t<all_of<
+        std::is_copy_assignable<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>
+    >::value>> : is_copy_assignable<typename Container::value_type> {};
+template <typename T1, typename T2> struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+//
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base
+{
+    static const void *get(const itype *src, const std::type_info*&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
+{
+    static const void *get(const itype *src, const std::type_info*& type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void*>(src);
+    }
+};
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type> class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = _<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) { }
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type))
+                return {vsrc, tpi};
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
+        // don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, policy, parent, st.second,
+            make_copy_constructor(src), make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, return_value_policy::take_ownership, {}, st.second,
+            nullptr, nullptr, holder);
+    }
+
+    template <typename T> using cast_op_type = detail::cast_op_type<T>;
+
+    operator itype*() { return (type *) value; }
+    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
+
+protected:
+    using Constructor = void *(*)(const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. */
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(*reinterpret_cast<const T *>(arg));
+        };
+    }
+
+    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
+template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    return caster.operator typename make_caster<T>::template cast_op_type<T>();
+}
+template <typename T> typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    return std::move(caster).operator
+        typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>();
+}
+
+template <typename type> class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
+    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
+            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic)
+            policy = return_value_policy::automatic_reference;
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T> using cast_op_type = std::reference_wrapper<type>;
+    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name) \
+    protected: \
+        type value; \
+    public: \
+        static constexpr auto name = py_name; \
+        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
+        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
+            if (!src) return none().release(); \
+            if (policy == return_value_policy::take_ownership) { \
+                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
+            } else { \
+                return cast(*src, policy, parent); \
+            } \
+        } \
+        operator type*() { return &value; } \
+        operator type&() { return value; } \
+        operator type&&() && { return std::move(value); } \
+        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
+
+
+template <typename CharT> using is_std_char_type = any_of<
+    std::is_same<CharT, char>, /* std::string */
+#if defined(PYBIND11_HAS_U8STRING)
+    std::is_same<CharT, char8_t>, /* std::u8string */
+#endif
+    std::is_same<CharT, char16_t>, /* std::u16string */
+    std::is_same<CharT, char32_t>, /* std::u32string */
+    std::is_same<CharT, wchar_t> /* std::wstring */
+>;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+public:
+
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src)
+            return false;
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr()))
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            else
+                return false;
+        } else if (PyFloat_Check(src.ptr())) {
+            return false;
+        } else if (std::is_unsigned<py_type>::value) {
+            py_value = as_unsigned<py_type>(src.ptr());
+        } else { // signed integer:
+            py_value = sizeof(T) <= sizeof(long)
+                ? (py_type) PyLong_AsLong(src.ptr())
+                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
+        }
+
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+
+        // Protect std::numeric_limits::min/max with parentheses
+        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
+                       (py_value < (py_type) (std::numeric_limits<T>::min)() ||
+                        py_value > (py_type) (std::numeric_limits<T>::max)()))) {
+            bool type_error = py_err && PyErr_ExceptionMatches(
+#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
+                PyExc_SystemError
+#else
+                PyExc_TypeError
+#endif
+            );
+            PyErr_Clear();
+            if (type_error && convert && PyNumber_Check(src.ptr())) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                     ? PyNumber_Float(src.ptr())
+                                                     : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) <= sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) > sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) > sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
+};
+
+template<typename T> struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none())
+            return true;
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(T, _("None"));
+};
+
+template <> class type_caster<void_type> : public void_caster<void_type> {};
+
+template <> class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        } else if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr)
+            return capsule(ptr).release();
+        else
+            return none().inc_ref();
+    }
+
+    template <typename T> using cast_op_type = void*&;
+    operator void *&() { return value; }
+    static constexpr auto name = _("capsule");
+private:
+    void *value = nullptr;
+};
+
+template <> class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> { };
+
+template <> class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        else if (src.ptr() == Py_True) { value = true; return true; }
+        else if (src.ptr() == Py_False) { value = false; return true; }
+        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+            // (allow non-implicit conversion for numpy booleans)
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0;  // None is implicitly converted to False
+            }
+            #if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+            #else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+            #endif
+            if (res == 0 || res == 1) {
+                value = (bool) res;
+                return true;
+            } else {
+                PyErr_Clear();
+            }
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, _("bool"));
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false> struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
+#if defined(PYBIND11_HAS_U8STRING)
+    static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1, "Unsupported char8_t size != 1");
+#endif
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+            "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+#if PY_MAJOR_VERSION < 3
+        object temp;
+#endif
+        handle load_src = src;
+        if (!src) {
+            return false;
+        } else if (!PyUnicode_Check(load_src.ptr())) {
+#if PY_MAJOR_VERSION >= 3
+            return load_bytes(load_src);
+#else
+            if (std::is_same<CharT, char>::value) {
+                return load_bytes(load_src);
+            }
+
+            // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
+            if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
+                return false;
+
+            temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
+            if (!temp) { PyErr_Clear(); return false; }
+            load_src = temp;
+#endif
+        }
+
+        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+            load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
+        if (!utfNbytes) { PyErr_Clear(); return false; }
+
+        const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView)
+            loader_life_support::add_patient(utfNbytes);
+
+        return true;
+    }
+
+    static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) throw error_already_set();
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return
+            UTF_N == 8  ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
+            UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
+                          PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
+        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
+        // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just
+        // passing the encoding as a string value, which works properly:
+        return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<std::is_same<C, char>::value, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed a Python 3 raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (bytes) {
+                value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<!std::is_same<C, char>::value, handle>) { return false; }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = type_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) return pybind11::none().inc_ref();
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) throw error_already_set();
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
+    operator CharT&() {
+        if (none)
+            throw value_error("Cannot convert None to a character");
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0)
+            throw value_error("Cannot convert empty string to a character");
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to figure
+        // out how long the first encoded character is in bytes to distinguish between these two
+        // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
+        // can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            unsigned char v0 = static_cast<unsigned char>(value[0]);
+            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
+                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
+                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
+                4; // 0b11110xxx - start of 4-byte sequence
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000)
+                throw value_error("Character code point not in range(0x10000)");
+        }
+
+        if (str_len != 1)
+            throw value_error("Expected a character, but multi-character string found");
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = _(PYBIND11_STRING_NAME);
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template<typename...> class Tuple, typename... Ts> class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+public:
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size)
+            return false;
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    // copied from the PYBIND11_TYPE_CASTER macro
+    template <typename T>
+    static handle cast(T *src, return_value_policy policy, handle parent) {
+        if (!src) return none().release();
+        if (policy == return_value_policy::take_ownership) {
+            auto h = cast(std::move(*src), policy, parent); delete src; return h;
+        } else {
+            return cast(*src, policy, parent);
+        }
+    }
+
+    static constexpr auto name = _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
+
+    template <typename T> using cast_op_type = type;
+
+    operator type() & { return implicit_cast(indices{}); }
+    operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & { return type(cast_op<Ts>(std::get<Is>(subcasters))...); }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && { return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...); }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(subcasters).load(seq[Is], convert)))
+            return false;
+#else
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
+            if (!r)
+                return false;
+#endif
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        std::array<object, size> entries{{
+            reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
+        }};
+        for (const auto &entry: entries)
+            if (!entry)
+                return handle();
+        tuple result(size);
+        int counter = 0;
+        for (auto & entry: entries)
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2> class type_caster<std::pair<T1, T2>>
+    : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts> class type_caster<std::tuple<Ts...>>
+    : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+template <typename type, typename holder_type>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type*() { return this->value; }
+    // static_cast works around compiler error with MSVC 17 and CUDA 10.2
+    // see issue #2180
+    explicit operator type&() { return *(static_cast<type *>(this->value)); }
+    explicit operator holder_type*() { return std::addressof(holder); }
+
+    // Workaround for Intel compiler bug
+    // see pybind11 issue 94
+    #if defined(__ICC) || defined(__INTEL_COMPILER)
+    operator holder_type&() { return holder; }
+    #else
+    explicit operator holder_type&() { return holder; }
+    #endif
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder)
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+    }
+
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return true;
+        } else {
+            throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if defined(NDEBUG)
+                             "(compile in debug mode for type information)");
+#else
+                             "of type '" + type_id<holder_type>() + "''");
+#endif
+        }
+    }
+
+    template <typename T = holder_type, detail::enable_if_t<!std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) { return false; }
+
+    template <typename T = holder_type, detail::enable_if_t<std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
+
+template <typename type, typename holder_type>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> { };
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false> struct always_construct_holder { static constexpr bool value = Value; };
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...) \
+    namespace pybind11 { namespace detail { \
+    template <typename type> \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__>  { }; \
+    template <typename type> \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>> \
+        : public type_caster_holder<type, holder_type> { }; \
+    }}
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder> struct is_holder_type :
+    std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter> struct is_holder_type<base, std::unique_ptr<base, deleter>> :
+    std::true_type {};
+
+template <typename T> struct handle_type_name { static constexpr auto name = _<T>(); };
+template <> struct handle_type_name<bytes> { static constexpr auto name = _(PYBIND11_BYTES_NAME); };
+template <> struct handle_type_name<int_> { static constexpr auto name = _("int"); };
+template <> struct handle_type_name<iterable> { static constexpr auto name = _("Iterable"); };
+template <> struct handle_type_name<iterator> { static constexpr auto name = _("Iterator"); };
+template <> struct handle_type_name<none> { static constexpr auto name = _("None"); };
+template <> struct handle_type_name<args> { static constexpr auto name = _("*args"); };
+template <> struct handle_type_name<kwargs> { static constexpr auto name = _("**kwargs"); };
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) { value = src; return static_cast<bool>(value); }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src))
+            return false;
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> { };
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T> using move_is_plain_type = satisfies_none_of<T,
+    std::is_void, std::is_pointer, std::is_reference, std::is_const
+>;
+template <typename T, typename SFINAE = void> struct move_always : std::false_type {};
+template <typename T> struct move_always<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<is_copy_constructible<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T, typename SFINAE = void> struct move_if_unreferenced : std::false_type {};
+template <typename T> struct move_if_unreferenced<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<move_always<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T> using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type> using cast_is_temporary_value_reference = bool_constant<
+    (std::is_reference<type>::value || std::is_pointer<type>::value) &&
+    !std::is_base_of<type_caster_generic, make_caster<type>>::value &&
+    !std::is_same<intrinsic_t<type>, void>::value
+>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void> struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return> struct return_value_policy_override<Return,
+        detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value &&
+               !std::is_pointer<Return>::value
+                   ? return_value_policy::move : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE> type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    if (!conv.load(handle, true)) {
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ type (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type " +
+            (std::string) str(handle.get_type()) + " to C++ type '" + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T> make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+            "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) { return T(reinterpret_borrow<object>(handle)); }
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(T &&value, return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    using no_ref_T = typename std::remove_reference<T>::type;
+    if (policy == return_value_policy::automatic)
+        policy = std::is_pointer<no_ref_T>::value ? return_value_policy::take_ownership :
+                 std::is_lvalue_reference<T>::value ? return_value_policy::copy : return_value_policy::move;
+    else if (policy == return_value_policy::automatic_reference)
+        policy = std::is_pointer<no_ref_T>::value ? return_value_policy::reference :
+                 std::is_lvalue_reference<T>::value ? return_value_policy::copy : return_value_policy::move;
+    return reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(value), policy, parent));
+}
+
+template <typename T> T handle::cast() const { return pybind11::cast<T>(*this); }
+template <> inline void handle::cast() const { return; }
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1)
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ rvalue: instance has multiple references"
+            " (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python " + (std::string) str(obj.get_type()) +
+                " instance to C++ " + type_id<T>() + " instance: instance has multiple references");
+#endif
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T&());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind11::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T> detail::enable_if_t<detail::move_always<T>::value, T> cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_if_unreferenced<T>::value, T> cast(object &&object) {
+    if (object.ref_count() > 1)
+        return cast<T>(object);
+    else
+        return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
+    return cast<T>(object);
+}
+
+template <typename T> T object::cast() const & { return pybind11::cast<T>(*this); }
+template <typename T> T object::cast() && { return pybind11::cast<T>(std::move(*this)); }
+template <> inline void object::cast() const & { return; }
+template <> inline void object::cast() && { return; }
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) { return pybind11::cast(std::forward<T>(o)); }
+
+struct overload_unused {}; // Placeholder type for the unneeded (and dead code) static variable in the OVERLOAD_INT macro
+template <typename ret_type> using overload_caster_t = conditional_t<
+    cast_is_temporary_value_reference<ret_type>::value, make_caster<ret_type>, overload_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o, make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&, overload_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked"); }
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to static_assert, even
+// though if it's in dead code, so we provide a "trampoline" to pybind11::cast that only does anything in
+// cases where pybind11::cast is valid.
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o)); }
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked"); }
+template <> inline void cast_safe<void>(object &&) {}
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() { return tuple(0); }
+
+template <return_value_policy policy = return_value_policy::automatic_reference,
+          typename... Args> tuple make_tuple(Args&&... args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args {
+        { reinterpret_steal<object>(detail::make_caster<Args>::cast(
+            std::forward<Args>(args_), policy, nullptr))... }
+    };
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if defined(NDEBUG)
+            throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
+#else
+            std::array<std::string, size> argtypes { {type_id<Args>()...} };
+            throw cast_error("make_tuple(): unable to convert argument of type '" +
+                argtypes[i] + "' to Python object");
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args)
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument.
+    constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { }
+    /// Assign a value to this argument
+    template <typename T> arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) { flag_none = flag; return *this; }
+
+    const char *name; ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!)
+    bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base),
+          value(reinterpret_steal<object>(
+              detail::make_caster<T>::cast(x, return_value_policy::automatic, {})
+          )),
+          descr(descr)
+#if !defined(NDEBUG)
+        , type(type_id<T>())
+#endif
+    { }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) { }
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) { }
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) { arg::none(flag); return *this; }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if !defined(NDEBUG)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+/// \ingroup annotations
+/// Annotation indicating that all following arguments are keyword-only; the is the equivalent of an
+/// unnamed '*' argument (in Python 3)
+struct kwonly {};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/> using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get args/kwargs argument positions relative to the end of the argument list:
+    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
+                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+
+    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
+
+    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos < 0;
+    static constexpr bool has_args = args_pos < 0;
+
+    static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) {
+        return load_impl_sequence(call, indices{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])))
+            return false;
+#else
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...})
+            if (!r)
+                return false;
+#endif
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) && {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) { }
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        int _[] = { 0, (process(args_list, std::forward<Ts>(values)), 0)... };
+        ignore_unused(_);
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(std::to_string(args_list.size()), type_id<T>());
+#endif
+        }
+        args_list.append(o);
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (const auto &a : ap)
+            args_list.append(a);
+    }
+
+    void process(list &/*args_list*/, arg_v a) {
+        if (!a.name)
+#if defined(NDEBUG)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+
+        if (m_kwargs.contains(a.name)) {
+#if defined(NDEBUG)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = a.value;
+    }
+
+    void process(list &/*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp)
+            return;
+        for (const auto &k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if defined(NDEBUG)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error("Got kwargs without a name; only named arguments "
+                         "may be passed via py::arg() to a python function call. "
+                         "(compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(std::string type) {
+        throw type_error("Got kwargs without a name of type '" + type + "'; only named "
+                         "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error("Got multiple values for keyword argument "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(std::string name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+    [[noreturn]] static void argument_cast_error() {
+        throw cast_error("Unable to convert call argument to Python object "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void argument_cast_error(std::string name, std::string type) {
+        throw cast_error("Unable to convert call argument '" + name
+                         + "' of type '" + type + "' to Python object");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<all_of<is_positional<Args>...>::value>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<!all_of<is_positional<Args>...>::value>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(
+        constexpr_last<is_positional, Args...>() < constexpr_first<is_keyword_or_ds, Args...>()
+        && constexpr_last<is_s_unpacking, Args...>() < constexpr_first<is_ds_unpacking, Args...>(),
+        "Invalid function call: positional args must precede keywords and ** unpacking; "
+        "* unpacking must precede ** unpacking"
+    );
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+#define PYBIND11_MAKE_OPAQUE(...) \
+    namespace pybind11 { namespace detail { \
+        template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \
+    }}
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/chrono.h b/pybind11/include/pybind11/chrono.h
new file mode 100644
index 0000000000000000000000000000000000000000..6127c659bdcef2da89d9fb80568f1c570bbb6534
--- /dev/null
+++ b/pybind11/include/pybind11/chrono.h
@@ -0,0 +1,191 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <cmath>
+#include <ctime>
+#include <chrono>
+#include <datetime.h>
+
+// Backport the PyDateTime_DELTA functions from Python3.3 if required
+#ifndef PyDateTime_DELTA_GET_DAYS
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)o)->days)
+#endif
+#ifndef PyDateTime_DELTA_GET_SECONDS
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)o)->seconds)
+#endif
+#ifndef PyDateTime_DELTA_GET_MICROSECONDS
+#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type> class duration_caster {
+public:
+    typedef typename type::rep rep;
+    typedef typename type::period period;
+
+    typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                  days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        else if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        else return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period>& get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock> static std::chrono::duration<rep, period> get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+};
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec   = 0;
+            cal.tm_min   = 0;
+            cal.tm_hour  = 0;
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday  = 1;   // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon   = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year  = 70;  // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        }
+        else return false;
+
+        value = system_clock::from_time_t(std::mktime(&cal)) + msecs;
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        // Get out microseconds, and make sure they are positive, to avoid bug in eastern hemisphere time zones
+        // (cfr. https://github.com/pybind/pybind11/issues/2417)
+        using us_t = duration<int, std::micro>;
+        auto us = duration_cast<us_t>(src.time_since_epoch() % seconds(1));
+        if (us.count() < 0)
+            us += seconds(1);
+
+        // Subtract microseconds BEFORE `system_clock::to_time_t`, because:
+        // > If std::time_t has lower precision, it is implementation-defined whether the value is rounded or truncated.
+        // (https://en.cppreference.com/w/cpp/chrono/system_clock/to_time_t)
+        std::time_t tt = system_clock::to_time_t(time_point_cast<system_clock::duration>(src - us));
+        // this function uses static memory so it's best to copy it out asap just in case
+        // otherwise other code that is using localtime may break this (not just python code)
+        std::tm localtime = *std::localtime(&tt);
+
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          us.count());
+    }
+    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration> class type_caster<std::chrono::time_point<Clock, Duration>>
+: public duration_caster<std::chrono::time_point<Clock, Duration>> {
+};
+
+template <typename Rep, typename Period> class type_caster<std::chrono::duration<Rep, Period>>
+: public duration_caster<std::chrono::duration<Rep, Period>> {
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/common.h b/pybind11/include/pybind11/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c8a4f1e88e493ee08d24e668639c8d495fd49b1
--- /dev/null
+++ b/pybind11/include/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/pybind11/include/pybind11/complex.h b/pybind11/include/pybind11/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8327eb37307490b658becf3d151132ddb5df531
--- /dev/null
+++ b/pybind11/include/pybind11/complex.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#  undef I
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = { 'Z', c, '\0' };
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T> constexpr const char format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T> class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src)
+            return false;
+        if (!convert && !PyComplex_Check(src.ptr()))
+            return false;
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+};
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/class.h b/pybind11/include/pybind11/detail/class.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d36744f2736d79c6fb9c6d93a1ce44f89e3b60e
--- /dev/null
+++ b/pybind11/include/pybind11/detail/class.h
@@ -0,0 +1,668 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if PY_VERSION_HEX >= 0x03030000 && !defined(PYPY_VERSION)
+#  define PYBIND11_BUILTIN_QUALNAME
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type
+// signatures; in 3.3+ this macro expands to nothing:
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_static_property_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+        class pybind11_static_property(property):
+            def __get__(self, obj, cls):
+                return property.__get__(self, cls, cls)
+
+            def __set__(self, obj, value):
+                cls = obj if isinstance(obj, type) else type(obj)
+                property.__set__(self, cls, value)
+        )", Py_file_input, d.ptr(), d.ptr()
+    );
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    const auto static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
+                                && !PyObject_IsInstance(value, static_prop);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    else {
+        return PyType_Type.tp_getattro(obj, name);
+    }
+}
+#endif
+
+/// metaclass `__call__` function that is used to create all pybind11 objects.
+extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, PyObject *kwargs) {
+
+    // use the default metaclass call to create/initialize the object
+    PyObject *self = PyType_Type.tp_call(type, args, kwargs);
+    if (self == nullptr) {
+        return nullptr;
+    }
+
+    // This must be a pybind11 instance
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Ensure that the base __init__ function(s) were called
+    for (const auto &vh : values_and_holders(instance)) {
+        if (!vh.holder_constructed()) {
+            PyErr_Format(PyExc_TypeError, "%.200s.__init__() must be called when overriding __init__",
+                         vh.type->type->tp_name);
+            Py_DECREF(self);
+            return nullptr;
+        }
+    }
+
+    return self;
+}
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject* make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_call = pybind11_meta_call;
+
+    type->tp_setattro = pybind11_meta_setattro;
+#if PY_MAJOR_VERSION >= 3
+    type->tp_getattro = pybind11_meta_getattro;
+#endif
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
+inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
+        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr)
+                        f(parentptr, self);
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
+/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
+/// to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    inst->owned = true;
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg;
+#if defined(PYPY_VERSION)
+    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+#endif
+    msg += type->tp_name;
+    msg += ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients)
+        Py_CLEAR(patient);
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+
+            if (instance->owned || v_h.holder_constructed())
+                v_h.type->dealloc(v_h);
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs)
+        PyObject_ClearWeakRefs(self);
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr)
+        Py_CLEAR(*dict_ptr);
+
+    if (instance->has_patients)
+        clear_patients(self);
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    clear_instance(self);
+
+    auto type = Py_TYPE(self);
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail("make_object_base_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Support for `d = instance.__dict__`.
+extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    if (!dict)
+        dict = PyDict_New();
+    Py_XINCREF(dict);
+    return dict;
+}
+
+/// dynamic_attr: Support for `instance.__dict__ = dict()`.
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
+    if (!PyDict_Check(new_dict)) {
+        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
+                     Py_TYPE(new_dict)->tp_name);
+        return -1;
+    }
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_INCREF(new_dict);
+    Py_CLEAR(dict);
+    dict = new_dict;
+    return 0;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto type = &heap_type->ht_type;
+#if defined(PYPY_VERSION) && (PYPY_VERSION_NUM < 0x06000000)
+    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                               "currently not supported in "
+                                               "conjunction with PyPy!");
+#endif
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {
+        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
+        {nullptr, nullptr, nullptr, nullptr, nullptr}
+    };
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer)
+            break;
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape)
+        view->len *= s;
+    view->readonly = info->readonly;
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+        view->format = const_cast<char *>(info->format.c_str());
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = &info->strides[0];
+        view->shape = &info->shape[0];
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+#if PY_MAJOR_VERSION < 3
+    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+#endif
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject* make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+#if PY_MAJOR_VERSION >= 3
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+#else
+        qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." + rec.name);
+#endif
+    }
+
+    object module;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__"))
+            module = rec.scope.attr("__module__");
+        else if (hasattr(rec.scope, "__name__"))
+            module = rec.scope.attr("__name__");
+    }
+
+    auto full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module ? str(module).cast<std::string>() + "." + rec.name :
+#endif
+        rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto base = (bases.size() == 0) ? internals.instance_base
+                                    : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
+                                         : internals.default_metaclass;
+
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *)base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (bases.size() > 0)
+        type->tp_bases = bases.release().ptr();
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+#if PY_VERSION_HEX >= 0x03050000
+    type->tp_as_async = &heap_type->as_async;
+#endif
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
+#if PY_MAJOR_VERSION < 3
+    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+#endif
+    if (!rec.is_final)
+        type->tp_flags |= Py_TPFLAGS_BASETYPE;
+
+    if (rec.dynamic_attr)
+        enable_dynamic_attributes(heap_type);
+
+    if (rec.buffer_protocol)
+        enable_buffer_protocol(heap_type);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+
+    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope)
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    else
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+
+    if (module) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module);
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/common.h b/pybind11/include/pybind11/detail/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..8923faef76eb4e06613273a0c71307814085bcac
--- /dev/null
+++ b/pybind11/include/pybind11/detail/common.h
@@ -0,0 +1,837 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 6
+#define PYBIND11_VERSION_PATCH dev0
+
+#define PYBIND11_NAMESPACE_BEGIN(name) namespace name {
+#define PYBIND11_NAMESPACE_END(name) }
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
+#  if __cplusplus >= 201402L
+#    define PYBIND11_CPP14
+#    if __cplusplus >= 201703L
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
+// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
+#  if _MSVC_LANG >= 201402L
+#    define PYBIND11_CPP14
+#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#  if __INTEL_COMPILER < 1700
+#    error pybind11 requires Intel C++ compiler v17 or newer
+#  endif
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#    error pybind11 requires clang 3.3 or newer
+#  endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#  if __clang_major__ < 5
+#    error pybind11 requires Xcode/clang 5.0 or newer
+#  endif
+#elif defined(__GNUG__)
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#    error pybind11 requires gcc 4.8 or newer
+#  endif
+#elif defined(_MSC_VER)
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
+// (e.g. std::negation) added in 2015u3:
+#  if _MSC_FULL_VER < 190024210
+#    error pybind11 requires MSVC 2015 update 3 or newer
+#  endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#  if defined(WIN32) || defined(_WIN32)
+#    define PYBIND11_EXPORT __declspec(dllexport)
+#  else
+#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline)
+#else
+#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined(PYBIND11_CPP14)
+#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#if defined(PYBIND11_CPP17)
+#  define PYBIND11_MAYBE_UNUSED [[maybe_unused]]
+#elif defined(_MSC_VER) && !defined(__clang__)
+#  define PYBIND11_MAYBE_UNUSED
+#else
+#  define PYBIND11_MAYBE_UNUSED __attribute__ ((__unused__))
+#endif
+
+/* Don't let Python.h #define (v)snprintf as macro because they are implemented
+   properly in Visual Studio since 2015. */
+#if defined(_MSC_VER) && _MSC_VER >= 1900
+#  define HAVE_SNPRINTF 1
+#endif
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#    define HAVE_ROUND 1
+#  endif
+#  pragma warning(push)
+#  pragma warning(disable: 4510 4610 4512 4005)
+#  if defined(_DEBUG) && !defined(Py_DEBUG)
+#    define PYBIND11_DEBUG_MARKER
+#    undef _DEBUG
+#  endif
+#endif
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#  undef isalnum
+#  undef isalpha
+#  undef islower
+#  undef isspace
+#  undef isupper
+#  undef tolower
+#  undef toupper
+#endif
+
+#if defined(copysign)
+#  undef copysign
+#endif
+
+#if defined(_MSC_VER)
+#  if defined(PYBIND11_DEBUG_MARKER)
+#    define _DEBUG
+#    undef PYBIND11_DEBUG_MARKER
+#  endif
+#  pragma warning(pop)
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <forward_list>
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <typeindex>
+#include <type_traits>
+
+#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy.
+// See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
+#define PYBIND11_PLUGIN_IMPL(name) \
+    extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT PyObject *PyInit_##name(); \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#else
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyString_Check
+#define PYBIND11_BYTES_FROM_STRING PyString_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyString_AsString
+#define PYBIND11_BYTES_SIZE PyString_Size
+#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
+#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed.
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed.
+#define PYBIND11_BYTES_NAME "str"
+#define PYBIND11_STRING_NAME "unicode"
+#define PYBIND11_SLICE_OBJECT PySliceObject
+#define PYBIND11_FROM_STRING PyString_FromString
+#define PYBIND11_STR_TYPE ::pybind11::bytes
+#define PYBIND11_BOOL_ATTR "__nonzero__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+// Providing a separate PyInit decl to make Clang's -Wmissing-prototypes happy.
+// See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
+#define PYBIND11_PLUGIN_IMPL(name) \
+    static PyObject *pybind11_init_wrapper();                           \
+    extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT void init##name(); \
+    extern "C" PYBIND11_EXPORT void init##name() {                      \
+        (void)pybind11_init_wrapper();                                  \
+    }                                                                   \
+    PyObject *pybind11_init_wrapper()
+#endif
+
+#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
+extern "C" {
+    struct _Py_atomic_address { void *value; };
+    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+}
+#endif
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+#define PYBIND11_ENSURE_INTERNALS_READY \
+    pybind11::detail::get_internals();
+
+#define PYBIND11_CHECK_PYTHON_VERSION \
+    {                                                                          \
+        const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION)         \
+            "." PYBIND11_TOSTRING(PY_MINOR_VERSION);                           \
+        const char *runtime_ver = Py_GetVersion();                             \
+        size_t len = std::strlen(compiled_ver);                                \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                  \
+                || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {     \
+            PyErr_Format(PyExc_ImportError,                                    \
+                "Python version mismatch: module was compiled for Python %s, " \
+                "but the interpreter version is incompatible: %s.",            \
+                compiled_ver, runtime_ver);                                    \
+            return nullptr;                                                    \
+        }                                                                      \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS \
+        catch (pybind11::error_already_set &e) {                               \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                  \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
+    static PyObject *pybind11_init();                                          \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        PYBIND11_ENSURE_INTERNALS_READY                                        \
+        try {                                                                  \
+            return pybind11_init();                                            \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the fist argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module` which can be used to initialize the module.
+
+    The entry point is marked as "maybe unused" to aid dead-code detection analysis:
+    since the entry point is typically only looked up at runtime and not referenced
+    during translation, it would otherwise appear as unused ("dead") code.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                        \
+    PYBIND11_MAYBE_UNUSED                                                      \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        PYBIND11_ENSURE_INTERNALS_READY                                        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
+        try {                                                                  \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
+            return m.ptr();                                                    \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t  = std::size_t;
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object’s reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object’s lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property’s implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
+     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
+     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
+     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
+     * or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
+     * (which is typically the size of two pointers), or when multiple inheritance is used on the
+     * python side.  Non-simple layout allocates the required amount of memory to have multiple
+     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+     * pointer to allocated space of the required space to hold a sequence of value pointers and
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
+     * `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the
+     * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+using std::enable_if_t;
+using std::conditional_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template<size_t ...> struct index_sequence  { };
+template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T> struct negation : bool_constant<!T::value> { };
+
+template <typename...> struct void_t_impl { using type = void; };
+template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...> struct bools {};
+template <class... Ts> using all_of = std::is_same<
+    bools<Ts::value..., true>,
+    bools<true, Ts::value...>>;
+template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts> using all_of = std::conjunction<Ts...>;
+template <class... Ts> using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts> using none_of = negation<any_of<Ts...>>;
+
+template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T> struct remove_class { };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+
+/// Helper template to strip away type modifiers
+template <typename T> struct intrinsic_type                       { typedef T type; };
+template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type { };
+
+/// Helper template which holds a list of types
+template <typename...> struct type_list { };
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+PYBIND11_NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
+/// none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> { using type = T; };
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template<typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template<typename> class P, typename Default>
+struct exactly_one<P, Default> { using type = Default; };
+
+template <template<typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
+template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
+/// can be converted to a Base pointer)
+template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+
+template <template<typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us> static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template<typename...> class Base, typename T>
+#if !defined(_MSC_VER)
+using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+#else // MSVC2015 has trouble with decltype in template aliases
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template<typename...> class Class, typename T>
+struct is_instantiation : std::false_type { };
+template <template<typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T> using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F> struct strip_function_object {
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<
+        std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+        std::remove_pointer<F>,
+        strip_function_object<F>
+    >::type
+>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
+        std::is_function, std::is_pointer, std::is_member_pointer>;
+
+/// Ignore that a variable is unused in compiler warnings
+inline void ignore_unused(const int *) { }
+
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (void)pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
+    class name : public builtin_exception { public: \
+        using builtin_exception::builtin_exception; \
+        name() : name("") { } \
+        void set_error() const override { PyErr_SetString(type, what()); } \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+
+template <typename T, typename SFINAE = void> struct format_descriptor { };
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
+template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
+        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
+        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+};
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = { c, '\0' };
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T> constexpr const char format_descriptor<
+    T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete { template <typename T> void operator()(T*) { } };
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+                              -> decltype(pf) { return pf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+                              -> decltype(pmf) { return pmf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+                              -> decltype(pmf) { return pmf; }
+};
+PYBIND11_NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
+// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args> struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) { }
+
+    // Implicit conversion constructor from any arbitrary container type with values convertible to T
+    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+
+    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
+    // to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+
+    // Moves the vector out of an rvalue any_container
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/descr.h b/pybind11/include/pybind11/detail/descr.h
new file mode 100644
index 0000000000000000000000000000000000000000..92720cd56277e73a27da3bac85c3c2ae6a3589ac
--- /dev/null
+++ b/pybind11/include/pybind11/detail/descr.h
@@ -0,0 +1,100 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#  define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#  define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1];
+
+    constexpr descr() : text{'\0'} { }
+    constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
+
+    template <typename... Chars>
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>, index_sequence<Is2...>) {
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
+constexpr descr<0> _(char const(&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
+template <size_t...Digits> struct int_to_str<0, Digits...> {
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
+    return _(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
+    return _(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+
+template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + _(", ") + concat(args...);
+}
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return _("{") + descr + _("}");
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/init.h b/pybind11/include/pybind11/detail/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ef78c1179f5b533c3ba3f637420c8125d632a7f
--- /dev/null
+++ b/pybind11/include/pybind11/detail/init.h
@@ -0,0 +1,336 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename> using cast_op_type = value_and_holder &;
+    operator value_and_holder &() { return *value; }
+    static constexpr auto name = _<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+PYBIND11_NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class> using Cpp = typename Class::type;
+template <typename Class> using Alias = typename Class::type_alias;
+template <typename Class> using Holder = typename Class::holder_type;
+
+template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) { return false; }
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initiailization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class, typename... Args, detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward<Args>(args)...); }
+template <typename Class, typename... Args, detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward<Args>(args)...}; }
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h, Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &, Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+            "pybind11::init(): init function must return a compatible pointer, "
+            "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    no_nullptr(ptr);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    static_assert(std::is_move_constructible<Cpp<Class>>::value,
+        "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias)
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    else
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(std::is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            else
+                v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args> struct alias_constructor {
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [func = std::move(class_factory)]
+        #else
+        auto &func = class_factory;
+        cl.def("__init__", [func]
+        #endif
+        (value_and_holder &v_h, Args... args) {
+            construct<Class>(v_h, func(std::forward<Args>(args)...),
+                             Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc,
+          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra&... extra) && {
+        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
+                                        "only be used if the class has an alias");
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+        #else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def("__init__", [class_func, alias_func]
+        #endif
+        (value_and_holder &v_h, CArgs... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                // If the instance type equals the registered type we don't have inheritance, so
+                // don't need the alias and can construct using the class function:
+                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+            else
+                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class, typename T, typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get, typename Set,
+          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get, typename Set,
+          typename RetState, typename Self, typename NewInstance, typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set)
+        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def("__setstate__", [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def("__setstate__", [func]
+#endif
+        (value_and_holder &v_h, ArgState state) {
+            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                            Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+PYBIND11_NAMESPACE_END(initimpl)
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(pybind11)
diff --git a/pybind11/include/pybind11/detail/internals.h b/pybind11/include/pybind11/detail/internals.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf40e9fe995cd952e0dec8378b44b3ac8477f235
--- /dev/null
+++ b/pybind11/include/pybind11/detail/internals.h
@@ -0,0 +1,352 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#    define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#else
+    // Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if PY_MAJOR_VERSION < 3
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             do {                                                            \
+                 PyThread_delete_key_value((key));                           \
+                 PyThread_set_key_value((key), (value));                     \
+             } while (false)
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             PyThread_set_key_value((key), (value))
+#    endif
+#    define PYBIND11_TLS_FREE(key) (void)key
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if defined(__GLIBCXX__)
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++))
+            hash = (hash * 33) ^ c;
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct overload_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
+    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
+    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    PYBIND11_TLS_KEY_INIT(tstate);
+    PyInterpreterState *istate = nullptr;
+    ~internals() {
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens whe PyThread_tss_free is called.
+        // PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does nothing.
+        // PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX). Neither
+        // of those have anything to do with CPython internals.
+        // PyMem_RawFree *requires* that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+#define PYBIND11_INTERNALS_VERSION 4
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#   define PYBIND11_BUILD_TYPE "_debug"
+#else
+#   define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+#if defined(_MSC_VER)
+#   define PYBIND11_COMPILER_TYPE "_msvc"
+#elif defined(__INTEL_COMPILER)
+#   define PYBIND11_COMPILER_TYPE "_icc"
+#elif defined(__clang__)
+#   define PYBIND11_COMPILER_TYPE "_clang"
+#elif defined(__PGI)
+#   define PYBIND11_COMPILER_TYPE "_pgi"
+#elif defined(__MINGW32__)
+#   define PYBIND11_COMPILER_TYPE "_mingw"
+#elif defined(__CYGWIN__)
+#   define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#elif defined(__GNUC__)
+#   define PYBIND11_COMPILER_TYPE "_gcc"
+#else
+#   define PYBIND11_COMPILER_TYPE "_unknown"
+#endif
+
+#if defined(_LIBCPP_VERSION)
+#  define PYBIND11_STDLIB "_libcpp"
+#elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#  define PYBIND11_STDLIB "_libstdcpp"
+#else
+#  define PYBIND11_STDLIB ""
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+#if defined(__GXX_ABI_VERSION)
+#  define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#else
+#  define PYBIND11_BUILD_ABI ""
+#endif
+
+#if defined(WITH_THREAD)
+#  define PYBIND11_INTERNALS_KIND ""
+#else
+#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#endif
+
+#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)           { e.restore();                                    return;
+    } catch (const builtin_exception &e)     { e.set_error();                                  return;
+    } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
+    } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
+    } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::overflow_error &e)   { PyErr_SetString(PyExc_OverflowError, e.what()); return;
+    } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+    } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)       { e.restore();   return;
+    } catch (const builtin_exception &e) { e.set_error(); return;
+    }
+}
+#endif
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE inline internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp)
+        return **internals_pp;
+
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state (PyGILState_Ensure()) {}
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+
+    constexpr auto *id = PYBIND11_INTERNALS_ID;
+    auto builtins = handle(PyEval_GetBuiltins());
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+        internals_pp = static_cast<internals **>(capsule(builtins[id]));
+
+        // We loaded builtins through python's builtins, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) internals_pp = new internals*();
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+
+        #if PY_VERSION_HEX < 0x03090000
+                PyEval_InitThreads();
+        #endif
+        PyThreadState *tstate = PyThreadState_Get();
+        #if PY_VERSION_HEX >= 0x03070000
+            internals_ptr->tstate = PyThread_tss_alloc();
+            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
+                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
+            PyThread_tss_set(internals_ptr->tstate, tstate);
+        #else
+            internals_ptr->tstate = PyThread_create_key();
+            if (internals_ptr->tstate == -1)
+                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
+            PyThread_set_key_value(internals_ptr->tstate, tstate);
+        #endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        builtins[id] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+inline type_map<type_info *> &registered_local_types_cpp() {
+    static type_map<type_info *> locals{};
+    return locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template<typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/typeid.h b/pybind11/include/pybind11/detail/typeid.h
new file mode 100644
index 0000000000000000000000000000000000000000..148889ffefdcc4de303d99f82af43aa9302c0a7c
--- /dev/null
+++ b/pybind11/include/pybind11/detail/typeid.h
@@ -0,0 +1,55 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) break;
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res {
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
+    if (status == 0)
+        name = res.get();
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+PYBIND11_NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T> static std::string type_id() {
+    std::string name(typeid(T).name());
+    detail::clean_type_id(name);
+    return name;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/eigen.h b/pybind11/include/pybind11/eigen.h
new file mode 100644
index 0000000000000000000000000000000000000000..22139def6013b47005df22be778bd6984e05ea1d
--- /dev/null
+++ b/pybind11/include/pybind11/eigen.h
@@ -0,0 +1,607 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "numpy.h"
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#elif defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#  ifdef __clang__
+//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
+//   under Clang, so disable that warning here:
+#    pragma GCC diagnostic ignored "-Wdeprecated"
+#  endif
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3,3,0)
+using EigenIndex = Eigen::Index;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T> using is_eigen_other = all_of<
+    is_template_base_of<Eigen::EigenBase, T>,
+    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
+>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor> struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
+    bool negativestrides = false;   // If true, do not use stride!
+
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c,
+            EigenIndex rstride, EigenIndex cstride) :
+        conformable{true}, rows{r}, cols{c} {
+        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        if (rstride < 0 || cstride < 0) {
+            negativestrides = true;
+        } else {
+            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                      EigenRowMajor ? cstride : rstride /* inner stride */ };
+        }
+    }
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+
+    template <typename props> bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
+        return
+            !negativestrides &&
+            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
+                (EigenRowMajor ? cols : rows) == 1) &&
+            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
+                (EigenRowMajor ? rows : cols) == 1);
+    }
+    operator bool() const { return conformable; }
+};
+
+template <typename Type> struct eigen_extract_stride { using type = Type; };
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_> struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex
+        rows = Type::RowsAtCompileTime,
+        cols = Type::ColsAtCompileTime,
+        size = Type::SizeAtCompileTime;
+    static constexpr bool
+        row_major = Type::IsRowMajor,
+        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic,
+        fixed_cols = cols != Eigen::Dynamic,
+        fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
+                                                       vector ? size : row_major ? cols : rows>::value;
+    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex
+                np_rows = a.shape(0),
+                np_cols = a.shape(1),
+                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+                return false;
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
+        // is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n)
+                return false; // Vector size mismatch
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        else if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        else if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) return false;
+            return {1, n, stride};
+        }
+        else {
+            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (fixed_rows && rows != n) return false;
+            return {n, 1, stride};
+        }
+    }
+
+    static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor =
+        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
+        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
+        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
+        _("]") +
+        // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
+        // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
+        // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
+        // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
+        // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
+        // *gave* a numpy.ndarray of the right type and dimensions.
+        _<show_writeable>(", flags.writeable", "") +
+        _<show_c_contiguous>(", flags.c_contiguous", "") +
+        _<show_f_contiguous>(", flags.f_contiguous", "") +
+        _("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector)
+        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
+    else
+        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
+                  src.data(), base);
+
+    if (!writeable)
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
+// array that references the encapsulated data with a python-side reference to the capsule to tie
+// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
+// not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf)
+            return false;
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        auto fits = props::conformable(buf);
+        if (!fits)
+            return false;
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
+        else if (ref.ndim() == 1) buf = buf.squeeze();
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    operator Type*() { return &value; }
+    operator Type&() { return value; }
+    operator Type&&() && { return std::move(value); }
+    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType> struct eigen_map_caster {
+private:
+    using props = EigenProps<MapType>;
+
+public:
+
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
+    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
+    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
+    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
+    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename> using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+    : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
+> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array = array_t<Scalar, array::forcecast |
+                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
+                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
+    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
+    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
+    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
+    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we can't
+        // avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            Array aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) return false; // Incompatible dimensions
+                if (!fits.template stride_compatible<props>())
+                    need_copy = true;
+                else
+                    copy_or_ref = std::move(aref);
+            }
+            else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) return false;
+
+            Array copy = Array::ensure(src);
+            if (!copy) return false;
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>())
+                return false;
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    operator Type*() { return ref.get(); }
+    operator Type&() { return *ref; }
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) { return a.mutable_data(); }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) { return a.data(); }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S> using stride_ctor_default = bool_constant<
+        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S> using stride_ctor_dual = bool_constant<
+        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S> using stride_ctor_outer = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+    template <typename S> using stride_ctor_inner = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) { return S(); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+protected:
+    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename> using cast_op_type = Type;
+};
+
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    typedef typename Type::Scalar Scalar;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
+    typedef typename Type::Index Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src)
+            return false;
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!obj.get_type().is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices)
+            return false;
+
+        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+            shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
+            outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type&>(src).makeCompressed();
+
+        object matrix_type = module::import("scipy.sparse").attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(
+            std::make_tuple(data, innerIndices, outerIndices),
+            std::make_pair(src.rows(), src.cols())
+        ).release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+            + npy_format_descriptor<Scalar>::name + _("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pybind11/include/pybind11/embed.h b/pybind11/include/pybind11/embed.h
new file mode 100644
index 0000000000000000000000000000000000000000..eae86c714ca17191bb03fd4df7c9384422168858
--- /dev/null
+++ b/pybind11/include/pybind11/embed.h
@@ -0,0 +1,203 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#if defined(PYPY_VERSION)
+#  error Embedding the interpreter is not supported with PyPy
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" PyObject *pybind11_init_impl_##name();  \
+      extern "C" PyObject *pybind11_init_impl_##name() { \
+          return pybind11_init_wrapper_##name();         \
+      }
+#else
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" void pybind11_init_impl_##name();       \
+      extern "C" void pybind11_init_impl_##name() {      \
+          pybind11_init_wrapper_##name();                \
+      }
+#endif
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
+        try {                                                                 \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
+            return m.ptr();                                                   \
+        } catch (pybind11::error_already_set &e) {                            \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        } catch (const std::exception &e) {                                   \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        }                                                                     \
+    }                                                                         \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
+    pybind11::detail::embedded_module PYBIND11_CONCAT(pybind11_module_, name) \
+                              (PYBIND11_TOSTRING(name),             \
+                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+#if PY_MAJOR_VERSION >= 3
+    using init_t = PyObject *(*)();
+#else
+    using init_t = void (*)();
+#endif
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized())
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1)
+            pybind11_fail("Insufficient memory to add a new module");
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional parameter can be used to skip the registration of signal handlers (see the
+    `Python documentation`_ for details). Calling this function again after the interpreter
+    has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true) {
+    if (Py_IsInitialized())
+        pybind11_fail("The interpreter is already running");
+
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Make .py files in the working directory available by default
+    module::import("sys").attr("path").cast<list>().append(".");
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    handle builtins(PyEval_GetBuiltins());
+    const char *id = PYBIND11_INTERNALS_ID;
+
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in builtins, so look there too:
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+        internals_ptr_ptr = capsule(builtins[id]);
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    scoped_interpreter(bool init_signal_handlers = true) {
+        initialize_interpreter(init_signal_handlers);
+    }
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid)
+            finalize_interpreter();
+    }
+
+private:
+    bool is_valid = true;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/eval.h b/pybind11/include/pybind11/eval.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba82cf42ae3673a3de391eb55777ef413c43dc33
--- /dev/null
+++ b/pybind11/include/pybind11/eval.h
@@ -0,0 +1,132 @@
+/*
+    pybind11/exec.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(str expr, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                               : str(s);
+    return eval<mode>(expr, global, local);
+}
+
+inline void exec(str expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, global, local);
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, global, local);
+}
+
+#if defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x3000000
+template <eval_mode mode = eval_statements>
+object eval_file(str, object, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+#else
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+#if PY_VERSION_HEX >= 0x03040000
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+#elif PY_VERSION_HEX >= 0x03000000
+    FILE *f = _Py_fopen(fname.ptr(), "r");
+#else
+    /* No unicode support in open() :( */
+    auto fobj = reinterpret_steal<object>(PyFile_FromString(
+        const_cast<char *>(fname_str.c_str()),
+        const_cast<char*>("r")));
+    FILE *f = nullptr;
+    if (fobj)
+        f = PyFile_AsFile(fobj.ptr());
+    closeFile = 0;
+#endif
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+#if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
+    PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
+                                  local.ptr());
+    (void) closeFile;
+#else
+    PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
+                                    local.ptr(), closeFile);
+#endif
+
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+#endif
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/functional.h b/pybind11/include/pybind11/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..57b6cd210f4b99d9d76a93c17aeed3a183fc01a0
--- /dev/null
+++ b/pybind11/include/pybind11/functional.h
@@ -0,0 +1,101 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*) (Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            return true;
+        }
+
+        if (!isinstance<function>(src))
+            return false;
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
+            auto rec = (function_record *) c;
+
+            if (rec && rec->is_stateless &&
+                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                struct capture { function_type f; };
+                value = ((capture *) &rec->data)->f;
+                return true;
+            }
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+            func_handle(function&& f_) : f(std::move(f_)) {}
+            func_handle(const func_handle&) = default;
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // to emulate 'move initialization capture' in C++11
+        struct func_wrapper {
+            func_handle hfunc;
+            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                object retval(hfunc.f(std::forward<Args>(args)...));
+                /* Visual studio 2015 parser issue: need parentheses around this expression */
+                return (retval.template cast<Return>());
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_)
+            return none().inc_ref();
+
+        auto result = f_.template target<function_type>();
+        if (result)
+            return cpp_function(*result, policy).release();
+        else
+            return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
+                               + make_caster<retval_type>::name + _("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/iostream.h b/pybind11/include/pybind11/iostream.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf92dfa49add54c298844b31898a82de3fb429d
--- /dev/null
+++ b/pybind11/include/pybind11/iostream.h
@@ -0,0 +1,209 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <streambuf>
+#include <ostream>
+#include <string>
+#include <memory>
+#include <iostream>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    int sync() {
+        if (pbase() != pptr()) {
+            // This subtraction cannot be negative, so dropping the sign
+            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+
+            {
+                gil_scoped_acquire tmp;
+                pywrite(line);
+                pyflush();
+            }
+
+            setp(pbase(), epptr());
+        }
+        return 0;
+    }
+
+public:
+
+    pythonbuf(object pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size),
+          d_buffer(new char[buf_size]),
+          pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf&&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() {
+        sync();
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
+            std::cerr << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    scoped_ostream_redirect(
+            std::ostream &costream = std::cout,
+            object pyostream = module::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() {
+        costream.rdbuf(old);
+    }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    scoped_estream_redirect(
+            std::ostream &costream = std::cerr,
+            object pyostream = module::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream,pyostream) {}
+};
+
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_)
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        if (do_stderr_)
+            redirect_stderr.reset(new scoped_estream_redirect());
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/numpy.h b/pybind11/include/pybind11/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..674450a631a49213a7fc83feed3a10e36934da61
--- /dev/null
+++ b/pybind11/include/pybind11/numpy.h
@@ -0,0 +1,1647 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "complex.h"
+#include <numeric>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <functional>
+#include <utility>
+#include <vector>
+#include <typeindex>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user. */
+static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class array; // Forward declaration
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <> struct handle_type_name<array> { static constexpr auto name = _("numpy.ndarray"); };
+
+template <typename type, typename SFINAE = void> struct npy_format_descriptor;
+
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD
+    char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject* dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info& tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end())
+            return &(it->second);
+        if (throw_if_missing)
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        return nullptr;
+    }
+
+    template<typename T> numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals& get_numpy_internals() {
+    static numpy_internals* ptr = nullptr;
+    if (!ptr)
+        load_numpy_internals(ptr);
+    return *ptr;
+}
+
+template <typename T> struct same_size {
+    template <typename U> using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete> constexpr int platform_lookup() { return -1; }
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_, NPY_UBYTE_,
+        NPY_SHORT_, NPY_USHORT_,
+        NPY_INT_, NPY_UINT_,
+        NPY_LONG_, NPY_ULONG_,
+        NPY_LONGLONG_, NPY_ULONGLONG_,
+        NPY_FLOAT_, NPY_DOUBLE_, NPY_LONGDOUBLE_,
+        NPY_CFLOAT_, NPY_CDOUBLE_, NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_, NPY_UNICODE_, NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_ = platform_lookup<std::int32_t, long, int, short>(
+            NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_ = platform_lookup<std::int64_t, long, long long, int>(
+            NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_ = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    typedef struct {
+        Py_intptr_t *ptr;
+        int len;
+    } PyArray_Dims;
+
+    static npy_api& get() {
+        static npy_api api = lookup();
+        return api;
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArray_Type_);
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)
+        (PyTypeObject *, PyObject *, int, Py_intptr_t const *,
+         Py_intptr_t const *, void *, int, PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_) (PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_) (PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_) (PyObject *, PyObject *);
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, unsigned char, PyObject **, int *,
+                                             Py_intptr_t *, PyObject **, PyObject *);
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 96,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+        API_PyArray_GetArrayParamsFromObject = 278,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module m = module::import("numpy.core.multiarray");
+        auto c = m.attr("_ARRAY_API");
+#if PY_MAJOR_VERSION >= 3
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL);
+#else
+        void **api_ptr = (void **) PyCObject_AsVoidPtr(c.ptr());
+#endif
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_SetBaseObject);
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy* array_proxy(void* ptr) {
+    return reinterpret_cast<PyArray_Proxy*>(ptr);
+}
+
+inline const PyArray_Proxy* array_proxy(const void* ptr) {
+    return reinterpret_cast<const PyArray_Proxy*>(ptr);
+}
+
+inline PyArrayDescr_Proxy* array_descriptor_proxy(PyObject* ptr) {
+   return reinterpret_cast<PyArrayDescr_Proxy*>(ptr);
+}
+
+inline const PyArrayDescr_Proxy* array_descriptor_proxy(const PyObject* ptr) {
+   return reinterpret_cast<const PyArrayDescr_Proxy*>(ptr);
+}
+
+inline bool check_flags(const void* ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T> struct is_std_array : std::false_type { };
+template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
+template <typename T> struct is_complex : std::false_type { };
+template <typename T> struct is_complex<std::complex<T>> : std::true_type { };
+
+template <typename T> struct array_info_scalar {
+    typedef T type;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = _("");
+    static void append_extents(list& /* shape */) { }
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T> struct array_info : array_info_scalar<T> { };
+template <typename T, size_t N> struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list& shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = _<array_info<T>::is_array>(
+        concat(_<N>(), array_info<T>::extents), _<N>()
+    );
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> { };
+template <size_t N> struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> { };
+template <typename T, size_t N> struct array_info<T[N]> : array_info<std::array<T, N>> { };
+template <typename T> using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T> using is_pod_struct = all_of<
+    std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
+#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
+    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
+    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
+    std::is_trivially_copyable<T>,
+#else
+    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+    std::is_trivially_destructible<T>,
+    satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#endif
+    satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
+>;
+
+template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`.  `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>>
+            shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides}, dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix> const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const { return operator()(index); }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix> const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span in
+    /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice).
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+public:
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix> T& operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) { return operator()(index); }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix> T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>> : type_caster<unchecked_reference<T, Dim>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_);
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr();
+    }
+
+    explicit dtype(const std::string &format) {
+        m_ptr = from_args(pybind11::str(format)).release().ptr();
+    }
+
+    dtype(const char *format) : dtype(std::string(format)) { }
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = names;
+        args["formats"] = formats;
+        args["offsets"] = offsets;
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(object args) {
+        PyObject *ptr = nullptr;
+        if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr)
+            throw error_already_set();
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T> static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(m_ptr)->elsize;
+    }
+
+    /// Returns true for structured data types.
+    bool has_fields() const {
+        return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
+    }
+
+    /// Single-character type code.
+    char kind() const {
+        return detail::array_descriptor_proxy(m_ptr)->kind;
+    }
+
+private:
+    static object _dtype_from_pep3118() {
+        static PyObject *obj = module::import("numpy.core._internal")
+            .attr("_dtype_from_pep3118").cast<object>().release().ptr();
+        return reinterpret_borrow<object>(obj);
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields())
+            return *this;
+
+        struct field_descr { PYBIND11_STR_TYPE name; object format; pybind11::int_ offset; };
+        std::vector<field_descr> field_descriptors;
+
+        for (auto field : attr("fields").attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto format = spec[1].cast<tuple>()[0].cast<dtype>();
+            auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
+            if (!len(name) && format.kind() == 'V')
+                continue;
+            field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset});
+        }
+
+        std::sort(field_descriptors.begin(), field_descriptors.end(),
+                  [](const field_descr& a, const field_descr& b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto& descr : field_descriptors) {
+            names.append(descr.name);
+            formats.append(descr.format);
+            offsets.append(descr.offset);
+        }
+        return dtype(names, formats, offsets, itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array(0, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides,
+          const void *ptr = nullptr, handle base = handle()) {
+
+        if (strides->empty())
+            *strides = c_strides(*shape, dt.itemsize());
+
+        auto ndim = shape->size();
+        if (ndim != strides->size())
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base))
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            else
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
+            const_cast<void *>(ptr), flags, nullptr));
+        if (!tmp)
+            throw error_already_set();
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) { }
+
+    template <typename T, typename = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) { }
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { }
+
+    explicit array(const buffer_info &info, handle base = handle())
+    : array(pybind11::dtype(info), info.shape, info.strides, info.ptr, base) { }
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
+    }
+
+    /// Total number of bytes
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+
+    /// Number of dimensions
+    ssize_t ndim() const {
+        return detail::array_proxy(m_ptr)->nd;
+    }
+
+    /// Base object
+    object base() const {
+        return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base);
+    }
+
+    /// Dimensions of the array
+    const ssize_t* shape() const {
+        return detail::array_proxy(m_ptr)->dimensions;
+    }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t* strides() const {
+        return detail::array_proxy(m_ptr)->strides;
+    }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const {
+        return detail::array_proxy(m_ptr)->flags;
+    }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template<typename... Ix> const void* data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template<typename... Ix> void* mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim())
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed or
+     * reshaped for the duration of the returned object, and the caller must take care not to access
+     * invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto& api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d = {
+            new_shape->data(), int(new_shape->size())
+        };
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        object new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
+        );
+        if (!new_array) throw error_already_set();
+        if (isinstance<array>(new_array)) { *this = std::move(new_array); }
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+protected:
+    template<typename, typename> friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string& msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) +
+                          " (ndim = " + std::to_string(ndim()) + ")");
+    }
+
+    template<typename... Ix> ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable())
+            throw std::domain_error("array is not writeable");
+    }
+
+    // Default, C-style strides
+    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        if (ndim > 0)
+            for (size_t i = ndim - 1; i > 0; --i)
+                strides[i - 1] = strides[i] * shape[i];
+        return strides;
+    }
+
+    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = 1; i < ndim; ++i)
+            strides[i] = strides[i - 1] * shape[i - 1];
+        return strides;
+    }
+
+    template<typename... Ix> void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t*) const { }
+
+    template<typename... Ix> void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i) +
+                              " is out of bounds for axis " + std::to_string(axis) +
+                              " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { }
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) { }
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) PyErr_Clear();
+        if (!is_borrowed) Py_XDECREF(h.ptr());
+    }
+
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) throw error_already_set();
+    }
+
+    explicit array_t(const buffer_info& info, handle base = handle()) : array(info, base) { }
+
+    array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) { }
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
+                ptr, base) { }
+
+    explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) { }
+
+    constexpr ssize_t itemsize() const {
+        return sizeof(T);
+    }
+
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template<typename... Ix> const T* data(Ix... index) const {
+        return static_cast<const T*>(array::data(index...));
+    }
+
+    template<typename... Ix> T* mutable_data(Ix... index) {
+        return static_cast<T*>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template<typename... Ix> const T& at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<const T*>(array::data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template<typename... Ix> T& mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<T*>(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `unchecked()`, this does not require that the underlying
+     * array have the `writable` flag.  Use with care: the array must not be destroyed or reshaped
+     * for the duration of the returned object, and the caller must take care not to access invalid
+     * dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of<T>().ptr());
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, dtype::of<T>().release().ptr(), 0, 0,
+            detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N> struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+template <size_t N> struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src))
+            return false;
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, bool>::value>(
+        _("bool"), _<std::is_signed<T>::value>("numpy.int", "numpy.uint") + _<sizeof(T)*8>()
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
+        _("numpy.float") + _<sizeof(T)*8>(), _("numpy.longdouble")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = _<std::is_same<typename T::value_type, float>::value
+                                   || std::is_same<typename T::value_type, double>::value>(
+        _("numpy.complex") + _<sizeof(typename T::value_type)*16>(), _("numpy.longcomplex")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {
+        npy_api::NPY_BOOL_,
+        npy_api::NPY_BYTE_,   npy_api::NPY_UBYTE_,   npy_api::NPY_INT16_,    npy_api::NPY_UINT16_,
+        npy_api::NPY_INT32_,  npy_api::NPY_UINT32_,  npy_api::NPY_INT64_,    npy_api::NPY_UINT64_,
+        npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,  npy_api::NPY_LONGDOUBLE_,
+        npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_
+    };
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() {
+        if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
+            return reinterpret_steal<pybind11::dtype>(ptr);
+        pybind11_fail("Unsupported buffer format!");
+    }
+};
+
+#define PYBIND11_DECL_CHAR_FMT \
+    static constexpr auto name = _("S") + _<N>(); \
+    static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
+template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
+template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
+#undef PYBIND11_DECL_CHAR_FMT
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name = _("(") + array_info<T>::extents + _(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape));
+    }
+};
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+inline PYBIND11_NOINLINE void register_structured_dtype(
+    any_container<field_descriptor> fields,
+    const std::type_info& tinfo, ssize_t itemsize,
+    bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto& numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false))
+        pybind11_fail("NumPy: dtype is already registered");
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(ordered_fields.begin(), ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto& field : ordered_fields) {
+        if (!field.descr)
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") +
+                            field.name + "` @ " + tinfo.name());
+        names.append(PYBIND11_STR_TYPE(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto& field : ordered_fields) {
+        if (field.offset > offset)
+            oss << (field.offset - offset) << 'x';
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset)
+        oss << (itemsize - offset) << 'x';
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Sanity check: verify that NumPy properly parses our buffer format string
+    auto& api = npy_api::get();
+    auto arr =  array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr()))
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = { dtype_ptr, format_str };
+    get_internals().direct_conversions[tindex].push_back(direct_converter);
+}
+
+template <typename T, typename SFINAE> struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() {
+        return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
+    }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields), typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T), &direct_converter);
+    }
+
+private:
+    static PyObject* dtype_ptr() {
+        static PyObject* ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void*& value) {
+        auto& api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_))
+            return false;
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
+# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
+#else
+
+#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+    ::pybind11::detail::field_descriptor {                                                    \
+        Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+        ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),           \
+        ::pybind11::detail::npy_format_descriptor<decltype(std::declval<T>().Field)>::dtype() \
+    }
+
+// Extract name, offset and format descriptor for a struct field
+#define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#define PYBIND11_EVAL0(...) __VA_ARGS__
+#define PYBIND11_EVAL1(...) PYBIND11_EVAL0 (PYBIND11_EVAL0 (PYBIND11_EVAL0 (__VA_ARGS__)))
+#define PYBIND11_EVAL2(...) PYBIND11_EVAL1 (PYBIND11_EVAL1 (PYBIND11_EVAL1 (__VA_ARGS__)))
+#define PYBIND11_EVAL3(...) PYBIND11_EVAL2 (PYBIND11_EVAL2 (PYBIND11_EVAL2 (__VA_ARGS__)))
+#define PYBIND11_EVAL4(...) PYBIND11_EVAL3 (PYBIND11_EVAL3 (PYBIND11_EVAL3 (__VA_ARGS__)))
+#define PYBIND11_EVAL(...)  PYBIND11_EVAL4 (PYBIND11_EVAL4 (PYBIND11_EVAL4 (__VA_ARGS__)))
+#define PYBIND11_MAP_END(...)
+#define PYBIND11_MAP_OUT
+#define PYBIND11_MAP_COMMA ,
+#define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0 (test, next, 0)
+#define PYBIND11_MAP_NEXT(test, next)  PYBIND11_MAP_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#if defined(_MSC_VER) && !defined(__clang__) // MSVC is not as eager to expand macros, hence this workaround
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP_LIST_NEXT(test, next) \
+    PYBIND11_MAP_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#define PYBIND11_MAP_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP2_LIST_NEXT(test, next) \
+    PYBIND11_MAP2_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#define PYBIND11_MAP2_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP2_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+template  <class T>
+using array_iterator = typename std::add_pointer<T>::type;
+
+template <class T>
+array_iterator<T> array_begin(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr));
+}
+
+template <class T>
+array_iterator<T> array_end(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr) + buffer.size);
+}
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : p_ptr(0), m_strides() {}
+
+    common_iterator(void* ptr, const container_type& strides, const container_type& shape)
+        : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            value_type s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) {
+        p_ptr += m_strides[dim];
+    }
+
+    void* data() const {
+        return p_ptr;
+    }
+
+private:
+    char* p_ptr;
+    container_type m_strides;
+};
+
+template <size_t N> class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers,
+                         const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0),
+          m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i)
+            m_shape[i] = shape[i];
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i)
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+    }
+
+    multi_array_iterator& operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            } else {
+                m_index[i] = 0;
+            }
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void> T* data() const {
+        return reinterpret_cast<T*>(m_common_iterator[K].data());
+    }
+
+private:
+
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter)
+                *strides_iter = *buffer_strides_iter;
+            else
+                *strides_iter = 0;
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator)
+            iter.increment(dim);
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a broadcast_trivial
+// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a
+// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
+// buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+        return std::max(res, buf.ndim);
+    });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or
+    // the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across buffers
+            if (dim_size_out == 1)
+                dim_size_out = dim_size_in;
+            else if (dim_size_in != 1 && dim_size_in != dim_size_out)
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1)
+            continue;
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim)
+            return broadcast_trivial::non_trivial;
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin()))
+            return broadcast_trivial::non_trivial;
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin();
+                    trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_c = false;
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin();
+                    trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_f = false;
+            }
+        }
+    }
+
+    return
+        trivial_broadcast_c ? broadcast_trivial::c_trivial :
+        trivial_broadcast_f ? broadcast_trivial::f_trivial :
+        broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize =
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
+        (!std::is_reference<T>::value ||
+         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+private:
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(NVectorized >= 1,
+            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag
+    // when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index> using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
+            typename vectorize_arg<Args>::type &...args,
+            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{ &args... }};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        size_t ndim = (size_t) nd;
+
+        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        array_t<Return> result;
+        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
+        else result = array_t<Return>(shape);
+
+        if (size == 0) return std::move(result);
+
+        /* Call the function */
+        if (trivial == broadcast_trivial::non_trivial)
+            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+        else
+            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
+
+        return std::move(result);
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
+            std::pair<unsigned char *&, const size_t>(
+                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
+            )...
+        }};
+
+        for (size_t i = 0; i < size; ++i) {
+            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) x.first += x.second;
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         array_t<Return> &output_array,
+                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        buffer_info output = output_array.request();
+        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
+
+        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
+             iter != end;
+             ++iter, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((
+                params[VIndex] = input_iter.template data<BIndex>()
+            ));
+            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...>
+vectorize_extractor(const Func &f, Return (*) (Args ...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...>
+vectorize(Return (*f) (Args ...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f) -> decltype(
+        detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/pybind11/include/pybind11/operators.h b/pybind11/include/pybind11/operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..086cb4cfd838797767dd3b3caa275bdb4348fc8e
--- /dev/null
+++ b/pybind11/include/pybind11/operators.h
@@ -0,0 +1,173 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift,
+    op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert,
+    op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
+    op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
+    op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
+    op_repr, op_truediv, op_itruediv, op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t { };
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t { };
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R> struct op_impl { };
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R> struct op_ {
+    template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+    template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const L &l, const R &r) { return B(expr); }                  \
+};                                                                                     \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_r, B, L, R> { \
+    static char const* name() { return "__" #rid "__"; }                               \
+    static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const R &r, const L &l) { return B(expr); }                  \
+};                                                                                     \
+inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {         \
+    return op_<op_##id, op_l, self_t, self_t>();                                       \
+}                                                                                      \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}                                                                                      \
+template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {    \
+    return op_<op_##id, op_r, T, self_t>();                                            \
+}
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
+    static B execute_cast(L &l, const R &r) { return B(expr); }                        \
+};                                                                                     \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
+template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
+    static B execute_cast(const L &l) { return B(expr); }                              \
+};                                                                                     \
+inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                    \
+    return op_<op_##id, op_u, self_t, undefined_t>();                                  \
+}
+
+PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
+PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
+PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
+PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
+PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
+PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
+PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
+PYBIND11_BINARY_OPERATOR(and,       rand,         operator&,    l & r)
+PYBIND11_BINARY_OPERATOR(xor,       rxor,         operator^,    l ^ r)
+PYBIND11_BINARY_OPERATOR(eq,        eq,           operator==,   l == r)
+PYBIND11_BINARY_OPERATOR(ne,        ne,           operator!=,   l != r)
+PYBIND11_BINARY_OPERATOR(or,        ror,          operator|,    l | r)
+PYBIND11_BINARY_OPERATOR(gt,        lt,           operator>,    l > r)
+PYBIND11_BINARY_OPERATOR(ge,        le,           operator>=,   l >= r)
+PYBIND11_BINARY_OPERATOR(lt,        gt,           operator<,    l < r)
+PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
+//PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
+PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
+PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
+PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand,     operator&=,   l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor,     operator^=,   l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
+PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
+PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
+// WARNING: This usage of `abs` should only be done for existing STL overloads.
+// Adding overloads directly in to the `std::` namespace is advised against:
+// https://en.cppreference.com/w/cpp/language/extending_std
+PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
+PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
+PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
+PYBIND11_UNARY_OPERATOR(float,      float_,       (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+PYBIND11_NAMESPACE_END(detail)
+
+using detail::self;
+// Add named operators so that they are accessible via `py::`.
+using detail::hash;
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pybind11/include/pybind11/options.h b/pybind11/include/pybind11/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..d74db1c68dddb3436cc0fb2674a6ef32ac77d5fd
--- /dev/null
+++ b/pybind11/include/pybind11/options.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options&) = delete;
+    options& operator=(const options&) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() {
+        global_state() = previous_state;
+    }
+
+    // Setter methods (affect the global state):
+
+    options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; }
+
+    options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; }
+
+    options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; }
+
+    options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    // This type is not meant to be allocated on the heap.
+    void* operator new(size_t) = delete;
+
+private:
+
+    struct state {
+        bool show_user_defined_docstrings = true;  //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;      //< Include auto-generated function signatures in docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/pybind11.h b/pybind11/include/pybind11/pybind11.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a7d7b88495afddabff7f9604c94e828eb780152
--- /dev/null
+++ b/pybind11/include/pybind11/pybind11.h
@@ -0,0 +1,2235 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning push
+#  pragma warning disable 68    // integer conversion resulted in a change of sign
+#  pragma warning disable 186   // pointless comparison of unsigned integer with zero
+#  pragma warning disable 878   // incompatible exception specifications
+#  pragma warning disable 1334  // the "template" keyword used for syntactic disambiguation may only be used within a template
+#  pragma warning disable 1682  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#  pragma warning disable 1786  // function "strdup" was declared deprecated
+#  pragma warning disable 1875  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
+#  pragma warning disable 2196  // warning #2196: routine is both "inline" and "noinline"
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted
+#  pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning)
+#  pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
+#  pragma warning(disable: 4702) // warning C4702: unreachable code
+#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#  pragma GCC diagnostic ignored "-Wattributes"
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wnoexcept-type"
+#  endif
+#endif
+
+#include "attr.h"
+#include "options.h"
+#include "detail/class.h"
+#include "detail/init.h"
+
+#if defined(__GNUG__) && !defined(__clang__)
+#  include <cxxabi.h>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() { }
+    cpp_function(std::nullptr_t) { }
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    cpp_function(Return (*f)(Args...), const Extra&... extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func, typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    cpp_function(Func &&f, const Extra&... extra) {
+        initialize(std::forward<Func>(f),
+                   (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
+        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return (*) (Class *, Arg...)) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, lvalue ref-qualifier)
+    /// A copy of the overload for non-const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...)&, const Extra&... extra) {
+        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*) (Class *, Arg...)) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
+        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, lvalue ref-qualifier)
+    /// A copy of the overload for const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...) const&, const Extra&... extra) {
+        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE detail::function_record *make_function_record() {
+        return new detail::function_record();
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
+        using namespace detail;
+        struct capture { remove_reference_t<Func> f; };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture object) */
+        auto rec = make_function_record();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wplacement-new"
+#endif
+            new ((capture *) &rec->data) capture { std::forward<Func>(f) };
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic pop
+#endif
+            if (!std::is_trivially_destructible<Func>::value)
+                rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); };
+        } else {
+            rec->data[0] = new capture { std::forward<Func>(f) };
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out = make_caster<
+            conditional_t<std::is_void<Return>::value, void_type, Return>
+        >;
+
+        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+                      "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call))
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            auto data = (sizeof(capture) <= sizeof(call.func.data)
+                         ? &call.func.data : call.func.data[0]);
+            capture *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result = cast_out::cast(
+                std::move(args_converter).template call<Return, Guard>(cap->f), policy, call.parent);
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        {
+            constexpr bool has_kwonly_args = any_of<std::is_same<kwonly, Extra>...>::value,
+                           has_args = any_of<std::is_same<args, Args>...>::value,
+                           has_arg_annotations = any_of<is_keyword<Extra>...>::value;
+            static_assert(has_arg_annotations || !has_kwonly_args, "py::kwonly requires the use of argument annotations");
+            static_assert(!(has_args && has_kwonly_args), "py::kwonly cannot be combined with a py::args argument");
+        }
+
+        /* Generate a readable signature describing the function's arguments and return value types */
+        static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
+
+        if (cast_in::has_args) rec->has_args = true;
+        if (cast_in::has_kwargs) rec->has_kwargs = true;
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr =
+            std::is_convertible<Func, FunctionType>::value &&
+            sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1] = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(detail::function_record *rec, const char *text,
+                            const std::type_info *const *types, size_t args) {
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = strdup(rec->name ? rec->name : "");
+        if (rec->doc) rec->doc = strdup(rec->doc);
+        for (auto &a: rec->args) {
+            if (a.name)
+                a.name = strdup(a.name);
+            if (a.descr)
+                a.descr = strdup(a.descr);
+            else if (a.value)
+                a.descr = strdup(repr(a.value).cast<std::string>().c_str());
+        }
+
+        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+
+#if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(
+                PyExc_FutureWarning,
+                ("pybind11-bound class '" + class_name + "' is using an old-style "
+                 "placement-new '" + func_name + "' which has been deprecated. See "
+                 "the upgrade guide in pybind11's docs. This message is only visible "
+                 "when compiled in debug mode.").c_str(), 0
+            );
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        for (auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                if (*(pc + 1) == '*')
+                    continue;
+
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += rec->args[arg_index].descr;
+                }
+                arg_index++;
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t)
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                if (auto tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature +=
+                        th.attr("__module__").cast<std::string>() + "." +
+                        th.attr("__qualname__").cast<std::string>(); // Python 3.3+, but we backport it to earlier versions
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature +=
+                        rec->scope.attr("__module__").cast<std::string>() + "." +
+                        rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    std::string tname(t->name());
+                    detail::clean_type_id(tname);
+                    signature += tname;
+                }
+            } else {
+                signature += c;
+            }
+        }
+        if (arg_index != args || types[type_index] != nullptr)
+            pybind11_fail("Internal error while parsing type signature (2)");
+
+#if PY_MAJOR_VERSION < 3
+        if (strcmp(rec->name, "__next__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("next");
+        } else if (strcmp(rec->name, "__bool__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("__nonzero__");
+        }
+#endif
+        rec->signature = strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto rec_capsule = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(rec->sibling.ptr()));
+                chain = (detail::function_record *) rec_capsule;
+                /* Never append a method to an overload chain of a parent class;
+                   instead, hide the parent's overloads in this case */
+                if (!chain->scope.is(rec->scope))
+                    chain = nullptr;
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_')
+                pybind11_fail("Cannot overload existing non-function object \"" + std::string(rec->name) +
+                        "\" with a function of the same name");
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*) (void)>(*dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(rec, [](void *ptr) {
+                destruct((detail::function_record *) ptr);
+            });
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+        } else {
+            /* Append at the end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            chain_start = chain;
+            if (chain->is_method != rec->is_method)
+                pybind11_fail("overloading a method with both static and instance methods is not supported; "
+                    #if defined(NDEBUG)
+                        "compile in debug mode for more details"
+                    #else
+                        "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " +
+                        std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
+                    #endif
+                );
+            while (chain->next)
+                chain = chain->next;
+            chain->next = rec;
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()) {
+                if (index > 0) signatures += "\n";
+                if (chain)
+                    signatures += std::to_string(++index) + ". ";
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += "\n";
+            }
+            if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures, we
+                // need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) first_user_def = false;
+                    else signatures += "\n";
+                }
+                if (options::show_function_signatures()) signatures += "\n";
+                signatures += it->doc;
+                if (options::show_function_signatures()) signatures += "\n";
+            }
+        }
+
+        /* Install docstring */
+        PyCFunctionObject *func = (PyCFunctionObject *) m_ptr;
+        if (func->m_ml->ml_doc)
+            std::free(const_cast<char *>(func->m_ml->ml_doc));
+        func->m_ml->ml_doc = strdup(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate instance method object");
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec) {
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data)
+                rec->free_data(rec);
+            std::free((char *) rec->name);
+            std::free((char *) rec->doc);
+            std::free((char *) rec->signature);
+            for (auto &arg: rec->args) {
+                std::free(const_cast<char *>(arg.name));
+                std::free(const_cast<char *>(arg.descr));
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+                delete rec->def;
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr),
+                              *it = overloads;
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right overload */
+        const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            const auto pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+
+            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
+                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
+                return nullptr;
+            }
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered())
+                return none().release().ptr();
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded = it != nullptr && it->next != nullptr;
+
+            for (; it != nullptr; it = it->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name.  If
+                      so, use it (and remove it from kwargs; if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the function
+                      takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get a
+                   result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *it;
+                size_t num_args = func.nargs;    // Number of positional arguments that we need
+                if (func.has_args) --num_args;   // (but don't count py::args
+                if (func.has_kwargs) --num_args; //  or py::kwargs)
+                size_t pos_args = num_args - func.nargs_kwonly;
+
+                if (!func.has_args && n_args_in > pos_args)
+                    continue; // Too many positional arguments for this overload
+
+                if (n_args_in < pos_args && func.args.size() < pos_args)
+                    continue; // Not enough positional arguments given, and not enough defaults to fill in the blanks
+
+                function_call call(func, parent);
+
+                size_t args_to_copy = (std::min)(pos_args, n_args_in); // Protect std::min with parentheses
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder)
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.push_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg)
+                    continue; // Maybe it was meant for another overload (issue #688)
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < num_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < num_args; ++args_copied) {
+                        const auto &arg = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg.name)
+                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            PyDict_DelItemString(kwargs.ptr(), arg.name);
+                        } else if (arg.value) {
+                            value = arg.value;
+                        }
+
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg.convert);
+                        }
+                        else
+                            break;
+                    }
+
+                    if (args_copied < num_args)
+                        continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
+                        }
+                    }
+                    call.args.push_back(extra_args);
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr())
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+                // 5. Put everything in a vector.  Not technically step 5, we've been building it
+                // in `call.args` all along.
+                #if !defined(NDEBUG)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs)
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!");
+                #endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
+                    break;
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'it' to be valid, as it would be
+                        // if we'd encountered this failure in the first-pass loop.
+                        if (!result)
+                            it = &call.func;
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#if defined(__GNUG__) && !defined(__clang__)
+        } catch ( abi::__forced_unwind& ) {
+            throw;
+#endif
+        } catch (...) {
+            /* When an exception is caught, give each registered exception
+               translator a chance to translate it to a Python exception
+               in reverse order of registration.
+
+               A translator may choose to do one of the following:
+
+                - catch the exception and call PyErr_SetString or PyErr_SetObject
+                  to set a standard (or custom) Python exception, or
+                - do nothing and let the exception fall through to the next translator, or
+                - delegate translation to the next translator by throwing a new type of exception. */
+
+            auto last_exception = std::current_exception();
+            auto &registered_exception_translators = get_internals().registered_exception_translators;
+            for (auto& translator : registered_exception_translators) {
+                try {
+                    translator(last_exception);
+                } catch (...) {
+                    last_exception = std::current_exception();
+                    continue;
+                }
+                return nullptr;
+            }
+            PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!");
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator)
+                return handle(Py_NotImplemented).inc_ref().ptr();
+
+            std::string msg = std::string(overloads->name) + "(): incompatible " +
+                std::string(overloads->is_constructor ? "constructor" : "function") +
+                " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    "+ std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) next = end = sig.find(')');
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) msg += it2->signature;
+
+                msg += "\n";
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) some_args = true;
+                else msg += ", ";
+                try {
+                    msg += pybind11::repr(args_[ti]);
+                } catch (const error_already_set&) {
+                    msg += "<repr raised Error>";
+                }
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (kwargs.size() > 0) {
+                    if (some_args) msg += "; ";
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (auto kwarg : kwargs) {
+                        if (first) first = false;
+                        else msg += ", ";
+                        msg += pybind11::str("{}=").format(kwarg.first);
+                        try {
+                            msg += pybind11::repr(kwarg.second);
+                        } catch (const error_already_set&) {
+                            msg += "<repr raised Error>";
+                        }
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            msg += it->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else {
+            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+                auto *pi = reinterpret_cast<instance *>(parent.ptr());
+                self_value_and_holder.type->init_instance(pi, nullptr);
+            }
+            return result.ptr();
+        }
+    }
+};
+
+/// Wrapper for Python extension modules
+class module : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    explicit module(const char *name, const char *doc = nullptr) {
+        if (!options::show_user_defined_docstrings()) doc = nullptr;
+#if PY_MAJOR_VERSION >= 3
+        PyModuleDef *def = new PyModuleDef();
+        std::memset(def, 0, sizeof(PyModuleDef));
+        def->m_name = name;
+        def->m_doc = doc;
+        def->m_size = -1;
+        Py_INCREF(def);
+        m_ptr = PyModule_Create(def);
+#else
+        m_ptr = Py_InitModule3(name, nullptr, doc);
+#endif
+        if (m_ptr == nullptr)
+            pybind11_fail("Internal error in module::module()");
+        inc_ref();
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module &def(const char *name_, Func &&f, const Extra& ... extra) {
+        cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
+                          sibling(getattr(*this, name_, none())), extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module m("example", "pybind11 example plugin");
+            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module def_submodule(const char *name, const char *doc = nullptr) {
+        std::string full_name = std::string(PyModule_GetName(m_ptr))
+            + std::string(".") + std::string(name);
+        auto result = reinterpret_borrow<module>(PyImport_AddModule(full_name.c_str()));
+        if (doc && options::show_user_defined_docstrings())
+            result.attr("__doc__") = pybind11::str(doc);
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj)
+            throw error_already_set();
+        return reinterpret_steal<module>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj)
+            throw error_already_set();
+        *this = reinterpret_steal<module>(obj);
+    }
+
+    // Adds an object to the module using the given name.  Throws if an object with the given name
+    // already exists.
+    //
+    // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
+    // established will, in most cases, break things.
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name))
+            pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
+                    std::string(name) + "\"");
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+};
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+    template <typename...> friend class class_;
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, rec.name))
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
+                          "\": an object with that name is already defined");
+
+        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            pybind11_fail("generic_type: type \"" + std::string(rec.name) +
+                          "\" is already registered!");
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local)
+            registered_local_types_cpp()[tindex] = tinfo;
+        else
+            internals.registered_types_cpp[tindex] = tinfo;
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        }
+        else if (rec.bases.size() == 1) {
+            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2)
+                tinfo2->simple_type = false;
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(
+            buffer_info *(*get_buffer)(PyObject *, void *),
+            void *get_buffer_data) {
+        PyHeapTypeObject *type = (PyHeapTypeObject*) m_ptr;
+        auto tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer)
+            pybind11_fail(
+                "To be able to register buffer protocol support for the type '" +
+                std::string(tinfo->type->tp_name) +
+                "' the associated class<>(..) invocation must "
+                "include the pybind11::buffer_protocol() annotation!");
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget, handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings();
+        auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
+                                                       : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T, typename = void_t<decltype(static_cast<void *(*)(size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) { r->operator_new = &T::operator new; }
+
+template <typename> void set_operator_new(...) { }
+
+template <typename T, typename SFINAE = void> struct has_operator_delete : std::false_type { };
+template <typename T> struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type { };
+template <typename T, typename SFINAE = void> struct has_operator_delete_size : std::false_type { };
+template <typename T> struct has_operator_delete_size<T, void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
+    : std::true_type { };
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); }
+template <typename T, enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); }
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void)s; (void)a;
+    #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+        if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+            #ifdef __cpp_sized_deallocation
+                ::operator delete(p, s, std::align_val_t(a));
+            #else
+                ::operator delete(p, std::align_val_t(a));
+            #endif
+            return;
+        }
+    #endif
+    #ifdef __cpp_sized_deallocation
+        ::operator delete(p, s);
+    #else
+        ::operator delete(p);
+    #endif
+}
+
+inline void add_class_method(object& cls, const char *name_, const cpp_function &cf) {
+    cls.attr(cf.name()) = cf;
+    if (strcmp(name_, "__eq__") == 0 && !cls.attr("__dict__").contains("__hash__")) {
+      cls.attr("__hash__") = none();
+    }
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) { return std::forward<F>(f); }
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T> using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T> struct is_valid_class_option :
+        detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+            "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+            "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &... extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+            (   constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                constexpr_sum(is_base<options>::value...)   == 0 && // no template option bases
+                none_of<std::is_same<multiple_inheritance, Extra>...>::value), // no multiple_inheritance attr
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type>&);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
+            instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
+        }
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) { }
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func&& f, const Extra&... extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_), is_method(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        add_class_method(*this, name_, cf);
+        return *this;
+    }
+
+    template <typename Func, typename... Extra> class_ &
+    def_static(const char *name_, Func &&f, const Extra&... extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = staticmethod(cf);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ &def(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ & def_cast(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra&... extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func> class_& def_buffer(Func &&func) {
+        struct capture { Func func; };
+        capture *ptr = new capture { std::forward<Func>(func) };
+        install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* {
+            detail::make_caster<type> caster;
+            if (!caster.load(obj, false))
+                return nullptr;
+            return new buffer_info(((capture *) ptr)->func(caster));
+        }, ptr);
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func] (type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func] (const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)),
+                     fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)),
+                     fset([pm](object, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly_static(name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) {
+        return def_property(name, fget, cpp_function(method_adaptor<type>(fset)), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
+                            return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        static_assert( 0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+           char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */
+           detail::process_attributes<Extra...>::init(extra..., rec_fget);
+           if (rec_fget->doc && rec_fget->doc != doc_prev) {
+              free(doc_prev);
+              rec_fget->doc = strdup(rec_fget->doc);
+           }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                free(doc_prev);
+                rec_fset->doc = strdup(rec_fset->doc);
+            }
+            if (! rec_active) rec_active = rec_fset;
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
+        try {
+            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+                    v_h.value_ptr<type>()->shared_from_this());
+            if (sh) {
+                new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+                v_h.set_holder_constructed();
+            }
+        } catch (const std::bad_weak_ptr &) {}
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if possible
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (inst->owned || detail::always_construct_holder<holder_type>::value) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes an
+    /// optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        // We could be deallocating because we are cleaning up after a Python exception.
+        // If so, the Python error indicator will be set. We need to clear that before
+        // running the destructor, in case the destructor code calls more Python.
+        // If we don't, the Python API will exit with an exception, and pybind11 will
+        // throw error_already_set from the C++ destructor which is forbidden and triggers
+        // std::terminate().
+        error_scope scope;
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        }
+        else {
+            detail::call_operator_delete(v_h.value_ptr<type>(),
+                v_h.type->type_size,
+                v_h.type->type_align
+            );
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        return h ? (detail::function_record *) reinterpret_borrow<capsule>(PyCFunction_GET_SELF(h.ptr()))
+                 : nullptr;
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args> detail::initimpl::constructor<Args...> init() { return {}; }
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args> detail::initimpl::alias_constructor<Args...> init_alias() { return {}; }
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) { return {std::forward<Func>(f)}; }
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the second
+/// when an alias is needed (i.e. due to python-side inheritance).  Arguments must be identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+struct enum_base {
+    enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { }
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](handle arg) -> str {
+                handle type = arg.get_type();
+                object type_name = type.attr("__name__");
+                dict entries = type.attr("__entries");
+                for (const auto &kv : entries) {
+                    object other = kv.second[int_(0)];
+                    if (other.equal(arg))
+                        return pybind11::str("{}.{}").format(type_name, kv.first);
+                }
+                return pybind11::str("{}.???").format(type_name);
+            }, name("__repr__"), is_method(m_base)
+        );
+
+        m_base.attr("name") = property(cpp_function(
+            [](handle arg) -> str {
+                dict entries = arg.get_type().attr("__entries");
+                for (const auto &kv : entries) {
+                    if (handle(kv.second[int_(0)]).equal(arg))
+                        return pybind11::str(kv.first);
+                }
+                return "???";
+            }, name("name"), is_method(m_base)
+        ));
+
+        m_base.attr("__doc__") = static_property(cpp_function(
+            [](handle arg) -> std::string {
+                std::string docstring;
+                dict entries = arg.attr("__entries");
+                if (((PyTypeObject *) arg.ptr())->tp_doc)
+                    docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n";
+                docstring += "Members:";
+                for (const auto &kv : entries) {
+                    auto key = std::string(pybind11::str(kv.first));
+                    auto comment = kv.second[int_(1)];
+                    docstring += "\n\n  " + key;
+                    if (!comment.is_none())
+                        docstring += " : " + (std::string) pybind11::str(comment);
+                }
+                return docstring;
+            }, name("__doc__")
+        ), none(), none(), "");
+
+        m_base.attr("__members__") = static_property(cpp_function(
+            [](handle arg) -> dict {
+                dict entries = arg.attr("__entries"), m;
+                for (const auto &kv : entries)
+                    m[kv.first] = kv.second[int_(0)];
+                return m;
+            }, name("__members__")), none(), none(), ""
+        );
+
+        #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a, object b) {                                               \
+                    if (!a.get_type().is(b.get_type()))                                \
+                        strict_behavior;                                               \
+                    return expr;                                                       \
+                },                                                                     \
+                name(op), is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b_) {                                             \
+                    int_ a(a_), b(b_);                                                 \
+                    return expr;                                                       \
+                },                                                                     \
+                name(op), is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                    \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b) {                                              \
+                    int_ a(a_);                                                        \
+                    return expr;                                                       \
+                },                                                                     \
+                name(op), is_method(m_base))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() &&  a.equal(b));
+            PYBIND11_ENUM_OP_CONV_LHS("__ne__",  b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__",   a <  b);
+                PYBIND11_ENUM_OP_CONV("__gt__",   a >  b);
+                PYBIND11_ENUM_OP_CONV("__le__",   a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__",   a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__",  a &  b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a &  b);
+                PYBIND11_ENUM_OP_CONV("__or__",   a |  b);
+                PYBIND11_ENUM_OP_CONV("__ror__",  a |  b);
+                PYBIND11_ENUM_OP_CONV("__xor__",  a ^  b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^  b);
+                m_base.attr("__invert__") = cpp_function(
+                    [](object arg) { return ~(int_(arg)); }, name("__invert__"), is_method(m_base));
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__",  int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+                #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) <  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) >  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+                #undef PYBIND11_THROW
+            }
+        }
+
+        #undef PYBIND11_ENUM_OP_CONV_LHS
+        #undef PYBIND11_ENUM_OP_CONV
+        #undef PYBIND11_ENUM_OP_STRICT
+
+        m_base.attr("__getstate__") = cpp_function(
+            [](object arg) { return int_(arg); }, name("__getstate__"), is_method(m_base));
+
+        m_base.attr("__hash__") = cpp_function(
+            [](object arg) { return int_(arg); }, name("__hash__"), is_method(m_base));
+    }
+
+    PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!");
+        }
+
+        entries[name] = std::make_pair(value, doc);
+        m_base.attr(name) = value;
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (const auto &kv : entries)
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type> class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::def;
+    using Base::attr;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Scalar = typename std::underlying_type<Type>::type;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra&... extra)
+      : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }));
+        def("__int__", [](Type value) { return (Scalar) value; });
+        #if PY_MAJOR_VERSION < 3
+            def("__long__", [](Type value) { return (Scalar) value; });
+        #endif
+        #if PY_MAJOR_VERSION > 3 || (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 8)
+            def("__index__", [](Type value) { return (Scalar) value; });
+        #endif
+
+        attr("__setstate__") = cpp_function(
+            [](detail::value_and_holder &v_h, Scalar arg) {
+                detail::initimpl::setstate<Base>(v_h, static_cast<Type>(arg),
+                        Py_TYPE(v_h.inst) != v_h.type->type); },
+            detail::is_new_style_constructor(),
+            pybind11::name("__setstate__"), is_method(*this));
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_& export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_& value(char const* name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+
+inline void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient)
+        pybind11_fail("Could not activate keep_alive!");
+
+    if (patient.is_none() || nurse.is_none())
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    }
+    else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport(
+            [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0)
+            return ret;
+        else if (n == 1 && call.init_self)
+            return call.init_self;
+        else if (n <= call.args.size())
+            return call.args[n - 1];
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals().registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+        .try_emplace(type);
+#else
+        .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+            get_internals().registered_types_py.erase(type);
+            wr.dec_ref();
+        })).release();
+    }
+
+    return res;
+}
+
+template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = decltype(*std::declval<Iterator>()),
+          typename... Extra>
+iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> ValueType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return *s.it;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = decltype((*std::declval<Iterator>()).first),
+          typename... Extra>
+iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> KeyType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return (*s.it).first;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_iterator(Type &value, Extra&&... extra) {
+    return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_key_iterator(Type &value, Extra&&... extra) {
+    return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+template <typename InputType, typename OutputType> void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        set_flag(bool &flag) : flag(flag) { flag = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) // implicit conversions are non-reentrant
+            return nullptr;
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false))
+            return nullptr;
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr)
+            PyErr_Clear();
+        return result;
+    };
+
+    if (auto tinfo = detail::get_type_info(typeid(OutputType)))
+        tinfo->implicit_conversions.push_back(implicit_caster);
+    else
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+}
+
+template <typename ExceptionTranslator>
+void register_exception_translator(ExceptionTranslator&& translator) {
+    detail::get_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with PyErr_SetString for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
+        std::string full_name = scope.attr("__name__").cast<std::string>() +
+                                std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
+        if (hasattr(scope, name))
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \"" + std::string(name) + "\"");
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    void operator()(const char *message) {
+        PyErr_SetString(m_ptr, message);
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Returns a reference to a function-local static exception object used in the simple
+// register_exception approach below.  (It would be simpler to have the static local variable
+// directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
+template <typename CppException>
+exception<CppException> &get_exception_object() { static exception<CppException> ex; return ex; }
+PYBIND11_NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs an exception translator to
+ * translate the C++ exception to the created Python exception using the exceptions what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &register_exception(handle scope,
+                                            const char *name,
+                                            PyObject *base = PyExc_Exception) {
+    auto &ex = detail::get_exception_object<CppException>();
+    if (!ex) ex = exception<CppException>(scope, name, base);
+
+    register_exception_translator([](std::exception_ptr p) {
+        if (!p) return;
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            detail::get_exception_object<CppException>()(e.what());
+        }
+    });
+    return ex;
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" ");
+    auto line = sep.attr("join")(strings);
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(line);
+    write(kwargs.contains("end") ? kwargs["end"] : cast("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>())
+        file.attr("flush")();
+}
+PYBIND11_NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto const &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+            #if !defined(NDEBUG)
+                if (!tstate)
+                    pybind11_fail("scoped_acquire: could not create thread state!");
+            #endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            /* Work around an annoying assertion in PyThreadState_Swap */
+            #if defined(Py_DEBUG)
+                PyInterpreterState *interp = tstate->interp;
+                tstate->interp = nullptr;
+            #endif
+            PyEval_AcquireThread(tstate);
+            #if defined(Py_DEBUG)
+                tstate->interp = interp;
+            #endif
+        }
+
+        inc_ref();
+    }
+
+    void inc_ref() {
+        ++tstate->gilstate_counter;
+    }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+        #if !defined(NDEBUG)
+            if (detail::get_thread_state_unchecked() != tstate)
+                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+            if (tstate->gilstate_counter < 0)
+                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        #endif
+        if (tstate->gilstate_counter == 0) {
+            #if !defined(NDEBUG)
+                if (!release)
+                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            #endif
+            PyThreadState_Clear(tstate);
+            PyThreadState_DeleteCurrent();
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release)
+           PyEval_SaveThread();
+    }
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+};
+
+class gil_scoped_release {
+public:
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        const auto &internals = detail::get_internals();
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+    ~gil_scoped_release() {
+        if (!tstate)
+            return;
+        PyEval_RestoreThread(tstate);
+        if (disassoc) {
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+};
+#elif defined(PYPY_VERSION)
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+public:
+    gil_scoped_acquire() { state = PyGILState_Ensure(); }
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+public:
+    gil_scoped_release() { state = PyEval_SaveThread(); }
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+};
+#else
+class gil_scoped_acquire { };
+class gil_scoped_release { };
+#endif
+
+error_already_set::~error_already_set() {
+    if (m_type) {
+        gil_scoped_acquire gil;
+        error_scope scope;
+        m_type.release().dec_ref();
+        m_value.release().dec_ref();
+        m_trace.release().dec_ref();
+    }
+}
+
+inline function get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name)  {
+    handle self = detail::get_object_handle(this_ptr, this_type);
+    if (!self)
+        return function();
+    handle type = self.get_type();
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overloaded in Python to avoid
+       many costly Python dictionary lookups below */
+    auto &cache = detail::get_internals().inactive_overload_cache;
+    if (cache.find(key) != cache.end())
+        return function();
+
+    function overload = getattr(self, name, function());
+    if (overload.is_cpp_function()) {
+        cache.insert(key);
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame && (std::string) str(frame->f_code->co_name) == name &&
+        frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller = PyDict_GetItem(
+            frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr())
+            return function();
+    }
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d; d["self"] = self; d["name"] = pybind11::str(name);
+    PyObject *result = PyRun_String(
+        "import inspect\n"
+        "frame = inspect.currentframe()\n"
+        "if frame is not None:\n"
+        "    frame = frame.f_back\n"
+        "    if frame is not None and str(frame.f_code.co_name) == name and "
+        "frame.f_code.co_argcount > 0:\n"
+        "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+        "        if self_caller == self:\n"
+        "            self = None\n",
+        Py_file_input, d.ptr(), d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    if (d["self"].is_none())
+        return function();
+    Py_DECREF(result);
+#endif
+
+    return overload;
+}
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr.
+
+  :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first
+                   non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overloaded Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T> function get_overload(const T *this_ptr, const char *name) {
+    auto tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? get_type_overload(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...) { \
+        pybind11::gil_scoped_acquire gil; \
+        pybind11::function overload = pybind11::get_overload(static_cast<const cname *>(this), name); \
+        if (overload) { \
+            auto o = overload(__VA_ARGS__); \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) { \
+                static pybind11::detail::overload_caster_t<ret_type> caster; \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster); \
+            } \
+            else return pybind11::detail::cast_safe<ret_type>(std::move(o)); \
+        } \
+    }
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn'
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERLOAD_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            "__str__",   // Name of method in Python (name)
+            toString,    // Name of function in C++ (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    return cname::fn(__VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it
+    throws if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the method
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. This macro should be used if the method name in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERLOAD_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws
+    if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#  pragma warning(pop)
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic pop
+#endif
diff --git a/pybind11/include/pybind11/pytypes.h b/pybind11/include/pybind11/pytypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea34cd9365c5191be29e986480c8434a8e0201e
--- /dev/null
+++ b/pybind11/include/pybind11/pytypes.h
@@ -0,0 +1,1608 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+#include <utility>
+#include <type_traits>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* A few forward declarations */
+class handle; class object;
+class str; class iterator;
+struct arg; struct arg_v;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+class args_proxy;
+inline bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy> class accessor;
+namespace accessor_policies {
+    struct obj_attr;
+    struct str_attr;
+    struct generic_item;
+    struct sequence_item;
+    struct list_item;
+    struct tuple_item;
+}
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag { };
+template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T> bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+        object call(Args&&... args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const      { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const  { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const  { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const  { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other) const;
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other) const;
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other) const;
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other) const;
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other) const;
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other) const;
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other) const;
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other) const;
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other) const;
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+    /// Return a handle to the Python type object underlying the instance
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+    /// Creates a ``handle`` from the given raw Python object pointer
+    handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T> T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+protected:
+    PyObject *m_ptr = nullptr;
+};
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+      PyObject *tmp = m_ptr;
+      m_ptr = nullptr;
+      return handle(tmp);
+    }
+
+    object& operator=(const object &other) {
+        other.inc_ref();
+        dec_ref();
+        m_ptr = other.m_ptr;
+        return *this;
+    }
+
+    object& operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T> T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T> T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t { };
+    struct stolen_t { };
+
+    template <typename T> friend T reinterpret_borrow(handle);
+    template <typename T> friend T reinterpret_steal(handle);
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) { }
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; }
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+inline std::string error_string();
+PYBIND11_NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class error_already_set : public std::runtime_error {
+public:
+    /// Constructs a new exception from the current Python error indicator, if any.  The current
+    /// Python error indicator will be cleared.
+    error_already_set() : std::runtime_error(detail::error_string()) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+    }
+
+    error_already_set(const error_already_set &) = default;
+    error_already_set(error_already_set &&) = default;
+
+    inline ~error_already_set();
+
+    /// Give the currently-held error back to Python, if any.  If there is currently a Python error
+    /// already set it is cleared first.  After this call, the current object no longer stores the
+    /// error variables (but the `.what()` string is still available).
+    void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); }
+
+    /// If it is impossible to raise the currently-held error, such as in destructor, we can write
+    /// it out using Python's unraisable hook (sys.unraisablehook). The error context should be
+    /// some object whose repr() helps identify the location of the error. Python already knows the
+    /// type and value of the error, so there is no need to repeat that. For example, __func__ could
+    /// be helpful. After this call, the current object no longer stores the error variables,
+    /// and neither does Python.
+    void discard_as_unraisable(object err_context) {
+        restore();
+        PyErr_WriteUnraisable(err_context.ptr());
+    }
+    void discard_as_unraisable(const char *err_context) {
+        discard_as_unraisable(reinterpret_steal<object>(PYBIND11_FROM_STRING(err_context)));
+    }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); }
+
+    const object& type() const { return m_type; }
+    const object& value() const { return m_value; }
+    const object& trace() const { return m_trace; }
+
+private:
+    object m_type, m_value, m_trace;
+};
+
+/** \defgroup python_builtins _
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return T::check_(obj); }
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); }
+
+template <> inline bool isinstance<handle>(handle) = delete;
+template <> inline bool isinstance<object>(handle obj) { return obj.ptr() != nullptr; }
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1)
+        throw error_already_set();
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) { throw error_already_set(); }
+    return h;
+}
+
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+#if PY_MAJOR_VERSION >= 3
+        if (PyInstanceMethod_Check(value.ptr()))
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        else
+#endif
+        if (PyMethod_Check(value.ptr()))
+            value = PyMethod_GET_FUNCTION(value.ptr());
+    }
+    return value;
+}
+
+// Helper aliases/functions to support implicit casting of values given to python accessors/methods.
+// When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes
+// through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) { return std::forward<T>(o); }
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
+    accessor(const accessor &) = default;
+    accessor(accessor &&) = default;
+
+    // accessor overload required to override default assignment operator (templates are not allowed
+    // to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T> void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T> void operator=(T &&value) & {
+        get_cache() = reinterpret_borrow<object>(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value ||
+            std::is_same<T, accessor_policies::obj_attr>::value, bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T> T cast() const { return get_cache().template cast<T>(); }
+
+private:
+    object &get_cache() const {
+        if (!cache) { cache = Policy::get(obj, key); }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+
+PYBIND11_NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) { throw error_already_set(); }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+PYBIND11_NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+
+    reference operator*() const { return Policy::dereference(); }
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() { Policy::increment(); return *this; }
+    It operator++(int) { auto copy = *this; Policy::increment(); return copy; }
+    It &operator--() { Policy::decrement(); return *this; }
+    It operator--(int) { auto copy = *this; Policy::decrement(); return copy; }
+    It &operator+=(difference_type n) { Policy::advance(n); return *this; }
+    It &operator-=(difference_type n) { Policy::advance(-n); return *this; }
+
+    friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator< (const It &a, const It &b) { return b - a > 0; }
+    friend bool operator> (const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    arrow_proxy(T &&value) : value(std::move(value)) { }
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { }
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type;
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    reference dereference() const { return {key, value}; }
+    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+PYBIND11_NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    } else {
+        PyErr_Clear();
+        return false;
+    }
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+
+inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) { }
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) { }
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T> using is_keyword = std::is_base_of<arg, T>;
+template <typename T> using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T> using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T> using is_positional = satisfies_none_of<T,
+    is_keyword, is_s_unpacking, is_ds_unpacking
+>;
+template <typename T> using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+PYBIND11_NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    public: \
+        PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \
+        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \
+        Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
+        Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
+        PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
+        bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
+        static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); } \
+        template <typename Policy_> \
+        Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) \
+    : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    Name(object &&o) \
+    : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); }
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) : Parent(o) { } \
+    Name(object &&o) : Parent(std::move(o)) { }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    Name() : Parent() { }
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator& operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto& self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const { operator*(); return &value; }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (PyErr_Occurred()) { throw error_already_set(); }
+    }
+
+private:
+    object value = {};
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
+
+    str(const char *c, size_t n)
+        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
+    str(const char *c = "")
+        : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    str(const std::string &s) : str(s.data(), s.size()) { }
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { }
+
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp)
+                pybind11_fail("Unable to extract string contents! (encoding issue)");
+        }
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+            pybind11_fail("Unable to extract string contents! (invalid type)");
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+        if (!str_value) throw error_already_set();
+#if PY_MAJOR_VERSION < 3
+        PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+        Py_XDECREF(str_value); str_value = unicode;
+#endif
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    bytes(const char *c = "")
+        : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    bytes(const char *c, size_t n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    // Allow implicit conversion:
+    bytes(const std::string &s) : bytes(s.data(), s.size()) { }
+
+    explicit bytes(const pybind11::str &s);
+
+    operator std::string() const {
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
+            pybind11_fail("Unable to extract bytes contents!");
+        return std::string(buffer, (size_t) length);
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp)
+            pybind11_fail("Unable to extract string contents! (encoding issue)");
+    }
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract string contents! (invalid type)");
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj)
+        pybind11_fail("Could not allocate bytes object!");
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes& b) {
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract bytes contents!");
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, (ssize_t) length));
+    if (!obj)
+        pybind11_fail("Could not allocate string object!");
+    m_ptr = obj.release().ptr();
+}
+
+/// \addtogroup pytypes
+/// @{
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) { }
+};
+
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) { }
+};
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) { }
+    // Allow implicit conversion from and to `bool`:
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
+    operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) return nullptr;
+        return handle(value ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)
+#if PY_VERSION_HEX < 0x03000000
+            || PyInt_Check(o)
+#endif
+    ) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    else {
+        unsigned long long v = PyLong_AsUnsignedLongLong(o);
+        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+}
+PYBIND11_NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) { }
+    // Allow implicit conversion from C++ integral types:
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLong((long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+        } else {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLongLong((long long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+        if (!m_ptr) pybind11_fail("Could not allocate int object!");
+    }
+
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    operator T() const {
+        return std::is_unsigned<T>::value
+            ? detail::as_unsigned<T>(m_ptr)
+            : sizeof(T) <= sizeof(long)
+              ? (T) PyLong_AsLong(m_ptr)
+              : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
+    }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
+        int_ start(start_), stop(stop_), step(step_);
+        m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
+        if (!m_ptr) pybind11_fail("Could not allocate slice object!");
+    }
+    bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
+                 size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length, (ssize_t *) start,
+                                    (ssize_t *) stop, (ssize_t *) step,
+                                    (ssize_t *) slicelength) == 0;
+    }
+    bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step,
+      ssize_t *slicelength) const {
+      return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+          length, start,
+          stop, step,
+          slicelength) == 0;
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { }
+
+    explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input")
+    capsule(const void *value, void (*destruct)(PyObject *))
+        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    capsule(const void *value, void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            void *ptr = PyCapsule_GetPointer(o, nullptr);
+            destructor(ptr);
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+
+        if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0)
+            pybind11_fail("Could not set capsule context!");
+    }
+
+    capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
+            destructor();
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    template <typename T> operator T *() const {
+        auto name = this->name();
+        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) pybind11_fail("Unable to extract capsule contents!");
+        return result;
+    }
+
+    const char *name() const { return PyCapsule_GetName(m_ptr); }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate dict object!");
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() const { PyDict_Clear(ptr()); }
+    template <typename T> bool contains(T &&key) const {
+        return PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr()) == 1;
+    }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op))
+            return handle(op).inc_ref().ptr();
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const {
+        ssize_t result = PySequence_Size(m_ptr);
+        if (result == -1)
+            throw error_already_set();
+        return (size_t) result;
+    }
+    bool empty() const { return size() == 0; }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate list object!");
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T> void append(T &&val) const {
+        PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+    template <typename T> void insert(size_t index, T &&val) const {
+        PyList_Insert(m_ptr, static_cast<ssize_t>(index),
+            detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+};
+
+class args : public tuple { PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check) };
+class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)  };
+
+class set : public object {
+public:
+    PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
+    set() : object(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate set object!");
+    }
+    size_t size() const { return (size_t) PySet_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    template <typename T> bool add(T &&val) const {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() const { PySet_Clear(m_ptr); }
+    template <typename T> bool contains(T &&val) const {
+        return PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 1;
+    }
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr()))
+            return fun;
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) const {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) flags |= PyBUF_WRITABLE;
+        Py_buffer *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+
+    /** \rst
+        Creates ``memoryview`` from ``buffer_info``.
+
+        ``buffer_info`` must be created from ``buffer::request()``. Otherwise
+        throws an exception.
+
+        For creating a ``memoryview`` from objects that support buffer protocol,
+        use ``memoryview(const object& obj)`` instead of this constructor.
+     \endrst */
+    explicit memoryview(const buffer_info& info) {
+        if (!info.view())
+            pybind11_fail("Prohibited to create memoryview without Py_buffer");
+        // Note: PyMemoryView_FromBuffer never increments obj reference.
+        m_ptr = (info.view()->obj) ?
+            PyMemoryView_FromObject(info.view()->obj) :
+            PyMemoryView_FromBuffer(info.view());
+        if (!m_ptr)
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static buffer.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``ptr`` and ``format``, which MUST outlive the memoryview constructed
+        here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromBuffer: https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromBuffer
+
+        :param ptr: Pointer to the buffer.
+        :param itemsize: Byte size of an element.
+        :param format: Pointer to the null-terminated format string. For
+            homogeneous Buffers, this should be set to
+            ``format_descriptor<T>::value``.
+        :param shape: Shape of the tensor (1 entry per dimension).
+        :param strides: Number of bytes between adjacent entries (for each
+            per dimension).
+        :param readonly: Flag to indicate if the underlying storage may be
+            written to.
+     \endrst */
+    static memoryview from_buffer(
+        void *ptr, ssize_t itemsize, const char *format,
+        detail::any_container<ssize_t> shape,
+        detail::any_container<ssize_t> strides, bool readonly = false);
+
+    static memoryview from_buffer(
+        const void *ptr, ssize_t itemsize, const char *format,
+        detail::any_container<ssize_t> shape,
+        detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<void*>(ptr), itemsize, format, shape, strides, true);
+    }
+
+    template<typename T>
+    static memoryview from_buffer(
+        T *ptr, detail::any_container<ssize_t> shape,
+        detail::any_container<ssize_t> strides, bool readonly = false) {
+        return memoryview::from_buffer(
+            reinterpret_cast<void*>(ptr), sizeof(T),
+            format_descriptor<T>::value, shape, strides, readonly);
+    }
+
+    template<typename T>
+    static memoryview from_buffer(
+        const T *ptr, detail::any_container<ssize_t> shape,
+        detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<T*>(ptr), shape, strides, true);
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    /** \rst
+        Creates ``memoryview`` from static memory.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``mem``, which MUST outlive the memoryview constructed here.
+
+        This method is not available in Python 2.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromMemory: https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromMemory
+     \endrst */
+    static memoryview from_memory(void *mem, ssize_t size, bool readonly = false) {
+        PyObject* ptr = PyMemoryView_FromMemory(
+            reinterpret_cast<char*>(mem), size,
+            (readonly) ? PyBUF_READ : PyBUF_WRITE);
+        if (!ptr)
+            pybind11_fail("Could not allocate memoryview object!");
+        return memoryview(object(ptr, stolen_t{}));
+    }
+
+    static memoryview from_memory(const void *mem, ssize_t size) {
+        return memoryview::from_memory(const_cast<void*>(mem), size, true);
+    }
+#endif
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+inline memoryview memoryview::from_buffer(
+    void *ptr, ssize_t itemsize, const char* format,
+    detail::any_container<ssize_t> shape,
+    detail::any_container<ssize_t> strides, bool readonly) {
+    size_t ndim = shape->size();
+    if (ndim != strides->size())
+        pybind11_fail("memoryview: shape length doesn't match strides length");
+    ssize_t size = ndim ? 1 : 0;
+    for (size_t i = 0; i < ndim; ++i)
+        size *= (*shape)[i];
+    Py_buffer view;
+    view.buf = ptr;
+    view.obj = nullptr;
+    view.len = size * itemsize;
+    view.readonly = static_cast<int>(readonly);
+    view.itemsize = itemsize;
+    view.format = const_cast<char*>(format);
+    view.ndim = static_cast<int>(ndim);
+    view.shape = shape->data();
+    view.strides = strides->data();
+    view.suboffsets = nullptr;
+    view.internal = nullptr;
+    PyObject* obj = PyMemoryView_FromBuffer(&view);
+    if (!obj)
+        throw error_already_set();
+    return memoryview(object(obj, stolen_t{}));
+}
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0)
+        pybind11_fail("Unable to compute length of object");
+    return (size_t) result;
+}
+
+inline size_t len_hint(handle h) {
+#if PY_VERSION_HEX >= 0x03040000
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+#else
+    ssize_t result = PyObject_Length(h.ptr());
+#endif
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) throw error_already_set();
+#if PY_MAJOR_VERSION < 3
+    PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+    Py_XDECREF(str_value); str_value = unicode;
+    if (!str_value) throw error_already_set();
+#endif
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename D> iterator object_api<D>::begin() const { return iter(derived()); }
+template <typename D> iterator object_api<D>::end() const { return iterator::sentinel(); }
+template <typename D> item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D> obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D> args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D> template <typename T> bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const { return pybind11::str(derived()); }
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
+
+template <typename D>
+handle object_api<D>::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); }
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1)
+        throw error_already_set();
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                   \
+    template <typename D> object object_api<D>::op() const {                   \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));        \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                  \
+    template <typename D>                                                      \
+    object object_api<D>::op(object_api const &other) const {                  \
+        object result = reinterpret_steal<object>(                             \
+            fn(derived().ptr(), other.derived().ptr()));                       \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY (operator~,   PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY (operator-,   PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+,   PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY(operator+=,  PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-,   PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator-=,  PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*,   PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator*=,  PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/,   PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator/=,  PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|,   PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY(operator|=,  PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&,   PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY(operator&=,  PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^,   PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY(operator^=,  PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<,  PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>,  PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/stl.h b/pybind11/include/pybind11/stl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c2bebda87f1c703888307c5b4bac277655b52d6
--- /dev/null
+++ b/pybind11/include/pybind11/stl.h
@@ -0,0 +1,388 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include <list>
+#include <deque>
+#include <valarray>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#    include <optional>
+#    define PYBIND11_HAS_OPTIONAL 1
+#  endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#    include <experimental/optional>
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
+#  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    include <variant>
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  include <optional>
+#  include <variant>
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<
+    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+template <typename Type, typename Key> struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<pybind11::set>(src))
+            return false;
+        auto s = reinterpret_borrow<pybind11::set>(src);
+        value.clear();
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert))
+                return false;
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Key>::policy(policy);
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(value_))
+                return handle();
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+};
+
+template <typename Type, typename Key, typename Value> struct map_caster {
+    using key_conv   = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src))
+            return false;
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) ||
+                !vconv.load(it.second.ptr(), convert))
+                return false;
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value)
+                return handle();
+            d[key] = value;
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Value> struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<str>(src))
+            return false;
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type,
+              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
+    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
+    void reserve_maybe(sequence, void *) { }
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Value>::policy(policy);
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
+ : list_caster<std::vector<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::deque<Type, Alloc>>
+ : list_caster<std::deque<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::list<Type, Alloc>>
+ : list_caster<std::list<Type, Alloc>, Type> { };
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size)
+            value.resize(size);
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size()))
+            return false;
+        size_t ctr = 0;
+        for (auto it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+};
+
+template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
+ : array_caster<std::array<Type, Size>, Type, false, Size> { };
+
+template <typename Type> struct type_caster<std::valarray<Type>>
+ : array_caster<std::valarray<Type>, Type, true> { };
+
+template <typename Key, typename Compare, typename Alloc> struct type_caster<std::set<Key, Compare, Alloc>>
+  : set_caster<std::set<Key, Compare, Alloc>, Key> { };
+
+template <typename Key, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+  : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> { };
+
+template <typename Key, typename Value, typename Compare, typename Alloc> struct type_caster<std::map<Key, Value, Compare, Alloc>>
+  : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> { };
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+  : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template<typename T> struct optional_caster {
+    using value_conv = make_caster<typename T::value_type>;
+
+    template <typename T_>
+    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+        if (!src)
+            return none().inc_ref();
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<T>::policy(policy);
+        }
+        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        } else if (src.is_none()) {
+            return true;  // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert))
+            return false;
+
+        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+};
+
+#if PYBIND11_HAS_OPTIONAL
+template<typename T> struct type_caster<std::optional<T>>
+    : public optional_caster<std::optional<T>> {};
+
+template<> struct type_caster<std::nullopt_t>
+    : public void_caster<std::nullopt_t> {};
+#endif
+
+#if PYBIND11_HAS_EXP_OPTIONAL
+template<typename T> struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template<> struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template<typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant> struct variant_caster;
+
+template <template<typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(caster);
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{}))
+            return true;
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+};
+
+#if PYBIND11_HAS_VARIANT
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+    os << (std::string) str(obj);
+    return os;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/pybind11/include/pybind11/stl_bind.h b/pybind11/include/pybind11/stl_bind.h
new file mode 100644
index 0000000000000000000000000000000000000000..47368f0280154db9ab5c64ac88a1a1fa655752e6
--- /dev/null
+++ b/pybind11/include/pybind11/stl_bind.h
@@ -0,0 +1,661 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>  struct container_traits {
+    template <typename T2> static std::true_type test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>())*);
+    template <typename T2> static std::false_type test_comparable(...);
+    template <typename T2> static std::true_type test_value(typename T2::value_type *);
+    template <typename T2> static std::false_type test_value(...);
+    template <typename T2> static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2> static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type { };
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T, enable_if_t<container_traits<T>::is_element &&
+                   container_traits<T>::is_comparable>>
+    : std::true_type { };
+
+/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>> {
+    static constexpr const bool value =
+        is_comparable<typename T::value_type>::value;
+};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value =
+        is_comparable<typename T::first_type>::value &&
+        is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void vector_if_copy_constructible(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_equal_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
+
+template<typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template<typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def("count",
+        [](const Vector &v, const T &x) {
+            return std::count(v.begin(), v.end(), x);
+        },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list"
+    );
+
+    cl.def("remove", [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end())
+                v.erase(p);
+            else
+                throw value_error();
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item."
+    );
+
+    cl.def("__contains__",
+        [](const Vector &v, const T &x) {
+            return std::find(v.begin(), v.end(), x) != v.end();
+        },
+        arg("x"),
+        "Return true the container contains ``x``"
+    );
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
+// silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("append",
+           [](Vector &v, const T &value) { v.push_back(value); },
+           arg("x"),
+           "Add an item to the end of the list");
+
+    cl.def(init([](iterable it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it)
+           v->push_back(h.cast<T>());
+        return v.release();
+    }));
+
+    cl.def("clear",
+        [](Vector &v) {
+            v.clear();
+        },
+        "Clear the contents"
+    );
+
+    cl.def("extend",
+       [](Vector &v, const Vector &src) {
+           v.insert(v.end(), src.begin(), src.end());
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("extend",
+       [](Vector &v, iterable it) {
+           const size_t old_size = v.size();
+           v.reserve(old_size + len_hint(it));
+           try {
+               for (handle h : it) {
+                   v.push_back(h.cast<T>());
+               }
+           } catch (const cast_error &) {
+               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
+               try {
+                   v.shrink_to_fit();
+               } catch (const std::exception &) {
+                   // Do nothing
+               }
+               throw;
+           }
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0)
+                i += v.size();
+            if (i < 0 || (SizeType)i > v.size())
+                throw index_error();
+            v.insert(v.begin() + i, x);
+        },
+        arg("i") , arg("x"),
+        "Insert an item at a given position."
+    );
+
+    cl.def("pop",
+        [](Vector &v) {
+            if (v.empty())
+                throw index_error();
+            T t = v.back();
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item"
+    );
+
+    cl.def("pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = v[(SizeType) i];
+            v.erase(v.begin() + i);
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``"
+    );
+
+    cl.def("__setitem__",
+        [wrap_i](Vector &v, DiffType i, const T &t) {
+            i = wrap_i(i, v.size());
+            v[(SizeType)i] = t;
+        }
+    );
+
+    /// Slicing protocol
+    cl.def("__getitem__",
+        [](const Vector &v, slice slice) -> Vector * {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            Vector *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i=0; i<slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object"
+    );
+
+    cl.def("__setitem__",
+        [](Vector &v, slice slice,  const Vector &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+
+            for (size_t i=0; i<slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object"
+    );
+
+    cl.def("__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``"
+    );
+
+    cl.def("__delitem__",
+        [](Vector &v, slice slice) {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object"
+    );
+
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector> using vector_needs_copy = negation<
+    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]), typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType)i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::reference_internal, ItType, ItType, T&>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+    cl.def("__getitem__",
+        [](const Vector &v, DiffType i) -> T {
+            if (i < 0 && (i += v.size()) < 0)
+                throw index_error();
+            if ((SizeType)i >= v.size())
+                throw index_error();
+            return v[(SizeType)i];
+        }
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::copy, ItType, ItType, T>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_> auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream&>() << std::declval<typename Vector::value_type>(), void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def("__repr__",
+           [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i=0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1)
+                    s << ", ";
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list."
+    );
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_& cl) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector& v) -> buffer_info {
+        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
+    });
+
+    cl.def(init([](buffer buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
+            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+
+        T *p = static_cast<T*>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        if (step == 1) {
+            return Vector(p, end);
+        }
+        else {
+            Vector vec;
+            vec.reserve((size_t) info.shape[0]);
+            for (; p != end; p += step)
+                vec.push_back(*p);
+            return vec;
+        }
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+
+PYBIND11_NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def("__bool__",
+        [](const Vector &v) -> bool {
+            return !v.empty();
+        },
+        "Check whether the list is nonempty"
+    );
+
+    cl.def("__len__", &Vector::size);
+
+
+
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+
+
+//
+// std::map, std::unordered_map
+//
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void map_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void map_assignment(const Args &...) { }
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               auto it = m.find(k);
+               if (it != m.end()) it->second = v;
+               else m.emplace(k, v);
+           }
+    );
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting
+template<typename Map, typename Class_>
+void map_assignment(enable_if_t<
+        !is_copy_assignable<typename Map::mapped_type>::value &&
+        is_copy_constructible<typename Map::mapped_type>::value,
+        Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               // We can't use m[k] = v; because value type might not be default constructable
+               auto r = m.emplace(k, v);
+               if (!r.second) {
+                   // value type is not copy assignable so the only way to insert it is to erase it first...
+                   m.erase(r.first);
+                   m.emplace(k, v);
+               }
+           }
+    );
+}
+
+
+template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+-> decltype(std::declval<std::ostream&>() << std::declval<typename Map::key_type>() << std::declval<typename Map::mapped_type>(), void()) {
+
+    cl.def("__repr__",
+           [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f)
+                    s << ", ";
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map."
+    );
+}
+
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def("__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty"
+    );
+
+    cl.def("__iter__",
+           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("items",
+           [](Map &m) { return make_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end())
+              throw key_error();
+           return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__",
+        [](Map &m, const KeyType &k) -> bool {
+            auto it = m.find(k);
+            if (it == m.end())
+              return false;
+           return true;
+        }
+    );
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__",
+           [](Map &m, const KeyType &k) {
+               auto it = m.find(k);
+               if (it == m.end())
+                   throw key_error();
+               m.erase(it);
+           }
+    );
+
+    cl.def("__len__", &Map::size);
+
+    return cl;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/pybind11/__init__.py b/pybind11/pybind11/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b2f83d5cd93c073ad130cc113bab25a1d03255b
--- /dev/null
+++ b/pybind11/pybind11/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from ._version import version_info, __version__  # noqa: F401 imported but unused
+
+
+def get_include(user=False):
+    import os
+    d = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(d, "include")):
+        # Package is installed
+        return os.path.join(d, "include")
+    else:
+        # Package is from a source directory
+        return os.path.join(os.path.dirname(d), "include")
diff --git a/pybind11/pybind11/__main__.py b/pybind11/pybind11/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e393cc8f103dc42531d2967d5c05a1edcf2cfa1
--- /dev/null
+++ b/pybind11/pybind11/__main__.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import argparse
+import sys
+import sysconfig
+
+from . import get_include
+
+
+def print_includes():
+    dirs = [sysconfig.get_path('include'),
+            sysconfig.get_path('platinclude'),
+            get_include()]
+
+    # Make unique but preserve order
+    unique_dirs = []
+    for d in dirs:
+        if d not in unique_dirs:
+            unique_dirs.append(d)
+
+    print(' '.join('-I' + d for d in unique_dirs))
+
+
+def main():
+    parser = argparse.ArgumentParser(prog='python -m pybind11')
+    parser.add_argument('--includes', action='store_true',
+                        help='Include flags for both pybind11 and Python headers.')
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.includes:
+        print_includes()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pybind11/pybind11/_version.py b/pybind11/pybind11/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f254ce5e262fa7fb4770e1b770935ea46ecc0
--- /dev/null
+++ b/pybind11/pybind11/_version.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+version_info = (2, 5, 'dev1')
+__version__ = '.'.join(map(str, version_info))
diff --git a/pybind11/setup.cfg b/pybind11/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..002f38d10e46472657ff8228139e0f92b0d5bc10
--- /dev/null
+++ b/pybind11/setup.cfg
@@ -0,0 +1,12 @@
+[bdist_wheel]
+universal=1
+
+[flake8]
+max-line-length = 99
+show_source = True
+exclude = .git, __pycache__, build, dist, docs, tools, venv
+ignore =
+    # required for pretty matrix formatting: multiple spaces after `,` and `[`
+    E201, E241, W504,
+    # camelcase 'cPickle' imported as lowercase 'pickle'
+    N813
diff --git a/pybind11/setup.py b/pybind11/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..577a6b6c37c9d284b0d5b7453de62aaa71c50869
--- /dev/null
+++ b/pybind11/setup.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Setup script for PyPI; use CMakeFile.txt to build extension modules
+
+from setuptools import setup
+from distutils.command.install_headers import install_headers
+from distutils.command.build_py import build_py
+from pybind11 import __version__
+import os
+
+package_data = [
+    'include/pybind11/detail/class.h',
+    'include/pybind11/detail/common.h',
+    'include/pybind11/detail/descr.h',
+    'include/pybind11/detail/init.h',
+    'include/pybind11/detail/internals.h',
+    'include/pybind11/detail/typeid.h',
+    'include/pybind11/attr.h',
+    'include/pybind11/buffer_info.h',
+    'include/pybind11/cast.h',
+    'include/pybind11/chrono.h',
+    'include/pybind11/common.h',
+    'include/pybind11/complex.h',
+    'include/pybind11/eigen.h',
+    'include/pybind11/embed.h',
+    'include/pybind11/eval.h',
+    'include/pybind11/functional.h',
+    'include/pybind11/iostream.h',
+    'include/pybind11/numpy.h',
+    'include/pybind11/operators.h',
+    'include/pybind11/options.h',
+    'include/pybind11/pybind11.h',
+    'include/pybind11/pytypes.h',
+    'include/pybind11/stl.h',
+    'include/pybind11/stl_bind.h',
+]
+
+# Prevent installation of pybind11 headers by setting
+# PYBIND11_USE_CMAKE.
+if os.environ.get('PYBIND11_USE_CMAKE'):
+    headers = []
+else:
+    headers = package_data
+
+
+class InstallHeaders(install_headers):
+    """Use custom header installer because the default one flattens subdirectories"""
+    def run(self):
+        if not self.distribution.headers:
+            return
+
+        for header in self.distribution.headers:
+            subdir = os.path.dirname(os.path.relpath(header, 'include/pybind11'))
+            install_dir = os.path.join(self.install_dir, subdir)
+            self.mkpath(install_dir)
+
+            (out, _) = self.copy_file(header, install_dir)
+            self.outfiles.append(out)
+
+
+# Install the headers inside the package as well
+class BuildPy(build_py):
+    def build_package_data(self):
+        build_py.build_package_data(self)
+        for header in package_data:
+            target = os.path.join(self.build_lib, 'pybind11', header)
+            self.mkpath(os.path.dirname(target))
+            self.copy_file(header, target, preserve_mode=False)
+
+    def get_outputs(self, include_bytecode=1):
+        outputs = build_py.get_outputs(self, include_bytecode=include_bytecode)
+        for header in package_data:
+            target = os.path.join(self.build_lib, 'pybind11', header)
+            outputs.append(target)
+        return outputs
+
+
+setup(
+    name='pybind11',
+    version=__version__,
+    description='Seamless operability between C++11 and Python',
+    author='Wenzel Jakob',
+    author_email='wenzel.jakob@epfl.ch',
+    url='https://github.com/pybind/pybind11',
+    download_url='https://github.com/pybind/pybind11/tarball/v' + __version__,
+    packages=['pybind11'],
+    license='BSD',
+    headers=headers,
+    zip_safe=False,
+    cmdclass=dict(install_headers=InstallHeaders, build_py=BuildPy),
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Utilities',
+        'Programming Language :: C++',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'License :: OSI Approved :: BSD License'
+    ],
+    keywords='C++11, Python bindings',
+    long_description="""pybind11 is a lightweight header-only library that
+exposes C++ types in Python and vice versa, mainly to create Python bindings of
+existing C++ code. Its goals and syntax are similar to the excellent
+Boost.Python by David Abrahams: to minimize boilerplate code in traditional
+extension modules by inferring type information using compile-time
+introspection.
+
+The main issue with Boost.Python-and the reason for creating such a similar
+project-is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.""")
diff --git a/pybind11/tests/CMakeLists.txt b/pybind11/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72de21018a85ff3d5e443628252255334073b914
--- /dev/null
+++ b/pybind11/tests/CMakeLists.txt
@@ -0,0 +1,361 @@
+# CMakeLists.txt -- Build system for the pybind11 test suite
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+# New Python support
+if(DEFINED Python_EXECUTABLE)
+  set(PYTHON_EXECUTABLE "${Python_EXECUTABLE}")
+  set(PYTHON_VERSION "${Python_VERSION}")
+endif()
+
+# There's no harm in including a project in a project
+project(pybind11_tests CXX)
+
+# Access FindCatch and more
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../tools")
+
+option(PYBIND11_WERROR "Report all warnings as errors" OFF)
+option(DOWNLOAD_EIGEN "Download EIGEN (requires CMake 3.11+)" OFF)
+set(PYBIND11_TEST_OVERRIDE
+    ""
+    CACHE STRING "Tests from ;-separated list of *.cpp files will be built instead of all tests")
+
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  # We're being loaded directly, i.e. not via add_subdirectory, so make this
+  # work as its own project and load the pybind11Config to get the tools we need
+  find_package(pybind11 REQUIRED CONFIG)
+endif()
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting tests build type to MinSizeRel as none was specified")
+  set(CMAKE_BUILD_TYPE
+      MinSizeRel
+      CACHE STRING "Choose the type of build." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel"
+                                               "RelWithDebInfo")
+endif()
+
+# Full set of test files (you can override these; see below)
+set(PYBIND11_TEST_FILES
+    test_async.cpp
+    test_buffers.cpp
+    test_builtin_casters.cpp
+    test_call_policies.cpp
+    test_callbacks.cpp
+    test_chrono.cpp
+    test_class.cpp
+    test_constants_and_functions.cpp
+    test_copy_move.cpp
+    test_custom_type_casters.cpp
+    test_docstring_options.cpp
+    test_eigen.cpp
+    test_enum.cpp
+    test_eval.cpp
+    test_exceptions.cpp
+    test_factory_constructors.cpp
+    test_gil_scoped.cpp
+    test_iostream.cpp
+    test_kwargs_and_defaults.cpp
+    test_local_bindings.cpp
+    test_methods_and_attributes.cpp
+    test_modules.cpp
+    test_multiple_inheritance.cpp
+    test_numpy_array.cpp
+    test_numpy_dtypes.cpp
+    test_numpy_vectorize.cpp
+    test_opaque_types.cpp
+    test_operator_overloading.cpp
+    test_pickling.cpp
+    test_pytypes.cpp
+    test_sequences_and_iterators.cpp
+    test_smart_ptr.cpp
+    test_stl.cpp
+    test_stl_binders.cpp
+    test_tagbased_polymorphic.cpp
+    test_union.cpp
+    test_virtual_functions.cpp)
+
+# Invoking cmake with something like:
+#     cmake -DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_pickling.cpp" ..
+# lets you override the tests that get compiled and run.  You can restore to all tests with:
+#     cmake -DPYBIND11_TEST_OVERRIDE= ..
+if(PYBIND11_TEST_OVERRIDE)
+  set(PYBIND11_TEST_FILES ${PYBIND11_TEST_OVERRIDE})
+endif()
+
+# Skip test_async for Python < 3.5
+list(FIND PYBIND11_TEST_FILES test_async.cpp PYBIND11_TEST_FILES_ASYNC_I)
+if((PYBIND11_TEST_FILES_ASYNC_I GREATER -1) AND (PYTHON_VERSION VERSION_LESS 3.5))
+  message(STATUS "Skipping test_async because Python version ${PYTHON_VERSION} < 3.5")
+  list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_ASYNC_I})
+endif()
+
+string(REPLACE ".cpp" ".py" PYBIND11_PYTEST_FILES "${PYBIND11_TEST_FILES}")
+
+# Contains the set of test files that require pybind11_cross_module_tests to be
+# built; if none of these are built (i.e. because TEST_OVERRIDE is used and
+# doesn't include them) the second module doesn't get built.
+set(PYBIND11_CROSS_MODULE_TESTS test_exceptions.py test_local_bindings.py test_stl.py
+                                test_stl_binders.py)
+
+set(PYBIND11_CROSS_MODULE_GIL_TESTS test_gil_scoped.py)
+
+# Check if Eigen is available; if not, remove from PYBIND11_TEST_FILES (but
+# keep it in PYBIND11_PYTEST_FILES, so that we get the "eigen is not installed"
+# skip message).
+list(FIND PYBIND11_TEST_FILES test_eigen.cpp PYBIND11_TEST_FILES_EIGEN_I)
+if(PYBIND11_TEST_FILES_EIGEN_I GREATER -1)
+  # Try loading via newer Eigen's Eigen3Config first (bypassing tools/FindEigen3.cmake).
+  # Eigen 3.3.1+ exports a cmake 3.0+ target for handling dependency requirements, but also
+  # produces a fatal error if loaded from a pre-3.0 cmake.
+  if(DOWNLOAD_EIGEN)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      message(FATAL_ERROR "CMake 3.11+ required when using DOWNLOAD_EIGEN")
+    endif()
+
+    set(EIGEN3_VERSION_STRING "3.3.7")
+
+    include(FetchContent)
+    FetchContent_Declare(
+      eigen
+      GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
+      GIT_TAG ${EIGEN3_VERSION_STRING})
+
+    FetchContent_GetProperties(eigen)
+    if(NOT eigen_POPULATED)
+      message(STATUS "Downloading Eigen")
+      FetchContent_Populate(eigen)
+    endif()
+
+    set(EIGEN3_INCLUDE_DIR ${eigen_SOURCE_DIR})
+    set(EIGEN3_FOUND TRUE)
+
+  else()
+    find_package(Eigen3 3.2.7 QUIET CONFIG)
+
+    if(NOT EIGEN3_FOUND)
+      # Couldn't load via target, so fall back to allowing module mode finding, which will pick up
+      # tools/FindEigen3.cmake
+      find_package(Eigen3 3.2.7 QUIET)
+    endif()
+  endif()
+
+  if(EIGEN3_FOUND)
+    if(NOT TARGET Eigen3::Eigen)
+      add_library(Eigen3::Eigen IMPORTED INTERFACE)
+      set_property(TARGET Eigen3::Eigen PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                 "${EIGEN3_INCLUDE_DIR}")
+    endif()
+
+    # Eigen 3.3.1+ cmake sets EIGEN3_VERSION_STRING (and hard codes the version when installed
+    # rather than looking it up in the cmake script); older versions, and the
+    # tools/FindEigen3.cmake, set EIGEN3_VERSION instead.
+    if(NOT EIGEN3_VERSION AND EIGEN3_VERSION_STRING)
+      set(EIGEN3_VERSION ${EIGEN3_VERSION_STRING})
+    endif()
+    message(STATUS "Building tests with Eigen v${EIGEN3_VERSION}")
+  else()
+    list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_EIGEN_I})
+    message(STATUS "Building tests WITHOUT Eigen, use -DDOWNLOAD_EIGEN on CMake 3.11+ to download")
+  endif()
+endif()
+
+# Optional dependency for some tests (boost::variant is only supported with version >= 1.56)
+find_package(Boost 1.56)
+
+if(Boost_FOUND)
+  if(NOT TARGET Boost::headers)
+    if(TARGET Boost::boost)
+      # Classic FindBoost
+      add_library(Boost::headers ALIAS Boost::boost)
+    else()
+      # Very old FindBoost, or newer Boost than CMake in older CMakes
+      add_library(Boost::headers IMPORTED INTERFACE)
+      set_property(TARGET Boost::headers PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                  ${Boost_INCLUDE_DIRS})
+    endif()
+  endif()
+endif()
+
+# Compile with compiler warnings turned on
+function(pybind11_enable_warnings target_name)
+  if(MSVC)
+    target_compile_options(${target_name} PRIVATE /W4)
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
+    target_compile_options(${target_name} PRIVATE -Wall -Wextra -Wconversion -Wcast-qual
+                                                  -Wdeprecated)
+  endif()
+
+  if(PYBIND11_WERROR)
+    if(MSVC)
+      target_compile_options(${target_name} PRIVATE /WX)
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
+      target_compile_options(${target_name} PRIVATE -Werror)
+    endif()
+  endif()
+
+  # Needs to be readded since the ordering requires these to be after the ones above
+  if(CMAKE_CXX_STANDARD
+     AND CMAKE_CXX_COMPILER_ID MATCHES "Clang"
+     AND PYTHON_VERSION VERSION_LESS 3.0)
+    if(CMAKE_CXX_STANDARD LESS 17)
+      target_compile_options(${target_name} PUBLIC -Wno-deprecated-register)
+    else()
+      target_compile_options(${target_name} PUBLIC -Wno-register)
+    endif()
+  endif()
+endfunction()
+
+set(test_targets pybind11_tests)
+
+# Build pybind11_cross_module_tests if any test_whatever.py are being built that require it
+foreach(t ${PYBIND11_CROSS_MODULE_TESTS})
+  list(FIND PYBIND11_PYTEST_FILES ${t} i)
+  if(i GREATER -1)
+    list(APPEND test_targets pybind11_cross_module_tests)
+    break()
+  endif()
+endforeach()
+
+foreach(t ${PYBIND11_CROSS_MODULE_GIL_TESTS})
+  list(FIND PYBIND11_PYTEST_FILES ${t} i)
+  if(i GREATER -1)
+    list(APPEND test_targets cross_module_gil_utils)
+    break()
+  endif()
+endforeach()
+
+foreach(target ${test_targets})
+  set(test_files ${PYBIND11_TEST_FILES})
+  if(NOT "${target}" STREQUAL "pybind11_tests")
+    set(test_files "")
+  endif()
+
+  # Create the binding library
+  pybind11_add_module(${target} THIN_LTO ${target}.cpp ${test_files} ${PYBIND11_HEADERS})
+  pybind11_enable_warnings(${target})
+
+  if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+    get_property(
+      suffix
+      TARGET ${target}
+      PROPERTY SUFFIX)
+    set(source_output "${CMAKE_CURRENT_SOURCE_DIR}/${target}${suffix}")
+    if(suffix AND EXISTS "${source_output}")
+      message(WARNING "Output file also in source directory; "
+                      "please remove to avoid confusion: ${source_output}")
+    endif()
+  endif()
+
+  if(MSVC)
+    target_compile_options(${target} PRIVATE /utf-8)
+  endif()
+
+  if(EIGEN3_FOUND)
+    target_link_libraries(${target} PRIVATE Eigen3::Eigen)
+    target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_EIGEN)
+  endif()
+
+  if(Boost_FOUND)
+    target_link_libraries(${target} PRIVATE Boost::headers)
+    target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_BOOST)
+  endif()
+
+  # Always write the output file directly into the 'tests' directory (even on MSVC)
+  if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+    set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+                                               "${CMAKE_CURRENT_BINARY_DIR}")
+    foreach(config ${CMAKE_CONFIGURATION_TYPES})
+      string(TOUPPER ${config} config)
+      set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config}
+                                                 "${CMAKE_CURRENT_BINARY_DIR}")
+    endforeach()
+  endif()
+endforeach()
+
+# Make sure pytest is found or produce a fatal error
+if(NOT PYBIND11_PYTEST_FOUND)
+  execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} -c "import pytest; print(pytest.__version__)"
+    RESULT_VARIABLE pytest_not_found
+    OUTPUT_VARIABLE pytest_version
+    ERROR_QUIET)
+  if(pytest_not_found)
+    message(FATAL_ERROR "Running the tests requires pytest. Please install it manually"
+                        " (try: ${PYTHON_EXECUTABLE} -m pip install pytest)")
+  elseif(pytest_version VERSION_LESS 3.1)
+    message(FATAL_ERROR "Running the tests requires pytest >= 3.1. Found: ${pytest_version}"
+                        "Please update it (try: ${PYTHON_EXECUTABLE} -m pip install -U pytest)")
+  endif()
+  set(PYBIND11_PYTEST_FOUND
+      TRUE
+      CACHE INTERNAL "")
+endif()
+
+if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+  # This is not used later in the build, so it's okay to regenerate each time.
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/pytest.ini" "${CMAKE_CURRENT_BINARY_DIR}/pytest.ini"
+                 COPYONLY)
+  file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/pytest.ini"
+       "\ntestpaths = \"${CMAKE_CURRENT_SOURCE_DIR}\"")
+
+endif()
+
+# cmake 3.12 added list(transform <list> prepend
+# but we can't use it yet
+string(REPLACE "test_" "${CMAKE_CURRENT_BINARY_DIR}/test_" PYBIND11_BINARY_TEST_FILES
+               "${PYBIND11_PYTEST_FILES}")
+
+# A single command to compile and run the tests
+add_custom_target(
+  pytest
+  COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PYBIND11_BINARY_PYTEST_FILES}
+  DEPENDS ${test_targets}
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  USES_TERMINAL)
+
+if(PYBIND11_TEST_OVERRIDE)
+  add_custom_command(
+    TARGET pytest
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E echo
+            "Note: not all tests run: -DPYBIND11_TEST_OVERRIDE is in effect")
+endif()
+
+# Add a check target to run all the tests, starting with pytest (we add dependencies to this below)
+add_custom_target(check DEPENDS pytest)
+
+# The remaining tests only apply when being built as part of the pybind11 project, but not if the
+# tests are being built independently.
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  return()
+endif()
+
+# Add a post-build comment to show the primary test suite .so size and, if a previous size, compare it:
+add_custom_command(
+  TARGET pybind11_tests
+  POST_BUILD
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../tools/libsize.py
+    $<TARGET_FILE:pybind11_tests>
+    ${CMAKE_CURRENT_BINARY_DIR}/sosize-$<TARGET_FILE_NAME:pybind11_tests>.txt)
+
+# Test embedding the interpreter. Provides the `cpptest` target.
+add_subdirectory(test_embed)
+
+# Test CMake build using functions and targets from subdirectory or installed location
+add_subdirectory(test_cmake_build)
diff --git a/pybind11/tests/conftest.py b/pybind11/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2350d041f5d3d57dede9ff23c3177eae2914048
--- /dev/null
+++ b/pybind11/tests/conftest.py
@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+"""pytest configuration
+
+Extends output capture as needed by pybind11: ignore constructors, optional unordered lines.
+Adds docstring and exceptions message sanitizers: ignore Python 2 vs 3 differences.
+"""
+
+import contextlib
+import difflib
+import gc
+import re
+import textwrap
+
+import pytest
+
+import env
+
+# Early diagnostic for failed imports
+import pybind11_tests  # noqa: F401
+
+_unicode_marker = re.compile(r'u(\'[^\']*\')')
+_long_marker = re.compile(r'([0-9])L')
+_hexadecimal = re.compile(r'0x[0-9a-fA-F]+')
+
+# Avoid collecting Python3 only files
+collect_ignore = []
+if env.PY2:
+    collect_ignore.append("test_async.py")
+
+
+def _strip_and_dedent(s):
+    """For triple-quote strings"""
+    return textwrap.dedent(s.lstrip('\n').rstrip())
+
+
+def _split_and_sort(s):
+    """For output which does not require specific line order"""
+    return sorted(_strip_and_dedent(s).splitlines())
+
+
+def _make_explanation(a, b):
+    """Explanation for a failed assert -- the a and b arguments are List[str]"""
+    return ["--- actual / +++ expected"] + [line.strip('\n') for line in difflib.ndiff(a, b)]
+
+
+class Output(object):
+    """Basic output post-processing and comparison"""
+    def __init__(self, string):
+        self.string = string
+        self.explanation = []
+
+    def __str__(self):
+        return self.string
+
+    def __eq__(self, other):
+        # Ignore constructor/destructor output which is prefixed with "###"
+        a = [line for line in self.string.strip().splitlines() if not line.startswith("###")]
+        b = _strip_and_dedent(other).splitlines()
+        if a == b:
+            return True
+        else:
+            self.explanation = _make_explanation(a, b)
+            return False
+
+
+class Unordered(Output):
+    """Custom comparison for output without strict line ordering"""
+    def __eq__(self, other):
+        a = _split_and_sort(self.string)
+        b = _split_and_sort(other)
+        if a == b:
+            return True
+        else:
+            self.explanation = _make_explanation(a, b)
+            return False
+
+
+class Capture(object):
+    def __init__(self, capfd):
+        self.capfd = capfd
+        self.out = ""
+        self.err = ""
+
+    def __enter__(self):
+        self.capfd.readouterr()
+        return self
+
+    def __exit__(self, *args):
+        self.out, self.err = self.capfd.readouterr()
+
+    def __eq__(self, other):
+        a = Output(self.out)
+        b = other
+        if a == b:
+            return True
+        else:
+            self.explanation = a.explanation
+            return False
+
+    def __str__(self):
+        return self.out
+
+    def __contains__(self, item):
+        return item in self.out
+
+    @property
+    def unordered(self):
+        return Unordered(self.out)
+
+    @property
+    def stderr(self):
+        return Output(self.err)
+
+
+@pytest.fixture
+def capture(capsys):
+    """Extended `capsys` with context manager and custom equality operators"""
+    return Capture(capsys)
+
+
+class SanitizedString(object):
+    def __init__(self, sanitizer):
+        self.sanitizer = sanitizer
+        self.string = ""
+        self.explanation = []
+
+    def __call__(self, thing):
+        self.string = self.sanitizer(thing)
+        return self
+
+    def __eq__(self, other):
+        a = self.string
+        b = _strip_and_dedent(other)
+        if a == b:
+            return True
+        else:
+            self.explanation = _make_explanation(a.splitlines(), b.splitlines())
+            return False
+
+
+def _sanitize_general(s):
+    s = s.strip()
+    s = s.replace("pybind11_tests.", "m.")
+    s = s.replace("unicode", "str")
+    s = _long_marker.sub(r"\1", s)
+    s = _unicode_marker.sub(r"\1", s)
+    return s
+
+
+def _sanitize_docstring(thing):
+    s = thing.__doc__
+    s = _sanitize_general(s)
+    return s
+
+
+@pytest.fixture
+def doc():
+    """Sanitize docstrings and add custom failure explanation"""
+    return SanitizedString(_sanitize_docstring)
+
+
+def _sanitize_message(thing):
+    s = str(thing)
+    s = _sanitize_general(s)
+    s = _hexadecimal.sub("0", s)
+    return s
+
+
+@pytest.fixture
+def msg():
+    """Sanitize messages and add custom failure explanation"""
+    return SanitizedString(_sanitize_message)
+
+
+# noinspection PyUnusedLocal
+def pytest_assertrepr_compare(op, left, right):
+    """Hook to insert custom failure explanation"""
+    if hasattr(left, 'explanation'):
+        return left.explanation
+
+
+@contextlib.contextmanager
+def suppress(exception):
+    """Suppress the desired exception"""
+    try:
+        yield
+    except exception:
+        pass
+
+
+def gc_collect():
+    ''' Run the garbage collector twice (needed when running
+    reference counting tests with PyPy) '''
+    gc.collect()
+    gc.collect()
+
+
+def pytest_configure():
+    pytest.suppress = suppress
+    pytest.gc_collect = gc_collect
diff --git a/pybind11/tests/constructor_stats.h b/pybind11/tests/constructor_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..abfaf9161406798eeaa79a0d6c22e023de893495
--- /dev/null
+++ b/pybind11/tests/constructor_stats.h
@@ -0,0 +1,275 @@
+#pragma once
+/*
+    tests/constructor_stats.h -- framework for printing and tracking object
+    instance lifetimes in example/test code.
+
+    Copyright (c) 2016 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+
+This header provides a few useful tools for writing examples or tests that want to check and/or
+display object instance lifetimes.  It requires that you include this header and add the following
+function calls to constructors:
+
+    class MyClass {
+        MyClass() { ...; print_default_created(this); }
+        ~MyClass() { ...; print_destroyed(this); }
+        MyClass(const MyClass &c) { ...; print_copy_created(this); }
+        MyClass(MyClass &&c) { ...; print_move_created(this); }
+        MyClass(int a, int b) { ...; print_created(this, a, b); }
+        MyClass &operator=(const MyClass &c) { ...; print_copy_assigned(this); }
+        MyClass &operator=(MyClass &&c) { ...; print_move_assigned(this); }
+
+        ...
+    }
+
+You can find various examples of these in several of the existing testing .cpp files.  (Of course
+you don't need to add any of the above constructors/operators that you don't actually have, except
+for the destructor).
+
+Each of these will print an appropriate message such as:
+
+    ### MyClass @ 0x2801910 created via default constructor
+    ### MyClass @ 0x27fa780 created 100 200
+    ### MyClass @ 0x2801910 destroyed
+    ### MyClass @ 0x27fa780 destroyed
+
+You can also include extra arguments (such as the 100, 200 in the output above, coming from the
+value constructor) for all of the above methods which will be included in the output.
+
+For testing, each of these also keeps track the created instances and allows you to check how many
+of the various constructors have been invoked from the Python side via code such as:
+
+    from pybind11_tests import ConstructorStats
+    cstats = ConstructorStats.get(MyClass)
+    print(cstats.alive())
+    print(cstats.default_constructions)
+
+Note that `.alive()` should usually be the first thing you call as it invokes Python's garbage
+collector to actually destroy objects that aren't yet referenced.
+
+For everything except copy and move constructors and destructors, any extra values given to the
+print_...() function is stored in a class-specific values list which you can retrieve and inspect
+from the ConstructorStats instance `.values()` method.
+
+In some cases, when you need to track instances of a C++ class not registered with pybind11, you
+need to add a function returning the ConstructorStats for the C++ class; this can be done with:
+
+    m.def("get_special_cstats", &ConstructorStats::get<SpecialClass>, py::return_value_policy::reference)
+
+Finally, you can suppress the output messages, but keep the constructor tracking (for
+inspection/testing in python) by using the functions with `print_` replaced with `track_` (e.g.
+`track_copy_created(this)`).
+
+*/
+
+#include "pybind11_tests.h"
+#include <unordered_map>
+#include <list>
+#include <typeindex>
+#include <sstream>
+
+class ConstructorStats {
+protected:
+    std::unordered_map<void*, int> _instances; // Need a map rather than set because members can shared address with parents
+    std::list<std::string> _values; // Used to track values (e.g. of value constructors)
+public:
+    int default_constructions = 0;
+    int copy_constructions = 0;
+    int move_constructions = 0;
+    int copy_assignments = 0;
+    int move_assignments = 0;
+
+    void copy_created(void *inst) {
+        created(inst);
+        copy_constructions++;
+    }
+
+    void move_created(void *inst) {
+        created(inst);
+        move_constructions++;
+    }
+
+    void default_created(void *inst) {
+        created(inst);
+        default_constructions++;
+    }
+
+    void created(void *inst) {
+        ++_instances[inst];
+    }
+
+    void destroyed(void *inst) {
+        if (--_instances[inst] < 0)
+            throw std::runtime_error("cstats.destroyed() called with unknown "
+                                     "instance; potential double-destruction "
+                                     "or a missing cstats.created()");
+    }
+
+    static void gc() {
+        // Force garbage collection to ensure any pending destructors are invoked:
+#if defined(PYPY_VERSION)
+        PyObject *globals = PyEval_GetGlobals();
+        PyObject *result = PyRun_String(
+            "import gc\n"
+            "for i in range(2):"
+            "    gc.collect()\n",
+            Py_file_input, globals, globals);
+        if (result == nullptr)
+            throw py::error_already_set();
+        Py_DECREF(result);
+#else
+        py::module::import("gc").attr("collect")();
+#endif
+    }
+
+    int alive() {
+        gc();
+        int total = 0;
+        for (const auto &p : _instances)
+            if (p.second > 0)
+                total += p.second;
+        return total;
+    }
+
+    void value() {} // Recursion terminator
+    // Takes one or more values, converts them to strings, then stores them.
+    template <typename T, typename... Tmore> void value(const T &v, Tmore &&...args) {
+        std::ostringstream oss;
+        oss << v;
+        _values.push_back(oss.str());
+        value(std::forward<Tmore>(args)...);
+    }
+
+    // Move out stored values
+    py::list values() {
+        py::list l;
+        for (const auto &v : _values) l.append(py::cast(v));
+        _values.clear();
+        return l;
+    }
+
+    // Gets constructor stats from a C++ type index
+    static ConstructorStats& get(std::type_index type) {
+        static std::unordered_map<std::type_index, ConstructorStats> all_cstats;
+        return all_cstats[type];
+    }
+
+    // Gets constructor stats from a C++ type
+    template <typename T> static ConstructorStats& get() {
+#if defined(PYPY_VERSION)
+        gc();
+#endif
+        return get(typeid(T));
+    }
+
+    // Gets constructor stats from a Python class
+    static ConstructorStats& get(py::object class_) {
+        auto &internals = py::detail::get_internals();
+        const std::type_index *t1 = nullptr, *t2 = nullptr;
+        try {
+            auto *type_info = internals.registered_types_py.at((PyTypeObject *) class_.ptr()).at(0);
+            for (auto &p : internals.registered_types_cpp) {
+                if (p.second == type_info) {
+                    if (t1) {
+                        t2 = &p.first;
+                        break;
+                    }
+                    t1 = &p.first;
+                }
+            }
+        }
+        catch (const std::out_of_range&) {}
+        if (!t1) throw std::runtime_error("Unknown class passed to ConstructorStats::get()");
+        auto &cs1 = get(*t1);
+        // If we have both a t1 and t2 match, one is probably the trampoline class; return whichever
+        // has more constructions (typically one or the other will be 0)
+        if (t2) {
+            auto &cs2 = get(*t2);
+            int cs1_total = cs1.default_constructions + cs1.copy_constructions + cs1.move_constructions + (int) cs1._values.size();
+            int cs2_total = cs2.default_constructions + cs2.copy_constructions + cs2.move_constructions + (int) cs2._values.size();
+            if (cs2_total > cs1_total) return cs2;
+        }
+        return cs1;
+    }
+};
+
+// To track construction/destruction, you need to call these methods from the various
+// constructors/operators.  The ones that take extra values record the given values in the
+// constructor stats values for later inspection.
+template <class T> void track_copy_created(T *inst) { ConstructorStats::get<T>().copy_created(inst); }
+template <class T> void track_move_created(T *inst) { ConstructorStats::get<T>().move_created(inst); }
+template <class T, typename... Values> void track_copy_assigned(T *, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.copy_assignments++;
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_move_assigned(T *, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.move_assignments++;
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_default_created(T *inst, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.default_created(inst);
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_created(T *inst, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.created(inst);
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_destroyed(T *inst) {
+    ConstructorStats::get<T>().destroyed(inst);
+}
+template <class T, typename... Values> void track_values(T *, Values &&...values) {
+    ConstructorStats::get<T>().value(std::forward<Values>(values)...);
+}
+
+/// Don't cast pointers to Python, print them as strings
+inline const char *format_ptrs(const char *p) { return p; }
+template <typename T>
+py::str format_ptrs(T *p) { return "{:#x}"_s.format(reinterpret_cast<std::uintptr_t>(p)); }
+template <typename T>
+auto format_ptrs(T &&x) -> decltype(std::forward<T>(x)) { return std::forward<T>(x); }
+
+template <class T, typename... Output>
+void print_constr_details(T *inst, const std::string &action, Output &&...output) {
+    py::print("###", py::type_id<T>(), "@", format_ptrs(inst), action,
+              format_ptrs(std::forward<Output>(output))...);
+}
+
+// Verbose versions of the above:
+template <class T, typename... Values> void print_copy_created(T *inst, Values &&...values) { // NB: this prints, but doesn't store, given values
+    print_constr_details(inst, "created via copy constructor", values...);
+    track_copy_created(inst);
+}
+template <class T, typename... Values> void print_move_created(T *inst, Values &&...values) { // NB: this prints, but doesn't store, given values
+    print_constr_details(inst, "created via move constructor", values...);
+    track_move_created(inst);
+}
+template <class T, typename... Values> void print_copy_assigned(T *inst, Values &&...values) {
+    print_constr_details(inst, "assigned via copy assignment", values...);
+    track_copy_assigned(inst, values...);
+}
+template <class T, typename... Values> void print_move_assigned(T *inst, Values &&...values) {
+    print_constr_details(inst, "assigned via move assignment", values...);
+    track_move_assigned(inst, values...);
+}
+template <class T, typename... Values> void print_default_created(T *inst, Values &&...values) {
+    print_constr_details(inst, "created via default constructor", values...);
+    track_default_created(inst, values...);
+}
+template <class T, typename... Values> void print_created(T *inst, Values &&...values) {
+    print_constr_details(inst, "created", values...);
+    track_created(inst, values...);
+}
+template <class T, typename... Values> void print_destroyed(T *inst, Values &&...values) { // Prints but doesn't store given values
+    print_constr_details(inst, "destroyed", values...);
+    track_destroyed(inst);
+}
+template <class T, typename... Values> void print_values(T *inst, Values &&...values) {
+    print_constr_details(inst, ":", values...);
+    track_values(inst, values...);
+}
diff --git a/pybind11/tests/cross_module_gil_utils.cpp b/pybind11/tests/cross_module_gil_utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07db9f6e48a10dfd2d4370c3daff6e793d6675d2
--- /dev/null
+++ b/pybind11/tests/cross_module_gil_utils.cpp
@@ -0,0 +1,73 @@
+/*
+    tests/cross_module_gil_utils.cpp -- tools for acquiring GIL from a different module
+
+    Copyright (c) 2019 Google LLC
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+#include <pybind11/pybind11.h>
+#include <cstdint>
+
+// This file mimics a DSO that makes pybind11 calls but does not define a
+// PYBIND11_MODULE. The purpose is to test that such a DSO can create a
+// py::gil_scoped_acquire when the running thread is in a GIL-released state.
+//
+// Note that we define a Python module here for convenience, but in general
+// this need not be the case. The typical scenario would be a DSO that implements
+// shared logic used internally by multiple pybind11 modules.
+
+namespace {
+
+namespace py = pybind11;
+void gil_acquire() { py::gil_scoped_acquire gil; }
+
+constexpr char kModuleName[] = "cross_module_gil_utils";
+
+#if PY_MAJOR_VERSION >= 3
+struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    kModuleName,
+    NULL,
+    0,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#else
+PyMethodDef module_methods[] = {
+    {NULL, NULL, 0, NULL}
+};
+#endif
+
+}  // namespace
+
+extern "C" PYBIND11_EXPORT
+#if PY_MAJOR_VERSION >= 3
+PyObject* PyInit_cross_module_gil_utils()
+#else
+void initcross_module_gil_utils()
+#endif
+{
+
+    PyObject* m =
+#if PY_MAJOR_VERSION >= 3
+        PyModule_Create(&moduledef);
+#else
+        Py_InitModule(kModuleName, module_methods);
+#endif
+
+    if (m != NULL) {
+        static_assert(
+            sizeof(&gil_acquire) == sizeof(void*),
+            "Function pointer must have the same size as void*");
+        PyModule_AddObject(m, "gil_acquire_funcaddr",
+                           PyLong_FromVoidPtr(reinterpret_cast<void*>(&gil_acquire)));
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return m;
+#endif
+}
diff --git a/pybind11/tests/env.py b/pybind11/tests/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cded441271c61af72fc0be0de79332dc6279d72
--- /dev/null
+++ b/pybind11/tests/env.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+import platform
+import sys
+
+LINUX = sys.platform.startswith("linux")
+MACOS = sys.platform.startswith("darwin")
+WIN = sys.platform.startswith("win32") or sys.platform.startswith("cygwin")
+
+CPYTHON = platform.python_implementation() == "CPython"
+PYPY = platform.python_implementation() == "PyPy"
+
+PY2 = sys.version_info.major == 2
+
+PY = sys.version_info
diff --git a/pybind11/tests/local_bindings.h b/pybind11/tests/local_bindings.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6afb808664de1fdbde011a9bf7c38d3a8794127
--- /dev/null
+++ b/pybind11/tests/local_bindings.h
@@ -0,0 +1,64 @@
+#pragma once
+#include "pybind11_tests.h"
+
+/// Simple class used to test py::local:
+template <int> class LocalBase {
+public:
+    LocalBase(int i) : i(i) { }
+    int i = -1;
+};
+
+/// Registered with py::module_local in both main and secondary modules:
+using LocalType = LocalBase<0>;
+/// Registered without py::module_local in both modules:
+using NonLocalType = LocalBase<1>;
+/// A second non-local type (for stl_bind tests):
+using NonLocal2 = LocalBase<2>;
+/// Tests within-module, different-compilation-unit local definition conflict:
+using LocalExternal = LocalBase<3>;
+/// Mixed: registered local first, then global
+using MixedLocalGlobal = LocalBase<4>;
+/// Mixed: global first, then local
+using MixedGlobalLocal = LocalBase<5>;
+
+/// Registered with py::module_local only in the secondary module:
+using ExternalType1 = LocalBase<6>;
+using ExternalType2 = LocalBase<7>;
+
+using LocalVec = std::vector<LocalType>;
+using LocalVec2 = std::vector<NonLocal2>;
+using LocalMap = std::unordered_map<std::string, LocalType>;
+using NonLocalVec = std::vector<NonLocalType>;
+using NonLocalVec2 = std::vector<NonLocal2>;
+using NonLocalMap = std::unordered_map<std::string, NonLocalType>;
+using NonLocalMap2 = std::unordered_map<std::string, uint8_t>;
+
+PYBIND11_MAKE_OPAQUE(LocalVec);
+PYBIND11_MAKE_OPAQUE(LocalVec2);
+PYBIND11_MAKE_OPAQUE(LocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalVec);
+//PYBIND11_MAKE_OPAQUE(NonLocalVec2); // same type as LocalVec2
+PYBIND11_MAKE_OPAQUE(NonLocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalMap2);
+
+
+// Simple bindings (used with the above):
+template <typename T, int Adjust = 0, typename... Args>
+py::class_<T> bind_local(Args && ...args) {
+    return py::class_<T>(std::forward<Args>(args)...)
+        .def(py::init<int>())
+        .def("get", [](T &i) { return i.i + Adjust; });
+};
+
+// Simulate a foreign library base class (to match the example in the docs):
+namespace pets {
+class Pet {
+public:
+    Pet(std::string name) : name_(name) {}
+    std::string name_;
+    const std::string &name() { return name_; }
+};
+}
+
+struct MixGL { int i; MixGL(int i) : i{i} {} };
+struct MixGL2 { int i; MixGL2(int i) : i{i} {} };
diff --git a/pybind11/tests/object.h b/pybind11/tests/object.h
new file mode 100644
index 0000000000000000000000000000000000000000..9235f19c20bff3afb59c6880a84c809205eff6ea
--- /dev/null
+++ b/pybind11/tests/object.h
@@ -0,0 +1,175 @@
+#if !defined(__OBJECT_H)
+#define __OBJECT_H
+
+#include <atomic>
+#include "constructor_stats.h"
+
+/// Reference counted object base class
+class Object {
+public:
+    /// Default constructor
+    Object() { print_default_created(this); }
+
+    /// Copy constructor
+    Object(const Object &) : m_refCount(0) { print_copy_created(this); }
+
+    /// Return the current reference count
+    int getRefCount() const { return m_refCount; };
+
+    /// Increase the object's reference count by one
+    void incRef() const { ++m_refCount; }
+
+    /** \brief Decrease the reference count of
+     * the object and possibly deallocate it.
+     *
+     * The object will automatically be deallocated once
+     * the reference count reaches zero.
+     */
+    void decRef(bool dealloc = true) const {
+        --m_refCount;
+        if (m_refCount == 0 && dealloc)
+            delete this;
+        else if (m_refCount < 0)
+            throw std::runtime_error("Internal error: reference count < 0!");
+    }
+
+    virtual std::string toString() const = 0;
+protected:
+    /** \brief Virtual protected deconstructor.
+     * (Will only be called by \ref ref)
+     */
+    virtual ~Object() { print_destroyed(this); }
+private:
+    mutable std::atomic<int> m_refCount { 0 };
+};
+
+// Tag class used to track constructions of ref objects.  When we track constructors, below, we
+// track and print out the actual class (e.g. ref<MyObject>), and *also* add a fake tracker for
+// ref_tag.  This lets us check that the total number of ref<Anything> constructors/destructors is
+// correct without having to check each individual ref<Whatever> type individually.
+class ref_tag {};
+
+/**
+ * \brief Reference counting helper
+ *
+ * The \a ref refeference template is a simple wrapper to store a
+ * pointer to an object. It takes care of increasing and decreasing
+ * the reference count of the object. When the last reference goes
+ * out of scope, the associated object will be deallocated.
+ *
+ * \ingroup libcore
+ */
+template <typename T> class ref {
+public:
+    /// Create a nullptr reference
+    ref() : m_ptr(nullptr) { print_default_created(this); track_default_created((ref_tag*) this); }
+
+    /// Construct a reference from a pointer
+    ref(T *ptr) : m_ptr(ptr) {
+        if (m_ptr) ((Object *) m_ptr)->incRef();
+
+        print_created(this, "from pointer", m_ptr); track_created((ref_tag*) this, "from pointer");
+
+    }
+
+    /// Copy constructor
+    ref(const ref &r) : m_ptr(r.m_ptr) {
+        if (m_ptr)
+            ((Object *) m_ptr)->incRef();
+
+        print_copy_created(this, "with pointer", m_ptr); track_copy_created((ref_tag*) this);
+    }
+
+    /// Move constructor
+    ref(ref &&r) : m_ptr(r.m_ptr) {
+        r.m_ptr = nullptr;
+
+        print_move_created(this, "with pointer", m_ptr); track_move_created((ref_tag*) this);
+    }
+
+    /// Destroy this reference
+    ~ref() {
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+
+        print_destroyed(this); track_destroyed((ref_tag*) this);
+    }
+
+    /// Move another reference into the current one
+    ref& operator=(ref&& r) {
+        print_move_assigned(this, "pointer", r.m_ptr); track_move_assigned((ref_tag*) this);
+
+        if (*this == r)
+            return *this;
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+        m_ptr = r.m_ptr;
+        r.m_ptr = nullptr;
+        return *this;
+    }
+
+    /// Overwrite this reference with another reference
+    ref& operator=(const ref& r) {
+        print_copy_assigned(this, "pointer", r.m_ptr); track_copy_assigned((ref_tag*) this);
+
+        if (m_ptr == r.m_ptr)
+            return *this;
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+        m_ptr = r.m_ptr;
+        if (m_ptr)
+            ((Object *) m_ptr)->incRef();
+        return *this;
+    }
+
+    /// Overwrite this reference with a pointer to another object
+    ref& operator=(T *ptr) {
+        print_values(this, "assigned pointer"); track_values((ref_tag*) this, "assigned pointer");
+
+        if (m_ptr == ptr)
+            return *this;
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+        m_ptr = ptr;
+        if (m_ptr)
+            ((Object *) m_ptr)->incRef();
+        return *this;
+    }
+
+    /// Compare this reference with another reference
+    bool operator==(const ref &r) const { return m_ptr == r.m_ptr; }
+
+    /// Compare this reference with another reference
+    bool operator!=(const ref &r) const { return m_ptr != r.m_ptr; }
+
+    /// Compare this reference with a pointer
+    bool operator==(const T* ptr) const { return m_ptr == ptr; }
+
+    /// Compare this reference with a pointer
+    bool operator!=(const T* ptr) const { return m_ptr != ptr; }
+
+    /// Access the object referenced by this reference
+    T* operator->() { return m_ptr; }
+
+    /// Access the object referenced by this reference
+    const T* operator->() const { return m_ptr; }
+
+    /// Return a C++ reference to the referenced object
+    T& operator*() { return *m_ptr; }
+
+    /// Return a const C++ reference to the referenced object
+    const T& operator*() const { return *m_ptr; }
+
+    /// Return a pointer to the referenced object
+    operator T* () { return m_ptr; }
+
+    /// Return a const pointer to the referenced object
+    T* get_ptr() { return m_ptr; }
+
+    /// Return a pointer to the referenced object
+    const T* get_ptr() const { return m_ptr; }
+private:
+    T *m_ptr;
+};
+
+#endif /* __OBJECT_H */
diff --git a/pybind11/tests/pybind11_cross_module_tests.cpp b/pybind11/tests/pybind11_cross_module_tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f705e310611619dff319f9b5d53b71e6fd54aec5
--- /dev/null
+++ b/pybind11/tests/pybind11_cross_module_tests.cpp
@@ -0,0 +1,123 @@
+/*
+    tests/pybind11_cross_module_tests.cpp -- contains tests that require multiple modules
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+PYBIND11_MODULE(pybind11_cross_module_tests, m) {
+    m.doc() = "pybind11 cross-module test module";
+
+    // test_local_bindings.py tests:
+    //
+    // Definitions here are tested by importing both this module and the
+    // relevant pybind11_tests submodule from a test_whatever.py
+
+    // test_load_external
+    bind_local<ExternalType1>(m, "ExternalType1", py::module_local());
+    bind_local<ExternalType2>(m, "ExternalType2", py::module_local());
+
+    // test_exceptions.py
+    m.def("raise_runtime_error", []() { PyErr_SetString(PyExc_RuntimeError, "My runtime error"); throw py::error_already_set(); });
+    m.def("raise_value_error", []() { PyErr_SetString(PyExc_ValueError, "My value error"); throw py::error_already_set(); });
+    m.def("throw_pybind_value_error", []() { throw py::value_error("pybind11 value error"); });
+    m.def("throw_pybind_type_error", []() { throw py::type_error("pybind11 type error"); });
+    m.def("throw_stop_iteration", []() { throw py::stop_iteration(); });
+
+    // test_local_bindings.py
+    // Local to both:
+    bind_local<LocalType, 1>(m, "LocalType", py::module_local())
+        .def("get2", [](LocalType &t) { return t.i + 2; })
+        ;
+
+    // Can only be called with our python type:
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // This registration will fail (global registration when LocalFail is already registered
+    // globally in the main test module):
+    m.def("register_nonlocal", [m]() {
+        bind_local<NonLocalType, 0>(m, "NonLocalType");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+
+    // test_stl_bind_global
+    // and global if the type (or one of the types, for the map) is global (so these will fail,
+    // assuming pybind11_tests is already loaded):
+    m.def("register_nonlocal_vec", [m]() {
+        py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    });
+    m.def("register_nonlocal_map", [m]() {
+        py::bind_map<NonLocalMap>(m, "NonLocalMap");
+    });
+    // The default can, however, be overridden to global using `py::module_local()` or
+    // `py::module_local(false)`.
+    // Explicitly made local:
+    py::bind_vector<NonLocalVec2>(m, "NonLocalVec2", py::module_local());
+    // Explicitly made global (and so will fail to bind):
+    m.def("register_nonlocal_map2", [m]() {
+        py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+    });
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global_local", [m]() {
+        bind_local<MixedGlobalLocal, 200>(m, "MixedGlobalLocal", py::module_local());
+    });
+    m.def("register_mixed_local_global", [m]() {
+        bind_local<MixedLocalGlobal, 2000>(m, "MixedLocalGlobal", py::module_local(false));
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+
+    m.def("load_vector_via_binding", [](std::vector<int> &v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Dog : public pets::Pet { public: Dog(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL", py::module_local()).def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 100; });
+
+    py::class_<MixGL2>(m, "MixGL2", py::module_local()).def(py::init<int>());
+
+    // test_vector_bool
+    // We can't test both stl.h and stl_bind.h conversions of `std::vector<bool>` within
+    // the same module (it would be an ODR violation). Therefore `bind_vector` of `bool`
+    // is defined here and tested in `test_stl_binders.py`.
+    py::bind_vector<std::vector<bool>>(m, "VectorBool");
+
+    // test_missing_header_message
+    // The main module already includes stl.h, but we need to test the error message
+    // which appears when this header is missing.
+    m.def("missing_header_arg", [](std::vector<float>) { });
+    m.def("missing_header_return", []() { return std::vector<float>(); });
+}
diff --git a/pybind11/tests/pybind11_tests.cpp b/pybind11/tests/pybind11_tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76e0298e83a1088a439441580bc3866c664dcaea
--- /dev/null
+++ b/pybind11/tests/pybind11_tests.cpp
@@ -0,0 +1,91 @@
+/*
+    tests/pybind11_tests.cpp -- pybind example plugin
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+#include <functional>
+#include <list>
+
+/*
+For testing purposes, we define a static global variable here in a function that each individual
+test .cpp calls with its initialization lambda.  It's convenient here because we can just not
+compile some test files to disable/ignore some of the test code.
+
+It is NOT recommended as a way to use pybind11 in practice, however: the initialization order will
+be essentially random, which is okay for our test scripts (there are no dependencies between the
+individual pybind11 test .cpp files), but most likely not what you want when using pybind11
+productively.
+
+Instead, see the "How can I reduce the build time?" question in the "Frequently asked questions"
+section of the documentation for good practice on splitting binding code over multiple files.
+*/
+std::list<std::function<void(py::module &)>> &initializers() {
+    static std::list<std::function<void(py::module &)>> inits;
+    return inits;
+}
+
+test_initializer::test_initializer(Initializer init) {
+    initializers().push_back(init);
+}
+
+test_initializer::test_initializer(const char *submodule_name, Initializer init) {
+    initializers().push_back([=](py::module &parent) {
+        auto m = parent.def_submodule(submodule_name);
+        init(m);
+    });
+}
+
+void bind_ConstructorStats(py::module &m) {
+    py::class_<ConstructorStats>(m, "ConstructorStats")
+        .def("alive", &ConstructorStats::alive)
+        .def("values", &ConstructorStats::values)
+        .def_readwrite("default_constructions", &ConstructorStats::default_constructions)
+        .def_readwrite("copy_assignments", &ConstructorStats::copy_assignments)
+        .def_readwrite("move_assignments", &ConstructorStats::move_assignments)
+        .def_readwrite("copy_constructions", &ConstructorStats::copy_constructions)
+        .def_readwrite("move_constructions", &ConstructorStats::move_constructions)
+        .def_static("get", (ConstructorStats &(*)(py::object)) &ConstructorStats::get, py::return_value_policy::reference_internal)
+
+        // Not exactly ConstructorStats, but related: expose the internal pybind number of registered instances
+        // to allow instance cleanup checks (invokes a GC first)
+        .def_static("detail_reg_inst", []() {
+            ConstructorStats::gc();
+            return py::detail::get_internals().registered_instances.size();
+        })
+        ;
+}
+
+PYBIND11_MODULE(pybind11_tests, m) {
+    m.doc() = "pybind11 test module";
+
+    bind_ConstructorStats(m);
+
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+
+    py::class_<UserType>(m, "UserType", "A `py::class_` type for testing")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("get_value", &UserType::value, "Get value using a method")
+        .def("set_value", &UserType::set, "Set value using a method")
+        .def_property("value", &UserType::value, &UserType::set, "Get/set value using a property")
+        .def("__repr__", [](const UserType& u) { return "UserType({})"_s.format(u.value()); });
+
+    py::class_<IncType, UserType>(m, "IncType")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("__repr__", [](const IncType& u) { return "IncType({})"_s.format(u.value()); });
+
+    for (const auto &initializer : initializers())
+        initializer(m);
+}
diff --git a/pybind11/tests/pybind11_tests.h b/pybind11/tests/pybind11_tests.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e47416270ff1c81dd01f4725b896341ae4dcd0f
--- /dev/null
+++ b/pybind11/tests/pybind11_tests.h
@@ -0,0 +1,65 @@
+#pragma once
+#include <pybind11/pybind11.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+// We get some really long type names here which causes MSVC 2015 to emit warnings
+#  pragma warning(disable: 4503) // warning C4503: decorated name length exceeded, name was truncated
+#endif
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+class test_initializer {
+    using Initializer = void (*)(py::module &);
+
+public:
+    test_initializer(Initializer init);
+    test_initializer(const char *submodule_name, Initializer init);
+};
+
+#define TEST_SUBMODULE(name, variable)                   \
+    void test_submodule_##name(py::module &);            \
+    test_initializer name(#name, test_submodule_##name); \
+    void test_submodule_##name(py::module &variable)
+
+
+/// Dummy type which is not exported anywhere -- something to trigger a conversion error
+struct UnregisteredType { };
+
+/// A user-defined type which is exported and can be used by any test
+class UserType {
+public:
+    UserType() = default;
+    UserType(int i) : i(i) { }
+
+    int value() const { return i; }
+    void set(int set) { i = set; }
+
+private:
+    int i = -1;
+};
+
+/// Like UserType, but increments `value` on copy for quick reference vs. copy tests
+class IncType : public UserType {
+public:
+    using UserType::UserType;
+    IncType() = default;
+    IncType(const IncType &other) : IncType(other.value() + 1) { }
+    IncType(IncType &&) = delete;
+    IncType &operator=(const IncType &) = delete;
+    IncType &operator=(IncType &&) = delete;
+};
+
+/// Custom cast-only type that casts to a string "rvalue" or "lvalue" depending on the cast context.
+/// Used to test recursive casters (e.g. std::tuple, stl containers).
+struct RValueCaster {};
+PYBIND11_NAMESPACE_BEGIN(pybind11)
+PYBIND11_NAMESPACE_BEGIN(detail)
+template<> class type_caster<RValueCaster> {
+public:
+    PYBIND11_TYPE_CASTER(RValueCaster, _("RValueCaster"));
+    static handle cast(RValueCaster &&, return_value_policy, handle) { return py::str("rvalue").release(); }
+    static handle cast(const RValueCaster &, return_value_policy, handle) { return py::str("lvalue").release(); }
+};
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(pybind11)
diff --git a/pybind11/tests/pytest.ini b/pybind11/tests/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..6d758ea6ac8d7315804875fba8d4e33cf1752e77
--- /dev/null
+++ b/pybind11/tests/pytest.ini
@@ -0,0 +1,19 @@
+[pytest]
+minversion = 3.1
+norecursedirs = test_cmake_build test_embed
+xfail_strict = True
+addopts =
+    # show summary of skipped tests
+    -rs
+    # capture only Python print and C++ py::print, but not C output (low-level Python errors)
+    --capture=sys
+    # enable all warnings
+    -Wa
+filterwarnings =
+    # make warnings into errors but ignore certain third-party extension issues
+    error
+    # importing scipy submodules on some version of Python
+    ignore::ImportWarning
+    # bogus numpy ABI warning (see numpy/#432)
+    ignore:.*numpy.dtype size changed.*:RuntimeWarning
+    ignore:.*numpy.ufunc size changed.*:RuntimeWarning
diff --git a/pybind11/tests/requirements.txt b/pybind11/tests/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..39bd57a1c7860bfdee5c6206ebf79426f5a49abc
--- /dev/null
+++ b/pybind11/tests/requirements.txt
@@ -0,0 +1,8 @@
+--extra-index-url https://antocuni.github.io/pypy-wheels/manylinux2010/
+numpy==1.16.6; python_version<"3.6"
+numpy==1.18.0; platform_python_implementation=="PyPy" and sys_platform=="darwin" and python_version>="3.6"
+numpy==1.19.1; (platform_python_implementation!="PyPy" or sys_platform!="darwin") and python_version>="3.6" and python_version<"3.9"
+pytest==4.6.9; python_version<"3.5"
+pytest==5.4.3; python_version>="3.5"
+scipy==1.2.3; (platform_python_implementation!="PyPy" or sys_platform!="darwin") and python_version<"3.6"
+scipy==1.5.2; (platform_python_implementation!="PyPy" or sys_platform!="darwin") and python_version>="3.6" and python_version<"3.9"
diff --git a/pybind11/tests/test_async.cpp b/pybind11/tests/test_async.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0ad0d535048fbb825b444e743193c743551cdd4
--- /dev/null
+++ b/pybind11/tests/test_async.cpp
@@ -0,0 +1,26 @@
+/*
+    tests/test_async.cpp -- __await__ support
+
+    Copyright (c) 2019 Google Inc.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(async_module, m) {
+    struct DoesNotSupportAsync {};
+    py::class_<DoesNotSupportAsync>(m, "DoesNotSupportAsync")
+        .def(py::init<>());
+    struct SupportsAsync {};
+    py::class_<SupportsAsync>(m, "SupportsAsync")
+        .def(py::init<>())
+        .def("__await__", [](const SupportsAsync& self) -> py::object {
+            static_cast<void>(self);
+            py::object loop = py::module::import("asyncio.events").attr("get_event_loop")();
+            py::object f = loop.attr("create_future")();
+            f.attr("set_result")(5);
+            return f.attr("__await__")();
+        });
+}
diff --git a/pybind11/tests/test_async.py b/pybind11/tests/test_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..df4489c499e88f190764dd17cef44b54b4516202
--- /dev/null
+++ b/pybind11/tests/test_async.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+asyncio = pytest.importorskip("asyncio")
+m = pytest.importorskip("pybind11_tests.async_module")
+
+
+@pytest.fixture
+def event_loop():
+    loop = asyncio.new_event_loop()
+    yield loop
+    loop.close()
+
+
+async def get_await_result(x):
+    return await x
+
+
+def test_await(event_loop):
+    assert 5 == event_loop.run_until_complete(get_await_result(m.SupportsAsync()))
+
+
+def test_await_missing(event_loop):
+    with pytest.raises(TypeError):
+        event_loop.run_until_complete(get_await_result(m.DoesNotSupportAsync()))
diff --git a/pybind11/tests/test_buffers.cpp b/pybind11/tests/test_buffers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1bc67ff7b66e86d7bf94de845e5737261f2a1280
--- /dev/null
+++ b/pybind11/tests/test_buffers.cpp
@@ -0,0 +1,195 @@
+/*
+    tests/test_buffers.cpp -- supporting Pythons' buffer protocol
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+TEST_SUBMODULE(buffers, m) {
+    // test_from_python / test_to_python:
+    class Matrix {
+    public:
+        Matrix(ssize_t rows, ssize_t cols) : m_rows(rows), m_cols(cols) {
+            print_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (rows*cols)];
+            memset(m_data, 0, sizeof(float) * (size_t) (rows * cols));
+        }
+
+        Matrix(const Matrix &s) : m_rows(s.m_rows), m_cols(s.m_cols) {
+            print_copy_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+        }
+
+        Matrix(Matrix &&s) : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_rows = 0;
+            s.m_cols = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Matrix() {
+            print_destroyed(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+        }
+
+        Matrix &operator=(const Matrix &s) {
+            print_copy_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+            m_rows = s.m_rows;
+            m_cols = s.m_cols;
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+            return *this;
+        }
+
+        Matrix &operator=(Matrix &&s) {
+            print_move_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            if (&s != this) {
+                delete[] m_data;
+                m_rows = s.m_rows; m_cols = s.m_cols; m_data = s.m_data;
+                s.m_rows = 0; s.m_cols = 0; s.m_data = nullptr;
+            }
+            return *this;
+        }
+
+        float operator()(ssize_t i, ssize_t j) const {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
+
+        float &operator()(ssize_t i, ssize_t j) {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
+
+        float *data() { return m_data; }
+
+        ssize_t rows() const { return m_rows; }
+        ssize_t cols() const { return m_cols; }
+    private:
+        ssize_t m_rows;
+        ssize_t m_cols;
+        float *m_data;
+    };
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+        .def(py::init<ssize_t, ssize_t>())
+        /// Construct from a buffer
+        .def(py::init([](py::buffer const b) {
+            py::buffer_info info = b.request();
+            if (info.format != py::format_descriptor<float>::format() || info.ndim != 2)
+                throw std::runtime_error("Incompatible buffer format!");
+
+            auto v = new Matrix(info.shape[0], info.shape[1]);
+            memcpy(v->data(), info.ptr, sizeof(float) * (size_t) (v->rows() * v->cols()));
+            return v;
+        }))
+
+       .def("rows", &Matrix::rows)
+       .def("cols", &Matrix::cols)
+
+        /// Bare bones interface
+       .def("__getitem__", [](const Matrix &m, std::pair<ssize_t, ssize_t> i) {
+            if (i.first >= m.rows() || i.second >= m.cols())
+                throw py::index_error();
+            return m(i.first, i.second);
+        })
+       .def("__setitem__", [](Matrix &m, std::pair<ssize_t, ssize_t> i, float v) {
+            if (i.first >= m.rows() || i.second >= m.cols())
+                throw py::index_error();
+            m(i.first, i.second) = v;
+        })
+       /// Provide buffer access
+       .def_buffer([](Matrix &m) -> py::buffer_info {
+            return py::buffer_info(
+                m.data(),                               /* Pointer to buffer */
+                { m.rows(), m.cols() },                 /* Buffer dimensions */
+                { sizeof(float) * size_t(m.cols()),     /* Strides (in bytes) for each index */
+                  sizeof(float) }
+            );
+        })
+        ;
+
+
+    // test_inherited_protocol
+    class SquareMatrix : public Matrix {
+    public:
+        SquareMatrix(ssize_t n) : Matrix(n, n) { }
+    };
+    // Derived classes inherit the buffer protocol and the buffer access function
+    py::class_<SquareMatrix, Matrix>(m, "SquareMatrix")
+        .def(py::init<ssize_t>());
+
+
+    // test_pointer_to_member_fn
+    // Tests that passing a pointer to member to the base class works in
+    // the derived class.
+    struct Buffer {
+        int32_t value = 0;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, sizeof(value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+    };
+    py::class_<Buffer>(m, "Buffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &Buffer::value)
+        .def_buffer(&Buffer::get_buffer_info);
+
+
+    class ConstBuffer {
+        std::unique_ptr<int32_t> value;
+
+    public:
+        int32_t get_value() const { return *value; }
+        void set_value(int32_t v) { *value = v; }
+
+        py::buffer_info get_buffer_info() const {
+            return py::buffer_info(value.get(), sizeof(*value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+
+        ConstBuffer() : value(new int32_t{0}) { };
+    };
+    py::class_<ConstBuffer>(m, "ConstBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_property("value", &ConstBuffer::get_value, &ConstBuffer::set_value)
+        .def_buffer(&ConstBuffer::get_buffer_info);
+
+    struct DerivedBuffer : public Buffer { };
+    py::class_<DerivedBuffer>(m, "DerivedBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", (int32_t DerivedBuffer::*) &DerivedBuffer::value)
+        .def_buffer(&DerivedBuffer::get_buffer_info);
+
+    struct BufferReadOnly {
+        const uint8_t value = 0;
+        BufferReadOnly(uint8_t value): value(value) {}
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, 1);
+        }
+    };
+    py::class_<BufferReadOnly>(m, "BufferReadOnly", py::buffer_protocol())
+        .def(py::init<uint8_t>())
+        .def_buffer(&BufferReadOnly::get_buffer_info);
+
+    struct BufferReadOnlySelect {
+        uint8_t value = 0;
+        bool readonly = false;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, 1, readonly);
+        }
+    };
+    py::class_<BufferReadOnlySelect>(m, "BufferReadOnlySelect", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &BufferReadOnlySelect::value)
+        .def_readwrite("readonly", &BufferReadOnlySelect::readonly)
+        .def_buffer(&BufferReadOnlySelect::get_buffer_info);
+
+}
diff --git a/pybind11/tests/test_buffers.py b/pybind11/tests/test_buffers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6adaf1f5eee00f93e2b0ba7e3838c1107297080
--- /dev/null
+++ b/pybind11/tests/test_buffers.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+import io
+import struct
+
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import buffers as m
+from pybind11_tests import ConstructorStats
+
+np = pytest.importorskip("numpy")
+
+
+def test_from_python():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.Matrix(np.array([1, 2, 3]))  # trying to assign a 1D array
+    assert str(excinfo.value) == "Incompatible buffer format!"
+
+    m3 = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+    m4 = m.Matrix(m3)
+
+    for i in range(m4.rows()):
+        for j in range(m4.cols()):
+            assert m3[i, j] == m4[i, j]
+
+    cstats = ConstructorStats.get(m.Matrix)
+    assert cstats.alive() == 1
+    del m3, m4
+    assert cstats.alive() == 0
+    assert cstats.values() == ["2x3 matrix"]
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0  # Don't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+# https://foss.heptapod.net/pypy/pypy/-/issues/2444
+def test_to_python():
+    mat = m.Matrix(5, 4)
+    assert memoryview(mat).shape == (5, 4)
+
+    assert mat[2, 3] == 0
+    mat[2, 3] = 4.0
+    mat[3, 2] = 7.0
+    assert mat[2, 3] == 4
+    assert mat[3, 2] == 7
+    assert struct.unpack_from('f', mat, (3 * 4 + 2) * 4) == (7, )
+    assert struct.unpack_from('f', mat, (2 * 4 + 3) * 4) == (4, )
+
+    mat2 = np.array(mat, copy=False)
+    assert mat2.shape == (5, 4)
+    assert abs(mat2).sum() == 11
+    assert mat2[2, 3] == 4 and mat2[3, 2] == 7
+    mat2[2, 3] = 5
+    assert mat2[2, 3] == 5
+
+    cstats = ConstructorStats.get(m.Matrix)
+    assert cstats.alive() == 1
+    del mat
+    pytest.gc_collect()
+    assert cstats.alive() == 1
+    del mat2  # holds a mat reference
+    pytest.gc_collect()
+    assert cstats.alive() == 0
+    assert cstats.values() == ["5x4 matrix"]
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0  # Don't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_inherited_protocol():
+    """SquareMatrix is derived from Matrix and inherits the buffer protocol"""
+
+    matrix = m.SquareMatrix(5)
+    assert memoryview(matrix).shape == (5, 5)
+    assert np.asarray(matrix).shape == (5, 5)
+
+
+def test_pointer_to_member_fn():
+    for cls in [m.Buffer, m.ConstBuffer, m.DerivedBuffer]:
+        buf = cls()
+        buf.value = 0x12345678
+        value = struct.unpack('i', bytearray(buf))[0]
+        assert value == 0x12345678
+
+
+def test_readonly_buffer():
+    buf = m.BufferReadOnly(0x64)
+    view = memoryview(buf)
+    assert view[0] == b'd' if env.PY2 else 0x64
+    assert view.readonly
+
+
+def test_selective_readonly_buffer():
+    buf = m.BufferReadOnlySelect()
+
+    memoryview(buf)[0] = b'd' if env.PY2 else 0x64
+    assert buf.value == 0x64
+
+    io.BytesIO(b'A').readinto(buf)
+    assert buf.value == ord(b'A')
+
+    buf.readonly = True
+    with pytest.raises(TypeError):
+        memoryview(buf)[0] = b'\0' if env.PY2 else 0
+    with pytest.raises(TypeError):
+        io.BytesIO(b'1').readinto(buf)
diff --git a/pybind11/tests/test_builtin_casters.cpp b/pybind11/tests/test_builtin_casters.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..acc9f8fb368899cde8b25702d4410c0d591fb5ee
--- /dev/null
+++ b/pybind11/tests/test_builtin_casters.cpp
@@ -0,0 +1,192 @@
+/*
+    tests/test_builtin_casters.cpp -- Casters available without any additional headers
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/complex.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+TEST_SUBMODULE(builtin_casters, m) {
+    // test_simple_string
+    m.def("string_roundtrip", [](const char *s) { return s; });
+
+    // test_unicode_conversion
+    // Some test characters in utf16 and utf32 encodings.  The last one (the 𝐀) contains a null byte
+    char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /*‽*/, cake32 = 0x1f382 /*🎂*/,              mathbfA32 = 0x1d400 /*𝐀*/;
+    char16_t b16 = 0x62 /*b*/, z16 = 0x7a,       ib16 = 0x203d,       cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
+    std::wstring wstr;
+    wstr.push_back(0x61); // a
+    wstr.push_back(0x2e18); // ⸘
+    if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
+    else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
+    wstr.push_back(0x7a); // z
+
+    m.def("good_utf8_string", []() { return std::string((const char*)u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
+    m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
+    m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
+    m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
+    m.def("bad_utf8_string", []()  { return std::string("abc\xd0" "def"); });
+    m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
+    // Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
+    if (PY_MAJOR_VERSION >= 3)
+        m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
+    if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
+        m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
+    m.def("u8_Z", []() -> char { return 'Z'; });
+    m.def("u8_eacute", []() -> char { return '\xe9'; });
+    m.def("u16_ibang", [=]() -> char16_t { return ib16; });
+    m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
+    m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
+
+    // test_single_char_arguments
+    m.attr("wchar_size") = py::cast(sizeof(wchar_t));
+    m.def("ord_char", [](char c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char_lv", [](char &c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char16", [](char16_t c) -> uint16_t { return c; });
+    m.def("ord_char16_lv", [](char16_t &c) -> uint16_t { return c; });
+    m.def("ord_char32", [](char32_t c) -> uint32_t { return c; });
+    m.def("ord_wchar", [](wchar_t c) -> int { return c; });
+
+    // test_bytes_to_string
+    m.def("strlen", [](char *s) { return strlen(s); });
+    m.def("string_length", [](std::string s) { return s.length(); });
+
+#ifdef PYBIND11_HAS_U8STRING
+    m.attr("has_u8string") = true;
+    m.def("good_utf8_u8string", []() { return std::u8string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
+    m.def("bad_utf8_u8string", []()  { return std::u8string((const char8_t*)"abc\xd0" "def"); });
+
+    m.def("u8_char8_Z", []() -> char8_t { return u8'Z'; });
+
+    // test_single_char_arguments
+    m.def("ord_char8", [](char8_t c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char8_lv", [](char8_t &c) -> int { return static_cast<unsigned char>(c); });
+#endif
+
+    // test_string_view
+#ifdef PYBIND11_HAS_STRING_VIEW
+    m.attr("has_string_view") = true;
+    m.def("string_view_print",   [](std::string_view s)    { py::print(s, s.size()); });
+    m.def("string_view16_print", [](std::u16string_view s) { py::print(s, s.size()); });
+    m.def("string_view32_print", [](std::u32string_view s) { py::print(s, s.size()); });
+    m.def("string_view_chars",   [](std::string_view s)    { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
+    m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view_return",   []() { return std::string_view((const char*)u8"utf8 secret \U0001f382"); });
+    m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
+    m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
+
+#   ifdef PYBIND11_HAS_U8STRING
+    m.def("string_view8_print",  [](std::u8string_view s) { py::print(s, s.size()); });
+    m.def("string_view8_chars",  [](std::u8string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
+    m.def("string_view8_return", []() { return std::u8string_view(u8"utf8 secret \U0001f382"); });
+#   endif
+#endif
+
+    // test_integer_casting
+    m.def("i32_str", [](std::int32_t v) { return std::to_string(v); });
+    m.def("u32_str", [](std::uint32_t v) { return std::to_string(v); });
+    m.def("i64_str", [](std::int64_t v) { return std::to_string(v); });
+    m.def("u64_str", [](std::uint64_t v) { return std::to_string(v); });
+
+    // test_tuple
+    m.def("pair_passthrough", [](std::pair<bool, std::string> input) {
+        return std::make_pair(input.second, input.first);
+    }, "Return a pair in reversed order");
+    m.def("tuple_passthrough", [](std::tuple<bool, std::string, int> input) {
+        return std::make_tuple(std::get<2>(input), std::get<1>(input), std::get<0>(input));
+    }, "Return a triple in reversed order");
+    m.def("empty_tuple", []() { return std::tuple<>(); });
+    static std::pair<RValueCaster, RValueCaster> lvpair;
+    static std::tuple<RValueCaster, RValueCaster, RValueCaster> lvtuple;
+    static std::pair<RValueCaster, std::tuple<RValueCaster, std::pair<RValueCaster, RValueCaster>>> lvnested;
+    m.def("rvalue_pair", []() { return std::make_pair(RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_pair", []() -> const decltype(lvpair) & { return lvpair; });
+    m.def("rvalue_tuple", []() { return std::make_tuple(RValueCaster{}, RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_tuple", []() -> const decltype(lvtuple) & { return lvtuple; });
+    m.def("rvalue_nested", []() {
+        return std::make_pair(RValueCaster{}, std::make_tuple(RValueCaster{}, std::make_pair(RValueCaster{}, RValueCaster{}))); });
+    m.def("lvalue_nested", []() -> const decltype(lvnested) & { return lvnested; });
+
+    static std::pair<int, std::string> int_string_pair{2, "items"};
+    m.def("int_string_pair", []() { return &int_string_pair; });
+
+    // test_builtins_cast_return_none
+    m.def("return_none_string", []() -> std::string * { return nullptr; });
+    m.def("return_none_char",   []() -> const char *  { return nullptr; });
+    m.def("return_none_bool",   []() -> bool *        { return nullptr; });
+    m.def("return_none_int",    []() -> int *         { return nullptr; });
+    m.def("return_none_float",  []() -> float *       { return nullptr; });
+    m.def("return_none_pair",   []() -> std::pair<int,int> * { return nullptr; });
+
+    // test_none_deferred
+    m.def("defer_none_cstring", [](char *) { return false; });
+    m.def("defer_none_cstring", [](py::none) { return true; });
+    m.def("defer_none_custom", [](UserType *) { return false; });
+    m.def("defer_none_custom", [](py::none) { return true; });
+    m.def("nodefer_none_void", [](void *) { return true; });
+    m.def("nodefer_none_void", [](py::none) { return false; });
+
+    // test_void_caster
+    m.def("load_nullptr_t", [](std::nullptr_t) {}); // not useful, but it should still compile
+    m.def("cast_nullptr_t", []() { return std::nullptr_t{}; });
+
+    // test_bool_caster
+    m.def("bool_passthrough", [](bool arg) { return arg; });
+    m.def("bool_passthrough_noconvert", [](bool arg) { return arg; }, py::arg().noconvert());
+
+    // test_reference_wrapper
+    m.def("refwrap_builtin", [](std::reference_wrapper<int> p) { return 10 * p.get(); });
+    m.def("refwrap_usertype", [](std::reference_wrapper<UserType> p) { return p.get().value(); });
+    // Not currently supported (std::pair caster has return-by-value cast operator);
+    // triggers static_assert failure.
+    //m.def("refwrap_pair", [](std::reference_wrapper<std::pair<int, int>>) { });
+
+    m.def("refwrap_list", [](bool copy) {
+        static IncType x1(1), x2(2);
+        py::list l;
+        for (auto &f : {std::ref(x1), std::ref(x2)}) {
+            l.append(py::cast(f, copy ? py::return_value_policy::copy
+                                      : py::return_value_policy::reference));
+        }
+        return l;
+    }, "copy"_a);
+
+    m.def("refwrap_iiw", [](const IncType &w) { return w.value(); });
+    m.def("refwrap_call_iiw", [](IncType &w, py::function f) {
+        py::list l;
+        l.append(f(std::ref(w)));
+        l.append(f(std::cref(w)));
+        IncType x(w.value());
+        l.append(f(std::ref(x)));
+        IncType y(w.value());
+        auto r3 = std::ref(y);
+        l.append(f(r3));
+        return l;
+    });
+
+    // test_complex
+    m.def("complex_cast", [](float x) { return "{}"_s.format(x); });
+    m.def("complex_cast", [](std::complex<float> x) { return "({}, {})"_s.format(x.real(), x.imag()); });
+
+    // test int vs. long (Python 2)
+    m.def("int_cast", []() {return (int) 42;});
+    m.def("long_cast", []() {return (long) 42;});
+    m.def("longlong_cast", []() {return  ULLONG_MAX;});
+
+    /// test void* cast operator
+    m.def("test_void_caster", []() -> bool {
+        void *v = (void *) 0xabcd;
+        py::object o = py::cast(v);
+        return py::cast<void *>(o) == v;
+    });
+}
diff --git a/pybind11/tests/test_builtin_casters.py b/pybind11/tests/test_builtin_casters.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d38bc1546f194021f6f47360e5a544a4267437
--- /dev/null
+++ b/pybind11/tests/test_builtin_casters.py
@@ -0,0 +1,392 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import builtin_casters as m
+from pybind11_tests import UserType, IncType
+
+
+def test_simple_string():
+    assert m.string_roundtrip("const char *") == "const char *"
+
+
+def test_unicode_conversion():
+    """Tests unicode conversion and error reporting."""
+    assert m.good_utf8_string() == u"Say utf8‽ 🎂 𝐀"
+    assert m.good_utf16_string() == u"b‽🎂𝐀z"
+    assert m.good_utf32_string() == u"a𝐀🎂‽z"
+    assert m.good_wchar_string() == u"a⸘𝐀z"
+    if hasattr(m, "has_u8string"):
+        assert m.good_utf8_u8string() == u"Say utf8‽ 🎂 𝐀"
+
+    with pytest.raises(UnicodeDecodeError):
+        m.bad_utf8_string()
+
+    with pytest.raises(UnicodeDecodeError):
+        m.bad_utf16_string()
+
+    # These are provided only if they actually fail (they don't when 32-bit and under Python 2.7)
+    if hasattr(m, "bad_utf32_string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_utf32_string()
+    if hasattr(m, "bad_wchar_string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_wchar_string()
+    if hasattr(m, "has_u8string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_utf8_u8string()
+
+    assert m.u8_Z() == 'Z'
+    assert m.u8_eacute() == u'é'
+    assert m.u16_ibang() == u'‽'
+    assert m.u32_mathbfA() == u'𝐀'
+    assert m.wchar_heart() == u'♥'
+    if hasattr(m, "has_u8string"):
+        assert m.u8_char8_Z() == 'Z'
+
+
+def test_single_char_arguments():
+    """Tests failures for passing invalid inputs to char-accepting functions"""
+    def toobig_message(r):
+        return "Character code point not in range({0:#x})".format(r)
+    toolong_message = "Expected a character, but multi-character string found"
+
+    assert m.ord_char(u'a') == 0x61  # simple ASCII
+    assert m.ord_char_lv(u'b') == 0x62
+    assert m.ord_char(u'é') == 0xE9  # requires 2 bytes in utf-8, but can be stuffed in a char
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char(u'Ā') == 0x100  # requires 2 bytes, doesn't fit in a char
+    assert str(excinfo.value) == toobig_message(0x100)
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char(u'ab')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_char16(u'a') == 0x61
+    assert m.ord_char16(u'é') == 0xE9
+    assert m.ord_char16_lv(u'ê') == 0xEA
+    assert m.ord_char16(u'Ā') == 0x100
+    assert m.ord_char16(u'‽') == 0x203d
+    assert m.ord_char16(u'♥') == 0x2665
+    assert m.ord_char16_lv(u'♡') == 0x2661
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char16(u'🎂') == 0x1F382  # requires surrogate pair
+    assert str(excinfo.value) == toobig_message(0x10000)
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char16(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_char32(u'a') == 0x61
+    assert m.ord_char32(u'é') == 0xE9
+    assert m.ord_char32(u'Ā') == 0x100
+    assert m.ord_char32(u'‽') == 0x203d
+    assert m.ord_char32(u'♥') == 0x2665
+    assert m.ord_char32(u'🎂') == 0x1F382
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char32(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_wchar(u'a') == 0x61
+    assert m.ord_wchar(u'é') == 0xE9
+    assert m.ord_wchar(u'Ā') == 0x100
+    assert m.ord_wchar(u'‽') == 0x203d
+    assert m.ord_wchar(u'♥') == 0x2665
+    if m.wchar_size == 2:
+        with pytest.raises(ValueError) as excinfo:
+            assert m.ord_wchar(u'🎂') == 0x1F382  # requires surrogate pair
+        assert str(excinfo.value) == toobig_message(0x10000)
+    else:
+        assert m.ord_wchar(u'🎂') == 0x1F382
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_wchar(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    if hasattr(m, "has_u8string"):
+        assert m.ord_char8(u'a') == 0x61  # simple ASCII
+        assert m.ord_char8_lv(u'b') == 0x62
+        assert m.ord_char8(u'é') == 0xE9  # requires 2 bytes in utf-8, but can be stuffed in a char
+        with pytest.raises(ValueError) as excinfo:
+            assert m.ord_char8(u'Ā') == 0x100  # requires 2 bytes, doesn't fit in a char
+        assert str(excinfo.value) == toobig_message(0x100)
+        with pytest.raises(ValueError) as excinfo:
+            assert m.ord_char8(u'ab')
+        assert str(excinfo.value) == toolong_message
+
+
+def test_bytes_to_string():
+    """Tests the ability to pass bytes to C++ string-accepting functions.  Note that this is
+    one-way: the only way to return bytes to Python is via the pybind11::bytes class."""
+    # Issue #816
+
+    def to_bytes(s):
+        b = s if env.PY2 else s.encode("utf8")
+        assert isinstance(b, bytes)
+        return b
+
+    assert m.strlen(to_bytes("hi")) == 2
+    assert m.string_length(to_bytes("world")) == 5
+    assert m.string_length(to_bytes("a\x00b")) == 3
+    assert m.strlen(to_bytes("a\x00b")) == 1  # C-string limitation
+
+    # passing in a utf8 encoded string should work
+    assert m.string_length(u'💩'.encode("utf8")) == 4
+
+
+@pytest.mark.skipif(not hasattr(m, "has_string_view"), reason="no <string_view>")
+def test_string_view(capture):
+    """Tests support for C++17 string_view arguments and return values"""
+    assert m.string_view_chars("Hi") == [72, 105]
+    assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
+    assert m.string_view16_chars(u"Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82]
+    assert m.string_view32_chars(u"Hi 🎂") == [72, 105, 32, 127874]
+    if hasattr(m, "has_u8string"):
+        assert m.string_view8_chars("Hi") == [72, 105]
+        assert m.string_view8_chars(u"Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
+
+    assert m.string_view_return() == u"utf8 secret 🎂"
+    assert m.string_view16_return() == u"utf16 secret 🎂"
+    assert m.string_view32_return() == u"utf32 secret 🎂"
+    if hasattr(m, "has_u8string"):
+        assert m.string_view8_return() == u"utf8 secret 🎂"
+
+    with capture:
+        m.string_view_print("Hi")
+        m.string_view_print("utf8 🎂")
+        m.string_view16_print(u"utf16 🎂")
+        m.string_view32_print(u"utf32 🎂")
+    assert capture == u"""
+        Hi 2
+        utf8 🎂 9
+        utf16 🎂 8
+        utf32 🎂 7
+    """
+    if hasattr(m, "has_u8string"):
+        with capture:
+            m.string_view8_print("Hi")
+            m.string_view8_print(u"utf8 🎂")
+        assert capture == u"""
+            Hi 2
+            utf8 🎂 9
+        """
+
+    with capture:
+        m.string_view_print("Hi, ascii")
+        m.string_view_print("Hi, utf8 🎂")
+        m.string_view16_print(u"Hi, utf16 🎂")
+        m.string_view32_print(u"Hi, utf32 🎂")
+    assert capture == u"""
+        Hi, ascii 9
+        Hi, utf8 🎂 13
+        Hi, utf16 🎂 12
+        Hi, utf32 🎂 11
+    """
+    if hasattr(m, "has_u8string"):
+        with capture:
+            m.string_view8_print("Hi, ascii")
+            m.string_view8_print(u"Hi, utf8 🎂")
+        assert capture == u"""
+            Hi, ascii 9
+            Hi, utf8 🎂 13
+        """
+
+
+def test_integer_casting():
+    """Issue #929 - out-of-range integer values shouldn't be accepted"""
+    assert m.i32_str(-1) == "-1"
+    assert m.i64_str(-1) == "-1"
+    assert m.i32_str(2000000000) == "2000000000"
+    assert m.u32_str(2000000000) == "2000000000"
+    if env.PY2:
+        assert m.i32_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
+        assert m.i64_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
+        assert m.i64_str(long(-999999999999)) == "-999999999999"  # noqa: F821 undefined name
+        assert m.u64_str(long(999999999999)) == "999999999999"  # noqa: F821 undefined name 'long'
+    else:
+        assert m.i64_str(-999999999999) == "-999999999999"
+        assert m.u64_str(999999999999) == "999999999999"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.u32_str(-1)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.u64_str(-1)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.i32_str(-3000000000)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.i32_str(3000000000)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    if env.PY2:
+        with pytest.raises(TypeError) as excinfo:
+            m.u32_str(long(-1))  # noqa: F821 undefined name 'long'
+        assert "incompatible function arguments" in str(excinfo.value)
+        with pytest.raises(TypeError) as excinfo:
+            m.u64_str(long(-1))  # noqa: F821 undefined name 'long'
+        assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_tuple(doc):
+    """std::pair <-> tuple & std::tuple <-> tuple"""
+    assert m.pair_passthrough((True, "test")) == ("test", True)
+    assert m.tuple_passthrough((True, "test", 5)) == (5, "test", True)
+    # Any sequence can be cast to a std::pair or std::tuple
+    assert m.pair_passthrough([True, "test"]) == ("test", True)
+    assert m.tuple_passthrough([True, "test", 5]) == (5, "test", True)
+    assert m.empty_tuple() == ()
+
+    assert doc(m.pair_passthrough) == """
+        pair_passthrough(arg0: Tuple[bool, str]) -> Tuple[str, bool]
+
+        Return a pair in reversed order
+    """
+    assert doc(m.tuple_passthrough) == """
+        tuple_passthrough(arg0: Tuple[bool, str, int]) -> Tuple[int, str, bool]
+
+        Return a triple in reversed order
+    """
+
+    assert m.rvalue_pair() == ("rvalue", "rvalue")
+    assert m.lvalue_pair() == ("lvalue", "lvalue")
+    assert m.rvalue_tuple() == ("rvalue", "rvalue", "rvalue")
+    assert m.lvalue_tuple() == ("lvalue", "lvalue", "lvalue")
+    assert m.rvalue_nested() == ("rvalue", ("rvalue", ("rvalue", "rvalue")))
+    assert m.lvalue_nested() == ("lvalue", ("lvalue", ("lvalue", "lvalue")))
+
+    assert m.int_string_pair() == (2, "items")
+
+
+def test_builtins_cast_return_none():
+    """Casters produced with PYBIND11_TYPE_CASTER() should convert nullptr to None"""
+    assert m.return_none_string() is None
+    assert m.return_none_char() is None
+    assert m.return_none_bool() is None
+    assert m.return_none_int() is None
+    assert m.return_none_float() is None
+    assert m.return_none_pair() is None
+
+
+def test_none_deferred():
+    """None passed as various argument types should defer to other overloads"""
+    assert not m.defer_none_cstring("abc")
+    assert m.defer_none_cstring(None)
+    assert not m.defer_none_custom(UserType())
+    assert m.defer_none_custom(None)
+    assert m.nodefer_none_void(None)
+
+
+def test_void_caster():
+    assert m.load_nullptr_t(None) is None
+    assert m.cast_nullptr_t() is None
+
+
+def test_reference_wrapper():
+    """std::reference_wrapper for builtin and user types"""
+    assert m.refwrap_builtin(42) == 420
+    assert m.refwrap_usertype(UserType(42)) == 42
+
+    with pytest.raises(TypeError) as excinfo:
+        m.refwrap_builtin(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.refwrap_usertype(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    a1 = m.refwrap_list(copy=True)
+    a2 = m.refwrap_list(copy=True)
+    assert [x.value for x in a1] == [2, 3]
+    assert [x.value for x in a2] == [2, 3]
+    assert not a1[0] is a2[0] and not a1[1] is a2[1]
+
+    b1 = m.refwrap_list(copy=False)
+    b2 = m.refwrap_list(copy=False)
+    assert [x.value for x in b1] == [1, 2]
+    assert [x.value for x in b2] == [1, 2]
+    assert b1[0] is b2[0] and b1[1] is b2[1]
+
+    assert m.refwrap_iiw(IncType(5)) == 5
+    assert m.refwrap_call_iiw(IncType(10), m.refwrap_iiw) == [10, 10, 10, 10]
+
+
+def test_complex_cast():
+    """std::complex casts"""
+    assert m.complex_cast(1) == "1.0"
+    assert m.complex_cast(2j) == "(0.0, 2.0)"
+
+
+def test_bool_caster():
+    """Test bool caster implicit conversions."""
+    convert, noconvert = m.bool_passthrough, m.bool_passthrough_noconvert
+
+    def require_implicit(v):
+        pytest.raises(TypeError, noconvert, v)
+
+    def cant_convert(v):
+        pytest.raises(TypeError, convert, v)
+
+    # straight up bool
+    assert convert(True) is True
+    assert convert(False) is False
+    assert noconvert(True) is True
+    assert noconvert(False) is False
+
+    # None requires implicit conversion
+    require_implicit(None)
+    assert convert(None) is False
+
+    class A(object):
+        def __init__(self, x):
+            self.x = x
+
+        def __nonzero__(self):
+            return self.x
+
+        def __bool__(self):
+            return self.x
+
+    class B(object):
+        pass
+
+    # Arbitrary objects are not accepted
+    cant_convert(object())
+    cant_convert(B())
+
+    # Objects with __nonzero__ / __bool__ defined can be converted
+    require_implicit(A(True))
+    assert convert(A(True)) is True
+    assert convert(A(False)) is False
+
+
+def test_numpy_bool():
+    np = pytest.importorskip("numpy")
+
+    convert, noconvert = m.bool_passthrough, m.bool_passthrough_noconvert
+
+    def cant_convert(v):
+        pytest.raises(TypeError, convert, v)
+
+    # np.bool_ is not considered implicit
+    assert convert(np.bool_(True)) is True
+    assert convert(np.bool_(False)) is False
+    assert noconvert(np.bool_(True)) is True
+    assert noconvert(np.bool_(False)) is False
+    cant_convert(np.zeros(2, dtype='int'))
+
+
+def test_int_long():
+    """In Python 2, a C++ int should return a Python int rather than long
+    if possible: longs are not always accepted where ints are used (such
+    as the argument to sys.exit()). A C++ long long is always a Python
+    long."""
+
+    import sys
+    must_be_long = type(getattr(sys, 'maxint', 1) + 1)
+    assert isinstance(m.int_cast(), int)
+    assert isinstance(m.long_cast(), int)
+    assert isinstance(m.longlong_cast(), must_be_long)
+
+
+def test_void_caster_2():
+    assert m.test_void_caster()
diff --git a/pybind11/tests/test_call_policies.cpp b/pybind11/tests/test_call_policies.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26c83f81b0ed370365d48279a4b8f3d4d23b5487
--- /dev/null
+++ b/pybind11/tests/test_call_policies.cpp
@@ -0,0 +1,101 @@
+/*
+    tests/test_call_policies.cpp -- keep_alive and call_guard
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+struct CustomGuard {
+    static bool enabled;
+
+    CustomGuard() { enabled = true; }
+    ~CustomGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool CustomGuard::enabled = false;
+
+struct DependentGuard {
+    static bool enabled;
+
+    DependentGuard() { enabled = CustomGuard::enabled; }
+    ~DependentGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool DependentGuard::enabled = false;
+
+TEST_SUBMODULE(call_policies, m) {
+    // Parent/Child are used in:
+    // test_keep_alive_argument, test_keep_alive_return_value, test_alive_gc_derived,
+    // test_alive_gc_multi_derived, test_return_none, test_keep_alive_constructor
+    class Child {
+    public:
+        Child() { py::print("Allocating child."); }
+        Child(const Child &) = default;
+        Child(Child &&) = default;
+        ~Child() { py::print("Releasing child."); }
+    };
+    py::class_<Child>(m, "Child")
+        .def(py::init<>());
+
+    class Parent {
+    public:
+        Parent() { py::print("Allocating parent."); }
+        Parent(const Parent& parent) = default;
+        ~Parent() { py::print("Releasing parent."); }
+        void addChild(Child *) { }
+        Child *returnChild() { return new Child(); }
+        Child *returnNullChild() { return nullptr; }
+    };
+    py::class_<Parent>(m, "Parent")
+        .def(py::init<>())
+        .def(py::init([](Child *) { return new Parent(); }), py::keep_alive<1, 2>())
+        .def("addChild", &Parent::addChild)
+        .def("addChildKeepAlive", &Parent::addChild, py::keep_alive<1, 2>())
+        .def("returnChild", &Parent::returnChild)
+        .def("returnChildKeepAlive", &Parent::returnChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveChild", &Parent::returnNullChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>());
+
+#if !defined(PYPY_VERSION)
+    // test_alive_gc
+    class ParentGC : public Parent {
+    public:
+        using Parent::Parent;
+    };
+    py::class_<ParentGC, Parent>(m, "ParentGC", py::dynamic_attr())
+        .def(py::init<>());
+#endif
+
+    // test_call_guard
+    m.def("unguarded_call", &CustomGuard::report_status);
+    m.def("guarded_call", &CustomGuard::report_status, py::call_guard<CustomGuard>());
+
+    m.def("multiple_guards_correct_order", []() {
+        return CustomGuard::report_status() + std::string(" & ") + DependentGuard::report_status();
+    }, py::call_guard<CustomGuard, DependentGuard>());
+
+    m.def("multiple_guards_wrong_order", []() {
+        return DependentGuard::report_status() + std::string(" & ") + CustomGuard::report_status();
+    }, py::call_guard<DependentGuard, CustomGuard>());
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+    // `py::call_guard<py::gil_scoped_release>()` should work in PyPy as well,
+    // but it's unclear how to test it without `PyGILState_GetThisThreadState`.
+    auto report_gil_status = []() {
+        auto is_gil_held = false;
+        if (auto tstate = py::detail::get_thread_state_unchecked())
+            is_gil_held = (tstate == PyGILState_GetThisThreadState());
+
+        return is_gil_held ? "GIL held" : "GIL released";
+    };
+
+    m.def("with_gil", report_gil_status);
+    m.def("without_gil", report_gil_status, py::call_guard<py::gil_scoped_release>());
+#endif
+}
diff --git a/pybind11/tests/test_call_policies.py b/pybind11/tests/test_call_policies.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec005c132f9c172fda1570073ada46342e38a2ea
--- /dev/null
+++ b/pybind11/tests/test_call_policies.py
@@ -0,0 +1,192 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import call_policies as m
+from pybind11_tests import ConstructorStats
+
+
+@pytest.mark.xfail("env.PYPY", reason="sometimes comes out 1 off on PyPy", strict=False)
+def test_keep_alive_argument(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.addChild(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == """
+        Allocating child.
+        Releasing child.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.addChildKeepAlive(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_keep_alive_return_value(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnChild()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == """
+        Allocating child.
+        Releasing child.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnChildKeepAlive()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+# https://foss.heptapod.net/pypy/pypy/-/issues/2447
+@pytest.mark.xfail("env.PYPY", reason="_PyObject_GetDictPtr is unimplemented")
+def test_alive_gc(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = m.ParentGC()
+    p.addChildKeepAlive(m.Child())
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_alive_gc_derived(capture):
+    class Derived(m.Parent):
+        pass
+
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = Derived()
+    p.addChildKeepAlive(m.Child())
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_alive_gc_multi_derived(capture):
+    class Derived(m.Parent, m.Child):
+        def __init__(self):
+            m.Parent.__init__(self)
+            m.Child.__init__(self)
+
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = Derived()
+    p.addChildKeepAlive(m.Child())
+    # +3 rather than +2 because Derived corresponds to two registered instances
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+        Releasing child.
+    """
+
+
+def test_return_none(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnNullChildKeepAliveChild()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == ""
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnNullChildKeepAliveParent()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == ""
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+
+def test_keep_alive_constructor(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    with capture:
+        p = m.Parent(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == """
+        Allocating child.
+        Allocating parent.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_call_guard():
+    assert m.unguarded_call() == "unguarded"
+    assert m.guarded_call() == "guarded"
+
+    assert m.multiple_guards_correct_order() == "guarded & guarded"
+    assert m.multiple_guards_wrong_order() == "unguarded & guarded"
+
+    if hasattr(m, "with_gil"):
+        assert m.with_gil() == "GIL held"
+        assert m.without_gil() == "GIL released"
diff --git a/pybind11/tests/test_callbacks.cpp b/pybind11/tests/test_callbacks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71b88c44c7650a7e7b3f37cee19359e15bbb0270
--- /dev/null
+++ b/pybind11/tests/test_callbacks.cpp
@@ -0,0 +1,168 @@
+/*
+    tests/test_callbacks.cpp -- callbacks
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/functional.h>
+#include <thread>
+
+
+int dummy_function(int i) { return i + 1; }
+
+TEST_SUBMODULE(callbacks, m) {
+    // test_callbacks, test_function_signatures
+    m.def("test_callback1", [](py::object func) { return func(); });
+    m.def("test_callback2", [](py::object func) { return func("Hello", 'x', true, 5); });
+    m.def("test_callback3", [](const std::function<int(int)> &func) {
+        return "func(43) = " + std::to_string(func(43)); });
+    m.def("test_callback4", []() -> std::function<int(int)> { return [](int i) { return i+1; }; });
+    m.def("test_callback5", []() {
+        return py::cpp_function([](int i) { return i+1; }, py::arg("number"));
+    });
+
+    // test_keyword_args_and_generalized_unpacking
+    m.def("test_tuple_unpacking", [](py::function f) {
+        auto t1 = py::make_tuple(2, 3);
+        auto t2 = py::make_tuple(5, 6);
+        return f("positional", 1, *t1, 4, *t2);
+    });
+
+    m.def("test_dict_unpacking", [](py::function f) {
+        auto d1 = py::dict("key"_a="value", "a"_a=1);
+        auto d2 = py::dict();
+        auto d3 = py::dict("b"_a=2);
+        return f("positional", 1, **d1, **d2, **d3);
+    });
+
+    m.def("test_keyword_args", [](py::function f) {
+        return f("x"_a=10, "y"_a=20);
+    });
+
+    m.def("test_unpacking_and_keywords1", [](py::function f) {
+        auto args = py::make_tuple(2);
+        auto kwargs = py::dict("d"_a=4);
+        return f(1, *args, "c"_a=3, **kwargs);
+    });
+
+    m.def("test_unpacking_and_keywords2", [](py::function f) {
+        auto kwargs1 = py::dict("a"_a=1);
+        auto kwargs2 = py::dict("c"_a=3, "d"_a=4);
+        return f("positional", *py::make_tuple(1), 2, *py::make_tuple(3, 4), 5,
+                 "key"_a="value", **kwargs1, "b"_a=2, **kwargs2, "e"_a=5);
+    });
+
+    m.def("test_unpacking_error1", [](py::function f) {
+        auto kwargs = py::dict("x"_a=3);
+        return f("x"_a=1, "y"_a=2, **kwargs); // duplicate ** after keyword
+    });
+
+    m.def("test_unpacking_error2", [](py::function f) {
+        auto kwargs = py::dict("x"_a=3);
+        return f(**kwargs, "x"_a=1); // duplicate keyword after **
+    });
+
+    m.def("test_arg_conversion_error1", [](py::function f) {
+        f(234, UnregisteredType(), "kw"_a=567);
+    });
+
+    m.def("test_arg_conversion_error2", [](py::function f) {
+        f(234, "expected_name"_a=UnregisteredType(), "kw"_a=567);
+    });
+
+    // test_lambda_closure_cleanup
+    struct Payload {
+        Payload() { print_default_created(this); }
+        ~Payload() { print_destroyed(this); }
+        Payload(const Payload &) { print_copy_created(this); }
+        Payload(Payload &&) { print_move_created(this); }
+    };
+    // Export the payload constructor statistics for testing purposes:
+    m.def("payload_cstats", &ConstructorStats::get<Payload>);
+    /* Test cleanup of lambda closure */
+    m.def("test_cleanup", []() -> std::function<void(void)> {
+        Payload p;
+
+        return [p]() {
+            /* p should be cleaned up when the returned function is garbage collected */
+            (void) p;
+        };
+    });
+
+    // test_cpp_function_roundtrip
+    /* Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer */
+    m.def("dummy_function", &dummy_function);
+    m.def("dummy_function2", [](int i, int j) { return i + j; });
+    m.def("roundtrip", [](std::function<int(int)> f, bool expect_none = false) {
+        if (expect_none && f)
+            throw std::runtime_error("Expected None to be converted to empty std::function");
+        return f;
+    }, py::arg("f"), py::arg("expect_none")=false);
+    m.def("test_dummy_function", [](const std::function<int(int)> &f) -> std::string {
+        using fn_type = int (*)(int);
+        auto result = f.target<fn_type>();
+        if (!result) {
+            auto r = f(1);
+            return "can't convert to function pointer: eval(1) = " + std::to_string(r);
+        } else if (*result == dummy_function) {
+            auto r = (*result)(1);
+            return "matches dummy_function: eval(1) = " + std::to_string(r);
+        } else {
+            return "argument does NOT match dummy_function. This should never happen!";
+        }
+    });
+
+    class AbstractBase { public: virtual unsigned int func() = 0; };
+    m.def("func_accepting_func_accepting_base", [](std::function<double(AbstractBase&)>) { });
+
+    struct MovableObject {
+        bool valid = true;
+
+        MovableObject() = default;
+        MovableObject(const MovableObject &) = default;
+        MovableObject &operator=(const MovableObject &) = default;
+        MovableObject(MovableObject &&o) : valid(o.valid) { o.valid = false; }
+        MovableObject &operator=(MovableObject &&o) {
+            valid = o.valid;
+            o.valid = false;
+            return *this;
+        }
+    };
+    py::class_<MovableObject>(m, "MovableObject");
+
+    // test_movable_object
+    m.def("callback_with_movable", [](std::function<void(MovableObject &)> f) {
+        auto x = MovableObject();
+        f(x); // lvalue reference shouldn't move out object
+        return x.valid; // must still return `true`
+    });
+
+    // test_bound_method_callback
+    struct CppBoundMethodTest {};
+    py::class_<CppBoundMethodTest>(m, "CppBoundMethodTest")
+        .def(py::init<>())
+        .def("triple", [](CppBoundMethodTest &, int val) { return 3 * val; });
+
+    // test async Python callbacks
+    using callback_f = std::function<void(int)>;
+    m.def("test_async_callback", [](callback_f f, py::list work) {
+        // make detached thread that calls `f` with piece of work after a little delay
+        auto start_f = [f](int j) {
+            auto invoke_f = [f, j] {
+                std::this_thread::sleep_for(std::chrono::milliseconds(50));
+                f(j);
+            };
+            auto t = std::thread(std::move(invoke_f));
+            t.detach();
+        };
+
+        // spawn worker threads
+        for (auto i : work)
+            start_f(py::cast<int>(i));
+    });
+}
diff --git a/pybind11/tests/test_callbacks.py b/pybind11/tests/test_callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d0e045d224aab7381549bdcfb1d2102cdd0eb7
--- /dev/null
+++ b/pybind11/tests/test_callbacks.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import callbacks as m
+from threading import Thread
+
+
+def test_callbacks():
+    from functools import partial
+
+    def func1():
+        return "func1"
+
+    def func2(a, b, c, d):
+        return "func2", a, b, c, d
+
+    def func3(a):
+        return "func3({})".format(a)
+
+    assert m.test_callback1(func1) == "func1"
+    assert m.test_callback2(func2) == ("func2", "Hello", "x", True, 5)
+    assert m.test_callback1(partial(func2, 1, 2, 3, 4)) == ("func2", 1, 2, 3, 4)
+    assert m.test_callback1(partial(func3, "partial")) == "func3(partial)"
+    assert m.test_callback3(lambda i: i + 1) == "func(43) = 44"
+
+    f = m.test_callback4()
+    assert f(43) == 44
+    f = m.test_callback5()
+    assert f(number=43) == 44
+
+
+def test_bound_method_callback():
+    # Bound Python method:
+    class MyClass:
+        def double(self, val):
+            return 2 * val
+
+    z = MyClass()
+    assert m.test_callback3(z.double) == "func(43) = 86"
+
+    z = m.CppBoundMethodTest()
+    assert m.test_callback3(z.triple) == "func(43) = 129"
+
+
+def test_keyword_args_and_generalized_unpacking():
+
+    def f(*args, **kwargs):
+        return args, kwargs
+
+    assert m.test_tuple_unpacking(f) == (("positional", 1, 2, 3, 4, 5, 6), {})
+    assert m.test_dict_unpacking(f) == (("positional", 1), {"key": "value", "a": 1, "b": 2})
+    assert m.test_keyword_args(f) == ((), {"x": 10, "y": 20})
+    assert m.test_unpacking_and_keywords1(f) == ((1, 2), {"c": 3, "d": 4})
+    assert m.test_unpacking_and_keywords2(f) == (
+        ("positional", 1, 2, 3, 4, 5),
+        {"key": "value", "a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
+    )
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_unpacking_error1(f)
+    assert "Got multiple values for keyword argument" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_unpacking_error2(f)
+    assert "Got multiple values for keyword argument" in str(excinfo.value)
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.test_arg_conversion_error1(f)
+    assert "Unable to convert call argument" in str(excinfo.value)
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.test_arg_conversion_error2(f)
+    assert "Unable to convert call argument" in str(excinfo.value)
+
+
+def test_lambda_closure_cleanup():
+    m.test_cleanup()
+    cstats = m.payload_cstats()
+    assert cstats.alive() == 0
+    assert cstats.copy_constructions == 1
+    assert cstats.move_constructions >= 1
+
+
+def test_cpp_function_roundtrip():
+    """Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer"""
+
+    assert m.test_dummy_function(m.dummy_function) == "matches dummy_function: eval(1) = 2"
+    assert (m.test_dummy_function(m.roundtrip(m.dummy_function)) ==
+            "matches dummy_function: eval(1) = 2")
+    assert m.roundtrip(None, expect_none=True) is None
+    assert (m.test_dummy_function(lambda x: x + 2) ==
+            "can't convert to function pointer: eval(1) = 3")
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_dummy_function(m.dummy_function2)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_dummy_function(lambda x, y: x + y)
+    assert any(s in str(excinfo.value) for s in ("missing 1 required positional argument",
+                                                 "takes exactly 2 arguments"))
+
+
+def test_function_signatures(doc):
+    assert doc(m.test_callback3) == "test_callback3(arg0: Callable[[int], int]) -> str"
+    assert doc(m.test_callback4) == "test_callback4() -> Callable[[int], int]"
+
+
+def test_movable_object():
+    assert m.callback_with_movable(lambda _: None) is True
+
+
+def test_async_callbacks():
+    # serves as state for async callback
+    class Item:
+        def __init__(self, value):
+            self.value = value
+
+    res = []
+
+    # generate stateful lambda that will store result in `res`
+    def gen_f():
+        s = Item(3)
+        return lambda j: res.append(s.value + j)
+
+    # do some work async
+    work = [1, 2, 3, 4]
+    m.test_async_callback(gen_f(), work)
+    # wait until work is done
+    from time import sleep
+    sleep(0.5)
+    assert sum(res) == sum([x + 3 for x in work])
+
+
+def test_async_async_callbacks():
+    t = Thread(target=test_async_callbacks)
+    t.start()
+    t.join()
diff --git a/pybind11/tests/test_chrono.cpp b/pybind11/tests/test_chrono.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d79d4b6ca96ecf1c2faa8bd8002a2eb38f39124
--- /dev/null
+++ b/pybind11/tests/test_chrono.cpp
@@ -0,0 +1,56 @@
+/*
+    tests/test_chrono.cpp -- test conversions to/from std::chrono types
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/chrono.h>
+#include <chrono>
+
+TEST_SUBMODULE(chrono, m) {
+    using system_time = std::chrono::system_clock::time_point;
+    using steady_time = std::chrono::steady_clock::time_point;
+
+    using timespan = std::chrono::duration<int64_t, std::nano>;
+    using timestamp = std::chrono::time_point<std::chrono::system_clock, timespan>;
+
+    // test_chrono_system_clock
+    // Return the current time off the wall clock
+    m.def("test_chrono1", []() { return std::chrono::system_clock::now(); });
+
+    // test_chrono_system_clock_roundtrip
+    // Round trip the passed in system clock time
+    m.def("test_chrono2", [](system_time t) { return t; });
+
+    // test_chrono_duration_roundtrip
+    // Round trip the passed in duration
+    m.def("test_chrono3", [](std::chrono::system_clock::duration d) { return d; });
+
+    // test_chrono_duration_subtraction_equivalence
+    // Difference between two passed in time_points
+    m.def("test_chrono4", [](system_time a, system_time b) { return a - b; });
+
+    // test_chrono_steady_clock
+    // Return the current time off the steady_clock
+    m.def("test_chrono5", []() { return std::chrono::steady_clock::now(); });
+
+    // test_chrono_steady_clock_roundtrip
+    // Round trip a steady clock timepoint
+    m.def("test_chrono6", [](steady_time t) { return t; });
+
+    // test_floating_point_duration
+    // Roundtrip a duration in microseconds from a float argument
+    m.def("test_chrono7", [](std::chrono::microseconds t) { return t; });
+    // Float durations (issue #719)
+    m.def("test_chrono_float_diff", [](std::chrono::duration<float> a, std::chrono::duration<float> b) {
+        return a - b; });
+
+    m.def("test_nano_timepoint", [](timestamp start, timespan delta) -> timestamp {
+        return start + delta;
+    });
+}
diff --git a/pybind11/tests/test_chrono.py b/pybind11/tests/test_chrono.py
new file mode 100644
index 0000000000000000000000000000000000000000..76783905a3bc9b60e5b58afdbdf592e88afb4f74
--- /dev/null
+++ b/pybind11/tests/test_chrono.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import chrono as m
+import datetime
+import pytest
+
+import env  # noqa: F401
+
+
+def test_chrono_system_clock():
+
+    # Get the time from both c++ and datetime
+    date0 = datetime.datetime.today()
+    date1 = m.test_chrono1()
+    date2 = datetime.datetime.today()
+
+    # The returned value should be a datetime
+    assert isinstance(date1, datetime.datetime)
+
+    # The numbers should vary by a very small amount (time it took to execute)
+    diff_python = abs(date2 - date0)
+    diff = abs(date1 - date2)
+
+    # There should never be a days difference
+    assert diff.days == 0
+
+    # Since datetime.datetime.today() calls time.time(), and on some platforms
+    # that has 1 second accuracy, we compare this way
+    assert diff.seconds <= diff_python.seconds
+
+
+def test_chrono_system_clock_roundtrip():
+    date1 = datetime.datetime.today()
+
+    # Roundtrip the time
+    date2 = m.test_chrono2(date1)
+
+    # The returned value should be a datetime
+    assert isinstance(date2, datetime.datetime)
+
+    # They should be identical (no information lost on roundtrip)
+    diff = abs(date1 - date2)
+    assert diff.days == 0
+    assert diff.seconds == 0
+    assert diff.microseconds == 0
+
+
+def test_chrono_system_clock_roundtrip_date():
+    date1 = datetime.date.today()
+
+    # Roundtrip the time
+    datetime2 = m.test_chrono2(date1)
+    date2 = datetime2.date()
+    time2 = datetime2.time()
+
+    # The returned value should be a datetime
+    assert isinstance(datetime2, datetime.datetime)
+    assert isinstance(date2, datetime.date)
+    assert isinstance(time2, datetime.time)
+
+    # They should be identical (no information lost on roundtrip)
+    diff = abs(date1 - date2)
+    assert diff.days == 0
+    assert diff.seconds == 0
+    assert diff.microseconds == 0
+
+    # Year, Month & Day should be the same after the round trip
+    assert date1.year == date2.year
+    assert date1.month == date2.month
+    assert date1.day == date2.day
+
+    # There should be no time information
+    assert time2.hour == 0
+    assert time2.minute == 0
+    assert time2.second == 0
+    assert time2.microsecond == 0
+
+
+SKIP_TZ_ENV_ON_WIN = pytest.mark.skipif(
+    "env.WIN", reason="TZ environment variable only supported on POSIX"
+)
+
+
+@pytest.mark.parametrize("time1", [
+    datetime.datetime.today().time(),
+    datetime.time(0, 0, 0),
+    datetime.time(0, 0, 0, 1),
+    datetime.time(0, 28, 45, 109827),
+    datetime.time(0, 59, 59, 999999),
+    datetime.time(1, 0, 0),
+    datetime.time(5, 59, 59, 0),
+    datetime.time(5, 59, 59, 1),
+])
+@pytest.mark.parametrize("tz", [
+    None,
+    pytest.param("Europe/Brussels", marks=SKIP_TZ_ENV_ON_WIN),
+    pytest.param("Asia/Pyongyang", marks=SKIP_TZ_ENV_ON_WIN),
+    pytest.param("America/New_York", marks=SKIP_TZ_ENV_ON_WIN),
+])
+def test_chrono_system_clock_roundtrip_time(time1, tz, monkeypatch):
+    if tz is not None:
+        monkeypatch.setenv("TZ", "/usr/share/zoneinfo/{}".format(tz))
+
+    # Roundtrip the time
+    datetime2 = m.test_chrono2(time1)
+    date2 = datetime2.date()
+    time2 = datetime2.time()
+
+    # The returned value should be a datetime
+    assert isinstance(datetime2, datetime.datetime)
+    assert isinstance(date2, datetime.date)
+    assert isinstance(time2, datetime.time)
+
+    # Hour, Minute, Second & Microsecond should be the same after the round trip
+    assert time1.hour == time2.hour
+    assert time1.minute == time2.minute
+    assert time1.second == time2.second
+    assert time1.microsecond == time2.microsecond
+
+    # There should be no date information (i.e. date = python base date)
+    assert date2.year == 1970
+    assert date2.month == 1
+    assert date2.day == 1
+
+
+def test_chrono_duration_roundtrip():
+
+    # Get the difference between two times (a timedelta)
+    date1 = datetime.datetime.today()
+    date2 = datetime.datetime.today()
+    diff = date2 - date1
+
+    # Make sure this is a timedelta
+    assert isinstance(diff, datetime.timedelta)
+
+    cpp_diff = m.test_chrono3(diff)
+
+    assert cpp_diff.days == diff.days
+    assert cpp_diff.seconds == diff.seconds
+    assert cpp_diff.microseconds == diff.microseconds
+
+
+def test_chrono_duration_subtraction_equivalence():
+
+    date1 = datetime.datetime.today()
+    date2 = datetime.datetime.today()
+
+    diff = date2 - date1
+    cpp_diff = m.test_chrono4(date2, date1)
+
+    assert cpp_diff.days == diff.days
+    assert cpp_diff.seconds == diff.seconds
+    assert cpp_diff.microseconds == diff.microseconds
+
+
+def test_chrono_duration_subtraction_equivalence_date():
+
+    date1 = datetime.date.today()
+    date2 = datetime.date.today()
+
+    diff = date2 - date1
+    cpp_diff = m.test_chrono4(date2, date1)
+
+    assert cpp_diff.days == diff.days
+    assert cpp_diff.seconds == diff.seconds
+    assert cpp_diff.microseconds == diff.microseconds
+
+
+def test_chrono_steady_clock():
+    time1 = m.test_chrono5()
+    assert isinstance(time1, datetime.timedelta)
+
+
+def test_chrono_steady_clock_roundtrip():
+    time1 = datetime.timedelta(days=10, seconds=10, microseconds=100)
+    time2 = m.test_chrono6(time1)
+
+    assert isinstance(time2, datetime.timedelta)
+
+    # They should be identical (no information lost on roundtrip)
+    assert time1.days == time2.days
+    assert time1.seconds == time2.seconds
+    assert time1.microseconds == time2.microseconds
+
+
+def test_floating_point_duration():
+    # Test using a floating point number in seconds
+    time = m.test_chrono7(35.525123)
+
+    assert isinstance(time, datetime.timedelta)
+
+    assert time.seconds == 35
+    assert 525122 <= time.microseconds <= 525123
+
+    diff = m.test_chrono_float_diff(43.789012, 1.123456)
+    assert diff.seconds == 42
+    assert 665556 <= diff.microseconds <= 665557
+
+
+def test_nano_timepoint():
+    time = datetime.datetime.now()
+    time1 = m.test_nano_timepoint(time, datetime.timedelta(seconds=60))
+    assert(time1 == time + datetime.timedelta(seconds=60))
diff --git a/pybind11/tests/test_class.cpp b/pybind11/tests/test_class.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5369cb064cc9fee76546529398787980f9c4c76e
--- /dev/null
+++ b/pybind11/tests/test_class.cpp
@@ -0,0 +1,449 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+
+// test_brace_initialization
+struct NoBraceInitialization {
+    NoBraceInitialization(std::vector<int> v) : vec{std::move(v)} {}
+    template <typename T>
+    NoBraceInitialization(std::initializer_list<T> l) : vec(l) {}
+
+    std::vector<int> vec;
+};
+
+TEST_SUBMODULE(class_, m) {
+    // test_instance
+    struct NoConstructor {
+        NoConstructor() = default;
+        NoConstructor(const NoConstructor &) = default;
+        NoConstructor(NoConstructor &&) = default;
+        static NoConstructor *new_instance() {
+            auto *ptr = new NoConstructor();
+            print_created(ptr, "via new_instance");
+            return ptr;
+        }
+        ~NoConstructor() { print_destroyed(this); }
+    };
+
+    py::class_<NoConstructor>(m, "NoConstructor")
+        .def_static("new_instance", &NoConstructor::new_instance, "Return an instance");
+
+    // test_inheritance
+    class Pet {
+    public:
+        Pet(const std::string &name, const std::string &species)
+            : m_name(name), m_species(species) {}
+        std::string name() const { return m_name; }
+        std::string species() const { return m_species; }
+    private:
+        std::string m_name;
+        std::string m_species;
+    };
+
+    class Dog : public Pet {
+    public:
+        Dog(const std::string &name) : Pet(name, "dog") {}
+        std::string bark() const { return "Woof!"; }
+    };
+
+    class Rabbit : public Pet {
+    public:
+        Rabbit(const std::string &name) : Pet(name, "parrot") {}
+    };
+
+    class Hamster : public Pet {
+    public:
+        Hamster(const std::string &name) : Pet(name, "rodent") {}
+    };
+
+    class Chimera : public Pet {
+        Chimera() : Pet("Kimmy", "chimera") {}
+    };
+
+    py::class_<Pet> pet_class(m, "Pet");
+    pet_class
+        .def(py::init<std::string, std::string>())
+        .def("name", &Pet::name)
+        .def("species", &Pet::species);
+
+    /* One way of declaring a subclass relationship: reference parent's class_ object */
+    py::class_<Dog>(m, "Dog", pet_class)
+        .def(py::init<std::string>());
+
+    /* Another way of declaring a subclass relationship: reference parent's C++ type */
+    py::class_<Rabbit, Pet>(m, "Rabbit")
+        .def(py::init<std::string>());
+
+    /* And another: list parent in class template arguments */
+    py::class_<Hamster, Pet>(m, "Hamster")
+        .def(py::init<std::string>());
+
+    /* Constructors are not inherited by default */
+    py::class_<Chimera, Pet>(m, "Chimera");
+
+    m.def("pet_name_species", [](const Pet &pet) { return pet.name() + " is a " + pet.species(); });
+    m.def("dog_bark", [](const Dog &dog) { return dog.bark(); });
+
+    // test_automatic_upcasting
+    struct BaseClass {
+        BaseClass() = default;
+        BaseClass(const BaseClass &) = default;
+        BaseClass(BaseClass &&) = default;
+        virtual ~BaseClass() {}
+    };
+    struct DerivedClass1 : BaseClass { };
+    struct DerivedClass2 : BaseClass { };
+
+    py::class_<BaseClass>(m, "BaseClass").def(py::init<>());
+    py::class_<DerivedClass1>(m, "DerivedClass1").def(py::init<>());
+    py::class_<DerivedClass2>(m, "DerivedClass2").def(py::init<>());
+
+    m.def("return_class_1", []() -> BaseClass* { return new DerivedClass1(); });
+    m.def("return_class_2", []() -> BaseClass* { return new DerivedClass2(); });
+    m.def("return_class_n", [](int n) -> BaseClass* {
+        if (n == 1) return new DerivedClass1();
+        if (n == 2) return new DerivedClass2();
+        return new BaseClass();
+    });
+    m.def("return_none", []() -> BaseClass* { return nullptr; });
+
+    // test_isinstance
+    m.def("check_instances", [](py::list l) {
+        return py::make_tuple(
+            py::isinstance<py::tuple>(l[0]),
+            py::isinstance<py::dict>(l[1]),
+            py::isinstance<Pet>(l[2]),
+            py::isinstance<Pet>(l[3]),
+            py::isinstance<Dog>(l[4]),
+            py::isinstance<Rabbit>(l[5]),
+            py::isinstance<UnregisteredType>(l[6])
+        );
+    });
+
+    // test_mismatched_holder
+    struct MismatchBase1 { };
+    struct MismatchDerived1 : MismatchBase1 { };
+
+    struct MismatchBase2 { };
+    struct MismatchDerived2 : MismatchBase2 { };
+
+    m.def("mismatched_holder_1", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase1, std::shared_ptr<MismatchBase1>>(mod, "MismatchBase1");
+        py::class_<MismatchDerived1, MismatchBase1>(mod, "MismatchDerived1");
+    });
+    m.def("mismatched_holder_2", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase2>(mod, "MismatchBase2");
+        py::class_<MismatchDerived2, std::shared_ptr<MismatchDerived2>,
+                   MismatchBase2>(mod, "MismatchDerived2");
+    });
+
+    // test_override_static
+    // #511: problem with inheritance + overwritten def_static
+    struct MyBase {
+        static std::unique_ptr<MyBase> make() {
+            return std::unique_ptr<MyBase>(new MyBase());
+        }
+    };
+
+    struct MyDerived : MyBase {
+        static std::unique_ptr<MyDerived> make() {
+            return std::unique_ptr<MyDerived>(new MyDerived());
+        }
+    };
+
+    py::class_<MyBase>(m, "MyBase")
+        .def_static("make", &MyBase::make);
+
+    py::class_<MyDerived, MyBase>(m, "MyDerived")
+        .def_static("make", &MyDerived::make)
+        .def_static("make2", &MyDerived::make);
+
+    // test_implicit_conversion_life_support
+    struct ConvertibleFromUserType {
+        int i;
+
+        ConvertibleFromUserType(UserType u) : i(u.value()) { }
+    };
+
+    py::class_<ConvertibleFromUserType>(m, "AcceptsUserType")
+        .def(py::init<UserType>());
+    py::implicitly_convertible<UserType, ConvertibleFromUserType>();
+
+    m.def("implicitly_convert_argument", [](const ConvertibleFromUserType &r) { return r.i; });
+    m.def("implicitly_convert_variable", [](py::object o) {
+        // `o` is `UserType` and `r` is a reference to a temporary created by implicit
+        // conversion. This is valid when called inside a bound function because the temp
+        // object is attached to the same life support system as the arguments.
+        const auto &r = o.cast<const ConvertibleFromUserType &>();
+        return r.i;
+    });
+    m.add_object("implicitly_convert_variable_fail", [&] {
+        auto f = [](PyObject *, PyObject *args) -> PyObject * {
+            auto o = py::reinterpret_borrow<py::tuple>(args)[0];
+            try { // It should fail here because there is no life support.
+                o.cast<const ConvertibleFromUserType &>();
+            } catch (const py::cast_error &e) {
+                return py::str(e.what()).release().ptr();
+            }
+            return py::str().release().ptr();
+        };
+
+        auto def = new PyMethodDef{"f", f, METH_VARARGS, nullptr};
+        return py::reinterpret_steal<py::object>(PyCFunction_NewEx(def, nullptr, m.ptr()));
+    }());
+
+    // test_operator_new_delete
+    struct HasOpNewDel {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("A new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("A placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("A delete"); return ::operator delete(p); }
+    };
+    struct HasOpNewDelSize {
+        std::uint32_t i;
+        static void *operator new(size_t s) { py::print("B new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("B placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("B delete", s); return ::operator delete(p); }
+    };
+    struct AliasedHasOpNewDelSize {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("C new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("C placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("C delete", s); return ::operator delete(p); }
+        virtual ~AliasedHasOpNewDelSize() = default;
+        AliasedHasOpNewDelSize() = default;
+        AliasedHasOpNewDelSize(const AliasedHasOpNewDelSize&) = delete;
+    };
+    struct PyAliasedHasOpNewDelSize : AliasedHasOpNewDelSize {
+        PyAliasedHasOpNewDelSize() = default;
+        PyAliasedHasOpNewDelSize(int) { }
+        std::uint64_t j;
+    };
+    struct HasOpNewDelBoth {
+        std::uint32_t i[8];
+        static void *operator new(size_t s) { py::print("D new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("D placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("D delete"); return ::operator delete(p); }
+        static void operator delete(void *p, size_t s) { py::print("D wrong delete", s); return ::operator delete(p); }
+    };
+    py::class_<HasOpNewDel>(m, "HasOpNewDel").def(py::init<>());
+    py::class_<HasOpNewDelSize>(m, "HasOpNewDelSize").def(py::init<>());
+    py::class_<HasOpNewDelBoth>(m, "HasOpNewDelBoth").def(py::init<>());
+    py::class_<AliasedHasOpNewDelSize, PyAliasedHasOpNewDelSize> aliased(m, "AliasedHasOpNewDelSize");
+    aliased.def(py::init<>());
+    aliased.attr("size_noalias") = py::int_(sizeof(AliasedHasOpNewDelSize));
+    aliased.attr("size_alias") = py::int_(sizeof(PyAliasedHasOpNewDelSize));
+
+    // This test is actually part of test_local_bindings (test_duplicate_local), but we need a
+    // definition in a different compilation unit within the same module:
+    bind_local<LocalExternal, 17>(m, "LocalExternal", py::module_local());
+
+    // test_bind_protected_functions
+    class ProtectedA {
+    protected:
+        int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class PublicistA : public ProtectedA {
+    public:
+        using ProtectedA::foo;
+    };
+
+    py::class_<ProtectedA>(m, "ProtectedA")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistA::foo);
+#else
+        .def("foo", static_cast<int (ProtectedA::*)() const>(&PublicistA::foo));
+#endif
+
+    class ProtectedB {
+    public:
+        virtual ~ProtectedB() = default;
+        ProtectedB() = default;
+        ProtectedB(const ProtectedB &) = delete;
+
+    protected:
+        virtual int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class TrampolineB : public ProtectedB {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, ProtectedB, foo, ); }
+    };
+
+    class PublicistB : public ProtectedB {
+    public:
+        using ProtectedB::foo;
+    };
+
+    py::class_<ProtectedB, TrampolineB>(m, "ProtectedB")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistB::foo);
+#else
+        .def("foo", static_cast<int (ProtectedB::*)() const>(&PublicistB::foo));
+#endif
+
+    // test_brace_initialization
+    struct BraceInitialization {
+        int field1;
+        std::string field2;
+    };
+
+    py::class_<BraceInitialization>(m, "BraceInitialization")
+        .def(py::init<int, const std::string &>())
+        .def_readwrite("field1", &BraceInitialization::field1)
+        .def_readwrite("field2", &BraceInitialization::field2);
+    // We *don't* want to construct using braces when the given constructor argument maps to a
+    // constructor, because brace initialization could go to the wrong place (in particular when
+    // there is also an `initializer_list<T>`-accept constructor):
+    py::class_<NoBraceInitialization>(m, "NoBraceInitialization")
+        .def(py::init<std::vector<int>>())
+        .def_readonly("vec", &NoBraceInitialization::vec);
+
+    // test_reentrant_implicit_conversion_failure
+    // #1035: issue with runaway reentrant implicit conversion
+    struct BogusImplicitConversion {
+        BogusImplicitConversion(const BogusImplicitConversion &) { }
+    };
+
+    py::class_<BogusImplicitConversion>(m, "BogusImplicitConversion")
+        .def(py::init<const BogusImplicitConversion &>());
+
+    py::implicitly_convertible<int, BogusImplicitConversion>();
+
+    // test_qualname
+    // #1166: nested class docstring doesn't show nested name
+    // Also related: tests that __qualname__ is set properly
+    struct NestBase {};
+    struct Nested {};
+    py::class_<NestBase> base(m, "NestBase");
+    base.def(py::init<>());
+    py::class_<Nested>(base, "Nested")
+        .def(py::init<>())
+        .def("fn", [](Nested &, int, NestBase &, Nested &) {})
+        .def("fa", [](Nested &, int, NestBase &, Nested &) {},
+                "a"_a, "b"_a, "c"_a);
+    base.def("g", [](NestBase &, Nested &) {});
+    base.def("h", []() { return NestBase(); });
+
+    // test_error_after_conversion
+    // The second-pass path through dispatcher() previously didn't
+    // remember which overload was used, and would crash trying to
+    // generate a useful error message
+
+    struct NotRegistered {};
+    struct StringWrapper { std::string str; };
+    m.def("test_error_after_conversions", [](int) {});
+    m.def("test_error_after_conversions",
+          [](StringWrapper) -> NotRegistered { return {}; });
+    py::class_<StringWrapper>(m, "StringWrapper").def(py::init<std::string>());
+    py::implicitly_convertible<std::string, StringWrapper>();
+
+    #if defined(PYBIND11_CPP17)
+        struct alignas(1024) Aligned {
+            std::uintptr_t ptr() const { return (uintptr_t) this; }
+        };
+        py::class_<Aligned>(m, "Aligned")
+            .def(py::init<>())
+            .def("ptr", &Aligned::ptr);
+    #endif
+
+    // test_final
+    struct IsFinal final {};
+    py::class_<IsFinal>(m, "IsFinal", py::is_final());
+
+    // test_non_final_final
+    struct IsNonFinalFinal {};
+    py::class_<IsNonFinalFinal>(m, "IsNonFinalFinal", py::is_final());
+
+    struct PyPrintDestructor {
+        PyPrintDestructor() {}
+        ~PyPrintDestructor() {
+            py::print("Print from destructor");
+        }
+        void throw_something() { throw std::runtime_error("error"); }
+    };
+    py::class_<PyPrintDestructor>(m, "PyPrintDestructor")
+        .def(py::init<>())
+        .def("throw_something", &PyPrintDestructor::throw_something);
+}
+
+template <int N> class BreaksBase { public:
+    virtual ~BreaksBase() = default;
+    BreaksBase() = default;
+    BreaksBase(const BreaksBase&) = delete;
+};
+template <int N> class BreaksTramp : public BreaksBase<N> {};
+// These should all compile just fine:
+typedef py::class_<BreaksBase<1>, std::unique_ptr<BreaksBase<1>>, BreaksTramp<1>> DoesntBreak1;
+typedef py::class_<BreaksBase<2>, BreaksTramp<2>, std::unique_ptr<BreaksBase<2>>> DoesntBreak2;
+typedef py::class_<BreaksBase<3>, std::unique_ptr<BreaksBase<3>>> DoesntBreak3;
+typedef py::class_<BreaksBase<4>, BreaksTramp<4>> DoesntBreak4;
+typedef py::class_<BreaksBase<5>> DoesntBreak5;
+typedef py::class_<BreaksBase<6>, std::shared_ptr<BreaksBase<6>>, BreaksTramp<6>> DoesntBreak6;
+typedef py::class_<BreaksBase<7>, BreaksTramp<7>, std::shared_ptr<BreaksBase<7>>> DoesntBreak7;
+typedef py::class_<BreaksBase<8>, std::shared_ptr<BreaksBase<8>>> DoesntBreak8;
+#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<N>>::value, \
+        "DoesntBreak" #N " has wrong type!")
+CHECK_BASE(1); CHECK_BASE(2); CHECK_BASE(3); CHECK_BASE(4); CHECK_BASE(5); CHECK_BASE(6); CHECK_BASE(7); CHECK_BASE(8);
+#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<N>>::value, \
+        "DoesntBreak" #N " has wrong type_alias!")
+#define CHECK_NOALIAS(N) static_assert(!DoesntBreak##N::has_alias && std::is_void<typename DoesntBreak##N::type_alias>::value, \
+        "DoesntBreak" #N " has type alias, but shouldn't!")
+CHECK_ALIAS(1); CHECK_ALIAS(2); CHECK_NOALIAS(3); CHECK_ALIAS(4); CHECK_NOALIAS(5); CHECK_ALIAS(6); CHECK_ALIAS(7); CHECK_NOALIAS(8);
+#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<N>>>::value, \
+        "DoesntBreak" #N " has wrong holder_type!")
+CHECK_HOLDER(1, unique); CHECK_HOLDER(2, unique); CHECK_HOLDER(3, unique); CHECK_HOLDER(4, unique); CHECK_HOLDER(5, unique);
+CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
+
+// There's no nice way to test that these fail because they fail to compile; leave them here,
+// though, so that they can be manually tested by uncommenting them (and seeing that compilation
+// failures occurs).
+
+// We have to actually look into the type: the typedef alone isn't enough to instantiate the type:
+#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-N>>::value, \
+        "Breaks1 has wrong type!");
+
+//// Two holder classes:
+//typedef py::class_<BreaksBase<-1>, std::unique_ptr<BreaksBase<-1>>, std::unique_ptr<BreaksBase<-1>>> Breaks1;
+//CHECK_BROKEN(1);
+//// Two aliases:
+//typedef py::class_<BreaksBase<-2>, BreaksTramp<-2>, BreaksTramp<-2>> Breaks2;
+//CHECK_BROKEN(2);
+//// Holder + 2 aliases
+//typedef py::class_<BreaksBase<-3>, std::unique_ptr<BreaksBase<-3>>, BreaksTramp<-3>, BreaksTramp<-3>> Breaks3;
+//CHECK_BROKEN(3);
+//// Alias + 2 holders
+//typedef py::class_<BreaksBase<-4>, std::unique_ptr<BreaksBase<-4>>, BreaksTramp<-4>, std::shared_ptr<BreaksBase<-4>>> Breaks4;
+//CHECK_BROKEN(4);
+//// Invalid option (not a subclass or holder)
+//typedef py::class_<BreaksBase<-5>, BreaksTramp<-4>> Breaks5;
+//CHECK_BROKEN(5);
+//// Invalid option: multiple inheritance not supported:
+//template <> struct BreaksBase<-8> : BreaksBase<-6>, BreaksBase<-7> {};
+//typedef py::class_<BreaksBase<-8>, BreaksBase<-6>, BreaksBase<-7>> Breaks8;
+//CHECK_BROKEN(8);
diff --git a/pybind11/tests/test_class.py b/pybind11/tests/test_class.py
new file mode 100644
index 0000000000000000000000000000000000000000..4214fe79d7fbab2b38a1f15ca39d41e7cd33a171
--- /dev/null
+++ b/pybind11/tests/test_class.py
@@ -0,0 +1,333 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import class_ as m
+from pybind11_tests import UserType, ConstructorStats
+
+
+def test_repr():
+    # In Python 3.3+, repr() accesses __qualname__
+    assert "pybind11_type" in repr(type(UserType))
+    assert "UserType" in repr(UserType)
+
+
+def test_instance(msg):
+    with pytest.raises(TypeError) as excinfo:
+        m.NoConstructor()
+    assert msg(excinfo.value) == "m.class_.NoConstructor: No constructor defined!"
+
+    instance = m.NoConstructor.new_instance()
+
+    cstats = ConstructorStats.get(m.NoConstructor)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+
+def test_docstrings(doc):
+    assert doc(UserType) == "A `py::class_` type for testing"
+    assert UserType.__name__ == "UserType"
+    assert UserType.__module__ == "pybind11_tests"
+    assert UserType.get_value.__name__ == "get_value"
+    assert UserType.get_value.__module__ == "pybind11_tests"
+
+    assert doc(UserType.get_value) == """
+        get_value(self: m.UserType) -> int
+
+        Get value using a method
+    """
+    assert doc(UserType.value) == "Get/set value using a property"
+
+    assert doc(m.NoConstructor.new_instance) == """
+        new_instance() -> m.class_.NoConstructor
+
+        Return an instance
+    """
+
+
+def test_qualname(doc):
+    """Tests that a properly qualified name is set in __qualname__ (even in pre-3.3, where we
+    backport the attribute) and that generated docstrings properly use it and the module name"""
+    assert m.NestBase.__qualname__ == "NestBase"
+    assert m.NestBase.Nested.__qualname__ == "NestBase.Nested"
+
+    assert doc(m.NestBase.__init__) == """
+        __init__(self: m.class_.NestBase) -> None
+    """
+    assert doc(m.NestBase.g) == """
+        g(self: m.class_.NestBase, arg0: m.class_.NestBase.Nested) -> None
+    """
+    assert doc(m.NestBase.Nested.__init__) == """
+        __init__(self: m.class_.NestBase.Nested) -> None
+    """
+    assert doc(m.NestBase.Nested.fn) == """
+        fn(self: m.class_.NestBase.Nested, arg0: int, arg1: m.class_.NestBase, arg2: m.class_.NestBase.Nested) -> None
+    """  # noqa: E501 line too long
+    assert doc(m.NestBase.Nested.fa) == """
+        fa(self: m.class_.NestBase.Nested, a: int, b: m.class_.NestBase, c: m.class_.NestBase.Nested) -> None
+    """  # noqa: E501 line too long
+    assert m.NestBase.__module__ == "pybind11_tests.class_"
+    assert m.NestBase.Nested.__module__ == "pybind11_tests.class_"
+
+
+def test_inheritance(msg):
+    roger = m.Rabbit('Rabbit')
+    assert roger.name() + " is a " + roger.species() == "Rabbit is a parrot"
+    assert m.pet_name_species(roger) == "Rabbit is a parrot"
+
+    polly = m.Pet('Polly', 'parrot')
+    assert polly.name() + " is a " + polly.species() == "Polly is a parrot"
+    assert m.pet_name_species(polly) == "Polly is a parrot"
+
+    molly = m.Dog('Molly')
+    assert molly.name() + " is a " + molly.species() == "Molly is a dog"
+    assert m.pet_name_species(molly) == "Molly is a dog"
+
+    fred = m.Hamster('Fred')
+    assert fred.name() + " is a " + fred.species() == "Fred is a rodent"
+
+    assert m.dog_bark(molly) == "Woof!"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.dog_bark(polly)
+    assert msg(excinfo.value) == """
+        dog_bark(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: m.class_.Dog) -> str
+
+        Invoked with: <m.class_.Pet object at 0>
+    """
+
+    with pytest.raises(TypeError) as excinfo:
+        m.Chimera("lion", "goat")
+    assert "No constructor defined!" in str(excinfo.value)
+
+
+def test_inheritance_init(msg):
+
+    # Single base
+    class Python(m.Pet):
+        def __init__(self):
+            pass
+    with pytest.raises(TypeError) as exc_info:
+        Python()
+    expected = ["m.class_.Pet.__init__() must be called when overriding __init__",
+                "Pet.__init__() must be called when overriding __init__"]  # PyPy?
+    # TODO: fix PyPy error message wrt. tp_name/__qualname__?
+    assert msg(exc_info.value) in expected
+
+    # Multiple bases
+    class RabbitHamster(m.Rabbit, m.Hamster):
+        def __init__(self):
+            m.Rabbit.__init__(self, "RabbitHamster")
+
+    with pytest.raises(TypeError) as exc_info:
+        RabbitHamster()
+    expected = ["m.class_.Hamster.__init__() must be called when overriding __init__",
+                "Hamster.__init__() must be called when overriding __init__"]  # PyPy
+    assert msg(exc_info.value) in expected
+
+
+def test_automatic_upcasting():
+    assert type(m.return_class_1()).__name__ == "DerivedClass1"
+    assert type(m.return_class_2()).__name__ == "DerivedClass2"
+    assert type(m.return_none()).__name__ == "NoneType"
+    # Repeat these a few times in a random order to ensure no invalid caching is applied
+    assert type(m.return_class_n(1)).__name__ == "DerivedClass1"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(0)).__name__ == "BaseClass"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(0)).__name__ == "BaseClass"
+    assert type(m.return_class_n(1)).__name__ == "DerivedClass1"
+
+
+def test_isinstance():
+    objects = [tuple(), dict(), m.Pet("Polly", "parrot")] + [m.Dog("Molly")] * 4
+    expected = (True, True, True, True, True, False, False)
+    assert m.check_instances(objects) == expected
+
+
+def test_mismatched_holder():
+    import re
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.mismatched_holder_1()
+    assert re.match('generic_type: type ".*MismatchDerived1" does not have a non-default '
+                    'holder type while its base ".*MismatchBase1" does', str(excinfo.value))
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.mismatched_holder_2()
+    assert re.match('generic_type: type ".*MismatchDerived2" has a non-default holder type '
+                    'while its base ".*MismatchBase2" does not', str(excinfo.value))
+
+
+def test_override_static():
+    """#511: problem with inheritance + overwritten def_static"""
+    b = m.MyBase.make()
+    d1 = m.MyDerived.make2()
+    d2 = m.MyDerived.make()
+
+    assert isinstance(b, m.MyBase)
+    assert isinstance(d1, m.MyDerived)
+    assert isinstance(d2, m.MyDerived)
+
+
+def test_implicit_conversion_life_support():
+    """Ensure the lifetime of temporary objects created for implicit conversions"""
+    assert m.implicitly_convert_argument(UserType(5)) == 5
+    assert m.implicitly_convert_variable(UserType(5)) == 5
+
+    assert "outside a bound function" in m.implicitly_convert_variable_fail(UserType(5))
+
+
+def test_operator_new_delete(capture):
+    """Tests that class-specific operator new/delete functions are invoked"""
+
+    class SubAliased(m.AliasedHasOpNewDelSize):
+        pass
+
+    with capture:
+        a = m.HasOpNewDel()
+        b = m.HasOpNewDelSize()
+        d = m.HasOpNewDelBoth()
+    assert capture == """
+        A new 8
+        B new 4
+        D new 32
+    """
+    sz_alias = str(m.AliasedHasOpNewDelSize.size_alias)
+    sz_noalias = str(m.AliasedHasOpNewDelSize.size_noalias)
+    with capture:
+        c = m.AliasedHasOpNewDelSize()
+        c2 = SubAliased()
+    assert capture == (
+        "C new " + sz_noalias + "\n" +
+        "C new " + sz_alias + "\n"
+    )
+
+    with capture:
+        del a
+        pytest.gc_collect()
+        del b
+        pytest.gc_collect()
+        del d
+        pytest.gc_collect()
+    assert capture == """
+        A delete
+        B delete 4
+        D delete
+    """
+
+    with capture:
+        del c
+        pytest.gc_collect()
+        del c2
+        pytest.gc_collect()
+    assert capture == (
+        "C delete " + sz_noalias + "\n" +
+        "C delete " + sz_alias + "\n"
+    )
+
+
+def test_bind_protected_functions():
+    """Expose protected member functions to Python using a helper class"""
+    a = m.ProtectedA()
+    assert a.foo() == 42
+
+    b = m.ProtectedB()
+    assert b.foo() == 42
+
+    class C(m.ProtectedB):
+        def __init__(self):
+            m.ProtectedB.__init__(self)
+
+        def foo(self):
+            return 0
+
+    c = C()
+    assert c.foo() == 0
+
+
+def test_brace_initialization():
+    """ Tests that simple POD classes can be constructed using C++11 brace initialization """
+    a = m.BraceInitialization(123, "test")
+    assert a.field1 == 123
+    assert a.field2 == "test"
+
+    # Tests that a non-simple class doesn't get brace initialization (if the
+    # class defines an initializer_list constructor, in particular, it would
+    # win over the expected constructor).
+    b = m.NoBraceInitialization([123, 456])
+    assert b.vec == [123, 456]
+
+
+@pytest.mark.xfail("env.PYPY")
+def test_class_refcount():
+    """Instances must correctly increase/decrease the reference count of their types (#1029)"""
+    from sys import getrefcount
+
+    class PyDog(m.Dog):
+        pass
+
+    for cls in m.Dog, PyDog:
+        refcount_1 = getrefcount(cls)
+        molly = [cls("Molly") for _ in range(10)]
+        refcount_2 = getrefcount(cls)
+
+        del molly
+        pytest.gc_collect()
+        refcount_3 = getrefcount(cls)
+
+        assert refcount_1 == refcount_3
+        assert refcount_2 > refcount_1
+
+
+def test_reentrant_implicit_conversion_failure(msg):
+    # ensure that there is no runaway reentrant implicit conversion (#1035)
+    with pytest.raises(TypeError) as excinfo:
+        m.BogusImplicitConversion(0)
+    assert msg(excinfo.value) == '''
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.class_.BogusImplicitConversion(arg0: m.class_.BogusImplicitConversion)
+
+        Invoked with: 0
+    '''
+
+
+def test_error_after_conversions():
+    with pytest.raises(TypeError) as exc_info:
+        m.test_error_after_conversions("hello")
+    assert str(exc_info.value).startswith(
+        "Unable to convert function return value to a Python type!")
+
+
+def test_aligned():
+    if hasattr(m, "Aligned"):
+        p = m.Aligned().ptr()
+        assert p % 1024 == 0
+
+
+# https://foss.heptapod.net/pypy/pypy/-/issues/2742
+@pytest.mark.xfail("env.PYPY")
+def test_final():
+    with pytest.raises(TypeError) as exc_info:
+        class PyFinalChild(m.IsFinal):
+            pass
+    assert str(exc_info.value).endswith("is not an acceptable base type")
+
+
+# https://foss.heptapod.net/pypy/pypy/-/issues/2742
+@pytest.mark.xfail("env.PYPY")
+def test_non_final_final():
+    with pytest.raises(TypeError) as exc_info:
+        class PyNonFinalFinalChild(m.IsNonFinalFinal):
+            pass
+    assert str(exc_info.value).endswith("is not an acceptable base type")
+
+
+# https://github.com/pybind/pybind11/issues/1878
+def test_exception_rvalue_abort():
+    with pytest.raises(RuntimeError):
+        m.PyPrintDestructor().throw_something()
diff --git a/pybind11/tests/test_cmake_build/CMakeLists.txt b/pybind11/tests/test_cmake_build/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c0578ad3d3cead093940452968a0b165f2a3fdc
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Built-in in CMake 3.5+
+include(CMakeParseArguments)
+
+add_custom_target(test_cmake_build)
+
+function(pybind11_add_build_test name)
+  cmake_parse_arguments(ARG "INSTALL" "" "" ${ARGN})
+
+  set(build_options "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
+
+  if(PYBIND11_FINDPYTHON)
+    list(APPEND build_options "-DPYBIND11_FINDPYTHON=${PYBIND11_FINDPYTHON}")
+
+    if(DEFINED Python_ROOT_DIR)
+      list(APPEND build_options "-DPython_ROOT_DIR=${Python_ROOT_DIR}")
+    endif()
+
+    list(APPEND build_options "-DPython_EXECUTABLE=${Python_EXECUTABLE}")
+  else()
+    list(APPEND build_options "-DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}")
+  endif()
+
+  if(DEFINED CMAKE_CXX_STANDARD)
+    list(APPEND build_options "-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}")
+  endif()
+
+  if(NOT ARG_INSTALL)
+    list(APPEND build_options "-DPYBIND11_PROJECT_DIR=${pybind11_SOURCE_DIR}")
+  else()
+    list(APPEND build_options "-DCMAKE_PREFIX_PATH=${pybind11_BINARY_DIR}/mock_install")
+  endif()
+
+  add_custom_target(
+    test_build_${name}
+    ${CMAKE_CTEST_COMMAND}
+    --build-and-test
+    "${CMAKE_CURRENT_SOURCE_DIR}/${name}"
+    "${CMAKE_CURRENT_BINARY_DIR}/${name}"
+    --build-config
+    Release
+    --build-noclean
+    --build-generator
+    ${CMAKE_GENERATOR}
+    $<$<BOOL:${CMAKE_GENERATOR_PLATFORM}>:--build-generator-platform>
+    ${CMAKE_GENERATOR_PLATFORM}
+    --build-makeprogram
+    ${CMAKE_MAKE_PROGRAM}
+    --build-target
+    check_${name}
+    --build-options
+    ${build_options})
+  if(ARG_INSTALL)
+    add_dependencies(test_build_${name} mock_install)
+  endif()
+  add_dependencies(test_cmake_build test_build_${name})
+endfunction()
+
+pybind11_add_build_test(subdirectory_function)
+pybind11_add_build_test(subdirectory_target)
+if("${PYTHON_MODULE_EXTENSION}" MATCHES "pypy" OR "${Python_INTERPRETER_ID}" STREQUAL "PyPy")
+  message(STATUS "Skipping embed test on PyPy")
+else()
+  pybind11_add_build_test(subdirectory_embed)
+endif()
+
+if(PYBIND11_INSTALL)
+  add_custom_target(
+    mock_install ${CMAKE_COMMAND} "-DCMAKE_INSTALL_PREFIX=${pybind11_BINARY_DIR}/mock_install" -P
+                 "${pybind11_BINARY_DIR}/cmake_install.cmake")
+
+  pybind11_add_build_test(installed_function INSTALL)
+  pybind11_add_build_test(installed_target INSTALL)
+  if(NOT ("${PYTHON_MODULE_EXTENSION}" MATCHES "pypy" OR "${Python_INTERPRETER_ID}" STREQUAL "PyPy"
+         ))
+    pybind11_add_build_test(installed_embed INSTALL)
+  endif()
+endif()
+
+add_dependencies(check test_cmake_build)
diff --git a/pybind11/tests/test_cmake_build/embed.cpp b/pybind11/tests/test_cmake_build/embed.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9581d2fdb0a1629b9d0839acc033c20fecbe880
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/embed.cpp
@@ -0,0 +1,21 @@
+#include <pybind11/embed.h>
+namespace py = pybind11;
+
+PYBIND11_EMBEDDED_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2)
+        throw std::runtime_error("Expected test.py file as the first argument");
+    auto test_py_file = argv[1];
+
+    py::scoped_interpreter guard{};
+
+    auto m = py::module::import("test_cmake_build");
+    if (m.attr("add")(1, 2).cast<int>() != 3)
+        throw std::runtime_error("embed.cpp failed");
+
+    py::module::import("sys").attr("argv") = py::make_tuple("test.py", "embed.cpp");
+    py::eval_file(test_py_file, py::globals());
+}
diff --git a/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt b/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..64ae5c4bff13d32c06639d310aebb682ca376d4e
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+project(test_installed_embed CXX)
+
+find_package(pybind11 CONFIG REQUIRED)
+message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+
+add_executable(test_installed_embed ../embed.cpp)
+target_link_libraries(test_installed_embed PRIVATE pybind11::embed)
+set_target_properties(test_installed_embed PROPERTIES OUTPUT_NAME test_cmake_build)
+
+# Do not treat includes from IMPORTED target as SYSTEM (Python headers in pybind11::embed).
+# This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
+set_target_properties(test_installed_embed PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
+
+add_custom_target(check_installed_embed $<TARGET_FILE:test_installed_embed>
+                                        ${PROJECT_SOURCE_DIR}/../test.py)
diff --git a/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt b/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a502863c0c64ad891ebe159a548f0c47dc6ce34
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.4)
+project(test_installed_module CXX)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+project(test_installed_function CXX)
+
+find_package(pybind11 CONFIG REQUIRED)
+message(
+  STATUS "Found pybind11 v${pybind11_VERSION} ${pybind11_VERSION_TYPE}: ${pybind11_INCLUDE_DIRS}")
+
+pybind11_add_module(test_installed_function SHARED NO_EXTRAS ../main.cpp)
+set_target_properties(test_installed_function PROPERTIES OUTPUT_NAME test_cmake_build)
+
+if(DEFINED Python_EXECUTABLE)
+  set(_Python_EXECUTABLE "${Python_EXECUTABLE}")
+elseif(DEFINED PYTHON_EXECUTABLE)
+  set(_Python_EXECUTABLE "${PYTHON_EXECUTABLE}")
+else()
+  message(FATAL_ERROR "No Python executable defined (should not be possible at this stage)")
+endif()
+
+add_custom_target(
+  check_installed_function
+  ${CMAKE_COMMAND}
+  -E
+  env
+  PYTHONPATH=$<TARGET_FILE_DIR:test_installed_function>
+  ${_Python_EXECUTABLE}
+  ${PROJECT_SOURCE_DIR}/../test.py
+  ${PROJECT_NAME})
diff --git a/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt b/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b38eb77470e3efa45d8ceb490312b0461118ab82
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+project(test_installed_target CXX)
+
+find_package(pybind11 CONFIG REQUIRED)
+message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+
+add_library(test_installed_target MODULE ../main.cpp)
+
+target_link_libraries(test_installed_target PRIVATE pybind11::module)
+set_target_properties(test_installed_target PROPERTIES OUTPUT_NAME test_cmake_build)
+
+# Make sure result is, for example, test_installed_target.so, not libtest_installed_target.dylib
+pybind11_extension(test_installed_target)
+
+# Do not treat includes from IMPORTED target as SYSTEM (Python headers in pybind11::module).
+# This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
+set_target_properties(test_installed_target PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
+
+if(DEFINED Python_EXECUTABLE)
+  set(_Python_EXECUTABLE "${Python_EXECUTABLE}")
+elseif(DEFINED PYTHON_EXECUTABLE)
+  set(_Python_EXECUTABLE "${PYTHON_EXECUTABLE}")
+else()
+  message(FATAL_ERROR "No Python executable defined (should not be possible at this stage)")
+endif()
+
+add_custom_target(
+  check_installed_target
+  ${CMAKE_COMMAND}
+  -E
+  env
+  PYTHONPATH=$<TARGET_FILE_DIR:test_installed_target>
+  ${_Python_EXECUTABLE}
+  ${PROJECT_SOURCE_DIR}/../test.py
+  ${PROJECT_NAME})
diff --git a/pybind11/tests/test_cmake_build/main.cpp b/pybind11/tests/test_cmake_build/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e30f2c4b9a31205185d2b221a994dc001a30730a
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/main.cpp
@@ -0,0 +1,6 @@
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+PYBIND11_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
diff --git a/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7df0cf77c99b5ca39571fd00aea421bf647256d
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+project(test_subdirectory_embed CXX)
+
+set(PYBIND11_INSTALL
+    ON
+    CACHE BOOL "")
+set(PYBIND11_EXPORT_NAME test_export)
+
+add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+
+# Test basic target functionality
+add_executable(test_subdirectory_embed ../embed.cpp)
+target_link_libraries(test_subdirectory_embed PRIVATE pybind11::embed)
+set_target_properties(test_subdirectory_embed PROPERTIES OUTPUT_NAME test_cmake_build)
+
+add_custom_target(check_subdirectory_embed $<TARGET_FILE:test_subdirectory_embed>
+                                           ${PROJECT_SOURCE_DIR}/../test.py)
+
+# Test custom export group -- PYBIND11_EXPORT_NAME
+add_library(test_embed_lib ../embed.cpp)
+target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
+
+install(
+  TARGETS test_embed_lib
+  EXPORT test_export
+  ARCHIVE DESTINATION bin
+  LIBRARY DESTINATION lib
+  RUNTIME DESTINATION lib)
+install(EXPORT test_export DESTINATION lib/cmake/test_export/test_export-Targets.cmake)
diff --git a/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt b/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..624c600f8511bfc2950e702dfd453918375a79af
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+project(test_subdirectory_function CXX)
+
+add_subdirectory("${PYBIND11_PROJECT_DIR}" pybind11)
+pybind11_add_module(test_subdirectory_function ../main.cpp)
+set_target_properties(test_subdirectory_function PROPERTIES OUTPUT_NAME test_cmake_build)
+
+if(DEFINED Python_EXECUTABLE)
+  set(_Python_EXECUTABLE "${Python_EXECUTABLE}")
+elseif(DEFINED PYTHON_EXECUTABLE)
+  set(_Python_EXECUTABLE "${PYTHON_EXECUTABLE}")
+else()
+  message(FATAL_ERROR "No Python executable defined (should not be possible at this stage)")
+endif()
+
+add_custom_target(
+  check_subdirectory_function
+  ${CMAKE_COMMAND}
+  -E
+  env
+  PYTHONPATH=$<TARGET_FILE_DIR:test_subdirectory_function>
+  ${_Python_EXECUTABLE}
+  ${PROJECT_SOURCE_DIR}/../test.py
+  ${PROJECT_NAME})
diff --git a/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt b/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2471941fb682951dd6e5224dbeee38a7a738862b
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.4)
+
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+
+project(test_subdirectory_target CXX)
+
+add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+
+add_library(test_subdirectory_target MODULE ../main.cpp)
+set_target_properties(test_subdirectory_target PROPERTIES OUTPUT_NAME test_cmake_build)
+
+target_link_libraries(test_subdirectory_target PRIVATE pybind11::module)
+
+# Make sure result is, for example, test_installed_target.so, not libtest_installed_target.dylib
+pybind11_extension(test_subdirectory_target)
+
+if(DEFINED Python_EXECUTABLE)
+  set(_Python_EXECUTABLE "${Python_EXECUTABLE}")
+elseif(DEFINED PYTHON_EXECUTABLE)
+  set(_Python_EXECUTABLE "${PYTHON_EXECUTABLE}")
+else()
+  message(FATAL_ERROR "No Python executable defined (should not be possible at this stage)")
+endif()
+
+add_custom_target(
+  check_subdirectory_target
+  ${CMAKE_COMMAND}
+  -E
+  env
+  PYTHONPATH=$<TARGET_FILE_DIR:test_subdirectory_target>
+  ${_Python_EXECUTABLE}
+  ${PROJECT_SOURCE_DIR}/../test.py
+  ${PROJECT_NAME})
diff --git a/pybind11/tests/test_cmake_build/test.py b/pybind11/tests/test_cmake_build/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..87ed5135ff415bab7a56bf4ab8dea3200fd53cca
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/test.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+import sys
+import test_cmake_build
+
+assert test_cmake_build.add(1, 2) == 3
+print("{} imports, runs, and adds: 1 + 2 = 3".format(sys.argv[1]))
diff --git a/pybind11/tests/test_constants_and_functions.cpp b/pybind11/tests/test_constants_and_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8ec74b7bc77c9ddf87073d40e9a8c8c9c2115f0
--- /dev/null
+++ b/pybind11/tests/test_constants_and_functions.cpp
@@ -0,0 +1,127 @@
+/*
+    tests/test_constants_and_functions.cpp -- global constants and functions, enumerations, raw byte strings
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+enum MyEnum { EFirstEntry = 1, ESecondEntry };
+
+std::string test_function1() {
+    return "test_function()";
+}
+
+std::string test_function2(MyEnum k) {
+    return "test_function(enum=" + std::to_string(k) + ")";
+}
+
+std::string test_function3(int i) {
+    return "test_function(" + std::to_string(i) + ")";
+}
+
+py::str test_function4()           { return "test_function()"; }
+py::str test_function4(char *)     { return "test_function(char *)"; }
+py::str test_function4(int, float) { return "test_function(int, float)"; }
+py::str test_function4(float, int) { return "test_function(float, int)"; }
+
+py::bytes return_bytes() {
+    const char *data = "\x01\x00\x02\x00";
+    return std::string(data, 4);
+}
+
+std::string print_bytes(py::bytes bytes) {
+    std::string ret = "bytes[";
+    const auto value = static_cast<std::string>(bytes);
+    for (size_t i = 0; i < value.length(); ++i) {
+        ret += std::to_string(static_cast<int>(value[i])) + " ";
+    }
+    ret.back() = ']';
+    return ret;
+}
+
+// Test that we properly handle C++17 exception specifiers (which are part of the function signature
+// in C++17).  These should all still work before C++17, but don't affect the function signature.
+namespace test_exc_sp {
+int f1(int x) noexcept { return x+1; }
+int f2(int x) noexcept(true) { return x+2; }
+int f3(int x) noexcept(false) { return x+3; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+int f4(int x) throw() { return x+4; } // Deprecated equivalent to noexcept(true)
+#if defined(__GNUG__)
+#  pragma GCC diagnostic pop
+#endif
+struct C {
+    int m1(int x) noexcept { return x-1; }
+    int m2(int x) const noexcept { return x-2; }
+    int m3(int x) noexcept(true) { return x-3; }
+    int m4(int x) const noexcept(true) { return x-4; }
+    int m5(int x) noexcept(false) { return x-5; }
+    int m6(int x) const noexcept(false) { return x-6; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+    int m7(int x) throw() { return x-7; }
+    int m8(int x) const throw() { return x-8; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic pop
+#endif
+};
+}
+
+
+TEST_SUBMODULE(constants_and_functions, m) {
+    // test_constants
+    m.attr("some_constant") = py::int_(14);
+
+    // test_function_overloading
+    m.def("test_function", &test_function1);
+    m.def("test_function", &test_function2);
+    m.def("test_function", &test_function3);
+
+#if defined(PYBIND11_OVERLOAD_CAST)
+    m.def("test_function", py::overload_cast<>(&test_function4));
+    m.def("test_function", py::overload_cast<char *>(&test_function4));
+    m.def("test_function", py::overload_cast<int, float>(&test_function4));
+    m.def("test_function", py::overload_cast<float, int>(&test_function4));
+#else
+    m.def("test_function", static_cast<py::str (*)()>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(char *)>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(int, float)>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(float, int)>(&test_function4));
+#endif
+
+    py::enum_<MyEnum>(m, "MyEnum")
+        .value("EFirstEntry", EFirstEntry)
+        .value("ESecondEntry", ESecondEntry)
+        .export_values();
+
+    // test_bytes
+    m.def("return_bytes", &return_bytes);
+    m.def("print_bytes", &print_bytes);
+
+    // test_exception_specifiers
+    using namespace test_exc_sp;
+    py::class_<C>(m, "C")
+        .def(py::init<>())
+        .def("m1", &C::m1)
+        .def("m2", &C::m2)
+        .def("m3", &C::m3)
+        .def("m4", &C::m4)
+        .def("m5", &C::m5)
+        .def("m6", &C::m6)
+        .def("m7", &C::m7)
+        .def("m8", &C::m8)
+        ;
+    m.def("f1", f1);
+    m.def("f2", f2);
+    m.def("f3", f3);
+    m.def("f4", f4);
+}
diff --git a/pybind11/tests/test_constants_and_functions.py b/pybind11/tests/test_constants_and_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..36b1aa64b1201eb16f4424c968c774f68b7abec2
--- /dev/null
+++ b/pybind11/tests/test_constants_and_functions.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import constants_and_functions as m
+
+
+def test_constants():
+    assert m.some_constant == 14
+
+
+def test_function_overloading():
+    assert m.test_function() == "test_function()"
+    assert m.test_function(7) == "test_function(7)"
+    assert m.test_function(m.MyEnum.EFirstEntry) == "test_function(enum=1)"
+    assert m.test_function(m.MyEnum.ESecondEntry) == "test_function(enum=2)"
+
+    assert m.test_function() == "test_function()"
+    assert m.test_function("abcd") == "test_function(char *)"
+    assert m.test_function(1, 1.0) == "test_function(int, float)"
+    assert m.test_function(1, 1.0) == "test_function(int, float)"
+    assert m.test_function(2.0, 2) == "test_function(float, int)"
+
+
+def test_bytes():
+    assert m.print_bytes(m.return_bytes()) == "bytes[1 0 2 0]"
+
+
+def test_exception_specifiers():
+    c = m.C()
+    assert c.m1(2) == 1
+    assert c.m2(3) == 1
+    assert c.m3(5) == 2
+    assert c.m4(7) == 3
+    assert c.m5(10) == 5
+    assert c.m6(14) == 8
+    assert c.m7(20) == 13
+    assert c.m8(29) == 21
+
+    assert m.f1(33) == 34
+    assert m.f2(53) == 55
+    assert m.f3(86) == 89
+    assert m.f4(140) == 144
diff --git a/pybind11/tests/test_copy_move.cpp b/pybind11/tests/test_copy_move.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f698bdf058dc53fceb21e504959fe334973bafb
--- /dev/null
+++ b/pybind11/tests/test_copy_move.cpp
@@ -0,0 +1,213 @@
+/*
+    tests/test_copy_move_policies.cpp -- 'copy' and 'move' return value policies
+                                         and related tests
+
+    Copyright (c) 2016 Ben North <ben@redfrontdoor.org>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+template <typename derived>
+struct empty {
+    static const derived& get_one() { return instance_; }
+    static derived instance_;
+};
+
+struct lacking_copy_ctor : public empty<lacking_copy_ctor> {
+    lacking_copy_ctor() {}
+    lacking_copy_ctor(const lacking_copy_ctor& other) = delete;
+};
+
+template <> lacking_copy_ctor empty<lacking_copy_ctor>::instance_ = {};
+
+struct lacking_move_ctor : public empty<lacking_move_ctor> {
+    lacking_move_ctor() {}
+    lacking_move_ctor(const lacking_move_ctor& other) = delete;
+    lacking_move_ctor(lacking_move_ctor&& other) = delete;
+};
+
+template <> lacking_move_ctor empty<lacking_move_ctor>::instance_ = {};
+
+/* Custom type caster move/copy test classes */
+class MoveOnlyInt {
+public:
+    MoveOnlyInt() { print_default_created(this); }
+    MoveOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOnlyInt(MoveOnlyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOnlyInt &operator=(MoveOnlyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOnlyInt(const MoveOnlyInt &) = delete;
+    MoveOnlyInt &operator=(const MoveOnlyInt &) = delete;
+    ~MoveOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+class MoveOrCopyInt {
+public:
+    MoveOrCopyInt() { print_default_created(this); }
+    MoveOrCopyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOrCopyInt(MoveOrCopyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOrCopyInt &operator=(MoveOrCopyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOrCopyInt(const MoveOrCopyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    MoveOrCopyInt &operator=(const MoveOrCopyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~MoveOrCopyInt() { print_destroyed(this); }
+
+    int value;
+};
+class CopyOnlyInt {
+public:
+    CopyOnlyInt() { print_default_created(this); }
+    CopyOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    CopyOnlyInt(const CopyOnlyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    CopyOnlyInt &operator=(const CopyOnlyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~CopyOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+PYBIND11_NAMESPACE_BEGIN(pybind11)
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <> struct type_caster<MoveOnlyInt> {
+    PYBIND11_TYPE_CASTER(MoveOnlyInt, _("MoveOnlyInt"));
+    bool load(handle src, bool) { value = MoveOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<MoveOrCopyInt> {
+    PYBIND11_TYPE_CASTER(MoveOrCopyInt, _("MoveOrCopyInt"));
+    bool load(handle src, bool) { value = MoveOrCopyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOrCopyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<CopyOnlyInt> {
+protected:
+    CopyOnlyInt value;
+public:
+    static constexpr auto name = _("CopyOnlyInt");
+    bool load(handle src, bool) { value = CopyOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const CopyOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+    static handle cast(const CopyOnlyInt *src, return_value_policy policy, handle parent) {
+        if (!src) return none().release();
+        return cast(*src, policy, parent);
+    }
+    operator CopyOnlyInt*() { return &value; }
+    operator CopyOnlyInt&() { return value; }
+    template <typename T> using cast_op_type = pybind11::detail::cast_op_type<T>;
+};
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(pybind11)
+
+TEST_SUBMODULE(copy_move_policies, m) {
+    // test_lacking_copy_ctor
+    py::class_<lacking_copy_ctor>(m, "lacking_copy_ctor")
+        .def_static("get_one", &lacking_copy_ctor::get_one,
+                    py::return_value_policy::copy);
+    // test_lacking_move_ctor
+    py::class_<lacking_move_ctor>(m, "lacking_move_ctor")
+        .def_static("get_one", &lacking_move_ctor::get_one,
+                    py::return_value_policy::move);
+
+    // test_move_and_copy_casts
+    m.def("move_and_copy_casts", [](py::object o) {
+        int r = 0;
+        r += py::cast<MoveOrCopyInt>(o).value; /* moves */
+        r += py::cast<MoveOnlyInt>(o).value; /* moves */
+        r += py::cast<CopyOnlyInt>(o).value; /* copies */
+        MoveOrCopyInt m1(py::cast<MoveOrCopyInt>(o)); /* moves */
+        MoveOnlyInt m2(py::cast<MoveOnlyInt>(o)); /* moves */
+        CopyOnlyInt m3(py::cast<CopyOnlyInt>(o)); /* copies */
+        r += m1.value + m2.value + m3.value;
+
+        return r;
+    });
+
+    // test_move_and_copy_loads
+    m.def("move_only", [](MoveOnlyInt m) { return m.value; });
+    m.def("move_or_copy", [](MoveOrCopyInt m) { return m.value; });
+    m.def("copy_only", [](CopyOnlyInt m) { return m.value; });
+    m.def("move_pair", [](std::pair<MoveOnlyInt, MoveOrCopyInt> p) {
+        return p.first.value + p.second.value;
+    });
+    m.def("move_tuple", [](std::tuple<MoveOnlyInt, MoveOrCopyInt, MoveOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value + std::get<2>(t).value;
+    });
+    m.def("copy_tuple", [](std::tuple<CopyOnlyInt, CopyOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value;
+    });
+    m.def("move_copy_nested", [](std::pair<MoveOnlyInt, std::pair<std::tuple<MoveOrCopyInt, CopyOnlyInt, std::tuple<MoveOnlyInt>>, MoveOrCopyInt>> x) {
+        return x.first.value + std::get<0>(x.second.first).value + std::get<1>(x.second.first).value +
+            std::get<0>(std::get<2>(x.second.first)).value + x.second.second.value;
+    });
+    m.def("move_and_copy_cstats", []() {
+        ConstructorStats::gc();
+        // Reset counts to 0 so that previous tests don't affect later ones:
+        auto &mc = ConstructorStats::get<MoveOrCopyInt>();
+        mc.move_assignments = mc.move_constructions = mc.copy_assignments = mc.copy_constructions = 0;
+        auto &mo = ConstructorStats::get<MoveOnlyInt>();
+        mo.move_assignments = mo.move_constructions = mo.copy_assignments = mo.copy_constructions = 0;
+        auto &co = ConstructorStats::get<CopyOnlyInt>();
+        co.move_assignments = co.move_constructions = co.copy_assignments = co.copy_constructions = 0;
+        py::dict d;
+        d["MoveOrCopyInt"] = py::cast(mc, py::return_value_policy::reference);
+        d["MoveOnlyInt"] = py::cast(mo, py::return_value_policy::reference);
+        d["CopyOnlyInt"] = py::cast(co, py::return_value_policy::reference);
+        return d;
+    });
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_move_and_copy_load_optional
+    m.attr("has_optional") = true;
+    m.def("move_optional", [](std::optional<MoveOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_or_copy_optional", [](std::optional<MoveOrCopyInt> o) {
+        return o->value;
+    });
+    m.def("copy_optional", [](std::optional<CopyOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_optional_tuple", [](std::optional<std::tuple<MoveOrCopyInt, MoveOnlyInt, CopyOnlyInt>> x) {
+        return std::get<0>(*x).value + std::get<1>(*x).value + std::get<2>(*x).value;
+    });
+#else
+    m.attr("has_optional") = false;
+#endif
+
+    // #70 compilation issue if operator new is not public
+    struct PrivateOpNew {
+        int value = 1;
+    private:
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4822) // warning C4822: local class member function does not have a body
+#endif
+        void *operator new(size_t bytes);
+    };
+    py::class_<PrivateOpNew>(m, "PrivateOpNew").def_readonly("value", &PrivateOpNew::value);
+    m.def("private_op_new_value", []() { return PrivateOpNew(); });
+    m.def("private_op_new_reference", []() -> const PrivateOpNew & {
+        static PrivateOpNew x{};
+        return x;
+    }, py::return_value_policy::reference);
+
+    // test_move_fallback
+    // #389: rvp::move should fall-through to copy on non-movable objects
+    struct MoveIssue1 {
+        int v;
+        MoveIssue1(int v) : v{v} {}
+        MoveIssue1(const MoveIssue1 &c) = default;
+        MoveIssue1(MoveIssue1 &&) = delete;
+    };
+    py::class_<MoveIssue1>(m, "MoveIssue1").def(py::init<int>()).def_readwrite("value", &MoveIssue1::v);
+
+    struct MoveIssue2 {
+        int v;
+        MoveIssue2(int v) : v{v} {}
+        MoveIssue2(MoveIssue2 &&) = default;
+    };
+    py::class_<MoveIssue2>(m, "MoveIssue2").def(py::init<int>()).def_readwrite("value", &MoveIssue2::v);
+
+    m.def("get_moveissue1", [](int i) { return new MoveIssue1(i); }, py::return_value_policy::move);
+    m.def("get_moveissue2", [](int i) { return MoveIssue2(i); }, py::return_value_policy::move);
+}
diff --git a/pybind11/tests/test_copy_move.py b/pybind11/tests/test_copy_move.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b53993a91187c5518212f6fc41ac3a1792cc1df
--- /dev/null
+++ b/pybind11/tests/test_copy_move.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import copy_move_policies as m
+
+
+def test_lacking_copy_ctor():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.lacking_copy_ctor.get_one()
+    assert "is non-copyable!" in str(excinfo.value)
+
+
+def test_lacking_move_ctor():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.lacking_move_ctor.get_one()
+    assert "is neither movable nor copyable!" in str(excinfo.value)
+
+
+def test_move_and_copy_casts():
+    """Cast some values in C++ via custom type casters and count the number of moves/copies."""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    # The type move constructions/assignments below each get incremented: the move assignment comes
+    # from the type_caster load; the move construction happens when extracting that via a cast or
+    # loading into an argument.
+    assert m.move_and_copy_casts(3) == 18
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 2
+    assert c_m.move_constructions >= 2
+    assert c_mc.alive() == 0
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 2
+    assert c_mc.move_constructions >= 2
+    assert c_c.alive() == 0
+    assert c_c.copy_assignments == 2
+    assert c_c.copy_constructions >= 2
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+def test_move_and_copy_loads():
+    """Call some functions that load arguments via custom type casters and count the number of
+    moves/copies."""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    assert m.move_only(10) == 10  # 1 move, c_m
+    assert m.move_or_copy(11) == 11  # 1 move, c_mc
+    assert m.copy_only(12) == 12  # 1 copy, c_c
+    assert m.move_pair((13, 14)) == 27  # 1 c_m move, 1 c_mc move
+    assert m.move_tuple((15, 16, 17)) == 48  # 2 c_m moves, 1 c_mc move
+    assert m.copy_tuple((18, 19)) == 37  # 2 c_c copies
+    # Direct constructions: 2 c_m moves, 2 c_mc moves, 1 c_c copy
+    # Extra moves/copies when moving pairs/tuples: 3 c_m, 3 c_mc, 2 c_c
+    assert m.move_copy_nested((1, ((2, 3, (4,)), 5))) == 15
+
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 6
+    assert c_m.move_constructions == 9
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 5
+    assert c_mc.move_constructions == 8
+    assert c_c.copy_assignments == 4
+    assert c_c.copy_constructions == 6
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+@pytest.mark.skipif(not m.has_optional, reason='no <optional>')
+def test_move_and_copy_load_optional():
+    """Tests move/copy loads of std::optional arguments"""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    # The extra move/copy constructions below come from the std::optional move (which has to move
+    # its arguments):
+    assert m.move_optional(10) == 10  # c_m: 1 move assign, 2 move construct
+    assert m.move_or_copy_optional(11) == 11  # c_mc: 1 move assign, 2 move construct
+    assert m.copy_optional(12) == 12  # c_c: 1 copy assign, 2 copy construct
+    # 1 move assign + move construct moves each of c_m, c_mc, 1 c_c copy
+    # +1 move/copy construct each from moving the tuple
+    # +1 move/copy construct each from moving the optional (which moves the tuple again)
+    assert m.move_optional_tuple((3, 4, 5)) == 12
+
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 2
+    assert c_m.move_constructions == 5
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 2
+    assert c_mc.move_constructions == 5
+    assert c_c.copy_assignments == 2
+    assert c_c.copy_constructions == 5
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+def test_private_op_new():
+    """An object with a private `operator new` cannot be returned by value"""
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.private_op_new_value()
+    assert "is neither movable nor copyable" in str(excinfo.value)
+
+    assert m.private_op_new_reference().value == 1
+
+
+def test_move_fallback():
+    """#389: rvp::move should fall-through to copy on non-movable objects"""
+
+    m2 = m.get_moveissue2(2)
+    assert m2.value == 2
+    m1 = m.get_moveissue1(1)
+    assert m1.value == 1
diff --git a/pybind11/tests/test_custom_type_casters.cpp b/pybind11/tests/test_custom_type_casters.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9485d3cdb207b14fd74eb1d8afe1c31d92891b7b
--- /dev/null
+++ b/pybind11/tests/test_custom_type_casters.cpp
@@ -0,0 +1,125 @@
+/*
+    tests/test_custom_type_casters.cpp -- tests type_caster<T>
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+
+// py::arg/py::arg_v testing: these arguments just record their argument when invoked
+class ArgInspector1 { public: std::string arg = "(default arg inspector 1)"; };
+class ArgInspector2 { public: std::string arg = "(default arg inspector 2)"; };
+class ArgAlwaysConverts { };
+namespace pybind11 { namespace detail {
+template <> struct type_caster<ArgInspector1> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector1, _("ArgInspector1"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector1 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector1 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgInspector2> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector2, _("ArgInspector2"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector2 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector2 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgAlwaysConverts> {
+public:
+    PYBIND11_TYPE_CASTER(ArgAlwaysConverts, _("ArgAlwaysConverts"));
+
+    bool load(handle, bool convert) {
+        return convert;
+    }
+
+    static handle cast(const ArgAlwaysConverts &, return_value_policy, handle) {
+        return py::none().release();
+    }
+};
+}}
+
+// test_custom_caster_destruction
+class DestructionTester {
+public:
+    DestructionTester() { print_default_created(this); }
+    ~DestructionTester() { print_destroyed(this); }
+    DestructionTester(const DestructionTester &) { print_copy_created(this); }
+    DestructionTester(DestructionTester &&) { print_move_created(this); }
+    DestructionTester &operator=(const DestructionTester &) { print_copy_assigned(this); return *this; }
+    DestructionTester &operator=(DestructionTester &&) { print_move_assigned(this); return *this; }
+};
+namespace pybind11 { namespace detail {
+template <> struct type_caster<DestructionTester> {
+    PYBIND11_TYPE_CASTER(DestructionTester, _("DestructionTester"));
+    bool load(handle, bool) { return true; }
+
+    static handle cast(const DestructionTester &, return_value_policy, handle) {
+        return py::bool_(true).release();
+    }
+};
+}}
+
+TEST_SUBMODULE(custom_type_casters, m) {
+    // test_custom_type_casters
+
+    // test_noconvert_args
+    //
+    // Test converting.  The ArgAlwaysConverts is just there to make the first no-conversion pass
+    // fail so that our call always ends up happening via the second dispatch (the one that allows
+    // some conversion).
+    class ArgInspector {
+    public:
+        ArgInspector1 f(ArgInspector1 a, ArgAlwaysConverts) { return a; }
+        std::string g(ArgInspector1 a, const ArgInspector1 &b, int c, ArgInspector2 *d, ArgAlwaysConverts) {
+            return a.arg + "\n" + b.arg + "\n" + std::to_string(c) + "\n" + d->arg;
+        }
+        static ArgInspector2 h(ArgInspector2 a, ArgAlwaysConverts) { return a; }
+    };
+    py::class_<ArgInspector>(m, "ArgInspector")
+        .def(py::init<>())
+        .def("f", &ArgInspector::f, py::arg(), py::arg() = ArgAlwaysConverts())
+        .def("g", &ArgInspector::g, "a"_a.noconvert(), "b"_a, "c"_a.noconvert()=13, "d"_a=ArgInspector2(), py::arg() = ArgAlwaysConverts())
+        .def_static("h", &ArgInspector::h, py::arg().noconvert(), py::arg() = ArgAlwaysConverts())
+        ;
+    m.def("arg_inspect_func", [](ArgInspector2 a, ArgInspector1 b, ArgAlwaysConverts) { return a.arg + "\n" + b.arg; },
+            py::arg().noconvert(false), py::arg_v(nullptr, ArgInspector1()).noconvert(true), py::arg() = ArgAlwaysConverts());
+
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("ints_preferred", [](int i) { return i / 2; }, py::arg("i"));
+    m.def("ints_only", [](int i) { return i / 2; }, py::arg("i").noconvert());
+
+    // test_custom_caster_destruction
+    // Test that `take_ownership` works on types with a custom type caster when given a pointer
+
+    // default policy: don't take ownership:
+    m.def("custom_caster_no_destroy", []() { static auto *dt = new DestructionTester(); return dt; });
+
+    m.def("custom_caster_destroy", []() { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Takes ownership: destroy when finished
+    m.def("custom_caster_destroy_const", []() -> const DestructionTester * { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Likewise (const doesn't inhibit destruction)
+    m.def("destruction_tester_cstats", &ConstructorStats::get<DestructionTester>, py::return_value_policy::reference);
+}
diff --git a/pybind11/tests/test_custom_type_casters.py b/pybind11/tests/test_custom_type_casters.py
new file mode 100644
index 0000000000000000000000000000000000000000..9475c4516845632da6c6c5b918ae05401d8f3f01
--- /dev/null
+++ b/pybind11/tests/test_custom_type_casters.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import custom_type_casters as m
+
+
+def test_noconvert_args(msg):
+    a = m.ArgInspector()
+    assert msg(a.f("hi")) == """
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = hi
+    """
+    assert msg(a.g("this is a", "this is b")) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        13
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
+    """  # noqa: E501 line too long
+    assert msg(a.g("this is a", "this is b", 42)) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        42
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
+    """  # noqa: E501 line too long
+    assert msg(a.g("this is a", "this is b", 42, "this is d")) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        42
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = this is d
+    """
+    assert (a.h("arg 1") ==
+            "loading ArgInspector2 argument WITHOUT conversion allowed.  Argument value = arg 1")
+    assert msg(m.arg_inspect_func("A1", "A2")) == """
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = A1
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = A2
+    """
+
+    assert m.floats_preferred(4) == 2.0
+    assert m.floats_only(4.0) == 2.0
+    with pytest.raises(TypeError) as excinfo:
+        m.floats_only(4)
+    assert msg(excinfo.value) == """
+        floats_only(): incompatible function arguments. The following argument types are supported:
+            1. (f: float) -> float
+
+        Invoked with: 4
+    """
+
+    assert m.ints_preferred(4) == 2
+    assert m.ints_preferred(True) == 0
+    with pytest.raises(TypeError) as excinfo:
+        m.ints_preferred(4.0)
+    assert msg(excinfo.value) == """
+        ints_preferred(): incompatible function arguments. The following argument types are supported:
+            1. (i: int) -> int
+
+        Invoked with: 4.0
+    """  # noqa: E501 line too long
+
+    assert m.ints_only(4) == 2
+    with pytest.raises(TypeError) as excinfo:
+        m.ints_only(4.0)
+    assert msg(excinfo.value) == """
+        ints_only(): incompatible function arguments. The following argument types are supported:
+            1. (i: int) -> int
+
+        Invoked with: 4.0
+    """
+
+
+def test_custom_caster_destruction():
+    """Tests that returning a pointer to a type that gets converted with a custom type caster gets
+    destroyed when the function has py::return_value_policy::take_ownership policy applied."""
+
+    cstats = m.destruction_tester_cstats()
+    # This one *doesn't* have take_ownership: the pointer should be used but not destroyed:
+    z = m.custom_caster_no_destroy()
+    assert cstats.alive() == 1 and cstats.default_constructions == 1
+    assert z
+
+    # take_ownership applied: this constructs a new object, casts it, then destroys it:
+    z = m.custom_caster_destroy()
+    assert z
+    assert cstats.default_constructions == 2
+
+    # Same, but with a const pointer return (which should *not* inhibit destruction):
+    z = m.custom_caster_destroy_const()
+    assert z
+    assert cstats.default_constructions == 3
+
+    # Make sure we still only have the original object (from ..._no_destroy()) alive:
+    assert cstats.alive() == 1
diff --git a/pybind11/tests/test_docstring_options.cpp b/pybind11/tests/test_docstring_options.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c8f79fd5f6308caab1ee2d22525af2a408eca07
--- /dev/null
+++ b/pybind11/tests/test_docstring_options.cpp
@@ -0,0 +1,61 @@
+/*
+    tests/test_docstring_options.cpp -- generation of docstrings and signatures
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(docstring_options, m) {
+    // test_docstring_options
+    {
+        py::options options;
+        options.disable_function_signatures();
+
+        m.def("test_function1", [](int, int) {}, py::arg("a"), py::arg("b"));
+        m.def("test_function2", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        m.def("test_overloaded1", [](int) {}, py::arg("i"), "Overload docstring");
+        m.def("test_overloaded1", [](double) {}, py::arg("d"));
+
+        m.def("test_overloaded2", [](int) {}, py::arg("i"), "overload docstring 1");
+        m.def("test_overloaded2", [](double) {}, py::arg("d"), "overload docstring 2");
+
+        m.def("test_overloaded3", [](int) {}, py::arg("i"));
+        m.def("test_overloaded3", [](double) {}, py::arg("d"), "Overload docstr");
+
+        options.enable_function_signatures();
+
+        m.def("test_function3", [](int, int) {}, py::arg("a"), py::arg("b"));
+        m.def("test_function4", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        options.disable_function_signatures().disable_user_defined_docstrings();
+
+        m.def("test_function5", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        {
+            py::options nested_options;
+            nested_options.enable_user_defined_docstrings();
+            m.def("test_function6", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+        }
+    }
+
+    m.def("test_function7", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+    {
+        py::options options;
+        options.disable_user_defined_docstrings();
+
+        struct DocstringTestFoo {
+            int value;
+            void setValue(int v) { value = v; }
+            int getValue() const { return value; }
+        };
+        py::class_<DocstringTestFoo>(m, "DocstringTestFoo", "This is a class docstring")
+            .def_property("value_prop", &DocstringTestFoo::getValue, &DocstringTestFoo::setValue, "This is a property docstring")
+        ;
+    }
+}
diff --git a/pybind11/tests/test_docstring_options.py b/pybind11/tests/test_docstring_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ade0f158c3fc7b8e21cf79461a430be7c82f3a
--- /dev/null
+++ b/pybind11/tests/test_docstring_options.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import docstring_options as m
+
+
+def test_docstring_options():
+    # options.disable_function_signatures()
+    assert not m.test_function1.__doc__
+
+    assert m.test_function2.__doc__ == "A custom docstring"
+
+    # docstring specified on just the first overload definition:
+    assert m.test_overloaded1.__doc__ == "Overload docstring"
+
+    # docstring on both overloads:
+    assert m.test_overloaded2.__doc__ == "overload docstring 1\noverload docstring 2"
+
+    # docstring on only second overload:
+    assert m.test_overloaded3.__doc__ == "Overload docstr"
+
+    # options.enable_function_signatures()
+    assert m.test_function3.__doc__ .startswith("test_function3(a: int, b: int) -> None")
+
+    assert m.test_function4.__doc__ .startswith("test_function4(a: int, b: int) -> None")
+    assert m.test_function4.__doc__ .endswith("A custom docstring\n")
+
+    # options.disable_function_signatures()
+    # options.disable_user_defined_docstrings()
+    assert not m.test_function5.__doc__
+
+    # nested options.enable_user_defined_docstrings()
+    assert m.test_function6.__doc__ == "A custom docstring"
+
+    # RAII destructor
+    assert m.test_function7.__doc__ .startswith("test_function7(a: int, b: int) -> None")
+    assert m.test_function7.__doc__ .endswith("A custom docstring\n")
+
+    # Suppression of user-defined docstrings for non-function objects
+    assert not m.DocstringTestFoo.__doc__
+    assert not m.DocstringTestFoo.value_prop.__doc__
diff --git a/pybind11/tests/test_eigen.cpp b/pybind11/tests/test_eigen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56aa1a4a6fe6b60a1d85c54cd40ee70ddde3528f
--- /dev/null
+++ b/pybind11/tests/test_eigen.cpp
@@ -0,0 +1,327 @@
+/*
+    tests/eigen.cpp -- automatic conversion of Eigen types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/eigen.h>
+#include <pybind11/stl.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4996) // C4996: std::unary_negation is deprecated
+#endif
+
+#include <Eigen/Cholesky>
+
+using MatrixXdR = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+
+
+// Sets/resets a testing reference matrix to have values of 10*r + c, where r and c are the
+// (1-based) row/column number.
+template <typename M> void reset_ref(M &x) {
+    for (int i = 0; i < x.rows(); i++) for (int j = 0; j < x.cols(); j++)
+        x(i, j) = 11 + 10*i + j;
+}
+
+// Returns a static, column-major matrix
+Eigen::MatrixXd &get_cm() {
+    static Eigen::MatrixXd *x;
+    if (!x) {
+        x = new Eigen::MatrixXd(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Likewise, but row-major
+MatrixXdR &get_rm() {
+    static MatrixXdR *x;
+    if (!x) {
+        x = new MatrixXdR(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Resets the values of the static matrices returned by get_cm()/get_rm()
+void reset_refs() {
+    reset_ref(get_cm());
+    reset_ref(get_rm());
+}
+
+// Returns element 2,1 from a matrix (used to test copy/nocopy)
+double get_elem(Eigen::Ref<const Eigen::MatrixXd> m) { return m(2, 1); };
+
+
+// Returns a matrix with 10*r + 100*c added to each matrix element (to help test that the matrix
+// reference is referencing rows/columns correctly).
+template <typename MatrixArgType> Eigen::MatrixXd adjust_matrix(MatrixArgType m) {
+    Eigen::MatrixXd ret(m);
+    for (int c = 0; c < m.cols(); c++) for (int r = 0; r < m.rows(); r++)
+        ret(r, c) += 10*r + 100*c;
+    return ret;
+}
+
+struct CustomOperatorNew {
+    CustomOperatorNew() = default;
+
+    Eigen::Matrix4d a = Eigen::Matrix4d::Zero();
+    Eigen::Matrix4d b = Eigen::Matrix4d::Identity();
+
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW;
+};
+
+TEST_SUBMODULE(eigen, m) {
+    using FixedMatrixR = Eigen::Matrix<float, 5, 6, Eigen::RowMajor>;
+    using FixedMatrixC = Eigen::Matrix<float, 5, 6>;
+    using DenseMatrixR = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using DenseMatrixC = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>;
+    using FourRowMatrixC = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixC = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using FourRowMatrixR = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixR = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using SparseMatrixR = Eigen::SparseMatrix<float, Eigen::RowMajor>;
+    using SparseMatrixC = Eigen::SparseMatrix<float>;
+
+    // various tests
+    m.def("double_col", [](const Eigen::VectorXf &x) -> Eigen::VectorXf { return 2.0f * x; });
+    m.def("double_row", [](const Eigen::RowVectorXf &x) -> Eigen::RowVectorXf { return 2.0f * x; });
+    m.def("double_complex", [](const Eigen::VectorXcf &x) -> Eigen::VectorXcf { return 2.0f * x; });
+    m.def("double_threec", [](py::EigenDRef<Eigen::Vector3f> x) { x *= 2; });
+    m.def("double_threer", [](py::EigenDRef<Eigen::RowVector3f> x) { x *= 2; });
+    m.def("double_mat_cm", [](Eigen::MatrixXf x) -> Eigen::MatrixXf { return 2.0f * x; });
+    m.def("double_mat_rm", [](DenseMatrixR x) -> DenseMatrixR { return 2.0f * x; });
+
+    // test_eigen_ref_to_python
+    // Different ways of passing via Eigen::Ref; the first and second are the Eigen-recommended
+    m.def("cholesky1", [](Eigen::Ref<MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky2", [](const Eigen::Ref<const MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky3", [](const Eigen::Ref<MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky4", [](Eigen::Ref<const MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+
+    // test_eigen_ref_mutators
+    // Mutators: these add some value to the given element using Eigen, but Eigen should be mapping into
+    // the numpy array data and so the result should show up there.  There are three versions: one that
+    // works on a contiguous-row matrix (numpy's default), one for a contiguous-column matrix, and one
+    // for any matrix.
+    auto add_rm = [](Eigen::Ref<MatrixXdR> x, int r, int c, double v) { x(r,c) += v; };
+    auto add_cm = [](Eigen::Ref<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; };
+
+    // Mutators (Eigen maps into numpy variables):
+    m.def("add_rm", add_rm); // Only takes row-contiguous
+    m.def("add_cm", add_cm); // Only takes column-contiguous
+    // Overloaded versions that will accept either row or column contiguous:
+    m.def("add1", add_rm);
+    m.def("add1", add_cm);
+    m.def("add2", add_cm);
+    m.def("add2", add_rm);
+    // This one accepts a matrix of any stride:
+    m.def("add_any", [](py::EigenDRef<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; });
+
+    // Return mutable references (numpy maps into eigen variables)
+    m.def("get_cm_ref", []() { return Eigen::Ref<Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_ref", []() { return Eigen::Ref<MatrixXdR>(get_rm()); });
+    // The same references, but non-mutable (numpy maps into eigen variables, but is !writeable)
+    m.def("get_cm_const_ref", []() { return Eigen::Ref<const Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_const_ref", []() { return Eigen::Ref<const MatrixXdR>(get_rm()); });
+
+    m.def("reset_refs", reset_refs); // Restores get_{cm,rm}_ref to original values
+
+    // Increments and returns ref to (same) matrix
+    m.def("incr_matrix", [](Eigen::Ref<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Same, but accepts a matrix of any strides
+    m.def("incr_matrix_any", [](py::EigenDRef<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even rows
+    m.def("even_rows", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), (m.rows() + 1) / 2, m.cols(),
+                py::EigenDStride(m.outerStride(), 2 * m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even columns
+    m.def("even_cols", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), m.rows(), (m.cols() + 1) / 2,
+                py::EigenDStride(2 * m.outerStride(), m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns diagonals: a vector-like object with an inner stride != 1
+    m.def("diagonal", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal(); });
+    m.def("diagonal_1", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal<1>(); });
+    m.def("diagonal_n", [](const Eigen::Ref<const Eigen::MatrixXd> &x, int index) { return x.diagonal(index); });
+
+    // Return a block of a matrix (gives non-standard strides)
+    m.def("block", [](const Eigen::Ref<const Eigen::MatrixXd> &x, int start_row, int start_col, int block_rows, int block_cols) {
+        return x.block(start_row, start_col, block_rows, block_cols);
+    });
+
+    // test_eigen_return_references, test_eigen_keepalive
+    // return value referencing/copying tests:
+    class ReturnTester {
+        Eigen::MatrixXd mat = create();
+    public:
+        ReturnTester() { print_created(this); }
+        ~ReturnTester() { print_destroyed(this); }
+        static Eigen::MatrixXd create() { return Eigen::MatrixXd::Ones(10, 10); }
+        static const Eigen::MatrixXd createConst() { return Eigen::MatrixXd::Ones(10, 10); }
+        Eigen::MatrixXd &get() { return mat; }
+        Eigen::MatrixXd *getPtr() { return &mat; }
+        const Eigen::MatrixXd &view() { return mat; }
+        const Eigen::MatrixXd *viewPtr() { return &mat; }
+        Eigen::Ref<Eigen::MatrixXd> ref() { return mat; }
+        Eigen::Ref<const Eigen::MatrixXd> refConst() { return mat; }
+        Eigen::Block<Eigen::MatrixXd> block(int r, int c, int nrow, int ncol) { return mat.block(r, c, nrow, ncol); }
+        Eigen::Block<const Eigen::MatrixXd> blockConst(int r, int c, int nrow, int ncol) const { return mat.block(r, c, nrow, ncol); }
+        py::EigenDMap<Eigen::Matrix2d> corners() { return py::EigenDMap<Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+        py::EigenDMap<const Eigen::Matrix2d> cornersConst() const { return py::EigenDMap<const Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+    };
+    using rvp = py::return_value_policy;
+    py::class_<ReturnTester>(m, "ReturnTester")
+        .def(py::init<>())
+        .def_static("create", &ReturnTester::create)
+        .def_static("create_const", &ReturnTester::createConst)
+        .def("get", &ReturnTester::get, rvp::reference_internal)
+        .def("get_ptr", &ReturnTester::getPtr, rvp::reference_internal)
+        .def("view", &ReturnTester::view, rvp::reference_internal)
+        .def("view_ptr", &ReturnTester::view, rvp::reference_internal)
+        .def("copy_get", &ReturnTester::get)   // Default rvp: copy
+        .def("copy_view", &ReturnTester::view) //         "
+        .def("ref", &ReturnTester::ref) // Default for Ref is to reference
+        .def("ref_const", &ReturnTester::refConst) // Likewise, but const
+        .def("ref_safe", &ReturnTester::ref, rvp::reference_internal)
+        .def("ref_const_safe", &ReturnTester::refConst, rvp::reference_internal)
+        .def("copy_ref", &ReturnTester::ref, rvp::copy)
+        .def("copy_ref_const", &ReturnTester::refConst, rvp::copy)
+        .def("block", &ReturnTester::block)
+        .def("block_safe", &ReturnTester::block, rvp::reference_internal)
+        .def("block_const", &ReturnTester::blockConst, rvp::reference_internal)
+        .def("copy_block", &ReturnTester::block, rvp::copy)
+        .def("corners", &ReturnTester::corners, rvp::reference_internal)
+        .def("corners_const", &ReturnTester::cornersConst, rvp::reference_internal)
+        ;
+
+    // test_special_matrix_objects
+    // Returns a DiagonalMatrix with diagonal (1,2,3,...)
+    m.def("incr_diag", [](int k) {
+        Eigen::DiagonalMatrix<int, Eigen::Dynamic> m(k);
+        for (int i = 0; i < k; i++) m.diagonal()[i] = i+1;
+        return m;
+    });
+
+    // Returns a SelfAdjointView referencing the lower triangle of m
+    m.def("symmetric_lower", [](const Eigen::MatrixXi &m) {
+            return m.selfadjointView<Eigen::Lower>();
+    });
+    // Returns a SelfAdjointView referencing the lower triangle of m
+    m.def("symmetric_upper", [](const Eigen::MatrixXi &m) {
+            return m.selfadjointView<Eigen::Upper>();
+    });
+
+    // Test matrix for various functions below.
+    Eigen::MatrixXf mat(5, 6);
+    mat << 0,  3,  0,  0,  0, 11,
+           22, 0,  0,  0, 17, 11,
+           7,  5,  0,  1,  0, 11,
+           0,  0,  0,  0,  0, 11,
+           0,  0, 14,  0,  8, 11;
+
+    // test_fixed, and various other tests
+    m.def("fixed_r", [mat]() -> FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_r_const", [mat]() -> const FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_c", [mat]() -> FixedMatrixC { return FixedMatrixC(mat); });
+    m.def("fixed_copy_r", [](const FixedMatrixR &m) -> FixedMatrixR { return m; });
+    m.def("fixed_copy_c", [](const FixedMatrixC &m) -> FixedMatrixC { return m; });
+    // test_mutator_descriptors
+    m.def("fixed_mutator_r", [](Eigen::Ref<FixedMatrixR>) {});
+    m.def("fixed_mutator_c", [](Eigen::Ref<FixedMatrixC>) {});
+    m.def("fixed_mutator_a", [](py::EigenDRef<FixedMatrixC>) {});
+    // test_dense
+    m.def("dense_r", [mat]() -> DenseMatrixR { return DenseMatrixR(mat); });
+    m.def("dense_c", [mat]() -> DenseMatrixC { return DenseMatrixC(mat); });
+    m.def("dense_copy_r", [](const DenseMatrixR &m) -> DenseMatrixR { return m; });
+    m.def("dense_copy_c", [](const DenseMatrixC &m) -> DenseMatrixC { return m; });
+    // test_sparse, test_sparse_signature
+    m.def("sparse_r", [mat]() -> SparseMatrixR { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_c", [mat]() -> SparseMatrixC { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_copy_r", [](const SparseMatrixR &m) -> SparseMatrixR { return m; });
+    m.def("sparse_copy_c", [](const SparseMatrixC &m) -> SparseMatrixC { return m; });
+    // test_partially_fixed
+    m.def("partial_copy_four_rm_r", [](const FourRowMatrixR &m) -> FourRowMatrixR { return m; });
+    m.def("partial_copy_four_rm_c", [](const FourColMatrixR &m) -> FourColMatrixR { return m; });
+    m.def("partial_copy_four_cm_r", [](const FourRowMatrixC &m) -> FourRowMatrixC { return m; });
+    m.def("partial_copy_four_cm_c", [](const FourColMatrixC &m) -> FourColMatrixC { return m; });
+
+    // test_cpp_casting
+    // Test that we can cast a numpy object to a Eigen::MatrixXd explicitly
+    m.def("cpp_copy", [](py::handle m) { return m.cast<Eigen::MatrixXd>()(1, 0); });
+    m.def("cpp_ref_c", [](py::handle m) { return m.cast<Eigen::Ref<Eigen::MatrixXd>>()(1, 0); });
+    m.def("cpp_ref_r", [](py::handle m) { return m.cast<Eigen::Ref<MatrixXdR>>()(1, 0); });
+    m.def("cpp_ref_any", [](py::handle m) { return m.cast<py::EigenDRef<Eigen::MatrixXd>>()(1, 0); });
+
+
+    // test_nocopy_wrapper
+    // Test that we can prevent copying into an argument that would normally copy: First a version
+    // that would allow copying (if types or strides don't match) for comparison:
+    m.def("get_elem", &get_elem);
+    // Now this alternative that calls the tells pybind to fail rather than copy:
+    m.def("get_elem_nocopy", [](Eigen::Ref<const Eigen::MatrixXd> m) -> double { return get_elem(m); },
+            py::arg().noconvert());
+    // Also test a row-major-only no-copy const ref:
+    m.def("get_elem_rm_nocopy", [](Eigen::Ref<const Eigen::Matrix<long, -1, -1, Eigen::RowMajor>> &m) -> long { return m(2, 1); },
+            py::arg().noconvert());
+
+    // test_issue738
+    // Issue #738: 1xN or Nx1 2D matrices were neither accepted nor properly copied with an
+    // incompatible stride value on the length-1 dimension--but that should be allowed (without
+    // requiring a copy!) because the stride value can be safely ignored on a size-1 dimension.
+    m.def("iss738_f1", &adjust_matrix<const Eigen::Ref<const Eigen::MatrixXd> &>, py::arg().noconvert());
+    m.def("iss738_f2", &adjust_matrix<const Eigen::Ref<const Eigen::Matrix<double, -1, -1, Eigen::RowMajor>> &>, py::arg().noconvert());
+
+    // test_issue1105
+    // Issue #1105: when converting from a numpy two-dimensional (Nx1) or (1xN) value into a dense
+    // eigen Vector or RowVector, the argument would fail to load because the numpy copy would fail:
+    // numpy won't broadcast a Nx1 into a 1-dimensional vector.
+    m.def("iss1105_col", [](Eigen::VectorXd) { return true; });
+    m.def("iss1105_row", [](Eigen::RowVectorXd) { return true; });
+
+    // test_named_arguments
+    // Make sure named arguments are working properly:
+    m.def("matrix_multiply", [](const py::EigenDRef<const Eigen::MatrixXd> A, const py::EigenDRef<const Eigen::MatrixXd> B)
+            -> Eigen::MatrixXd {
+        if (A.cols() != B.rows()) throw std::domain_error("Nonconformable matrices!");
+        return A * B;
+    }, py::arg("A"), py::arg("B"));
+
+    // test_custom_operator_new
+    py::class_<CustomOperatorNew>(m, "CustomOperatorNew")
+        .def(py::init<>())
+        .def_readonly("a", &CustomOperatorNew::a)
+        .def_readonly("b", &CustomOperatorNew::b);
+
+    // test_eigen_ref_life_support
+    // In case of a failure (the caster's temp array does not live long enough), creating
+    // a new array (np.ones(10)) increases the chances that the temp array will be garbage
+    // collected and/or that its memory will be overridden with different values.
+    m.def("get_elem_direct", [](Eigen::Ref<const Eigen::VectorXd> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v(5);
+    });
+    m.def("get_elem_indirect", [](std::vector<Eigen::Ref<const Eigen::VectorXd>> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v[0](5);
+    });
+}
diff --git a/pybind11/tests/test_eigen.py b/pybind11/tests/test_eigen.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac68471474a869b59f786fd35cc69a3f2f1b27d5
--- /dev/null
+++ b/pybind11/tests/test_eigen.py
@@ -0,0 +1,697 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import ConstructorStats
+
+np = pytest.importorskip("numpy")
+m = pytest.importorskip("pybind11_tests.eigen")
+
+
+ref = np.array([[ 0.,  3,  0,  0,  0, 11],
+                [22,  0,  0,  0, 17, 11],
+                [ 7,  5,  0,  1,  0, 11],
+                [ 0,  0,  0,  0,  0, 11],
+                [ 0,  0, 14,  0,  8, 11]])
+
+
+def assert_equal_ref(mat):
+    np.testing.assert_array_equal(mat, ref)
+
+
+def assert_sparse_equal_ref(sparse_mat):
+    assert_equal_ref(sparse_mat.toarray())
+
+
+def test_fixed():
+    assert_equal_ref(m.fixed_c())
+    assert_equal_ref(m.fixed_r())
+    assert_equal_ref(m.fixed_copy_r(m.fixed_r()))
+    assert_equal_ref(m.fixed_copy_c(m.fixed_c()))
+    assert_equal_ref(m.fixed_copy_r(m.fixed_c()))
+    assert_equal_ref(m.fixed_copy_c(m.fixed_r()))
+
+
+def test_dense():
+    assert_equal_ref(m.dense_r())
+    assert_equal_ref(m.dense_c())
+    assert_equal_ref(m.dense_copy_r(m.dense_r()))
+    assert_equal_ref(m.dense_copy_c(m.dense_c()))
+    assert_equal_ref(m.dense_copy_r(m.dense_c()))
+    assert_equal_ref(m.dense_copy_c(m.dense_r()))
+
+
+def test_partially_fixed():
+    ref2 = np.array([[0., 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, 1]), ref2[:, [1]])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2[0, :]), ref2[[0], :])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
+    np.testing.assert_array_equal(
+        m.partial_copy_four_rm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, 1]), ref2[:, [1]])
+    np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2[0, :]), ref2[[0], :])
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
+    np.testing.assert_array_equal(
+        m.partial_copy_four_cm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+
+    # TypeError should be raise for a shape mismatch
+    functions = [m.partial_copy_four_rm_r, m.partial_copy_four_rm_c,
+                 m.partial_copy_four_cm_r, m.partial_copy_four_cm_c]
+    matrix_with_wrong_shape = [[1, 2],
+                               [3, 4]]
+    for f in functions:
+        with pytest.raises(TypeError) as excinfo:
+            f(matrix_with_wrong_shape)
+        assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_mutator_descriptors():
+    zr = np.arange(30, dtype='float32').reshape(5, 6)  # row-major
+    zc = zr.reshape(6, 5).transpose()  # column-major
+
+    m.fixed_mutator_r(zr)
+    m.fixed_mutator_c(zc)
+    m.fixed_mutator_a(zr)
+    m.fixed_mutator_a(zc)
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_r(zc)
+    assert ('(arg0: numpy.ndarray[numpy.float32[5, 6],'
+            ' flags.writeable, flags.c_contiguous]) -> None'
+            in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_c(zr)
+    assert ('(arg0: numpy.ndarray[numpy.float32[5, 6],'
+            ' flags.writeable, flags.f_contiguous]) -> None'
+            in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_a(np.array([[1, 2], [3, 4]], dtype='float32'))
+    assert ('(arg0: numpy.ndarray[numpy.float32[5, 6], flags.writeable]) -> None'
+            in str(excinfo.value))
+    zr.flags.writeable = False
+    with pytest.raises(TypeError):
+        m.fixed_mutator_r(zr)
+    with pytest.raises(TypeError):
+        m.fixed_mutator_a(zr)
+
+
+def test_cpp_casting():
+    assert m.cpp_copy(m.fixed_r()) == 22.
+    assert m.cpp_copy(m.fixed_c()) == 22.
+    z = np.array([[5., 6], [7, 8]])
+    assert m.cpp_copy(z) == 7.
+    assert m.cpp_copy(m.get_cm_ref()) == 21.
+    assert m.cpp_copy(m.get_rm_ref()) == 21.
+    assert m.cpp_ref_c(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_r(m.get_rm_ref()) == 21.
+    with pytest.raises(RuntimeError) as excinfo:
+        # Can't reference m.fixed_c: it contains floats, m.cpp_ref_any wants doubles
+        m.cpp_ref_any(m.fixed_c())
+    assert 'Unable to cast Python instance' in str(excinfo.value)
+    with pytest.raises(RuntimeError) as excinfo:
+        # Can't reference m.fixed_r: it contains floats, m.cpp_ref_any wants doubles
+        m.cpp_ref_any(m.fixed_r())
+    assert 'Unable to cast Python instance' in str(excinfo.value)
+    assert m.cpp_ref_any(m.ReturnTester.create()) == 1.
+
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+
+
+def test_pass_readonly_array():
+    z = np.full((5, 6), 42.0)
+    z.flags.writeable = False
+    np.testing.assert_array_equal(z, m.fixed_copy_r(z))
+    np.testing.assert_array_equal(m.fixed_r_const(), m.fixed_r())
+    assert not m.fixed_r_const().flags.writeable
+    np.testing.assert_array_equal(m.fixed_copy_r(m.fixed_r_const()), m.fixed_r_const())
+
+
+def test_nonunit_stride_from_python():
+    counting_mat = np.arange(9.0, dtype=np.float32).reshape((3, 3))
+    second_row = counting_mat[1, :]
+    second_col = counting_mat[:, 1]
+    np.testing.assert_array_equal(m.double_row(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_col(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_complex(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_row(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_col(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_complex(second_col), 2.0 * second_col)
+
+    counting_3d = np.arange(27.0, dtype=np.float32).reshape((3, 3, 3))
+    slices = [counting_3d[0, :, :], counting_3d[:, 0, :], counting_3d[:, :, 0]]
+    for ref_mat in slices:
+        np.testing.assert_array_equal(m.double_mat_cm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_rm(ref_mat), 2.0 * ref_mat)
+
+    # Mutator:
+    m.double_threer(second_row)
+    m.double_threec(second_col)
+    np.testing.assert_array_equal(counting_mat, [[0., 2, 2], [6, 16, 10], [6, 14, 8]])
+
+
+def test_negative_stride_from_python(msg):
+    """Eigen doesn't support (as of yet) negative strides. When a function takes an Eigen matrix by
+    copy or const reference, we can pass a numpy array that has negative strides.  Otherwise, an
+    exception will be thrown as Eigen will not be able to map the numpy array."""
+
+    counting_mat = np.arange(9.0, dtype=np.float32).reshape((3, 3))
+    counting_mat = counting_mat[::-1, ::-1]
+    second_row = counting_mat[1, :]
+    second_col = counting_mat[:, 1]
+    np.testing.assert_array_equal(m.double_row(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_col(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_complex(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_row(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_col(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_complex(second_col), 2.0 * second_col)
+
+    counting_3d = np.arange(27.0, dtype=np.float32).reshape((3, 3, 3))
+    counting_3d = counting_3d[::-1, ::-1, ::-1]
+    slices = [counting_3d[0, :, :], counting_3d[:, 0, :], counting_3d[:, :, 0]]
+    for ref_mat in slices:
+        np.testing.assert_array_equal(m.double_mat_cm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_rm(ref_mat), 2.0 * ref_mat)
+
+    # Mutator:
+    with pytest.raises(TypeError) as excinfo:
+        m.double_threer(second_row)
+    assert msg(excinfo.value) == """
+        double_threer(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[numpy.float32[1, 3], flags.writeable]) -> None
+
+        Invoked with: """ + repr(np.array([ 5.,  4.,  3.], dtype='float32'))  # noqa: E501 line too long
+
+    with pytest.raises(TypeError) as excinfo:
+        m.double_threec(second_col)
+    assert msg(excinfo.value) == """
+        double_threec(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[numpy.float32[3, 1], flags.writeable]) -> None
+
+        Invoked with: """ + repr(np.array([ 7.,  4.,  1.], dtype='float32'))  # noqa: E501 line too long
+
+
+def test_nonunit_stride_to_python():
+    assert np.all(m.diagonal(ref) == ref.diagonal())
+    assert np.all(m.diagonal_1(ref) == ref.diagonal(1))
+    for i in range(-5, 7):
+        assert np.all(m.diagonal_n(ref, i) == ref.diagonal(i)), "m.diagonal_n({})".format(i)
+
+    assert np.all(m.block(ref, 2, 1, 3, 3) == ref[2:5, 1:4])
+    assert np.all(m.block(ref, 1, 4, 4, 2) == ref[1:, 4:])
+    assert np.all(m.block(ref, 1, 4, 3, 2) == ref[1:4, 4:])
+
+
+def test_eigen_ref_to_python():
+    chols = [m.cholesky1, m.cholesky2, m.cholesky3, m.cholesky4]
+    for i, chol in enumerate(chols, start=1):
+        mymat = chol(np.array([[1., 2, 4], [2, 13, 23], [4, 23, 77]]))
+        assert np.all(mymat == np.array([[1, 0, 0], [2, 3, 0], [4, 5, 6]])), "cholesky{}".format(i)
+
+
+def assign_both(a1, a2, r, c, v):
+    a1[r, c] = v
+    a2[r, c] = v
+
+
+def array_copy_but_one(a, r, c, v):
+    z = np.array(a, copy=True)
+    z[r, c] = v
+    return z
+
+
+def test_eigen_return_references():
+    """Tests various ways of returning references and non-referencing copies"""
+
+    master = np.ones((10, 10))
+    a = m.ReturnTester()
+    a_get1 = a.get()
+    assert not a_get1.flags.owndata and a_get1.flags.writeable
+    assign_both(a_get1, master, 3, 3, 5)
+    a_get2 = a.get_ptr()
+    assert not a_get2.flags.owndata and a_get2.flags.writeable
+    assign_both(a_get1, master, 2, 3, 6)
+
+    a_view1 = a.view()
+    assert not a_view1.flags.owndata and not a_view1.flags.writeable
+    with pytest.raises(ValueError):
+        a_view1[2, 3] = 4
+    a_view2 = a.view_ptr()
+    assert not a_view2.flags.owndata and not a_view2.flags.writeable
+    with pytest.raises(ValueError):
+        a_view2[2, 3] = 4
+
+    a_copy1 = a.copy_get()
+    assert a_copy1.flags.owndata and a_copy1.flags.writeable
+    np.testing.assert_array_equal(a_copy1, master)
+    a_copy1[7, 7] = -44  # Shouldn't affect anything else
+    c1want = array_copy_but_one(master, 7, 7, -44)
+    a_copy2 = a.copy_view()
+    assert a_copy2.flags.owndata and a_copy2.flags.writeable
+    np.testing.assert_array_equal(a_copy2, master)
+    a_copy2[4, 4] = -22  # Shouldn't affect anything else
+    c2want = array_copy_but_one(master, 4, 4, -22)
+
+    a_ref1 = a.ref()
+    assert not a_ref1.flags.owndata and a_ref1.flags.writeable
+    assign_both(a_ref1, master, 1, 1, 15)
+    a_ref2 = a.ref_const()
+    assert not a_ref2.flags.owndata and not a_ref2.flags.writeable
+    with pytest.raises(ValueError):
+        a_ref2[5, 5] = 33
+    a_ref3 = a.ref_safe()
+    assert not a_ref3.flags.owndata and a_ref3.flags.writeable
+    assign_both(a_ref3, master, 0, 7, 99)
+    a_ref4 = a.ref_const_safe()
+    assert not a_ref4.flags.owndata and not a_ref4.flags.writeable
+    with pytest.raises(ValueError):
+        a_ref4[7, 0] = 987654321
+
+    a_copy3 = a.copy_ref()
+    assert a_copy3.flags.owndata and a_copy3.flags.writeable
+    np.testing.assert_array_equal(a_copy3, master)
+    a_copy3[8, 1] = 11
+    c3want = array_copy_but_one(master, 8, 1, 11)
+    a_copy4 = a.copy_ref_const()
+    assert a_copy4.flags.owndata and a_copy4.flags.writeable
+    np.testing.assert_array_equal(a_copy4, master)
+    a_copy4[8, 4] = 88
+    c4want = array_copy_but_one(master, 8, 4, 88)
+
+    a_block1 = a.block(3, 3, 2, 2)
+    assert not a_block1.flags.owndata and a_block1.flags.writeable
+    a_block1[0, 0] = 55
+    master[3, 3] = 55
+    a_block2 = a.block_safe(2, 2, 3, 2)
+    assert not a_block2.flags.owndata and a_block2.flags.writeable
+    a_block2[2, 1] = -123
+    master[4, 3] = -123
+    a_block3 = a.block_const(6, 7, 4, 3)
+    assert not a_block3.flags.owndata and not a_block3.flags.writeable
+    with pytest.raises(ValueError):
+        a_block3[2, 2] = -44444
+
+    a_copy5 = a.copy_block(2, 2, 2, 3)
+    assert a_copy5.flags.owndata and a_copy5.flags.writeable
+    np.testing.assert_array_equal(a_copy5, master[2:4, 2:5])
+    a_copy5[1, 1] = 777
+    c5want = array_copy_but_one(master[2:4, 2:5], 1, 1, 777)
+
+    a_corn1 = a.corners()
+    assert not a_corn1.flags.owndata and a_corn1.flags.writeable
+    a_corn1 *= 50
+    a_corn1[1, 1] = 999
+    master[0, 0] = 50
+    master[0, 9] = 50
+    master[9, 0] = 50
+    master[9, 9] = 999
+    a_corn2 = a.corners_const()
+    assert not a_corn2.flags.owndata and not a_corn2.flags.writeable
+    with pytest.raises(ValueError):
+        a_corn2[1, 0] = 51
+
+    # All of the changes made all the way along should be visible everywhere
+    # now (except for the copies, of course)
+    np.testing.assert_array_equal(a_get1, master)
+    np.testing.assert_array_equal(a_get2, master)
+    np.testing.assert_array_equal(a_view1, master)
+    np.testing.assert_array_equal(a_view2, master)
+    np.testing.assert_array_equal(a_ref1, master)
+    np.testing.assert_array_equal(a_ref2, master)
+    np.testing.assert_array_equal(a_ref3, master)
+    np.testing.assert_array_equal(a_ref4, master)
+    np.testing.assert_array_equal(a_block1, master[3:5, 3:5])
+    np.testing.assert_array_equal(a_block2, master[2:5, 2:4])
+    np.testing.assert_array_equal(a_block3, master[6:10, 7:10])
+    np.testing.assert_array_equal(a_corn1, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+    np.testing.assert_array_equal(a_corn2, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+
+    np.testing.assert_array_equal(a_copy1, c1want)
+    np.testing.assert_array_equal(a_copy2, c2want)
+    np.testing.assert_array_equal(a_copy3, c3want)
+    np.testing.assert_array_equal(a_copy4, c4want)
+    np.testing.assert_array_equal(a_copy5, c5want)
+
+
+def assert_keeps_alive(cl, method, *args):
+    cstats = ConstructorStats.get(cl)
+    start_with = cstats.alive()
+    a = cl()
+    assert cstats.alive() == start_with + 1
+    z = method(a, *args)
+    assert cstats.alive() == start_with + 1
+    del a
+    # Here's the keep alive in action:
+    assert cstats.alive() == start_with + 1
+    del z
+    # Keep alive should have expired:
+    assert cstats.alive() == start_with
+
+
+def test_eigen_keepalive():
+    a = m.ReturnTester()
+    cstats = ConstructorStats.get(m.ReturnTester)
+    assert cstats.alive() == 1
+    unsafe = [a.ref(), a.ref_const(), a.block(1, 2, 3, 4)]
+    copies = [a.copy_get(), a.copy_view(), a.copy_ref(), a.copy_ref_const(),
+              a.copy_block(4, 3, 2, 1)]
+    del a
+    assert cstats.alive() == 0
+    del unsafe
+    del copies
+
+    for meth in [m.ReturnTester.get, m.ReturnTester.get_ptr, m.ReturnTester.view,
+                 m.ReturnTester.view_ptr, m.ReturnTester.ref_safe, m.ReturnTester.ref_const_safe,
+                 m.ReturnTester.corners, m.ReturnTester.corners_const]:
+        assert_keeps_alive(m.ReturnTester, meth)
+
+    for meth in [m.ReturnTester.block_safe, m.ReturnTester.block_const]:
+        assert_keeps_alive(m.ReturnTester, meth, 4, 3, 2, 1)
+
+
+def test_eigen_ref_mutators():
+    """Tests Eigen's ability to mutate numpy values"""
+
+    orig = np.array([[1., 2, 3], [4, 5, 6], [7, 8, 9]])
+    zr = np.array(orig)
+    zc = np.array(orig, order='F')
+    m.add_rm(zr, 1, 0, 100)
+    assert np.all(zr == np.array([[1., 2, 3], [104, 5, 6], [7, 8, 9]]))
+    m.add_cm(zc, 1, 0, 200)
+    assert np.all(zc == np.array([[1., 2, 3], [204, 5, 6], [7, 8, 9]]))
+
+    m.add_any(zr, 1, 0, 20)
+    assert np.all(zr == np.array([[1., 2, 3], [124, 5, 6], [7, 8, 9]]))
+    m.add_any(zc, 1, 0, 10)
+    assert np.all(zc == np.array([[1., 2, 3], [214, 5, 6], [7, 8, 9]]))
+
+    # Can't reference a col-major array with a row-major Ref, and vice versa:
+    with pytest.raises(TypeError):
+        m.add_rm(zc, 1, 0, 1)
+    with pytest.raises(TypeError):
+        m.add_cm(zr, 1, 0, 1)
+
+    # Overloads:
+    m.add1(zr, 1, 0, -100)
+    m.add2(zr, 1, 0, -20)
+    assert np.all(zr == orig)
+    m.add1(zc, 1, 0, -200)
+    m.add2(zc, 1, 0, -10)
+    assert np.all(zc == orig)
+
+    # a non-contiguous slice (this won't work on either the row- or
+    # column-contiguous refs, but should work for the any)
+    cornersr = zr[0::2, 0::2]
+    cornersc = zc[0::2, 0::2]
+
+    assert np.all(cornersr == np.array([[1., 3], [7, 9]]))
+    assert np.all(cornersc == np.array([[1., 3], [7, 9]]))
+
+    with pytest.raises(TypeError):
+        m.add_rm(cornersr, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_cm(cornersr, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_rm(cornersc, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_cm(cornersc, 0, 1, 25)
+    m.add_any(cornersr, 0, 1, 25)
+    m.add_any(cornersc, 0, 1, 44)
+    assert np.all(zr == np.array([[1., 2, 28], [4, 5, 6], [7, 8, 9]]))
+    assert np.all(zc == np.array([[1., 2, 47], [4, 5, 6], [7, 8, 9]]))
+
+    # You shouldn't be allowed to pass a non-writeable array to a mutating Eigen method:
+    zro = zr[0:4, 0:4]
+    zro.flags.writeable = False
+    with pytest.raises(TypeError):
+        m.add_rm(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add_any(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add1(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add2(zro, 0, 0, 0)
+
+    # integer array shouldn't be passable to a double-matrix-accepting mutating func:
+    zi = np.array([[1, 2], [3, 4]])
+    with pytest.raises(TypeError):
+        m.add_rm(zi)
+
+
+def test_numpy_ref_mutators():
+    """Tests numpy mutating Eigen matrices (for returned Eigen::Ref<...>s)"""
+
+    m.reset_refs()  # In case another test already changed it
+
+    zc = m.get_cm_ref()
+    zcro = m.get_cm_const_ref()
+    zr = m.get_rm_ref()
+    zrro = m.get_rm_const_ref()
+
+    assert [zc[1, 2], zcro[1, 2], zr[1, 2], zrro[1, 2]] == [23] * 4
+
+    assert not zc.flags.owndata and zc.flags.writeable
+    assert not zr.flags.owndata and zr.flags.writeable
+    assert not zcro.flags.owndata and not zcro.flags.writeable
+    assert not zrro.flags.owndata and not zrro.flags.writeable
+
+    zc[1, 2] = 99
+    expect = np.array([[11., 12, 13], [21, 22, 99], [31, 32, 33]])
+    # We should have just changed zc, of course, but also zcro and the original eigen matrix
+    assert np.all(zc == expect)
+    assert np.all(zcro == expect)
+    assert np.all(m.get_cm_ref() == expect)
+
+    zr[1, 2] = 99
+    assert np.all(zr == expect)
+    assert np.all(zrro == expect)
+    assert np.all(m.get_rm_ref() == expect)
+
+    # Make sure the readonly ones are numpy-readonly:
+    with pytest.raises(ValueError):
+        zcro[1, 2] = 6
+    with pytest.raises(ValueError):
+        zrro[1, 2] = 6
+
+    # We should be able to explicitly copy like this (and since we're copying,
+    # the const should drop away)
+    y1 = np.array(m.get_cm_const_ref())
+
+    assert y1.flags.owndata and y1.flags.writeable
+    # We should get copies of the eigen data, which was modified above:
+    assert y1[1, 2] == 99
+    y1[1, 2] += 12
+    assert y1[1, 2] == 111
+    assert zc[1, 2] == 99  # Make sure we aren't referencing the original
+
+
+def test_both_ref_mutators():
+    """Tests a complex chain of nested eigen/numpy references"""
+
+    m.reset_refs()  # In case another test already changed it
+
+    z = m.get_cm_ref()  # numpy -> eigen
+    z[0, 2] -= 3
+    z2 = m.incr_matrix(z, 1)  # numpy -> eigen -> numpy -> eigen
+    z2[1, 1] += 6
+    z3 = m.incr_matrix(z, 2)  # (numpy -> eigen)^3
+    z3[2, 2] += -5
+    z4 = m.incr_matrix(z, 3)  # (numpy -> eigen)^4
+    z4[1, 1] -= 1
+    z5 = m.incr_matrix(z, 4)  # (numpy -> eigen)^5
+    z5[0, 0] = 0
+    assert np.all(z == z2)
+    assert np.all(z == z3)
+    assert np.all(z == z4)
+    assert np.all(z == z5)
+    expect = np.array([[0., 22, 20], [31, 37, 33], [41, 42, 38]])
+    assert np.all(z == expect)
+
+    y = np.array(range(100), dtype='float64').reshape(10, 10)
+    y2 = m.incr_matrix_any(y, 10)  # np -> eigen -> np
+    y3 = m.incr_matrix_any(y2[0::2, 0::2], -33)  # np -> eigen -> np slice -> np -> eigen -> np
+    y4 = m.even_rows(y3)  # numpy -> eigen slice -> (... y3)
+    y5 = m.even_cols(y4)  # numpy -> eigen slice -> (... y4)
+    y6 = m.incr_matrix_any(y5, 1000)  # numpy -> eigen -> (... y5)
+
+    # Apply same mutations using just numpy:
+    yexpect = np.array(range(100), dtype='float64').reshape(10, 10)
+    yexpect += 10
+    yexpect[0::2, 0::2] -= 33
+    yexpect[0::4, 0::4] += 1000
+    assert np.all(y6 == yexpect[0::4, 0::4])
+    assert np.all(y5 == yexpect[0::4, 0::4])
+    assert np.all(y4 == yexpect[0::4, 0::2])
+    assert np.all(y3 == yexpect[0::2, 0::2])
+    assert np.all(y2 == yexpect)
+    assert np.all(y == yexpect)
+
+
+def test_nocopy_wrapper():
+    # get_elem requires a column-contiguous matrix reference, but should be
+    # callable with other types of matrix (via copying):
+    int_matrix_colmajor = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order='F')
+    dbl_matrix_colmajor = np.array(int_matrix_colmajor, dtype='double', order='F', copy=True)
+    int_matrix_rowmajor = np.array(int_matrix_colmajor, order='C', copy=True)
+    dbl_matrix_rowmajor = np.array(int_matrix_rowmajor, dtype='double', order='C', copy=True)
+
+    # All should be callable via get_elem:
+    assert m.get_elem(int_matrix_colmajor) == 8
+    assert m.get_elem(dbl_matrix_colmajor) == 8
+    assert m.get_elem(int_matrix_rowmajor) == 8
+    assert m.get_elem(dbl_matrix_rowmajor) == 8
+
+    # All but the second should fail with m.get_elem_nocopy:
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(int_matrix_colmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+    assert m.get_elem_nocopy(dbl_matrix_colmajor) == 8
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(int_matrix_rowmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(dbl_matrix_rowmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+
+    # For the row-major test, we take a long matrix in row-major, so only the third is allowed:
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(int_matrix_colmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(dbl_matrix_colmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+    assert m.get_elem_rm_nocopy(int_matrix_rowmajor) == 8
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(dbl_matrix_rowmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+
+
+def test_eigen_ref_life_support():
+    """Ensure the lifetime of temporary arrays created by the `Ref` caster
+
+    The `Ref` caster sometimes creates a copy which needs to stay alive. This needs to
+    happen both for directs casts (just the array) or indirectly (e.g. list of arrays).
+    """
+
+    a = np.full(shape=10, fill_value=8, dtype=np.int8)
+    assert m.get_elem_direct(a) == 8
+
+    list_of_a = [a]
+    assert m.get_elem_indirect(list_of_a) == 8
+
+
+def test_special_matrix_objects():
+    assert np.all(m.incr_diag(7) == np.diag([1., 2, 3, 4, 5, 6, 7]))
+
+    asymm = np.array([[ 1.,  2,  3,  4],
+                      [ 5,  6,  7,  8],
+                      [ 9, 10, 11, 12],
+                      [13, 14, 15, 16]])
+    symm_lower = np.array(asymm)
+    symm_upper = np.array(asymm)
+    for i in range(4):
+        for j in range(i + 1, 4):
+            symm_lower[i, j] = symm_lower[j, i]
+            symm_upper[j, i] = symm_upper[i, j]
+
+    assert np.all(m.symmetric_lower(asymm) == symm_lower)
+    assert np.all(m.symmetric_upper(asymm) == symm_upper)
+
+
+def test_dense_signature(doc):
+    assert doc(m.double_col) == """
+        double_col(arg0: numpy.ndarray[numpy.float32[m, 1]]) -> numpy.ndarray[numpy.float32[m, 1]]
+    """
+    assert doc(m.double_row) == """
+        double_row(arg0: numpy.ndarray[numpy.float32[1, n]]) -> numpy.ndarray[numpy.float32[1, n]]
+    """
+    assert doc(m.double_complex) == ("""
+        double_complex(arg0: numpy.ndarray[numpy.complex64[m, 1]])"""
+                                     """ -> numpy.ndarray[numpy.complex64[m, 1]]
+    """)
+    assert doc(m.double_mat_rm) == ("""
+        double_mat_rm(arg0: numpy.ndarray[numpy.float32[m, n]])"""
+                                    """ -> numpy.ndarray[numpy.float32[m, n]]
+    """)
+
+
+def test_named_arguments():
+    a = np.array([[1.0, 2], [3, 4], [5, 6]])
+    b = np.ones((2, 1))
+
+    assert np.all(m.matrix_multiply(a, b) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(A=a, B=b) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(B=b, A=a) == np.array([[3.], [7], [11]]))
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(b, a)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(A=b, B=a)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(B=a, A=b)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+
+def test_sparse():
+    pytest.importorskip("scipy")
+    assert_sparse_equal_ref(m.sparse_r())
+    assert_sparse_equal_ref(m.sparse_c())
+    assert_sparse_equal_ref(m.sparse_copy_r(m.sparse_r()))
+    assert_sparse_equal_ref(m.sparse_copy_c(m.sparse_c()))
+    assert_sparse_equal_ref(m.sparse_copy_r(m.sparse_c()))
+    assert_sparse_equal_ref(m.sparse_copy_c(m.sparse_r()))
+
+
+def test_sparse_signature(doc):
+    pytest.importorskip("scipy")
+    assert doc(m.sparse_copy_r) == """
+        sparse_copy_r(arg0: scipy.sparse.csr_matrix[numpy.float32]) -> scipy.sparse.csr_matrix[numpy.float32]
+    """  # noqa: E501 line too long
+    assert doc(m.sparse_copy_c) == """
+        sparse_copy_c(arg0: scipy.sparse.csc_matrix[numpy.float32]) -> scipy.sparse.csc_matrix[numpy.float32]
+    """  # noqa: E501 line too long
+
+
+def test_issue738():
+    """Ignore strides on a length-1 dimension (even if they would be incompatible length > 1)"""
+    assert np.all(m.iss738_f1(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
+    assert np.all(m.iss738_f1(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+
+    assert np.all(m.iss738_f2(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
+    assert np.all(m.iss738_f2(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+
+
+def test_issue1105():
+    """Issue 1105: 1xN or Nx1 input arrays weren't accepted for eigen
+    compile-time row vectors or column vector"""
+    assert m.iss1105_row(np.ones((1, 7)))
+    assert m.iss1105_col(np.ones((7, 1)))
+
+    # These should still fail (incompatible dimensions):
+    with pytest.raises(TypeError) as excinfo:
+        m.iss1105_row(np.ones((7, 1)))
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.iss1105_col(np.ones((1, 7)))
+    assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_custom_operator_new():
+    """Using Eigen types as member variables requires a class-specific
+    operator new with proper alignment"""
+
+    o = m.CustomOperatorNew()
+    np.testing.assert_allclose(o.a, 0.0)
+    np.testing.assert_allclose(o.b.diagonal(), 1.0)
diff --git a/pybind11/tests/test_embed/CMakeLists.txt b/pybind11/tests/test_embed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2e298fa7e44684cfae77184e6218f83b62d90c1c
--- /dev/null
+++ b/pybind11/tests/test_embed/CMakeLists.txt
@@ -0,0 +1,43 @@
+if("${PYTHON_MODULE_EXTENSION}" MATCHES "pypy" OR "${Python_INTERPRETER_ID}" STREQUAL "PyPy")
+  add_custom_target(cpptest) # Dummy target on PyPy. Embedding is not supported.
+  set(_suppress_unused_variable_warning "${DOWNLOAD_CATCH}")
+  return()
+endif()
+
+find_package(Catch 2.13.0)
+
+if(CATCH_FOUND)
+  message(STATUS "Building interpreter tests using Catch v${CATCH_VERSION}")
+else()
+  message(STATUS "Catch not detected. Interpreter tests will be skipped. Install Catch headers"
+                 " manually or use `cmake -DDOWNLOAD_CATCH=ON` to fetch them automatically.")
+  return()
+endif()
+
+find_package(Threads REQUIRED)
+
+add_executable(test_embed catch.cpp test_interpreter.cpp)
+pybind11_enable_warnings(test_embed)
+
+target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
+
+if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+  file(COPY test_interpreter.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
+endif()
+
+add_custom_target(
+  cpptest
+  COMMAND "$<TARGET_FILE:test_embed>"
+  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+
+pybind11_add_module(external_module THIN_LTO external_module.cpp)
+set_target_properties(external_module PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+                                                 "${CMAKE_CURRENT_BINARY_DIR}")
+foreach(config ${CMAKE_CONFIGURATION_TYPES})
+  string(TOUPPER ${config} config)
+  set_target_properties(external_module PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config}
+                                                   "${CMAKE_CURRENT_BINARY_DIR}")
+endforeach()
+add_dependencies(cpptest external_module)
+
+add_dependencies(check cpptest)
diff --git a/pybind11/tests/test_embed/catch.cpp b/pybind11/tests/test_embed/catch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd137385cb32250b8640169934fb96aa5e80f069
--- /dev/null
+++ b/pybind11/tests/test_embed/catch.cpp
@@ -0,0 +1,22 @@
+// The Catch implementation is compiled here. This is a standalone
+// translation unit to avoid recompiling it for every test change.
+
+#include <pybind11/embed.h>
+
+#ifdef _MSC_VER
+// Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to catch
+// 2.0.1; this should be fixed in the next catch release after 2.0.1).
+#  pragma warning(disable: 4996)
+#endif
+
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+namespace py = pybind11;
+
+int main(int argc, char *argv[]) {
+    py::scoped_interpreter guard{};
+    auto result = Catch::Session().run(argc, argv);
+
+    return result < 0xff ? result : 0xff;
+}
diff --git a/pybind11/tests/test_embed/external_module.cpp b/pybind11/tests/test_embed/external_module.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9a6058b179400545479412e5549d7a54f94caeb
--- /dev/null
+++ b/pybind11/tests/test_embed/external_module.cpp
@@ -0,0 +1,23 @@
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+/* Simple test module/test class to check that the referenced internals data of external pybind11
+ * modules aren't preserved over a finalize/initialize.
+ */
+
+PYBIND11_MODULE(external_module, m) {
+    class A {
+    public:
+        A(int value) : v{value} {};
+        int v;
+    };
+
+    py::class_<A>(m, "A")
+        .def(py::init<int>())
+        .def_readwrite("value", &A::v);
+
+    m.def("internals_at", []() {
+        return reinterpret_cast<uintptr_t>(&py::detail::get_internals());
+    });
+}
diff --git a/pybind11/tests/test_embed/test_interpreter.cpp b/pybind11/tests/test_embed/test_interpreter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..222bd565fbffd6484db09876ae9cceabffcb69cd
--- /dev/null
+++ b/pybind11/tests/test_embed/test_interpreter.cpp
@@ -0,0 +1,284 @@
+#include <pybind11/embed.h>
+
+#ifdef _MSC_VER
+// Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to catch
+// 2.0.1; this should be fixed in the next catch release after 2.0.1).
+#  pragma warning(disable: 4996)
+#endif
+
+#include <catch.hpp>
+
+#include <thread>
+#include <fstream>
+#include <functional>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+class Widget {
+public:
+    Widget(std::string message) : message(message) { }
+    virtual ~Widget() = default;
+
+    std::string the_message() const { return message; }
+    virtual int the_answer() const = 0;
+
+private:
+    std::string message;
+};
+
+class PyWidget final : public Widget {
+    using Widget::Widget;
+
+    int the_answer() const override { PYBIND11_OVERLOAD_PURE(int, Widget, the_answer); }
+};
+
+PYBIND11_EMBEDDED_MODULE(widget_module, m) {
+    py::class_<Widget, PyWidget>(m, "Widget")
+        .def(py::init<std::string>())
+        .def_property_readonly("the_message", &Widget::the_message);
+
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_exception, ) {
+    throw std::runtime_error("C++ Error");
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
+    auto d = py::dict();
+    d["missing"].cast<py::object>();
+}
+
+TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
+    auto module = py::module::import("test_interpreter");
+    REQUIRE(py::hasattr(module, "DerivedWidget"));
+
+    auto locals = py::dict("hello"_a="Hello, World!", "x"_a=5, **module.attr("__dict__"));
+    py::exec(R"(
+        widget = DerivedWidget("{} - {}".format(hello, x))
+        message = widget.the_message
+    )", py::globals(), locals);
+    REQUIRE(locals["message"].cast<std::string>() == "Hello, World! - 5");
+
+    auto py_widget = module.attr("DerivedWidget")("The question");
+    auto message = py_widget.attr("the_message");
+    REQUIRE(message.cast<std::string>() == "The question");
+
+    const auto &cpp_widget = py_widget.cast<const Widget &>();
+    REQUIRE(cpp_widget.the_answer() == 42);
+}
+
+TEST_CASE("Import error handling") {
+    REQUIRE_NOTHROW(py::module::import("widget_module"));
+    REQUIRE_THROWS_WITH(py::module::import("throw_exception"),
+                        "ImportError: C++ Error");
+    REQUIRE_THROWS_WITH(py::module::import("throw_error_already_set"),
+                        Catch::Contains("ImportError: KeyError"));
+}
+
+TEST_CASE("There can be only one interpreter") {
+    static_assert(std::is_move_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_move_assignable<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_assignable<py::scoped_interpreter>::value, "");
+
+    REQUIRE_THROWS_WITH(py::initialize_interpreter(), "The interpreter is already running");
+    REQUIRE_THROWS_WITH(py::scoped_interpreter(), "The interpreter is already running");
+
+    py::finalize_interpreter();
+    REQUIRE_NOTHROW(py::scoped_interpreter());
+    {
+        auto pyi1 = py::scoped_interpreter();
+        auto pyi2 = std::move(pyi1);
+    }
+    py::initialize_interpreter();
+}
+
+bool has_pybind11_internals_builtin() {
+    auto builtins = py::handle(PyEval_GetBuiltins());
+    return builtins.contains(PYBIND11_INTERNALS_ID);
+};
+
+bool has_pybind11_internals_static() {
+    auto **&ipp = py::detail::get_internals_pp();
+    return ipp && *ipp;
+}
+
+TEST_CASE("Restart the interpreter") {
+    // Verify pre-restart state.
+    REQUIRE(py::module::import("widget_module").attr("add")(1, 2).cast<int>() == 3);
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+    REQUIRE(py::module::import("external_module").attr("A")(123).attr("value").cast<int>() == 123);
+
+    // local and foreign module internals should point to the same internals:
+    REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
+            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+
+    // Restart the interpreter.
+    py::finalize_interpreter();
+    REQUIRE(Py_IsInitialized() == 0);
+
+    py::initialize_interpreter();
+    REQUIRE(Py_IsInitialized() == 1);
+
+    // Internals are deleted after a restart.
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    pybind11::detail::get_internals();
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+    REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
+            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+
+    // Make sure that an interpreter with no get_internals() created until finalize still gets the
+    // internals destroyed
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    bool ran = false;
+    py::module::import("__main__").attr("internals_destroy_test") =
+        py::capsule(&ran, [](void *ran) { py::detail::get_internals(); *static_cast<bool *>(ran) = true; });
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    REQUIRE_FALSE(ran);
+    py::finalize_interpreter();
+    REQUIRE(ran);
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    // C++ modules can be reloaded.
+    auto cpp_module = py::module::import("widget_module");
+    REQUIRE(cpp_module.attr("add")(1, 2).cast<int>() == 3);
+
+    // C++ type information is reloaded and can be used in python modules.
+    auto py_module = py::module::import("test_interpreter");
+    auto py_widget = py_module.attr("DerivedWidget")("Hello after restart");
+    REQUIRE(py_widget.attr("the_message").cast<std::string>() == "Hello after restart");
+}
+
+TEST_CASE("Subinterpreter") {
+    // Add tags to the modules in the main interpreter and test the basics.
+    py::module::import("__main__").attr("main_tag") = "main interpreter";
+    {
+        auto m = py::module::import("widget_module");
+        m.attr("extension_module_tag") = "added to module in main interpreter";
+
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    /// Create and switch to a subinterpreter.
+    auto main_tstate = PyThreadState_Get();
+    auto sub_tstate = Py_NewInterpreter();
+
+    // Subinterpreters get their own copy of builtins. detail::get_internals() still
+    // works by returning from the static variable, i.e. all interpreters share a single
+    // global pybind11::internals;
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    // Modules tags should be gone.
+    REQUIRE_FALSE(py::hasattr(py::module::import("__main__"), "tag"));
+    {
+        auto m = py::module::import("widget_module");
+        REQUIRE_FALSE(py::hasattr(m, "extension_module_tag"));
+
+        // Function bindings should still work.
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+
+    // Restore main interpreter.
+    Py_EndInterpreter(sub_tstate);
+    PyThreadState_Swap(main_tstate);
+
+    REQUIRE(py::hasattr(py::module::import("__main__"), "main_tag"));
+    REQUIRE(py::hasattr(py::module::import("widget_module"), "extension_module_tag"));
+}
+
+TEST_CASE("Execution frame") {
+    // When the interpreter is embedded, there is no execution frame, but `py::exec`
+    // should still function by using reasonable globals: `__main__.__dict__`.
+    py::exec("var = dict(number=42)");
+    REQUIRE(py::globals()["var"]["number"].cast<int>() == 42);
+}
+
+TEST_CASE("Threads") {
+    // Restart interpreter to ensure threads are not initialized
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    constexpr auto num_threads = 10;
+    auto locals = py::dict("count"_a=0);
+
+    {
+        py::gil_scoped_release gil_release{};
+        REQUIRE(has_pybind11_internals_static());
+
+        auto threads = std::vector<std::thread>();
+        for (auto i = 0; i < num_threads; ++i) {
+            threads.emplace_back([&]() {
+                py::gil_scoped_acquire gil{};
+                locals["count"] = locals["count"].cast<int>() + 1;
+            });
+        }
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+
+    REQUIRE(locals["count"].cast<int>() == num_threads);
+}
+
+// Scope exit utility https://stackoverflow.com/a/36644501/7255855
+struct scope_exit {
+    std::function<void()> f_;
+    explicit scope_exit(std::function<void()> f) noexcept : f_(std::move(f)) {}
+    ~scope_exit() { if (f_) f_(); }
+};
+
+TEST_CASE("Reload module from file") {
+    // Disable generation of cached bytecode (.pyc files) for this test, otherwise
+    // Python might pick up an old version from the cache instead of the new versions
+    // of the .py files generated below
+    auto sys = py::module::import("sys");
+    bool dont_write_bytecode = sys.attr("dont_write_bytecode").cast<bool>();
+    sys.attr("dont_write_bytecode") = true;
+    // Reset the value at scope exit
+    scope_exit reset_dont_write_bytecode([&]() {
+        sys.attr("dont_write_bytecode") = dont_write_bytecode;
+    });
+
+    std::string module_name = "test_module_reload";
+    std::string module_file = module_name + ".py";
+
+    // Create the module .py file
+    std::ofstream test_module(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 1\n";
+    test_module.close();
+    // Delete the file at scope exit
+    scope_exit delete_module_file([&]() {
+        std::remove(module_file.c_str());
+    });
+
+    // Import the module from file
+    auto module = py::module::import(module_name.c_str());
+    int result = module.attr("test")().cast<int>();
+    REQUIRE(result == 1);
+
+    // Update the module .py file with a small change
+    test_module.open(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 2\n";
+    test_module.close();
+
+    // Reload the module
+    module.reload();
+    result = module.attr("test")().cast<int>();
+    REQUIRE(result == 2);
+}
diff --git a/pybind11/tests/test_embed/test_interpreter.py b/pybind11/tests/test_embed/test_interpreter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6174ede446f0356fbdf61aee4136535a78a32479
--- /dev/null
+++ b/pybind11/tests/test_embed/test_interpreter.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+from widget_module import Widget
+
+
+class DerivedWidget(Widget):
+    def __init__(self, message):
+        super(DerivedWidget, self).__init__(message)
+
+    def the_answer(self):
+        return 42
diff --git a/pybind11/tests/test_enum.cpp b/pybind11/tests/test_enum.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3153089208c964346e2fc39cafad8d0b372f1154
--- /dev/null
+++ b/pybind11/tests/test_enum.cpp
@@ -0,0 +1,87 @@
+/*
+    tests/test_enums.cpp -- enumerations
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(enums, m) {
+    // test_unscoped_enum
+    enum UnscopedEnum {
+        EOne = 1,
+        ETwo,
+        EThree
+    };
+    py::enum_<UnscopedEnum>(m, "UnscopedEnum", py::arithmetic(), "An unscoped enumeration")
+        .value("EOne", EOne, "Docstring for EOne")
+        .value("ETwo", ETwo, "Docstring for ETwo")
+        .value("EThree", EThree, "Docstring for EThree")
+        .export_values();
+
+    // test_scoped_enum
+    enum class ScopedEnum {
+        Two = 2,
+        Three
+    };
+    py::enum_<ScopedEnum>(m, "ScopedEnum", py::arithmetic())
+        .value("Two", ScopedEnum::Two)
+        .value("Three", ScopedEnum::Three);
+
+    m.def("test_scoped_enum", [](ScopedEnum z) {
+        return "ScopedEnum::" + std::string(z == ScopedEnum::Two ? "Two" : "Three");
+    });
+
+    // test_binary_operators
+    enum Flags {
+        Read = 4,
+        Write = 2,
+        Execute = 1
+    };
+    py::enum_<Flags>(m, "Flags", py::arithmetic())
+        .value("Read", Flags::Read)
+        .value("Write", Flags::Write)
+        .value("Execute", Flags::Execute)
+        .export_values();
+
+    // test_implicit_conversion
+    class ClassWithUnscopedEnum {
+    public:
+        enum EMode {
+            EFirstMode = 1,
+            ESecondMode
+        };
+
+        static EMode test_function(EMode mode) {
+            return mode;
+        }
+    };
+    py::class_<ClassWithUnscopedEnum> exenum_class(m, "ClassWithUnscopedEnum");
+    exenum_class.def_static("test_function", &ClassWithUnscopedEnum::test_function);
+    py::enum_<ClassWithUnscopedEnum::EMode>(exenum_class, "EMode")
+        .value("EFirstMode", ClassWithUnscopedEnum::EFirstMode)
+        .value("ESecondMode", ClassWithUnscopedEnum::ESecondMode)
+        .export_values();
+
+    // test_enum_to_int
+    m.def("test_enum_to_int", [](int) { });
+    m.def("test_enum_to_uint", [](uint32_t) { });
+    m.def("test_enum_to_long_long", [](long long) { });
+
+    // test_duplicate_enum_name
+    enum SimpleEnum
+    {
+        ONE, TWO, THREE
+    };
+
+    m.def("register_bad_enum", [m]() {
+        py::enum_<SimpleEnum>(m, "SimpleEnum")
+            .value("ONE", SimpleEnum::ONE)          //NOTE: all value function calls are called with the same first parameter value
+            .value("ONE", SimpleEnum::TWO)
+            .value("ONE", SimpleEnum::THREE)
+            .export_values();
+    });
+}
diff --git a/pybind11/tests/test_enum.py b/pybind11/tests/test_enum.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa193e9ba86295e249c20b96a150ce2ca0b88a
--- /dev/null
+++ b/pybind11/tests/test_enum.py
@@ -0,0 +1,207 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import enums as m
+
+
+def test_unscoped_enum():
+    assert str(m.UnscopedEnum.EOne) == "UnscopedEnum.EOne"
+    assert str(m.UnscopedEnum.ETwo) == "UnscopedEnum.ETwo"
+    assert str(m.EOne) == "UnscopedEnum.EOne"
+
+    # name property
+    assert m.UnscopedEnum.EOne.name == "EOne"
+    assert m.UnscopedEnum.ETwo.name == "ETwo"
+    assert m.EOne.name == "EOne"
+    # name readonly
+    with pytest.raises(AttributeError):
+        m.UnscopedEnum.EOne.name = ""
+    # name returns a copy
+    foo = m.UnscopedEnum.EOne.name
+    foo = "bar"
+    assert m.UnscopedEnum.EOne.name == "EOne"
+
+    # __members__ property
+    assert m.UnscopedEnum.__members__ == \
+        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo, "EThree": m.UnscopedEnum.EThree}
+    # __members__ readonly
+    with pytest.raises(AttributeError):
+        m.UnscopedEnum.__members__ = {}
+    # __members__ returns a copy
+    foo = m.UnscopedEnum.__members__
+    foo["bar"] = "baz"
+    assert m.UnscopedEnum.__members__ == \
+        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo, "EThree": m.UnscopedEnum.EThree}
+
+    for docstring_line in '''An unscoped enumeration
+
+Members:
+
+  EOne : Docstring for EOne
+
+  ETwo : Docstring for ETwo
+
+  EThree : Docstring for EThree'''.split('\n'):
+        assert docstring_line in m.UnscopedEnum.__doc__
+
+    # Unscoped enums will accept ==/!= int comparisons
+    y = m.UnscopedEnum.ETwo
+    assert y == 2
+    assert 2 == y
+    assert y != 3
+    assert 3 != y
+    # Compare with None
+    assert (y != None)  # noqa: E711
+    assert not (y == None)  # noqa: E711
+    # Compare with an object
+    assert (y != object())
+    assert not (y == object())
+    # Compare with string
+    assert y != "2"
+    assert "2" != y
+    assert not ("2" == y)
+    assert not (y == "2")
+
+    with pytest.raises(TypeError):
+        y < object()
+
+    with pytest.raises(TypeError):
+        y <= object()
+
+    with pytest.raises(TypeError):
+        y > object()
+
+    with pytest.raises(TypeError):
+        y >= object()
+
+    with pytest.raises(TypeError):
+        y | object()
+
+    with pytest.raises(TypeError):
+        y & object()
+
+    with pytest.raises(TypeError):
+        y ^ object()
+
+    assert int(m.UnscopedEnum.ETwo) == 2
+    assert str(m.UnscopedEnum(2)) == "UnscopedEnum.ETwo"
+
+    # order
+    assert m.UnscopedEnum.EOne < m.UnscopedEnum.ETwo
+    assert m.UnscopedEnum.EOne < 2
+    assert m.UnscopedEnum.ETwo > m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.ETwo > 1
+    assert m.UnscopedEnum.ETwo <= 2
+    assert m.UnscopedEnum.ETwo >= 2
+    assert m.UnscopedEnum.EOne <= m.UnscopedEnum.ETwo
+    assert m.UnscopedEnum.EOne <= 2
+    assert m.UnscopedEnum.ETwo >= m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.ETwo >= 1
+    assert not (m.UnscopedEnum.ETwo < m.UnscopedEnum.EOne)
+    assert not (2 < m.UnscopedEnum.EOne)
+
+    # arithmetic
+    assert m.UnscopedEnum.EOne & m.UnscopedEnum.EThree == m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.EOne | m.UnscopedEnum.ETwo == m.UnscopedEnum.EThree
+    assert m.UnscopedEnum.EOne ^ m.UnscopedEnum.EThree == m.UnscopedEnum.ETwo
+
+
+def test_scoped_enum():
+    assert m.test_scoped_enum(m.ScopedEnum.Three) == "ScopedEnum::Three"
+    z = m.ScopedEnum.Two
+    assert m.test_scoped_enum(z) == "ScopedEnum::Two"
+
+    # Scoped enums will *NOT* accept ==/!= int comparisons (Will always return False)
+    assert not z == 3
+    assert not 3 == z
+    assert z != 3
+    assert 3 != z
+    # Compare with None
+    assert (z != None)  # noqa: E711
+    assert not (z == None)  # noqa: E711
+    # Compare with an object
+    assert (z != object())
+    assert not (z == object())
+    # Scoped enums will *NOT* accept >, <, >= and <= int comparisons (Will throw exceptions)
+    with pytest.raises(TypeError):
+        z > 3
+    with pytest.raises(TypeError):
+        z < 3
+    with pytest.raises(TypeError):
+        z >= 3
+    with pytest.raises(TypeError):
+        z <= 3
+
+    # order
+    assert m.ScopedEnum.Two < m.ScopedEnum.Three
+    assert m.ScopedEnum.Three > m.ScopedEnum.Two
+    assert m.ScopedEnum.Two <= m.ScopedEnum.Three
+    assert m.ScopedEnum.Two <= m.ScopedEnum.Two
+    assert m.ScopedEnum.Two >= m.ScopedEnum.Two
+    assert m.ScopedEnum.Three >= m.ScopedEnum.Two
+
+
+def test_implicit_conversion():
+    assert str(m.ClassWithUnscopedEnum.EMode.EFirstMode) == "EMode.EFirstMode"
+    assert str(m.ClassWithUnscopedEnum.EFirstMode) == "EMode.EFirstMode"
+
+    f = m.ClassWithUnscopedEnum.test_function
+    first = m.ClassWithUnscopedEnum.EFirstMode
+    second = m.ClassWithUnscopedEnum.ESecondMode
+
+    assert f(first) == 1
+
+    assert f(first) == f(first)
+    assert not f(first) != f(first)
+
+    assert f(first) != f(second)
+    assert not f(first) == f(second)
+
+    assert f(first) == int(f(first))
+    assert not f(first) != int(f(first))
+
+    assert f(first) != int(f(second))
+    assert not f(first) == int(f(second))
+
+    # noinspection PyDictCreation
+    x = {f(first): 1, f(second): 2}
+    x[f(first)] = 3
+    x[f(second)] = 4
+    # Hashing test
+    assert str(x) == "{EMode.EFirstMode: 3, EMode.ESecondMode: 4}"
+
+
+def test_binary_operators():
+    assert int(m.Flags.Read) == 4
+    assert int(m.Flags.Write) == 2
+    assert int(m.Flags.Execute) == 1
+    assert int(m.Flags.Read | m.Flags.Write | m.Flags.Execute) == 7
+    assert int(m.Flags.Read | m.Flags.Write) == 6
+    assert int(m.Flags.Read | m.Flags.Execute) == 5
+    assert int(m.Flags.Write | m.Flags.Execute) == 3
+    assert int(m.Flags.Write | 1) == 3
+    assert ~m.Flags.Write == -3
+
+    state = m.Flags.Read | m.Flags.Write
+    assert (state & m.Flags.Read) != 0
+    assert (state & m.Flags.Write) != 0
+    assert (state & m.Flags.Execute) == 0
+    assert (state & 1) == 0
+
+    state2 = ~state
+    assert state2 == -7
+    assert int(state ^ state2) == -1
+
+
+def test_enum_to_int():
+    m.test_enum_to_int(m.Flags.Read)
+    m.test_enum_to_int(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+    m.test_enum_to_uint(m.Flags.Read)
+    m.test_enum_to_uint(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+    m.test_enum_to_long_long(m.Flags.Read)
+    m.test_enum_to_long_long(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+
+
+def test_duplicate_enum_name():
+    with pytest.raises(ValueError) as excinfo:
+        m.register_bad_enum()
+    assert str(excinfo.value) == 'SimpleEnum: element "ONE" already exists!'
diff --git a/pybind11/tests/test_eval.cpp b/pybind11/tests/test_eval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0948219117df7d8fd64dba3130d36e1307f272b
--- /dev/null
+++ b/pybind11/tests/test_eval.cpp
@@ -0,0 +1,91 @@
+/*
+    tests/test_eval.cpp -- Usage of eval() and eval_file()
+
+    Copyright (c) 2016 Klemens D. Morgenstern
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/eval.h>
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(eval_, m) {
+    // test_evals
+
+    auto global = py::dict(py::module::import("__main__").attr("__dict__"));
+
+    m.def("test_eval_statements", [global]() {
+        auto local = py::dict();
+        local["call_test"] = py::cpp_function([&]() -> int {
+            return 42;
+        });
+
+        // Regular string literal
+        py::exec(
+            "message = 'Hello World!'\n"
+            "x = call_test()",
+            global, local
+        );
+
+        // Multi-line raw string literal
+        py::exec(R"(
+            if x == 42:
+                print(message)
+            else:
+                raise RuntimeError
+            )", global, local
+        );
+        auto x = local["x"].cast<int>();
+
+        return x == 42;
+    });
+
+    m.def("test_eval", [global]() {
+        auto local = py::dict();
+        local["x"] = py::int_(42);
+        auto x = py::eval("x", global, local);
+        return x.cast<int>() == 42;
+    });
+
+    m.def("test_eval_single_statement", []() {
+        auto local = py::dict();
+        local["call_test"] = py::cpp_function([&]() -> int {
+            return 42;
+        });
+
+        auto result = py::eval<py::eval_single_statement>("x = call_test()", py::dict(), local);
+        auto x = local["x"].cast<int>();
+        return result.is_none() && x == 42;
+    });
+
+    m.def("test_eval_file", [global](py::str filename) {
+        auto local = py::dict();
+        local["y"] = py::int_(43);
+
+        int val_out;
+        local["call_test2"] = py::cpp_function([&](int value) { val_out = value; });
+
+        auto result = py::eval_file(filename, global, local);
+        return val_out == 43 && result.is_none();
+    });
+
+    m.def("test_eval_failure", []() {
+        try {
+            py::eval("nonsense code ...");
+        } catch (py::error_already_set &) {
+            return true;
+        }
+        return false;
+    });
+
+    m.def("test_eval_file_failure", []() {
+        try {
+            py::eval_file("non-existing file");
+        } catch (std::exception &) {
+            return true;
+        }
+        return false;
+    });
+}
diff --git a/pybind11/tests/test_eval.py b/pybind11/tests/test_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f9d1881db8d3154c73226414fa87f257a20bc8
--- /dev/null
+++ b/pybind11/tests/test_eval.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+import os
+
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import eval_ as m
+
+
+def test_evals(capture):
+    with capture:
+        assert m.test_eval_statements()
+    assert capture == "Hello World!"
+
+    assert m.test_eval()
+    assert m.test_eval_single_statement()
+
+    assert m.test_eval_failure()
+
+
+@pytest.mark.xfail("env.PYPY and not env.PY2", raises=RuntimeError)
+def test_eval_file():
+    filename = os.path.join(os.path.dirname(__file__), "test_eval_call.py")
+    assert m.test_eval_file(filename)
+
+    assert m.test_eval_file_failure()
diff --git a/pybind11/tests/test_eval_call.py b/pybind11/tests/test_eval_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42a0a6d3062777557e23ca40e5881f97b43f6a9
--- /dev/null
+++ b/pybind11/tests/test_eval_call.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# This file is called from 'test_eval.py'
+
+if 'call_test2' in locals():
+    call_test2(y)  # noqa: F821 undefined name
diff --git a/pybind11/tests/test_exceptions.cpp b/pybind11/tests/test_exceptions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..537819d987a46746cf65ccb812c312219fcd41ba
--- /dev/null
+++ b/pybind11/tests/test_exceptions.cpp
@@ -0,0 +1,224 @@
+/*
+    tests/test_custom-exceptions.cpp -- exception translation
+
+    Copyright (c) 2016 Pim Schellart <P.Schellart@princeton.edu>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+// A type that should be raised as an exception in Python
+class MyException : public std::exception {
+public:
+    explicit MyException(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that should be translated to a standard Python exception
+class MyException2 : public std::exception {
+public:
+    explicit MyException2(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that is not derived from std::exception (and is thus unknown)
+class MyException3 {
+public:
+    explicit MyException3(const char * m) : message{m} {}
+    virtual const char * what() const noexcept {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that should be translated to MyException
+// and delegated to its exception translator
+class MyException4 : public std::exception {
+public:
+    explicit MyException4(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+
+// Like the above, but declared via the helper function
+class MyException5 : public std::logic_error {
+public:
+    explicit MyException5(const std::string &what) : std::logic_error(what) {}
+};
+
+// Inherits from MyException5
+class MyException5_1 : public MyException5 {
+    using MyException5::MyException5;
+};
+
+struct PythonCallInDestructor {
+    PythonCallInDestructor(const py::dict &d) : d(d) {}
+    ~PythonCallInDestructor() { d["good"] = true; }
+
+    py::dict d;
+};
+
+
+
+struct PythonAlreadySetInDestructor {
+    PythonAlreadySetInDestructor(const py::str &s) : s(s) {}
+    ~PythonAlreadySetInDestructor() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set& ex) {
+            ex.discard_as_unraisable(s);
+        }
+    }
+
+    py::str s;
+};
+
+
+TEST_SUBMODULE(exceptions, m) {
+    m.def("throw_std_exception", []() {
+        throw std::runtime_error("This exception was intentionally thrown.");
+    });
+
+    // make a new custom exception and use it as a translation target
+    static py::exception<MyException> ex(m, "MyException");
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException &e) {
+            // Set MyException as the active python error
+            ex(e.what());
+        }
+    });
+
+    // register new translator for MyException2
+    // no need to store anything here because this type will
+    // never by visible from Python
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException2 &e) {
+            // Translate this exception to a standard RuntimeError
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+        }
+    });
+
+    // register new translator for MyException4
+    // which will catch it and delegate to the previously registered
+    // translator for MyException by throwing a new exception
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException4 &e) {
+            throw MyException(e.what());
+        }
+    });
+
+    // A simple exception translation:
+    auto ex5 = py::register_exception<MyException5>(m, "MyException5");
+    // A slightly more complicated one that declares MyException5_1 as a subclass of MyException5
+    py::register_exception<MyException5_1>(m, "MyException5_1", ex5.ptr());
+
+    m.def("throws1", []() { throw MyException("this error should go to a custom type"); });
+    m.def("throws2", []() { throw MyException2("this error should go to a standard Python exception"); });
+    m.def("throws3", []() { throw MyException3("this error cannot be translated"); });
+    m.def("throws4", []() { throw MyException4("this error is rethrown"); });
+    m.def("throws5", []() { throw MyException5("this is a helper-defined translated exception"); });
+    m.def("throws5_1", []() { throw MyException5_1("MyException5 subclass"); });
+    m.def("throws_logic_error", []() { throw std::logic_error("this error should fall through to the standard handler"); });
+    m.def("throws_overflow_error", []() {throw std::overflow_error(""); });
+    m.def("exception_matches", []() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set& ex) {
+            if (!ex.matches(PyExc_KeyError)) throw;
+            return true;
+        }
+        return false;
+    });
+    m.def("exception_matches_base", []() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set &ex) {
+            if (!ex.matches(PyExc_Exception)) throw;
+            return true;
+        }
+        return false;
+    });
+    m.def("modulenotfound_exception_matches_base", []() {
+        try {
+            // On Python >= 3.6, this raises a ModuleNotFoundError, a subclass of ImportError
+            py::module::import("nonexistent");
+        }
+        catch (py::error_already_set &ex) {
+            if (!ex.matches(PyExc_ImportError)) throw;
+            return true;
+        }
+        return false;
+    });
+
+    m.def("throw_already_set", [](bool err) {
+        if (err)
+            PyErr_SetString(PyExc_ValueError, "foo");
+        try {
+            throw py::error_already_set();
+        } catch (const std::runtime_error& e) {
+            if ((err && e.what() != std::string("ValueError: foo")) ||
+                (!err && e.what() != std::string("Unknown internal error occurred")))
+            {
+                PyErr_Clear();
+                throw std::runtime_error("error message mismatch");
+            }
+        }
+        PyErr_Clear();
+        if (err)
+            PyErr_SetString(PyExc_ValueError, "foo");
+        throw py::error_already_set();
+    });
+
+    m.def("python_call_in_destructor", [](py::dict d) {
+        try {
+            PythonCallInDestructor set_dict_in_destructor(d);
+            PyErr_SetString(PyExc_ValueError, "foo");
+            throw py::error_already_set();
+        } catch (const py::error_already_set&) {
+            return true;
+        }
+        return false;
+    });
+
+    m.def("python_alreadyset_in_destructor", [](py::str s) {
+        PythonAlreadySetInDestructor alreadyset_in_destructor(s);
+        return true;
+    });
+
+    // test_nested_throws
+    m.def("try_catch", [m](py::object exc_type, py::function f, py::args args) {
+        try { f(*args); }
+        catch (py::error_already_set &ex) {
+            if (ex.matches(exc_type))
+                py::print(ex.what());
+            else
+                throw;
+        }
+    });
+
+    // Test repr that cannot be displayed
+    m.def("simple_bool_passthrough", [](bool x) {return x;});
+
+}
diff --git a/pybind11/tests/test_exceptions.py b/pybind11/tests/test_exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d7088d00b8fec6aeab23f02c2646e3254b53917
--- /dev/null
+++ b/pybind11/tests/test_exceptions.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+import sys
+
+import pytest
+
+from pybind11_tests import exceptions as m
+import pybind11_cross_module_tests as cm
+
+
+def test_std_exception(msg):
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throw_std_exception()
+    assert msg(excinfo.value) == "This exception was intentionally thrown."
+
+
+def test_error_already_set(msg):
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throw_already_set(False)
+    assert msg(excinfo.value) == "Unknown internal error occurred"
+
+    with pytest.raises(ValueError) as excinfo:
+        m.throw_already_set(True)
+    assert msg(excinfo.value) == "foo"
+
+
+def test_cross_module_exceptions():
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.raise_runtime_error()
+    assert str(excinfo.value) == "My runtime error"
+
+    with pytest.raises(ValueError) as excinfo:
+        cm.raise_value_error()
+    assert str(excinfo.value) == "My value error"
+
+    with pytest.raises(ValueError) as excinfo:
+        cm.throw_pybind_value_error()
+    assert str(excinfo.value) == "pybind11 value error"
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.throw_pybind_type_error()
+    assert str(excinfo.value) == "pybind11 type error"
+
+    with pytest.raises(StopIteration) as excinfo:
+        cm.throw_stop_iteration()
+
+
+def test_python_call_in_catch():
+    d = {}
+    assert m.python_call_in_destructor(d) is True
+    assert d["good"] is True
+
+
+def test_python_alreadyset_in_destructor(monkeypatch, capsys):
+    hooked = False
+    triggered = [False]  # mutable, so Python 2.7 closure can modify it
+
+    if hasattr(sys, 'unraisablehook'):  # Python 3.8+
+        hooked = True
+        default_hook = sys.unraisablehook
+
+        def hook(unraisable_hook_args):
+            exc_type, exc_value, exc_tb, err_msg, obj = unraisable_hook_args
+            if obj == 'already_set demo':
+                triggered[0] = True
+            default_hook(unraisable_hook_args)
+            return
+
+        # Use monkeypatch so pytest can apply and remove the patch as appropriate
+        monkeypatch.setattr(sys, 'unraisablehook', hook)
+
+    assert m.python_alreadyset_in_destructor('already_set demo') is True
+    if hooked:
+        assert triggered[0] is True
+
+    _, captured_stderr = capsys.readouterr()
+    # Error message is different in Python 2 and 3, check for words that appear in both
+    assert 'ignored' in captured_stderr and 'already_set demo' in captured_stderr
+
+
+def test_exception_matches():
+    assert m.exception_matches()
+    assert m.exception_matches_base()
+    assert m.modulenotfound_exception_matches_base()
+
+
+def test_custom(msg):
+    # Can we catch a MyException?
+    with pytest.raises(m.MyException) as excinfo:
+        m.throws1()
+    assert msg(excinfo.value) == "this error should go to a custom type"
+
+    # Can we translate to standard Python exceptions?
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws2()
+    assert msg(excinfo.value) == "this error should go to a standard Python exception"
+
+    # Can we handle unknown exceptions?
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws3()
+    assert msg(excinfo.value) == "Caught an unknown exception!"
+
+    # Can we delegate to another handler by rethrowing?
+    with pytest.raises(m.MyException) as excinfo:
+        m.throws4()
+    assert msg(excinfo.value) == "this error is rethrown"
+
+    # Can we fall-through to the default handler?
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws_logic_error()
+    assert msg(excinfo.value) == "this error should fall through to the standard handler"
+
+    # OverFlow error translation.
+    with pytest.raises(OverflowError) as excinfo:
+        m.throws_overflow_error()
+
+    # Can we handle a helper-declared exception?
+    with pytest.raises(m.MyException5) as excinfo:
+        m.throws5()
+    assert msg(excinfo.value) == "this is a helper-defined translated exception"
+
+    # Exception subclassing:
+    with pytest.raises(m.MyException5) as excinfo:
+        m.throws5_1()
+    assert msg(excinfo.value) == "MyException5 subclass"
+    assert isinstance(excinfo.value, m.MyException5_1)
+
+    with pytest.raises(m.MyException5_1) as excinfo:
+        m.throws5_1()
+    assert msg(excinfo.value) == "MyException5 subclass"
+
+    with pytest.raises(m.MyException5) as excinfo:
+        try:
+            m.throws5()
+        except m.MyException5_1:
+            raise RuntimeError("Exception error: caught child from parent")
+    assert msg(excinfo.value) == "this is a helper-defined translated exception"
+
+
+def test_nested_throws(capture):
+    """Tests nested (e.g. C++ -> Python -> C++) exception handling"""
+
+    def throw_myex():
+        raise m.MyException("nested error")
+
+    def throw_myex5():
+        raise m.MyException5("nested error 5")
+
+    # In the comments below, the exception is caught in the first step, thrown in the last step
+
+    # C++ -> Python
+    with capture:
+        m.try_catch(m.MyException5, throw_myex5)
+    assert str(capture).startswith("MyException5: nested error 5")
+
+    # Python -> C++ -> Python
+    with pytest.raises(m.MyException) as excinfo:
+        m.try_catch(m.MyException5, throw_myex)
+    assert str(excinfo.value) == "nested error"
+
+    def pycatch(exctype, f, *args):
+        try:
+            f(*args)
+        except m.MyException as e:
+            print(e)
+
+    # C++ -> Python -> C++ -> Python
+    with capture:
+        m.try_catch(
+            m.MyException5, pycatch, m.MyException, m.try_catch, m.MyException, throw_myex5)
+    assert str(capture).startswith("MyException5: nested error 5")
+
+    # C++ -> Python -> C++
+    with capture:
+        m.try_catch(m.MyException, pycatch, m.MyException5, m.throws4)
+    assert capture == "this error is rethrown"
+
+    # Python -> C++ -> Python -> C++
+    with pytest.raises(m.MyException5) as excinfo:
+        m.try_catch(m.MyException, pycatch, m.MyException, m.throws5)
+    assert str(excinfo.value) == "this is a helper-defined translated exception"
+
+
+# This can often happen if you wrap a pybind11 class in a Python wrapper
+def test_invalid_repr():
+
+    class MyRepr(object):
+        def __repr__(self):
+            raise AttributeError("Example error")
+
+    with pytest.raises(TypeError):
+        m.simple_bool_passthrough(MyRepr())
diff --git a/pybind11/tests/test_factory_constructors.cpp b/pybind11/tests/test_factory_constructors.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61cf33d16ed404563a3da803a4c2ecea4453a3b4
--- /dev/null
+++ b/pybind11/tests/test_factory_constructors.cpp
@@ -0,0 +1,342 @@
+/*
+    tests/test_factory_constructors.cpp -- tests construction from a factory function
+                                           via py::init_factory()
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <cmath>
+
+// Classes for testing python construction via C++ factory function:
+// Not publicly constructible, copyable, or movable:
+class TestFactory1 {
+    friend class TestFactoryHelper;
+    TestFactory1() : value("(empty)") { print_default_created(this); }
+    TestFactory1(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory1(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory1(TestFactory1 &&) = delete;
+    TestFactory1(const TestFactory1 &) = delete;
+    TestFactory1 &operator=(TestFactory1 &&) = delete;
+    TestFactory1 &operator=(const TestFactory1 &) = delete;
+public:
+    std::string value;
+    ~TestFactory1() { print_destroyed(this); }
+};
+// Non-public construction, but moveable:
+class TestFactory2 {
+    friend class TestFactoryHelper;
+    TestFactory2() : value("(empty2)") { print_default_created(this); }
+    TestFactory2(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory2(std::string v) : value(std::move(v)) { print_created(this, value); }
+public:
+    TestFactory2(TestFactory2 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory2 &operator=(TestFactory2 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    ~TestFactory2() { print_destroyed(this); }
+};
+// Mixed direct/factory construction:
+class TestFactory3 {
+protected:
+    friend class TestFactoryHelper;
+    TestFactory3() : value("(empty3)") { print_default_created(this); }
+    TestFactory3(int v) : value(std::to_string(v)) { print_created(this, value); }
+public:
+    TestFactory3(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory3(TestFactory3 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory3 &operator=(TestFactory3 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    virtual ~TestFactory3() { print_destroyed(this); }
+};
+// Inheritance test
+class TestFactory4 : public TestFactory3 {
+public:
+    TestFactory4() : TestFactory3() { print_default_created(this); }
+    TestFactory4(int v) : TestFactory3(v) { print_created(this, v); }
+    virtual ~TestFactory4() { print_destroyed(this); }
+};
+// Another class for an invalid downcast test
+class TestFactory5 : public TestFactory3 {
+public:
+    TestFactory5(int i) : TestFactory3(i) { print_created(this, i); }
+    virtual ~TestFactory5() { print_destroyed(this); }
+};
+
+class TestFactory6 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory6(int i) : value{i} { print_created(this, i); }
+    TestFactory6(TestFactory6 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory6(const TestFactory6 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory6() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF6 : public TestFactory6 {
+public:
+    // Special constructor that allows the factory to construct a PyTF6 from a TestFactory6 only
+    // when an alias is needed:
+    PyTF6(TestFactory6 &&base) : TestFactory6(std::move(base)) { alias = true; print_created(this, "move", value); }
+    PyTF6(int i) : TestFactory6(i) { alias = true; print_created(this, i); }
+    PyTF6(PyTF6 &&f) : TestFactory6(std::move(f)) { print_move_created(this); }
+    PyTF6(const PyTF6 &f) : TestFactory6(f) { print_copy_created(this); }
+    PyTF6(std::string s) : TestFactory6((int) s.size()) { alias = true; print_created(this, s); }
+    virtual ~PyTF6() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory6, get, /*no args*/); }
+};
+
+class TestFactory7 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory7(int i) : value{i} { print_created(this, i); }
+    TestFactory7(TestFactory7 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory7(const TestFactory7 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory7() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF7 : public TestFactory7 {
+public:
+    PyTF7(int i) : TestFactory7(i) { alias = true; print_created(this, i); }
+    PyTF7(PyTF7 &&f) : TestFactory7(std::move(f)) { print_move_created(this); }
+    PyTF7(const PyTF7 &f) : TestFactory7(f) { print_copy_created(this); }
+    virtual ~PyTF7() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory7, get, /*no args*/); }
+};
+
+
+class TestFactoryHelper {
+public:
+    // Non-movable, non-copyable type:
+    // Return via pointer:
+    static TestFactory1 *construct1() { return new TestFactory1(); }
+    // Holder:
+    static std::unique_ptr<TestFactory1> construct1(int a) { return std::unique_ptr<TestFactory1>(new TestFactory1(a)); }
+    // pointer again
+    static TestFactory1 *construct1_string(std::string a) { return new TestFactory1(a); }
+
+    // Moveable type:
+    // pointer:
+    static TestFactory2 *construct2() { return new TestFactory2(); }
+    // holder:
+    static std::unique_ptr<TestFactory2> construct2(int a) { return std::unique_ptr<TestFactory2>(new TestFactory2(a)); }
+    // by value moving:
+    static TestFactory2 construct2(std::string a) { return TestFactory2(a); }
+
+    // shared_ptr holder type:
+    // pointer:
+    static TestFactory3 *construct3() { return new TestFactory3(); }
+    // holder:
+    static std::shared_ptr<TestFactory3> construct3(int a) { return std::shared_ptr<TestFactory3>(new TestFactory3(a)); }
+};
+
+TEST_SUBMODULE(factory_constructors, m) {
+
+    // Define various trivial types to allow simpler overload resolution:
+    py::module m_tag = m.def_submodule("tag");
+#define MAKE_TAG_TYPE(Name) \
+    struct Name##_tag {}; \
+    py::class_<Name##_tag>(m_tag, #Name "_tag").def(py::init<>()); \
+    m_tag.attr(#Name) = py::cast(Name##_tag{})
+    MAKE_TAG_TYPE(pointer);
+    MAKE_TAG_TYPE(unique_ptr);
+    MAKE_TAG_TYPE(move);
+    MAKE_TAG_TYPE(shared_ptr);
+    MAKE_TAG_TYPE(derived);
+    MAKE_TAG_TYPE(TF4);
+    MAKE_TAG_TYPE(TF5);
+    MAKE_TAG_TYPE(null_ptr);
+    MAKE_TAG_TYPE(null_unique_ptr);
+    MAKE_TAG_TYPE(null_shared_ptr);
+    MAKE_TAG_TYPE(base);
+    MAKE_TAG_TYPE(invalid_base);
+    MAKE_TAG_TYPE(alias);
+    MAKE_TAG_TYPE(unaliasable);
+    MAKE_TAG_TYPE(mixed);
+
+    // test_init_factory_basic, test_bad_type
+    py::class_<TestFactory1>(m, "TestFactory1")
+        .def(py::init([](unique_ptr_tag, int v) { return TestFactoryHelper::construct1(v); }))
+        .def(py::init(&TestFactoryHelper::construct1_string)) // raw function pointer
+        .def(py::init([](pointer_tag) { return TestFactoryHelper::construct1(); }))
+        .def(py::init([](py::handle, int v, py::handle) { return TestFactoryHelper::construct1(v); }))
+        .def_readwrite("value", &TestFactory1::value)
+        ;
+    py::class_<TestFactory2>(m, "TestFactory2")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](unique_ptr_tag, std::string v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](move_tag) { return TestFactoryHelper::construct2(); }))
+        .def_readwrite("value", &TestFactory2::value)
+        ;
+
+    // Stateful & reused:
+    int c = 1;
+    auto c4a = [c](pointer_tag, TF4_tag, int a) { (void) c; return new TestFactory4(a);};
+
+    // test_init_factory_basic, test_init_factory_casting
+    py::class_<TestFactory3, std::shared_ptr<TestFactory3>>(m, "TestFactory3")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct3(v); }))
+        .def(py::init([](shared_ptr_tag) { return TestFactoryHelper::construct3(); }))
+        .def("__init__", [](TestFactory3 &self, std::string v) { new (&self) TestFactory3(v); }) // placement-new ctor
+
+        // factories returning a derived type:
+        .def(py::init(c4a)) // derived ptr
+        .def(py::init([](pointer_tag, TF5_tag, int a) { return new TestFactory5(a); }))
+        // derived shared ptr:
+        .def(py::init([](shared_ptr_tag, TF4_tag, int a) { return std::make_shared<TestFactory4>(a); }))
+        .def(py::init([](shared_ptr_tag, TF5_tag, int a) { return std::make_shared<TestFactory5>(a); }))
+
+        // Returns nullptr:
+        .def(py::init([](null_ptr_tag) { return (TestFactory3 *) nullptr; }))
+        .def(py::init([](null_unique_ptr_tag) { return std::unique_ptr<TestFactory3>(); }))
+        .def(py::init([](null_shared_ptr_tag) { return std::shared_ptr<TestFactory3>(); }))
+
+        .def_readwrite("value", &TestFactory3::value)
+        ;
+
+    // test_init_factory_casting
+    py::class_<TestFactory4, TestFactory3, std::shared_ptr<TestFactory4>>(m, "TestFactory4")
+        .def(py::init(c4a)) // pointer
+        ;
+
+    // Doesn't need to be registered, but registering makes getting ConstructorStats easier:
+    py::class_<TestFactory5, TestFactory3, std::shared_ptr<TestFactory5>>(m, "TestFactory5");
+
+    // test_init_factory_alias
+    // Alias testing
+    py::class_<TestFactory6, PyTF6>(m, "TestFactory6")
+        .def(py::init([](base_tag, int i) { return TestFactory6(i); }))
+        .def(py::init([](alias_tag, int i) { return PyTF6(i); }))
+        .def(py::init([](alias_tag, std::string s) { return PyTF6(s); }))
+        .def(py::init([](alias_tag, pointer_tag, int i) { return new PyTF6(i); }))
+        .def(py::init([](base_tag, pointer_tag, int i) { return new TestFactory6(i); }))
+        .def(py::init([](base_tag, alias_tag, pointer_tag, int i) { return (TestFactory6 *) new PyTF6(i); }))
+
+        .def("get", &TestFactory6::get)
+        .def("has_alias", &TestFactory6::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory6>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF6>, py::return_value_policy::reference)
+        ;
+
+    // test_init_factory_dual
+    // Separate alias constructor testing
+    py::class_<TestFactory7, PyTF7, std::shared_ptr<TestFactory7>>(m, "TestFactory7")
+        .def(py::init(
+            [](int i) { return TestFactory7(i); },
+            [](int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](pointer_tag, int i) { return new TestFactory7(i); },
+            [](pointer_tag, int i) { return new PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, int i) { return new TestFactory7(i); },
+            [](mixed_tag, int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, std::string s) { return TestFactory7((int) s.size()); },
+            [](mixed_tag, std::string s) { return new PyTF7((int) s.size()); }))
+        .def(py::init(
+            [](base_tag, pointer_tag, int i) { return new TestFactory7(i); },
+            [](base_tag, pointer_tag, int i) { return (TestFactory7 *) new PyTF7(i); }))
+        .def(py::init(
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(i); },
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(10*i); }))
+        .def(py::init(
+            [](shared_ptr_tag, base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, base_tag, int i) { auto *p = new PyTF7(i); return std::shared_ptr<TestFactory7>(p); }))
+        .def(py::init(
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); })) // <-- invalid alias factory
+
+        .def("get", &TestFactory7::get)
+        .def("has_alias", &TestFactory7::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory7>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF7>, py::return_value_policy::reference)
+        ;
+
+    // test_placement_new_alternative
+    // Class with a custom new operator but *without* a placement new operator (issue #948)
+    class NoPlacementNew {
+    public:
+        NoPlacementNew(int i) : i(i) { }
+        static void *operator new(std::size_t s) {
+            auto *p = ::operator new(s);
+            py::print("operator new called, returning", reinterpret_cast<uintptr_t>(p));
+            return p;
+        }
+        static void operator delete(void *p) {
+            py::print("operator delete called on", reinterpret_cast<uintptr_t>(p));
+            ::operator delete(p);
+        }
+        int i;
+    };
+    // As of 2.2, `py::init<args>` no longer requires placement new
+    py::class_<NoPlacementNew>(m, "NoPlacementNew")
+        .def(py::init<int>())
+        .def(py::init([]() { return new NoPlacementNew(100); }))
+        .def_readwrite("i", &NoPlacementNew::i)
+        ;
+
+
+    // test_reallocations
+    // Class that has verbose operator_new/operator_delete calls
+    struct NoisyAlloc {
+        NoisyAlloc(const NoisyAlloc &) = default;
+        NoisyAlloc(int i) { py::print(py::str("NoisyAlloc(int {})").format(i)); }
+        NoisyAlloc(double d) { py::print(py::str("NoisyAlloc(double {})").format(d)); }
+        ~NoisyAlloc() { py::print("~NoisyAlloc()"); }
+
+        static void *operator new(size_t s) { py::print("noisy new"); return ::operator new(s); }
+        static void *operator new(size_t, void *p) { py::print("noisy placement new"); return p; }
+        static void operator delete(void *p, size_t) { py::print("noisy delete"); ::operator delete(p); }
+        static void operator delete(void *, void *) { py::print("noisy placement delete"); }
+#if defined(_MSC_VER) && _MSC_VER < 1910
+        // MSVC 2015 bug: the above "noisy delete" isn't invoked (fixed in MSVC 2017)
+        static void operator delete(void *p) { py::print("noisy delete"); ::operator delete(p); }
+#endif
+    };
+    py::class_<NoisyAlloc>(m, "NoisyAlloc")
+        // Since these overloads have the same number of arguments, the dispatcher will try each of
+        // them until the arguments convert.  Thus we can get a pre-allocation here when passing a
+        // single non-integer:
+        .def("__init__", [](NoisyAlloc *a, int i) { new (a) NoisyAlloc(i); }) // Regular constructor, runs first, requires preallocation
+        .def(py::init([](double d) { return new NoisyAlloc(d); }))
+
+        // The two-argument version: first the factory pointer overload.
+        .def(py::init([](int i, int) { return new NoisyAlloc(i); }))
+        // Return-by-value:
+        .def(py::init([](double d, int) { return NoisyAlloc(d); }))
+        // Old-style placement new init; requires preallocation
+        .def("__init__", [](NoisyAlloc &a, double d, double) { new (&a) NoisyAlloc(d); })
+        // Requires deallocation of previous overload preallocated value:
+        .def(py::init([](int i, double) { return new NoisyAlloc(i); }))
+        // Regular again: requires yet another preallocation
+        .def("__init__", [](NoisyAlloc &a, int i, std::string) { new (&a) NoisyAlloc(i); })
+        ;
+
+
+
+
+    // static_assert testing (the following def's should all fail with appropriate compilation errors):
+#if 0
+    struct BadF1Base {};
+    struct BadF1 : BadF1Base {};
+    struct PyBadF1 : BadF1 {};
+    py::class_<BadF1, PyBadF1, std::shared_ptr<BadF1>> bf1(m, "BadF1");
+    // wrapped factory function must return a compatible pointer, holder, or value
+    bf1.def(py::init([]() { return 3; }));
+    // incompatible factory function pointer return type
+    bf1.def(py::init([]() { static int three = 3; return &three; }));
+    // incompatible factory function std::shared_ptr<T> return type: cannot convert shared_ptr<T> to holder
+    // (non-polymorphic base)
+    bf1.def(py::init([]() { return std::shared_ptr<BadF1Base>(new BadF1()); }));
+#endif
+}
diff --git a/pybind11/tests/test_factory_constructors.py b/pybind11/tests/test_factory_constructors.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c4bed165f6575950b7f0f17ec65a88397e0ff54
--- /dev/null
+++ b/pybind11/tests/test_factory_constructors.py
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+import pytest
+import re
+
+import env  # noqa: F401
+
+from pybind11_tests import factory_constructors as m
+from pybind11_tests.factory_constructors import tag
+from pybind11_tests import ConstructorStats
+
+
+def test_init_factory_basic():
+    """Tests py::init_factory() wrapper around various ways of returning the object"""
+
+    cstats = [ConstructorStats.get(c) for c in [m.TestFactory1, m.TestFactory2, m.TestFactory3]]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    x1 = m.TestFactory1(tag.unique_ptr, 3)
+    assert x1.value == "3"
+    y1 = m.TestFactory1(tag.pointer)
+    assert y1.value == "(empty)"
+    z1 = m.TestFactory1("hi!")
+    assert z1.value == "hi!"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+
+    x2 = m.TestFactory2(tag.move)
+    assert x2.value == "(empty2)"
+    y2 = m.TestFactory2(tag.pointer, 7)
+    assert y2.value == "7"
+    z2 = m.TestFactory2(tag.unique_ptr, "hi again")
+    assert z2.value == "hi again"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+
+    x3 = m.TestFactory3(tag.shared_ptr)
+    assert x3.value == "(empty3)"
+    y3 = m.TestFactory3(tag.pointer, 42)
+    assert y3.value == "42"
+    z3 = m.TestFactory3("bye")
+    assert z3.value == "bye"
+
+    for null_ptr_kind in [tag.null_ptr,
+                          tag.null_unique_ptr,
+                          tag.null_shared_ptr]:
+        with pytest.raises(TypeError) as excinfo:
+            m.TestFactory3(null_ptr_kind)
+        assert str(excinfo.value) == "pybind11::init(): factory function returned nullptr"
+
+    assert [i.alive() for i in cstats] == [3, 3, 3]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 9
+
+    del x1, y2, y3, z3
+    assert [i.alive() for i in cstats] == [2, 2, 1]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 5
+    del x2, x3, y1, z1, z2
+    assert [i.alive() for i in cstats] == [0, 0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["3", "hi!"],
+        ["7", "hi again"],
+        ["42", "bye"]
+    ]
+    assert [i.default_constructions for i in cstats] == [1, 1, 1]
+
+
+def test_init_factory_signature(msg):
+    with pytest.raises(TypeError) as excinfo:
+        m.TestFactory1("invalid", "constructor", "arguments")
+    assert msg(excinfo.value) == """
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int)
+            2. m.factory_constructors.TestFactory1(arg0: str)
+            3. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.pointer_tag)
+            4. m.factory_constructors.TestFactory1(arg0: handle, arg1: int, arg2: handle)
+
+        Invoked with: 'invalid', 'constructor', 'arguments'
+    """  # noqa: E501 line too long
+
+    assert msg(m.TestFactory1.__init__.__doc__) == """
+        __init__(*args, **kwargs)
+        Overloaded function.
+
+        1. __init__(self: m.factory_constructors.TestFactory1, arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int) -> None
+
+        2. __init__(self: m.factory_constructors.TestFactory1, arg0: str) -> None
+
+        3. __init__(self: m.factory_constructors.TestFactory1, arg0: m.factory_constructors.tag.pointer_tag) -> None
+
+        4. __init__(self: m.factory_constructors.TestFactory1, arg0: handle, arg1: int, arg2: handle) -> None
+    """  # noqa: E501 line too long
+
+
+def test_init_factory_casting():
+    """Tests py::init_factory() wrapper with various upcasting and downcasting returns"""
+
+    cstats = [ConstructorStats.get(c) for c in [m.TestFactory3, m.TestFactory4, m.TestFactory5]]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    # Construction from derived references:
+    a = m.TestFactory3(tag.pointer, tag.TF4, 4)
+    assert a.value == "4"
+    b = m.TestFactory3(tag.shared_ptr, tag.TF4, 5)
+    assert b.value == "5"
+    c = m.TestFactory3(tag.pointer, tag.TF5, 6)
+    assert c.value == "6"
+    d = m.TestFactory3(tag.shared_ptr, tag.TF5, 7)
+    assert d.value == "7"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    # Shared a lambda with TF3:
+    e = m.TestFactory4(tag.pointer, tag.TF4, 8)
+    assert e.value == "8"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 5
+    assert [i.alive() for i in cstats] == [5, 3, 2]
+
+    del a
+    assert [i.alive() for i in cstats] == [4, 2, 2]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    del b, c, e
+    assert [i.alive() for i in cstats] == [1, 0, 1]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 1
+
+    del d
+    assert [i.alive() for i in cstats] == [0, 0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["4", "5", "6", "7", "8"],
+        ["4", "5", "8"],
+        ["6", "7"]
+    ]
+
+
+def test_init_factory_alias():
+    """Tests py::init_factory() wrapper with value conversions and alias types"""
+
+    cstats = [m.TestFactory6.get_cstats(), m.TestFactory6.get_alias_cstats()]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    a = m.TestFactory6(tag.base, 1)
+    assert a.get() == 1
+    assert not a.has_alias()
+    b = m.TestFactory6(tag.alias, "hi there")
+    assert b.get() == 8
+    assert b.has_alias()
+    c = m.TestFactory6(tag.alias, 3)
+    assert c.get() == 3
+    assert c.has_alias()
+    d = m.TestFactory6(tag.alias, tag.pointer, 4)
+    assert d.get() == 4
+    assert d.has_alias()
+    e = m.TestFactory6(tag.base, tag.pointer, 5)
+    assert e.get() == 5
+    assert not e.has_alias()
+    f = m.TestFactory6(tag.base, tag.alias, tag.pointer, 6)
+    assert f.get() == 6
+    assert f.has_alias()
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+    assert [i.alive() for i in cstats] == [6, 4]
+
+    del a, b, e
+    assert [i.alive() for i in cstats] == [3, 3]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    del f, c, d
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    class MyTest(m.TestFactory6):
+        def __init__(self, *args):
+            m.TestFactory6.__init__(self, *args)
+
+        def get(self):
+            return -5 + m.TestFactory6.get(self)
+
+    # Return Class by value, moved into new alias:
+    z = MyTest(tag.base, 123)
+    assert z.get() == 118
+    assert z.has_alias()
+
+    # Return alias by value, moved into new alias:
+    y = MyTest(tag.alias, "why hello!")
+    assert y.get() == 5
+    assert y.has_alias()
+
+    # Return Class by pointer, moved into new alias then original destroyed:
+    x = MyTest(tag.base, tag.pointer, 47)
+    assert x.get() == 42
+    assert x.has_alias()
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    assert [i.alive() for i in cstats] == [3, 3]
+    del x, y, z
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["1", "8", "3", "4", "5", "6", "123", "10", "47"],
+        ["hi there", "3", "4", "6", "move", "123", "why hello!", "move", "47"]
+    ]
+
+
+def test_init_factory_dual():
+    """Tests init factory functions with dual main/alias factory functions"""
+    from pybind11_tests.factory_constructors import TestFactory7
+
+    cstats = [TestFactory7.get_cstats(), TestFactory7.get_alias_cstats()]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    class PythFactory7(TestFactory7):
+        def get(self):
+            return 100 + TestFactory7.get(self)
+
+    a1 = TestFactory7(1)
+    a2 = PythFactory7(2)
+    assert a1.get() == 1
+    assert a2.get() == 102
+    assert not a1.has_alias()
+    assert a2.has_alias()
+
+    b1 = TestFactory7(tag.pointer, 3)
+    b2 = PythFactory7(tag.pointer, 4)
+    assert b1.get() == 3
+    assert b2.get() == 104
+    assert not b1.has_alias()
+    assert b2.has_alias()
+
+    c1 = TestFactory7(tag.mixed, 5)
+    c2 = PythFactory7(tag.mixed, 6)
+    assert c1.get() == 5
+    assert c2.get() == 106
+    assert not c1.has_alias()
+    assert c2.has_alias()
+
+    d1 = TestFactory7(tag.base, tag.pointer, 7)
+    d2 = PythFactory7(tag.base, tag.pointer, 8)
+    assert d1.get() == 7
+    assert d2.get() == 108
+    assert not d1.has_alias()
+    assert d2.has_alias()
+
+    # Both return an alias; the second multiplies the value by 10:
+    e1 = TestFactory7(tag.alias, tag.pointer, 9)
+    e2 = PythFactory7(tag.alias, tag.pointer, 10)
+    assert e1.get() == 9
+    assert e2.get() == 200
+    assert e1.has_alias()
+    assert e2.has_alias()
+
+    f1 = TestFactory7(tag.shared_ptr, tag.base, 11)
+    f2 = PythFactory7(tag.shared_ptr, tag.base, 12)
+    assert f1.get() == 11
+    assert f2.get() == 112
+    assert not f1.has_alias()
+    assert f2.has_alias()
+
+    g1 = TestFactory7(tag.shared_ptr, tag.invalid_base, 13)
+    assert g1.get() == 13
+    assert not g1.has_alias()
+    with pytest.raises(TypeError) as excinfo:
+        PythFactory7(tag.shared_ptr, tag.invalid_base, 14)
+    assert (str(excinfo.value) ==
+            "pybind11::init(): construction failed: returned holder-wrapped instance is not an "
+            "alias instance")
+
+    assert [i.alive() for i in cstats] == [13, 7]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 13
+
+    del a1, a2, b1, d1, e1, e2
+    assert [i.alive() for i in cstats] == [7, 4]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 7
+    del b2, c1, c2, d2, f1, f2, g1
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["1", "2", "3", "4", "5", "6", "7", "8", "9", "100", "11", "12", "13", "14"],
+        ["2", "4", "6", "8", "9", "100", "12"]
+    ]
+
+
+def test_no_placement_new(capture):
+    """Prior to 2.2, `py::init<...>` relied on the type supporting placement
+    new; this tests a class without placement new support."""
+    with capture:
+        a = m.NoPlacementNew(123)
+
+    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    assert found
+    assert a.i == 123
+    with capture:
+        del a
+        pytest.gc_collect()
+    assert capture == "operator delete called on " + found.group(1)
+
+    with capture:
+        b = m.NoPlacementNew()
+
+    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    assert found
+    assert b.i == 100
+    with capture:
+        del b
+        pytest.gc_collect()
+    assert capture == "operator delete called on " + found.group(1)
+
+
+def test_multiple_inheritance():
+    class MITest(m.TestFactory1, m.TestFactory2):
+        def __init__(self):
+            m.TestFactory1.__init__(self, tag.unique_ptr, 33)
+            m.TestFactory2.__init__(self, tag.move)
+
+    a = MITest()
+    assert m.TestFactory1.value.fget(a) == "33"
+    assert m.TestFactory2.value.fget(a) == "(empty2)"
+
+
+def create_and_destroy(*args):
+    a = m.NoisyAlloc(*args)
+    print("---")
+    del a
+    pytest.gc_collect()
+
+
+def strip_comments(s):
+    return re.sub(r'\s+#.*', '', s)
+
+
+def test_reallocations(capture, msg):
+    """When the constructor is overloaded, previous overloads can require a preallocated value.
+    This test makes sure that such preallocated values only happen when they might be necessary,
+    and that they are deallocated properly"""
+
+    pytest.gc_collect()
+
+    with capture:
+        create_and_destroy(1)
+    assert msg(capture) == """
+        noisy new
+        noisy placement new
+        NoisyAlloc(int 1)
+        ---
+        ~NoisyAlloc()
+        noisy delete
+    """
+    with capture:
+        create_and_destroy(1.5)
+    assert msg(capture) == strip_comments("""
+        noisy new               # allocation required to attempt first overload
+        noisy delete            # have to dealloc before considering factory init overload
+        noisy new               # pointer factory calling "new", part 1: allocation
+        NoisyAlloc(double 1.5)  # ... part two, invoking constructor
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(2, 3)
+    assert msg(capture) == strip_comments("""
+        noisy new          # pointer factory calling "new", allocation
+        NoisyAlloc(int 2)  # constructor
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(2.5, 3)
+    assert msg(capture) == strip_comments("""
+        NoisyAlloc(double 2.5)  # construction (local func variable: operator_new not called)
+        noisy new               # return-by-value "new" part 1: allocation
+        ~NoisyAlloc()           # moved-away local func variable destruction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(3.5, 4.5)
+    assert msg(capture) == strip_comments("""
+        noisy new               # preallocation needed before invoking placement-new overload
+        noisy placement new     # Placement new
+        NoisyAlloc(double 3.5)  # construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(4, 0.5)
+    assert msg(capture) == strip_comments("""
+        noisy new          # preallocation needed before invoking placement-new overload
+        noisy delete       # deallocation of preallocated storage
+        noisy new          # Factory pointer allocation
+        NoisyAlloc(int 4)  # factory pointer construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(5, "hi")
+    assert msg(capture) == strip_comments("""
+        noisy new            # preallocation needed before invoking first placement new
+        noisy delete         # delete before considering new-style constructor
+        noisy new            # preallocation for second placement new
+        noisy placement new  # Placement new in the second placement new overload
+        NoisyAlloc(int 5)    # construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+
+@pytest.mark.skipif("env.PY2")
+def test_invalid_self():
+    """Tests invocation of the pybind-registered base class with an invalid `self` argument.  You
+    can only actually do this on Python 3: Python 2 raises an exception itself if you try."""
+    class NotPybindDerived(object):
+        pass
+
+    # Attempts to initialize with an invalid type passed as `self`:
+    class BrokenTF1(m.TestFactory1):
+        def __init__(self, bad):
+            if bad == 1:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory1.__init__(a, tag.pointer)
+            elif bad == 2:
+                a = NotPybindDerived()
+                m.TestFactory1.__init__(a, tag.pointer)
+
+    # Same as above, but for a class with an alias:
+    class BrokenTF6(m.TestFactory6):
+        def __init__(self, bad):
+            if bad == 1:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory6.__init__(a, tag.base, 1)
+            elif bad == 2:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory6.__init__(a, tag.alias, 1)
+            elif bad == 3:
+                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.base, 1)
+            elif bad == 4:
+                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.alias, 1)
+
+    for arg in (1, 2):
+        with pytest.raises(TypeError) as excinfo:
+            BrokenTF1(arg)
+        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
+
+    for arg in (1, 2, 3, 4):
+        with pytest.raises(TypeError) as excinfo:
+            BrokenTF6(arg)
+        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
diff --git a/pybind11/tests/test_gil_scoped.cpp b/pybind11/tests/test_gil_scoped.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dc9b7ed2243a832c19d6826836ac579456232441
--- /dev/null
+++ b/pybind11/tests/test_gil_scoped.cpp
@@ -0,0 +1,54 @@
+/*
+    tests/test_gil_scoped.cpp -- acquire and release gil
+
+    Copyright (c) 2017 Borja Zarco (Google LLC) <bzarco@google.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/functional.h>
+
+
+class VirtClass  {
+public:
+    virtual ~VirtClass() = default;
+    VirtClass() = default;
+    VirtClass(const VirtClass&) = delete;
+    virtual void virtual_func() {}
+    virtual void pure_virtual_func() = 0;
+};
+
+class PyVirtClass : public VirtClass {
+    void virtual_func() override {
+        PYBIND11_OVERLOAD(void, VirtClass, virtual_func,);
+    }
+    void pure_virtual_func() override {
+        PYBIND11_OVERLOAD_PURE(void, VirtClass, pure_virtual_func,);
+    }
+};
+
+TEST_SUBMODULE(gil_scoped, m) {
+  py::class_<VirtClass, PyVirtClass>(m, "VirtClass")
+      .def(py::init<>())
+      .def("virtual_func", &VirtClass::virtual_func)
+      .def("pure_virtual_func", &VirtClass::pure_virtual_func);
+
+    m.def("test_callback_py_obj",
+          [](py::object func) { func(); });
+    m.def("test_callback_std_func",
+          [](const std::function<void()> &func) { func(); });
+    m.def("test_callback_virtual_func",
+          [](VirtClass &virt) { virt.virtual_func(); });
+    m.def("test_callback_pure_virtual_func",
+          [](VirtClass &virt) { virt.pure_virtual_func(); });
+    m.def("test_cross_module_gil",
+          []() {
+              auto cm = py::module::import("cross_module_gil_utils");
+              auto gil_acquire = reinterpret_cast<void (*)()>(
+                  PyLong_AsVoidPtr(cm.attr("gil_acquire_funcaddr").ptr()));
+              py::gil_scoped_release gil_release;
+              gil_acquire();
+          });
+}
diff --git a/pybind11/tests/test_gil_scoped.py b/pybind11/tests/test_gil_scoped.py
new file mode 100644
index 0000000000000000000000000000000000000000..27122cca2818a3cd0b61f051d5c8ac631ba9d8fc
--- /dev/null
+++ b/pybind11/tests/test_gil_scoped.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+import multiprocessing
+import threading
+
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import gil_scoped as m
+
+
+def _run_in_process(target, *args, **kwargs):
+    """Runs target in process and returns its exitcode after 10s (None if still alive)."""
+    process = multiprocessing.Process(target=target, args=args, kwargs=kwargs)
+    process.daemon = True
+    try:
+        process.start()
+        # Do not need to wait much, 10s should be more than enough.
+        process.join(timeout=10)
+        return process.exitcode
+    finally:
+        if process.is_alive():
+            process.terminate()
+
+
+def _python_to_cpp_to_python():
+    """Calls different C++ functions that come back to Python."""
+    class ExtendedVirtClass(m.VirtClass):
+        def virtual_func(self):
+            pass
+
+        def pure_virtual_func(self):
+            pass
+
+    extended = ExtendedVirtClass()
+    m.test_callback_py_obj(lambda: None)
+    m.test_callback_std_func(lambda: None)
+    m.test_callback_virtual_func(extended)
+    m.test_callback_pure_virtual_func(extended)
+
+
+def _python_to_cpp_to_python_from_threads(num_threads, parallel=False):
+    """Calls different C++ functions that come back to Python, from Python threads."""
+    threads = []
+    for _ in range(num_threads):
+        thread = threading.Thread(target=_python_to_cpp_to_python)
+        thread.daemon = True
+        thread.start()
+        if parallel:
+            threads.append(thread)
+        else:
+            thread.join()
+    for thread in threads:
+        thread.join()
+
+
+# TODO: FIXME, sometimes returns -11 instead of 0
+@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+def test_python_to_cpp_to_python_from_thread():
+    """Makes sure there is no GIL deadlock when running in a thread.
+
+    It runs in a separate process to be able to stop and assert if it deadlocks.
+    """
+    assert _run_in_process(_python_to_cpp_to_python_from_threads, 1) == 0
+
+
+# TODO: FIXME
+@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+def test_python_to_cpp_to_python_from_thread_multiple_parallel():
+    """Makes sure there is no GIL deadlock when running in a thread multiple times in parallel.
+
+    It runs in a separate process to be able to stop and assert if it deadlocks.
+    """
+    assert _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=True) == 0
+
+
+# TODO: FIXME
+@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+def test_python_to_cpp_to_python_from_thread_multiple_sequential():
+    """Makes sure there is no GIL deadlock when running in a thread multiple times sequentially.
+
+    It runs in a separate process to be able to stop and assert if it deadlocks.
+    """
+    assert _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=False) == 0
+
+
+# TODO: FIXME
+@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+def test_python_to_cpp_to_python_from_process():
+    """Makes sure there is no GIL deadlock when using processes.
+
+    This test is for completion, but it was never an issue.
+    """
+    assert _run_in_process(_python_to_cpp_to_python) == 0
+
+
+def test_cross_module_gil():
+    """Makes sure that the GIL can be acquired by another module from a GIL-released state."""
+    m.test_cross_module_gil()  # Should not raise a SIGSEGV
diff --git a/pybind11/tests/test_iostream.cpp b/pybind11/tests/test_iostream.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e67f88af5fd2d377221a6fcd6c890dec5344df48
--- /dev/null
+++ b/pybind11/tests/test_iostream.cpp
@@ -0,0 +1,73 @@
+/*
+    tests/test_iostream.cpp -- Usage of scoped_output_redirect
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/iostream.h>
+#include "pybind11_tests.h"
+#include <iostream>
+
+
+void noisy_function(std::string msg, bool flush) {
+
+    std::cout << msg;
+    if (flush)
+        std::cout << std::flush;
+}
+
+void noisy_funct_dual(std::string msg, std::string emsg) {
+    std::cout << msg;
+    std::cerr << emsg;
+}
+
+TEST_SUBMODULE(iostream, m) {
+
+    add_ostream_redirect(m);
+
+    // test_evals
+
+    m.def("captured_output_default", [](std::string msg) {
+        py::scoped_ostream_redirect redir;
+        std::cout << msg << std::flush;
+    });
+
+    m.def("captured_output", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cout, py::module::import("sys").attr("stdout"));
+        std::cout << msg << std::flush;
+    });
+
+    m.def("guard_output", &noisy_function,
+            py::call_guard<py::scoped_ostream_redirect>(),
+            py::arg("msg"), py::arg("flush")=true);
+
+    m.def("captured_err", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("noisy_function", &noisy_function, py::arg("msg"), py::arg("flush") = true);
+
+    m.def("dual_guard", &noisy_funct_dual,
+            py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),
+            py::arg("msg"), py::arg("emsg"));
+
+    m.def("raw_output", [](std::string msg) {
+        std::cout << msg << std::flush;
+    });
+
+    m.def("raw_err", [](std::string msg) {
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("captured_dual", [](std::string msg, std::string emsg) {
+        py::scoped_ostream_redirect redirout(std::cout, py::module::import("sys").attr("stdout"));
+        py::scoped_ostream_redirect redirerr(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cout << msg << std::flush;
+        std::cerr << emsg << std::flush;
+    });
+}
diff --git a/pybind11/tests/test_iostream.py b/pybind11/tests/test_iostream.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac4fcece0b089c03e240a0ae89e54c0c33feedf
--- /dev/null
+++ b/pybind11/tests/test_iostream.py
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import iostream as m
+import sys
+
+from contextlib import contextmanager
+
+try:
+    # Python 3
+    from io import StringIO
+except ImportError:
+    # Python 2
+    try:
+        from cStringIO import StringIO
+    except ImportError:
+        from StringIO import StringIO
+
+try:
+    # Python 3.4
+    from contextlib import redirect_stdout
+except ImportError:
+    @contextmanager
+    def redirect_stdout(target):
+        original = sys.stdout
+        sys.stdout = target
+        yield
+        sys.stdout = original
+
+try:
+    # Python 3.5
+    from contextlib import redirect_stderr
+except ImportError:
+    @contextmanager
+    def redirect_stderr(target):
+        original = sys.stderr
+        sys.stderr = target
+        yield
+        sys.stderr = original
+
+
+def test_captured(capsys):
+    msg = "I've been redirected to Python, I hope!"
+    m.captured_output(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+    m.captured_err(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == ''
+    assert stderr == msg
+
+
+def test_captured_large_string(capsys):
+    # Make this bigger than the buffer used on the C++ side: 1024 chars
+    msg = "I've been redirected to Python, I hope!"
+    msg = msg * (1024 // len(msg) + 1)
+
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+
+def test_guard_capture(capsys):
+    msg = "I've been redirected to Python, I hope!"
+    m.guard_output(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+
+def test_series_captured(capture):
+    with capture:
+        m.captured_output("a")
+        m.captured_output("b")
+    assert capture == "ab"
+
+
+def test_flush(capfd):
+    msg = "(not flushed)"
+    msg2 = "(flushed)"
+
+    with m.ostream_redirect():
+        m.noisy_function(msg, flush=False)
+        stdout, stderr = capfd.readouterr()
+        assert stdout == ''
+
+        m.noisy_function(msg2, flush=True)
+        stdout, stderr = capfd.readouterr()
+        assert stdout == msg + msg2
+
+        m.noisy_function(msg, flush=False)
+
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+
+
+def test_not_captured(capfd):
+    msg = "Something that should not show up in log"
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.captured_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+
+
+def test_err(capfd):
+    msg = "Something that should not show up in log"
+    stream = StringIO()
+    with redirect_stderr(stream):
+        m.raw_err(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == msg
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stderr(stream):
+        m.captured_err(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+
+
+def test_multi_captured(capfd):
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.captured_output("a")
+        m.raw_output("b")
+        m.captured_output("c")
+        m.raw_output("d")
+    stdout, stderr = capfd.readouterr()
+    assert stdout == 'bd'
+    assert stream.getvalue() == 'ac'
+
+
+def test_dual(capsys):
+    m.captured_dual("a", "b")
+    stdout, stderr = capsys.readouterr()
+    assert stdout == "a"
+    assert stderr == "b"
+
+
+def test_redirect(capfd):
+    msg = "Should not be in log!"
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        with m.ostream_redirect():
+            m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stream.getvalue() == msg
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stream.getvalue() == ''
+
+
+def test_redirect_err(capfd):
+    msg = "StdOut"
+    msg2 = "StdErr"
+
+    stream = StringIO()
+    with redirect_stderr(stream):
+        with m.ostream_redirect(stdout=False):
+            m.raw_output(msg)
+            m.raw_err(msg2)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+    assert stream.getvalue() == msg2
+
+
+def test_redirect_both(capfd):
+    msg = "StdOut"
+    msg2 = "StdErr"
+
+    stream = StringIO()
+    stream2 = StringIO()
+    with redirect_stdout(stream):
+        with redirect_stderr(stream2):
+            with m.ostream_redirect():
+                m.raw_output(msg)
+                m.raw_err(msg2)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+    assert stream2.getvalue() == msg2
diff --git a/pybind11/tests/test_kwargs_and_defaults.cpp b/pybind11/tests/test_kwargs_and_defaults.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64bc2377b255350a5a4e0f22ce0e5a3b1e4082ea
--- /dev/null
+++ b/pybind11/tests/test_kwargs_and_defaults.cpp
@@ -0,0 +1,131 @@
+/*
+    tests/test_kwargs_and_defaults.cpp -- keyword arguments and default values
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+TEST_SUBMODULE(kwargs_and_defaults, m) {
+    auto kw_func = [](int x, int y) { return "x=" + std::to_string(x) + ", y=" + std::to_string(y); };
+
+    // test_named_arguments
+    m.def("kw_func0", kw_func);
+    m.def("kw_func1", kw_func, py::arg("x"), py::arg("y"));
+    m.def("kw_func2", kw_func, py::arg("x") = 100, py::arg("y") = 200);
+    m.def("kw_func3", [](const char *) { }, py::arg("data") = std::string("Hello world!"));
+
+    /* A fancier default argument */
+    std::vector<int> list{{13, 17}};
+    m.def("kw_func4", [](const std::vector<int> &entries) {
+        std::string ret = "{";
+        for (int i : entries)
+            ret += std::to_string(i) + " ";
+        ret.back() = '}';
+        return ret;
+    }, py::arg("myList") = list);
+
+    m.def("kw_func_udl", kw_func, "x"_a, "y"_a=300);
+    m.def("kw_func_udl_z", kw_func, "x"_a, "y"_a=0);
+
+    // test_args_and_kwargs
+    m.def("args_function", [](py::args args) -> py::tuple {
+        return std::move(args);
+    });
+    m.def("args_kwargs_function", [](py::args args, py::kwargs kwargs) {
+        return py::make_tuple(args, kwargs);
+    });
+
+    // test_mixed_args_and_kwargs
+    m.def("mixed_plus_args", [](int i, double j, py::args args) {
+        return py::make_tuple(i, j, args);
+    });
+    m.def("mixed_plus_kwargs", [](int i, double j, py::kwargs kwargs) {
+        return py::make_tuple(i, j, kwargs);
+    });
+    auto mixed_plus_both = [](int i, double j, py::args args, py::kwargs kwargs) {
+        return py::make_tuple(i, j, args, kwargs);
+    };
+    m.def("mixed_plus_args_kwargs", mixed_plus_both);
+
+    m.def("mixed_plus_args_kwargs_defaults", mixed_plus_both,
+            py::arg("i") = 1, py::arg("j") = 3.14159);
+
+    // test_args_refcount
+    // PyPy needs a garbage collection to get the reference count values to match CPython's behaviour
+    #ifdef PYPY_VERSION
+    #define GC_IF_NEEDED ConstructorStats::gc()
+    #else
+    #define GC_IF_NEEDED
+    #endif
+    m.def("arg_refcount_h", [](py::handle h) { GC_IF_NEEDED; return h.ref_count(); });
+    m.def("arg_refcount_h", [](py::handle h, py::handle, py::handle) { GC_IF_NEEDED; return h.ref_count(); });
+    m.def("arg_refcount_o", [](py::object o) { GC_IF_NEEDED; return o.ref_count(); });
+    m.def("args_refcount", [](py::args a) {
+        GC_IF_NEEDED;
+        py::tuple t(a.size());
+        for (size_t i = 0; i < a.size(); i++)
+            // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
+            t[i] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+        return t;
+    });
+    m.def("mixed_args_refcount", [](py::object o, py::args a) {
+        GC_IF_NEEDED;
+        py::tuple t(a.size() + 1);
+        t[0] = o.ref_count();
+        for (size_t i = 0; i < a.size(); i++)
+            // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
+            t[i + 1] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+        return t;
+    });
+
+    // pybind11 won't allow these to be bound: args and kwargs, if present, must be at the end.
+    // Uncomment these to test that the static_assert is indeed working:
+//    m.def("bad_args1", [](py::args, int) {});
+//    m.def("bad_args2", [](py::kwargs, int) {});
+//    m.def("bad_args3", [](py::kwargs, py::args) {});
+//    m.def("bad_args4", [](py::args, int, py::kwargs) {});
+//    m.def("bad_args5", [](py::args, py::kwargs, int) {});
+//    m.def("bad_args6", [](py::args, py::args) {});
+//    m.def("bad_args7", [](py::kwargs, py::kwargs) {});
+
+    // test_keyword_only_args
+    m.def("kwonly_all", [](int i, int j) { return py::make_tuple(i, j); },
+            py::kwonly(), py::arg("i"), py::arg("j"));
+    m.def("kwonly_some", [](int i, int j, int k) { return py::make_tuple(i, j, k); },
+            py::arg(), py::kwonly(), py::arg("j"), py::arg("k"));
+    m.def("kwonly_with_defaults", [](int i, int j, int k, int z) { return py::make_tuple(i, j, k, z); },
+            py::arg() = 3, "j"_a = 4, py::kwonly(), "k"_a = 5, "z"_a);
+    m.def("kwonly_mixed", [](int i, int j) { return py::make_tuple(i, j); },
+            "i"_a, py::kwonly(), "j"_a);
+    m.def("kwonly_plus_more", [](int i, int j, int k, py::kwargs kwargs) {
+            return py::make_tuple(i, j, k, kwargs); },
+            py::arg() /* positional */, py::arg("j") = -1 /* both */, py::kwonly(), py::arg("k") /* kw-only */);
+
+    m.def("register_invalid_kwonly", [](py::module m) {
+        m.def("bad_kwonly", [](int i, int j) { return py::make_tuple(i, j); },
+                py::kwonly(), py::arg() /* invalid unnamed argument */, "j"_a);
+    });
+
+    // These should fail to compile:
+    // argument annotations are required when using kwonly
+//    m.def("bad_kwonly1", [](int) {}, py::kwonly());
+    // can't specify both `py::kwonly` and a `py::args` argument
+//    m.def("bad_kwonly2", [](int i, py::args) {}, py::kwonly(), "i"_a);
+
+    // test_function_signatures (along with most of the above)
+    struct KWClass { void foo(int, float) {} };
+    py::class_<KWClass>(m, "KWClass")
+        .def("foo0", &KWClass::foo)
+        .def("foo1", &KWClass::foo, "x"_a, "y"_a);
+
+    // Make sure a class (not an instance) can be used as a default argument.
+    // The return value doesn't matter, only that the module is importable.
+    m.def("class_default_argument", [](py::object a) { return py::repr(a); },
+        "a"_a = py::module::import("decimal").attr("Decimal"));
+}
diff --git a/pybind11/tests/test_kwargs_and_defaults.py b/pybind11/tests/test_kwargs_and_defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..5257e0cd3061707f0dd1b79de54a0c6cdae81cd1
--- /dev/null
+++ b/pybind11/tests/test_kwargs_and_defaults.py
@@ -0,0 +1,192 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import kwargs_and_defaults as m
+
+
+def test_function_signatures(doc):
+    assert doc(m.kw_func0) == "kw_func0(arg0: int, arg1: int) -> str"
+    assert doc(m.kw_func1) == "kw_func1(x: int, y: int) -> str"
+    assert doc(m.kw_func2) == "kw_func2(x: int = 100, y: int = 200) -> str"
+    assert doc(m.kw_func3) == "kw_func3(data: str = 'Hello world!') -> None"
+    assert doc(m.kw_func4) == "kw_func4(myList: List[int] = [13, 17]) -> str"
+    assert doc(m.kw_func_udl) == "kw_func_udl(x: int, y: int = 300) -> str"
+    assert doc(m.kw_func_udl_z) == "kw_func_udl_z(x: int, y: int = 0) -> str"
+    assert doc(m.args_function) == "args_function(*args) -> tuple"
+    assert doc(m.args_kwargs_function) == "args_kwargs_function(*args, **kwargs) -> tuple"
+    assert doc(m.KWClass.foo0) == \
+        "foo0(self: m.kwargs_and_defaults.KWClass, arg0: int, arg1: float) -> None"
+    assert doc(m.KWClass.foo1) == \
+        "foo1(self: m.kwargs_and_defaults.KWClass, x: int, y: float) -> None"
+
+
+def test_named_arguments(msg):
+    assert m.kw_func0(5, 10) == "x=5, y=10"
+
+    assert m.kw_func1(5, 10) == "x=5, y=10"
+    assert m.kw_func1(5, y=10) == "x=5, y=10"
+    assert m.kw_func1(y=10, x=5) == "x=5, y=10"
+
+    assert m.kw_func2() == "x=100, y=200"
+    assert m.kw_func2(5) == "x=5, y=200"
+    assert m.kw_func2(x=5) == "x=5, y=200"
+    assert m.kw_func2(y=10) == "x=100, y=10"
+    assert m.kw_func2(5, 10) == "x=5, y=10"
+    assert m.kw_func2(x=5, y=10) == "x=5, y=10"
+
+    with pytest.raises(TypeError) as excinfo:
+        # noinspection PyArgumentList
+        m.kw_func2(x=5, y=10, z=12)
+    assert excinfo.match(
+        r'(?s)^kw_func2\(\): incompatible.*Invoked with: kwargs: ((x=5|y=10|z=12)(, |$))' + '{3}$')
+
+    assert m.kw_func4() == "{13 17}"
+    assert m.kw_func4(myList=[1, 2, 3]) == "{1 2 3}"
+
+    assert m.kw_func_udl(x=5, y=10) == "x=5, y=10"
+    assert m.kw_func_udl_z(x=5) == "x=5, y=0"
+
+
+def test_arg_and_kwargs():
+    args = 'arg1_value', 'arg2_value', 3
+    assert m.args_function(*args) == args
+
+    args = 'a1', 'a2'
+    kwargs = dict(arg3='a3', arg4=4)
+    assert m.args_kwargs_function(*args, **kwargs) == (args, kwargs)
+
+
+def test_mixed_args_and_kwargs(msg):
+    mpa = m.mixed_plus_args
+    mpk = m.mixed_plus_kwargs
+    mpak = m.mixed_plus_args_kwargs
+    mpakd = m.mixed_plus_args_kwargs_defaults
+
+    assert mpa(1, 2.5, 4, 99.5, None) == (1, 2.5, (4, 99.5, None))
+    assert mpa(1, 2.5) == (1, 2.5, ())
+    with pytest.raises(TypeError) as excinfo:
+        assert mpa(1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: int, arg1: float, *args) -> tuple
+
+        Invoked with: 1
+    """  # noqa: E501 line too long
+    with pytest.raises(TypeError) as excinfo:
+        assert mpa()
+    assert msg(excinfo.value) == """
+        mixed_plus_args(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: int, arg1: float, *args) -> tuple
+
+        Invoked with:
+    """  # noqa: E501 line too long
+
+    assert mpk(-2, 3.5, pi=3.14159, e=2.71828) == (-2, 3.5, {'e': 2.71828, 'pi': 3.14159})
+    assert mpak(7, 7.7, 7.77, 7.777, 7.7777, minusseven=-7) == (
+        7, 7.7, (7.77, 7.777, 7.7777), {'minusseven': -7})
+    assert mpakd() == (1, 3.14159, (), {})
+    assert mpakd(3) == (3, 3.14159, (), {})
+    assert mpakd(j=2.71828) == (1, 2.71828, (), {})
+    assert mpakd(k=42) == (1, 3.14159, (), {'k': 42})
+    assert mpakd(1, 1, 2, 3, 5, 8, then=13, followedby=21) == (
+        1, 1, (2, 3, 5, 8), {'then': 13, 'followedby': 21})
+    # Arguments specified both positionally and via kwargs should fail:
+    with pytest.raises(TypeError) as excinfo:
+        assert mpakd(1, i=1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
+            1. (i: int = 1, j: float = 3.14159, *args, **kwargs) -> tuple
+
+        Invoked with: 1; kwargs: i=1
+    """  # noqa: E501 line too long
+    with pytest.raises(TypeError) as excinfo:
+        assert mpakd(1, 2, j=1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
+            1. (i: int = 1, j: float = 3.14159, *args, **kwargs) -> tuple
+
+        Invoked with: 1, 2; kwargs: j=1
+    """  # noqa: E501 line too long
+
+
+def test_keyword_only_args(msg):
+    assert m.kwonly_all(i=1, j=2) == (1, 2)
+    assert m.kwonly_all(j=1, i=2) == (2, 1)
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.kwonly_all(i=1) == (1,)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.kwonly_all(1, 2) == (1, 2)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    assert m.kwonly_some(1, k=3, j=2) == (1, 2, 3)
+
+    assert m.kwonly_with_defaults(z=8) == (3, 4, 5, 8)
+    assert m.kwonly_with_defaults(2, z=8) == (2, 4, 5, 8)
+    assert m.kwonly_with_defaults(2, j=7, k=8, z=9) == (2, 7, 8, 9)
+    assert m.kwonly_with_defaults(2, 7, z=9, k=8) == (2, 7, 8, 9)
+
+    assert m.kwonly_mixed(1, j=2) == (1, 2)
+    assert m.kwonly_mixed(j=2, i=3) == (3, 2)
+    assert m.kwonly_mixed(i=2, j=3) == (2, 3)
+
+    assert m.kwonly_plus_more(4, 5, k=6, extra=7) == (4, 5, 6, {'extra': 7})
+    assert m.kwonly_plus_more(3, k=5, j=4, extra=6) == (3, 4, 5, {'extra': 6})
+    assert m.kwonly_plus_more(2, k=3, extra=4) == (2, -1, 3, {'extra': 4})
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.kwonly_mixed(i=1) == (1,)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.register_invalid_kwonly(m)
+    assert msg(excinfo.value) == """
+        arg(): cannot specify an unnamed argument after an kwonly() annotation
+    """
+
+
+@pytest.mark.xfail("env.PYPY and env.PY2", reason="PyPy2 doesn't double count")
+def test_args_refcount():
+    """Issue/PR #1216 - py::args elements get double-inc_ref()ed when combined with regular
+    arguments"""
+    refcount = m.arg_refcount_h
+
+    myval = 54321
+    expected = refcount(myval)
+    assert m.arg_refcount_h(myval) == expected
+    assert m.arg_refcount_o(myval) == expected + 1
+    assert m.arg_refcount_h(myval) == expected
+    assert refcount(myval) == expected
+
+    assert m.mixed_plus_args(1, 2.0, "a", myval) == (1, 2.0, ("a", myval))
+    assert refcount(myval) == expected
+
+    assert m.mixed_plus_kwargs(3, 4.0, a=1, b=myval) == (3, 4.0, {"a": 1, "b": myval})
+    assert refcount(myval) == expected
+
+    assert m.args_function(-1, myval) == (-1, myval)
+    assert refcount(myval) == expected
+
+    assert m.mixed_plus_args_kwargs(5, 6.0, myval, a=myval) == (5, 6.0, (myval,), {"a": myval})
+    assert refcount(myval) == expected
+
+    assert m.args_kwargs_function(7, 8, myval, a=1, b=myval) == \
+        ((7, 8, myval), {"a": 1, "b": myval})
+    assert refcount(myval) == expected
+
+    exp3 = refcount(myval, myval, myval)
+    assert m.args_refcount(myval, myval, myval) == (exp3, exp3, exp3)
+    assert refcount(myval) == expected
+
+    # This function takes the first arg as a `py::object` and the rest as a `py::args`.  Unlike the
+    # previous case, when we have both positional and `py::args` we need to construct a new tuple
+    # for the `py::args`; in the previous case, we could simply inc_ref and pass on Python's input
+    # tuple without having to inc_ref the individual elements, but here we can't, hence the extra
+    # refs.
+    assert m.mixed_args_refcount(myval, myval, myval) == (exp3 + 3, exp3 + 3, exp3 + 3)
+
+    assert m.class_default_argument() == "<class 'decimal.Decimal'>"
diff --git a/pybind11/tests/test_local_bindings.cpp b/pybind11/tests/test_local_bindings.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97c02dbeb567c3699aa48f150bd8ec9dd3cd951f
--- /dev/null
+++ b/pybind11/tests/test_local_bindings.cpp
@@ -0,0 +1,101 @@
+/*
+    tests/test_local_bindings.cpp -- tests the py::module_local class feature which makes a class
+                                     binding local to the module in which it is defined.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+TEST_SUBMODULE(local_bindings, m) {
+    // test_load_external
+    m.def("load_external1", [](ExternalType1 &e) { return e.i; });
+    m.def("load_external2", [](ExternalType2 &e) { return e.i; });
+
+    // test_local_bindings
+    // Register a class with py::module_local:
+    bind_local<LocalType, -1>(m, "LocalType", py::module_local())
+        .def("get3", [](LocalType &t) { return t.i + 3; })
+        ;
+
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // The main pybind11 test module is loaded first, so this registration will succeed (the second
+    // one, in pybind11_cross_module_tests.cpp, is designed to fail):
+    bind_local<NonLocalType, 0>(m, "NonLocalType")
+        .def(py::init<int>())
+        .def("get", [](LocalType &i) { return i.i; })
+        ;
+
+    // test_duplicate_local
+    // py::module_local declarations should be visible across compilation units that get linked together;
+    // this tries to register a duplicate local.  It depends on a definition in test_class.cpp and
+    // should raise a runtime error from the duplicate definition attempt.  If test_class isn't
+    // available it *also* throws a runtime error (with "test_class not enabled" as value).
+    m.def("register_local_external", [m]() {
+        auto main = py::module::import("pybind11_tests");
+        if (py::hasattr(main, "class_")) {
+            bind_local<LocalExternal, 7>(m, "LocalExternal", py::module_local());
+        }
+        else throw std::runtime_error("test_class not enabled");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+    // and global if the type (or one of the types, for the map) is global:
+    py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    py::bind_map<NonLocalMap>(m, "NonLocalMap");
+
+    // test_stl_bind_global
+    // They can, however, be overridden to global using `py::module_local(false)`:
+    bind_local<NonLocal2, 10>(m, "NonLocal2");
+    py::bind_vector<LocalVec2>(m, "LocalVec2", py::module_local());
+    py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global", [m]() {
+        bind_local<MixedGlobalLocal, 100>(m, "MixedGlobalLocal", py::module_local(false));
+    });
+    m.def("register_mixed_local", [m]() {
+        bind_local<MixedLocalGlobal, 1000>(m, "MixedLocalGlobal", py::module_local());
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    m.def("load_vector_via_caster", [](std::vector<int> v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Cat : public pets::Pet { public: Cat(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL").def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 10; });
+
+    py::class_<MixGL2>(m, "MixGL2").def(py::init<int>());
+}
diff --git a/pybind11/tests/test_local_bindings.py b/pybind11/tests/test_local_bindings.py
new file mode 100644
index 0000000000000000000000000000000000000000..5460727e1d7ad840f5f2817e9ffbb4e10920b583
--- /dev/null
+++ b/pybind11/tests/test_local_bindings.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import local_bindings as m
+
+
+def test_load_external():
+    """Load a `py::module_local` type that's only registered in an external module"""
+    import pybind11_cross_module_tests as cm
+
+    assert m.load_external1(cm.ExternalType1(11)) == 11
+    assert m.load_external2(cm.ExternalType2(22)) == 22
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.load_external2(cm.ExternalType1(21)) == 21
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.load_external1(cm.ExternalType2(12)) == 12
+    assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_local_bindings():
+    """Tests that duplicate `py::module_local` class bindings work across modules"""
+
+    # Make sure we can load the second module with the conflicting (but local) definition:
+    import pybind11_cross_module_tests as cm
+
+    i1 = m.LocalType(5)
+    assert i1.get() == 4
+    assert i1.get3() == 8
+
+    i2 = cm.LocalType(10)
+    assert i2.get() == 11
+    assert i2.get2() == 12
+
+    assert not hasattr(i1, 'get2')
+    assert not hasattr(i2, 'get3')
+
+    # Loading within the local module
+    assert m.local_value(i1) == 5
+    assert cm.local_value(i2) == 10
+
+    # Cross-module loading works as well (on failure, the type loader looks for
+    # external module-local converters):
+    assert m.local_value(i2) == 10
+    assert cm.local_value(i1) == 5
+
+
+def test_nonlocal_failure():
+    """Tests that attempting to register a non-local type in multiple modules fails"""
+    import pybind11_cross_module_tests as cm
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalType" is already registered!'
+
+
+def test_duplicate_local():
+    """Tests expected failure when registering a class twice with py::local in the same module"""
+    with pytest.raises(RuntimeError) as excinfo:
+        m.register_local_external()
+    import pybind11_tests
+    assert str(excinfo.value) == (
+        'generic_type: type "LocalExternal" is already registered!'
+        if hasattr(pybind11_tests, 'class_') else 'test_class not enabled')
+
+
+def test_stl_bind_local():
+    import pybind11_cross_module_tests as cm
+
+    v1, v2 = m.LocalVec(), cm.LocalVec()
+    v1.append(m.LocalType(1))
+    v1.append(m.LocalType(2))
+    v2.append(cm.LocalType(1))
+    v2.append(cm.LocalType(2))
+
+    # Cross module value loading:
+    v1.append(cm.LocalType(3))
+    v2.append(m.LocalType(3))
+
+    assert [i.get() for i in v1] == [0, 1, 2]
+    assert [i.get() for i in v2] == [2, 3, 4]
+
+    v3, v4 = m.NonLocalVec(), cm.NonLocalVec2()
+    v3.append(m.NonLocalType(1))
+    v3.append(m.NonLocalType(2))
+    v4.append(m.NonLocal2(3))
+    v4.append(m.NonLocal2(4))
+
+    assert [i.get() for i in v3] == [1, 2]
+    assert [i.get() for i in v4] == [13, 14]
+
+    d1, d2 = m.LocalMap(), cm.LocalMap()
+    d1["a"] = v1[0]
+    d1["b"] = v1[1]
+    d2["c"] = v2[0]
+    d2["d"] = v2[1]
+    assert {i: d1[i].get() for i in d1} == {'a': 0, 'b': 1}
+    assert {i: d2[i].get() for i in d2} == {'c': 2, 'd': 3}
+
+
+def test_stl_bind_global():
+    import pybind11_cross_module_tests as cm
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_map()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalMap" is already registered!'
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_vec()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalVec" is already registered!'
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_map2()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalMap2" is already registered!'
+
+
+def test_mixed_local_global():
+    """Local types take precedence over globally registered types: a module with a `module_local`
+    type can be registered even if the type is already registered globally.  With the module,
+    casting will go to the local type; outside the module casting goes to the global type."""
+    import pybind11_cross_module_tests as cm
+    m.register_mixed_global()
+    m.register_mixed_local()
+
+    a = []
+    a.append(m.MixedGlobalLocal(1))
+    a.append(m.MixedLocalGlobal(2))
+    a.append(m.get_mixed_gl(3))
+    a.append(m.get_mixed_lg(4))
+
+    assert [x.get() for x in a] == [101, 1002, 103, 1004]
+
+    cm.register_mixed_global_local()
+    cm.register_mixed_local_global()
+    a.append(m.MixedGlobalLocal(5))
+    a.append(m.MixedLocalGlobal(6))
+    a.append(cm.MixedGlobalLocal(7))
+    a.append(cm.MixedLocalGlobal(8))
+    a.append(m.get_mixed_gl(9))
+    a.append(m.get_mixed_lg(10))
+    a.append(cm.get_mixed_gl(11))
+    a.append(cm.get_mixed_lg(12))
+
+    assert [x.get() for x in a] == \
+        [101, 1002, 103, 1004, 105, 1006, 207, 2008, 109, 1010, 211, 2012]
+
+
+def test_internal_locals_differ():
+    """Makes sure the internal local type map differs across the two modules"""
+    import pybind11_cross_module_tests as cm
+    assert m.local_cpp_types_addr() != cm.local_cpp_types_addr()
+
+
+@pytest.mark.xfail("env.PYPY")
+def test_stl_caster_vs_stl_bind(msg):
+    """One module uses a generic vector caster from `<pybind11/stl.h>` while the other
+    exports `std::vector<int>` via `py:bind_vector` and `py::module_local`"""
+    import pybind11_cross_module_tests as cm
+
+    v1 = cm.VectorInt([1, 2, 3])
+    assert m.load_vector_via_caster(v1) == 6
+    assert cm.load_vector_via_binding(v1) == 6
+
+    v2 = [1, 2, 3]
+    assert m.load_vector_via_caster(v2) == 6
+    with pytest.raises(TypeError) as excinfo:
+        cm.load_vector_via_binding(v2) == 6
+    assert msg(excinfo.value) == """
+    load_vector_via_binding(): incompatible function arguments. The following argument types are supported:
+        1. (arg0: pybind11_cross_module_tests.VectorInt) -> int
+
+    Invoked with: [1, 2, 3]
+    """  # noqa: E501 line too long
+
+
+def test_cross_module_calls():
+    import pybind11_cross_module_tests as cm
+
+    v1 = m.LocalVec()
+    v1.append(m.LocalType(1))
+    v2 = cm.LocalVec()
+    v2.append(cm.LocalType(2))
+
+    # Returning the self pointer should get picked up as returning an existing
+    # instance (even when that instance is of a foreign, non-local type).
+    assert m.return_self(v1) is v1
+    assert cm.return_self(v2) is v2
+    assert m.return_self(v2) is v2
+    assert cm.return_self(v1) is v1
+
+    assert m.LocalVec is not cm.LocalVec
+    # Returning a copy, on the other hand, always goes to the local type,
+    # regardless of where the source type came from.
+    assert type(m.return_copy(v1)) is m.LocalVec
+    assert type(m.return_copy(v2)) is m.LocalVec
+    assert type(cm.return_copy(v1)) is cm.LocalVec
+    assert type(cm.return_copy(v2)) is cm.LocalVec
+
+    # Test the example given in the documentation (which also tests inheritance casting):
+    mycat = m.Cat("Fluffy")
+    mydog = cm.Dog("Rover")
+    assert mycat.get_name() == "Fluffy"
+    assert mydog.name() == "Rover"
+    assert m.Cat.__base__.__name__ == "Pet"
+    assert cm.Dog.__base__.__name__ == "Pet"
+    assert m.Cat.__base__ is not cm.Dog.__base__
+    assert m.pet_name(mycat) == "Fluffy"
+    assert m.pet_name(mydog) == "Rover"
+    assert cm.pet_name(mycat) == "Fluffy"
+    assert cm.pet_name(mydog) == "Rover"
+
+    assert m.MixGL is not cm.MixGL
+    a = m.MixGL(1)
+    b = cm.MixGL(2)
+    assert m.get_gl_value(a) == 11
+    assert m.get_gl_value(b) == 12
+    assert cm.get_gl_value(a) == 101
+    assert cm.get_gl_value(b) == 102
+
+    c, d = m.MixGL2(3), cm.MixGL2(4)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(c)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(d)
+    assert "incompatible function arguments" in str(excinfo.value)
diff --git a/pybind11/tests/test_methods_and_attributes.cpp b/pybind11/tests/test_methods_and_attributes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11d4e7b3501a8bb37b829af6c4aa5d4a4e094f8e
--- /dev/null
+++ b/pybind11/tests/test_methods_and_attributes.cpp
@@ -0,0 +1,372 @@
+/*
+    tests/test_methods_and_attributes.cpp -- constructors, deconstructors, attribute access,
+    __str__, argument and return value conventions
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+#if !defined(PYBIND11_OVERLOAD_CAST)
+template <typename... Args>
+using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
+#endif
+
+class ExampleMandA {
+public:
+    ExampleMandA() { print_default_created(this); }
+    ExampleMandA(int value) : value(value) { print_created(this, value); }
+    ExampleMandA(const ExampleMandA &e) : value(e.value) { print_copy_created(this); }
+    ExampleMandA(std::string&&) {}
+    ExampleMandA(ExampleMandA &&e) : value(e.value) { print_move_created(this); }
+    ~ExampleMandA() { print_destroyed(this); }
+
+    std::string toString() {
+        return "ExampleMandA[value=" + std::to_string(value) + "]";
+    }
+
+    void operator=(const ExampleMandA &e) { print_copy_assigned(this); value = e.value; }
+    void operator=(ExampleMandA &&e) { print_move_assigned(this); value = e.value; }
+
+    void add1(ExampleMandA other) { value += other.value; }         // passing by value
+    void add2(ExampleMandA &other) { value += other.value; }        // passing by reference
+    void add3(const ExampleMandA &other) { value += other.value; }  // passing by const reference
+    void add4(ExampleMandA *other) { value += other->value; }       // passing by pointer
+    void add5(const ExampleMandA *other) { value += other->value; } // passing by const pointer
+
+    void add6(int other) { value += other; }                        // passing by value
+    void add7(int &other) { value += other; }                       // passing by reference
+    void add8(const int &other) { value += other; }                 // passing by const reference
+    void add9(int *other) { value += *other; }                      // passing by pointer
+    void add10(const int *other) { value += *other; }               // passing by const pointer
+
+    void consume_str(std::string&&) {}
+
+    ExampleMandA self1() { return *this; }                          // return by value
+    ExampleMandA &self2() { return *this; }                         // return by reference
+    const ExampleMandA &self3() { return *this; }                   // return by const reference
+    ExampleMandA *self4() { return this; }                          // return by pointer
+    const ExampleMandA *self5() { return this; }                    // return by const pointer
+
+    int internal1() { return value; }                               // return by value
+    int &internal2() { return value; }                              // return by reference
+    const int &internal3() { return value; }                        // return by const reference
+    int *internal4() { return &value; }                             // return by pointer
+    const int *internal5() { return &value; }                       // return by const pointer
+
+    py::str overloaded()             { return "()"; }
+    py::str overloaded(int)          { return "(int)"; }
+    py::str overloaded(int, float)   { return "(int, float)"; }
+    py::str overloaded(float, int)   { return "(float, int)"; }
+    py::str overloaded(int, int)     { return "(int, int)"; }
+    py::str overloaded(float, float) { return "(float, float)"; }
+    py::str overloaded(int)          const { return "(int) const"; }
+    py::str overloaded(int, float)   const { return "(int, float) const"; }
+    py::str overloaded(float, int)   const { return "(float, int) const"; }
+    py::str overloaded(int, int)     const { return "(int, int) const"; }
+    py::str overloaded(float, float) const { return "(float, float) const"; }
+
+    static py::str overloaded(float) { return "static float"; }
+
+    int value = 0;
+};
+
+struct TestProperties {
+    int value = 1;
+    static int static_value;
+
+    int get() const { return value; }
+    void set(int v) { value = v; }
+
+    static int static_get() { return static_value; }
+    static void static_set(int v) { static_value = v; }
+};
+int TestProperties::static_value = 1;
+
+struct TestPropertiesOverride : TestProperties {
+    int value = 99;
+    static int static_value;
+};
+int TestPropertiesOverride::static_value = 99;
+
+struct TestPropRVP {
+    UserType v1{1};
+    UserType v2{1};
+    static UserType sv1;
+    static UserType sv2;
+
+    const UserType &get1() const { return v1; }
+    const UserType &get2() const { return v2; }
+    UserType get_rvalue() const { return v2; }
+    void set1(int v) { v1.set(v); }
+    void set2(int v) { v2.set(v); }
+};
+UserType TestPropRVP::sv1(1);
+UserType TestPropRVP::sv2(1);
+
+// Test None-allowed py::arg argument policy
+class NoneTester { public: int answer = 42; };
+int none1(const NoneTester &obj) { return obj.answer; }
+int none2(NoneTester *obj) { return obj ? obj->answer : -1; }
+int none3(std::shared_ptr<NoneTester> &obj) { return obj ? obj->answer : -1; }
+int none4(std::shared_ptr<NoneTester> *obj) { return obj && *obj ? (*obj)->answer : -1; }
+int none5(std::shared_ptr<NoneTester> obj) { return obj ? obj->answer : -1; }
+
+struct StrIssue {
+    int val = -1;
+
+    StrIssue() = default;
+    StrIssue(int i) : val{i} {}
+};
+
+// Issues #854, #910: incompatible function args when member function/pointer is in unregistered base class
+class UnregisteredBase {
+public:
+    void do_nothing() const {}
+    void increase_value() { rw_value++; ro_value += 0.25; }
+    void set_int(int v) { rw_value = v; }
+    int get_int() const { return rw_value; }
+    double get_double() const { return ro_value; }
+    int rw_value = 42;
+    double ro_value = 1.25;
+};
+class RegisteredDerived : public UnregisteredBase {
+public:
+    using UnregisteredBase::UnregisteredBase;
+    double sum() const { return rw_value + ro_value; }
+};
+
+// Test explicit lvalue ref-qualification
+struct RefQualified {
+    int value = 0;
+
+    void refQualified(int other) & { value += other; }
+    int constRefQualified(int other) const & { return value + other; }
+};
+
+TEST_SUBMODULE(methods_and_attributes, m) {
+    // test_methods_and_attributes
+    py::class_<ExampleMandA> emna(m, "ExampleMandA");
+    emna.def(py::init<>())
+        .def(py::init<int>())
+        .def(py::init<std::string&&>())
+        .def(py::init<const ExampleMandA&>())
+        .def("add1", &ExampleMandA::add1)
+        .def("add2", &ExampleMandA::add2)
+        .def("add3", &ExampleMandA::add3)
+        .def("add4", &ExampleMandA::add4)
+        .def("add5", &ExampleMandA::add5)
+        .def("add6", &ExampleMandA::add6)
+        .def("add7", &ExampleMandA::add7)
+        .def("add8", &ExampleMandA::add8)
+        .def("add9", &ExampleMandA::add9)
+        .def("add10", &ExampleMandA::add10)
+        .def("consume_str", &ExampleMandA::consume_str)
+        .def("self1", &ExampleMandA::self1)
+        .def("self2", &ExampleMandA::self2)
+        .def("self3", &ExampleMandA::self3)
+        .def("self4", &ExampleMandA::self4)
+        .def("self5", &ExampleMandA::self5)
+        .def("internal1", &ExampleMandA::internal1)
+        .def("internal2", &ExampleMandA::internal2)
+        .def("internal3", &ExampleMandA::internal3)
+        .def("internal4", &ExampleMandA::internal4)
+        .def("internal5", &ExampleMandA::internal5)
+#if defined(PYBIND11_OVERLOAD_CAST)
+        .def("overloaded", py::overload_cast<>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,   float>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float,   int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,     int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_float", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_const", py::overload_cast<int         >(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,   float>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float,   int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,     int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float, float>(&ExampleMandA::overloaded, py::const_))
+#else
+        // Use both the traditional static_cast method and the C++11 compatible overload_cast_
+        .def("overloaded", overload_cast_<>()(&ExampleMandA::overloaded))
+        .def("overloaded", overload_cast_<int>()(&ExampleMandA::overloaded))
+        .def("overloaded", overload_cast_<int,   float>()(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float,   int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int,     int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float, float)>(&ExampleMandA::overloaded))
+        .def("overloaded_float", overload_cast_<float, float>()(&ExampleMandA::overloaded))
+        .def("overloaded_const", overload_cast_<int         >()(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", overload_cast_<int,   float>()(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float,   int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int,     int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float, float) const>(&ExampleMandA::overloaded))
+#endif
+        // test_no_mixed_overloads
+        // Raise error if trying to mix static/non-static overloads on the same name:
+        .def_static("add_mixed_overloads1", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def       ("overload_mixed1", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded))
+                .def_static("overload_mixed1", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded));
+        })
+        .def_static("add_mixed_overloads2", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def_static("overload_mixed2", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded))
+                .def       ("overload_mixed2", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded));
+        })
+        .def("__str__", &ExampleMandA::toString)
+        .def_readwrite("value", &ExampleMandA::value);
+
+    // test_copy_method
+    // Issue #443: can't call copied methods in Python 3
+    emna.attr("add2b") = emna.attr("add2");
+
+    // test_properties, test_static_properties, test_static_cls
+    py::class_<TestProperties>(m, "TestProperties")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestProperties::value)
+        .def_readwrite("def_readwrite", &TestProperties::value)
+        .def_property("def_writeonly", nullptr,
+                      [](TestProperties& s,int v) { s.value = v; } )
+        .def_property("def_property_writeonly", nullptr, &TestProperties::set)
+        .def_property_readonly("def_property_readonly", &TestProperties::get)
+        .def_property("def_property", &TestProperties::get, &TestProperties::set)
+        .def_property("def_property_impossible", nullptr, nullptr)
+        .def_readonly_static("def_readonly_static", &TestProperties::static_value)
+        .def_readwrite_static("def_readwrite_static", &TestProperties::static_value)
+        .def_property_static("def_writeonly_static", nullptr,
+                             [](py::object, int v) { TestProperties::static_value = v; })
+        .def_property_readonly_static("def_property_readonly_static",
+                                      [](py::object) { return TestProperties::static_get(); })
+        .def_property_static("def_property_writeonly_static", nullptr,
+                             [](py::object, int v) { return TestProperties::static_set(v); })
+        .def_property_static("def_property_static",
+                             [](py::object) { return TestProperties::static_get(); },
+                             [](py::object, int v) { TestProperties::static_set(v); })
+        .def_property_static("static_cls",
+                             [](py::object cls) { return cls; },
+                             [](py::object cls, py::function f) { f(cls); });
+
+    py::class_<TestPropertiesOverride, TestProperties>(m, "TestPropertiesOverride")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestPropertiesOverride::value)
+        .def_readonly_static("def_readonly_static", &TestPropertiesOverride::static_value);
+
+    auto static_get1 = [](py::object) -> const UserType & { return TestPropRVP::sv1; };
+    auto static_get2 = [](py::object) -> const UserType & { return TestPropRVP::sv2; };
+    auto static_set1 = [](py::object, int v) { TestPropRVP::sv1.set(v); };
+    auto static_set2 = [](py::object, int v) { TestPropRVP::sv2.set(v); };
+    auto rvp_copy = py::return_value_policy::copy;
+
+    // test_property_return_value_policies
+    py::class_<TestPropRVP>(m, "TestPropRVP")
+        .def(py::init<>())
+        .def_property_readonly("ro_ref", &TestPropRVP::get1)
+        .def_property_readonly("ro_copy", &TestPropRVP::get2, rvp_copy)
+        .def_property_readonly("ro_func", py::cpp_function(&TestPropRVP::get2, rvp_copy))
+        .def_property("rw_ref", &TestPropRVP::get1, &TestPropRVP::set1)
+        .def_property("rw_copy", &TestPropRVP::get2, &TestPropRVP::set2, rvp_copy)
+        .def_property("rw_func", py::cpp_function(&TestPropRVP::get2, rvp_copy), &TestPropRVP::set2)
+        .def_property_readonly_static("static_ro_ref", static_get1)
+        .def_property_readonly_static("static_ro_copy", static_get2, rvp_copy)
+        .def_property_readonly_static("static_ro_func", py::cpp_function(static_get2, rvp_copy))
+        .def_property_static("static_rw_ref", static_get1, static_set1)
+        .def_property_static("static_rw_copy", static_get2, static_set2, rvp_copy)
+        .def_property_static("static_rw_func", py::cpp_function(static_get2, rvp_copy), static_set2)
+        // test_property_rvalue_policy
+        .def_property_readonly("rvalue", &TestPropRVP::get_rvalue)
+        .def_property_readonly_static("static_rvalue", [](py::object) { return UserType(1); });
+
+    // test_metaclass_override
+    struct MetaclassOverride { };
+    py::class_<MetaclassOverride>(m, "MetaclassOverride", py::metaclass((PyObject *) &PyType_Type))
+        .def_property_readonly_static("readonly", [](py::object) { return 1; });
+
+#if !defined(PYPY_VERSION)
+    // test_dynamic_attributes
+    class DynamicClass {
+    public:
+        DynamicClass() { print_default_created(this); }
+        DynamicClass(const DynamicClass&) = delete;
+        ~DynamicClass() { print_destroyed(this); }
+    };
+    py::class_<DynamicClass>(m, "DynamicClass", py::dynamic_attr())
+        .def(py::init());
+
+    class CppDerivedDynamicClass : public DynamicClass { };
+    py::class_<CppDerivedDynamicClass, DynamicClass>(m, "CppDerivedDynamicClass")
+        .def(py::init());
+#endif
+
+    // test_bad_arg_default
+    // Issue/PR #648: bad arg default debugging output
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+    m.def("bad_arg_def_named", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg("a") = UnregisteredType());
+    });
+    m.def("bad_arg_def_unnamed", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg() = UnregisteredType());
+    });
+
+    // test_accepts_none
+    py::class_<NoneTester, std::shared_ptr<NoneTester>>(m, "NoneTester")
+        .def(py::init<>());
+    m.def("no_none1", &none1, py::arg().none(false));
+    m.def("no_none2", &none2, py::arg().none(false));
+    m.def("no_none3", &none3, py::arg().none(false));
+    m.def("no_none4", &none4, py::arg().none(false));
+    m.def("no_none5", &none5, py::arg().none(false));
+    m.def("ok_none1", &none1);
+    m.def("ok_none2", &none2, py::arg().none(true));
+    m.def("ok_none3", &none3);
+    m.def("ok_none4", &none4, py::arg().none(true));
+    m.def("ok_none5", &none5);
+
+    // test_str_issue
+    // Issue #283: __str__ called on uninitialized instance when constructor arguments invalid
+    py::class_<StrIssue>(m, "StrIssue")
+        .def(py::init<int>())
+        .def(py::init<>())
+        .def("__str__", [](const StrIssue &si) {
+            return "StrIssue[" + std::to_string(si.val) + "]"; }
+        );
+
+    // test_unregistered_base_implementations
+    //
+    // Issues #854/910: incompatible function args when member function/pointer is in unregistered
+    // base class The methods and member pointers below actually resolve to members/pointers in
+    // UnregisteredBase; before this test/fix they would be registered via lambda with a first
+    // argument of an unregistered type, and thus uncallable.
+    py::class_<RegisteredDerived>(m, "RegisteredDerived")
+        .def(py::init<>())
+        .def("do_nothing", &RegisteredDerived::do_nothing)
+        .def("increase_value", &RegisteredDerived::increase_value)
+        .def_readwrite("rw_value", &RegisteredDerived::rw_value)
+        .def_readonly("ro_value", &RegisteredDerived::ro_value)
+        // These should trigger a static_assert if uncommented
+        //.def_readwrite("fails", &UserType::value) // should trigger a static_assert if uncommented
+        //.def_readonly("fails", &UserType::value) // should trigger a static_assert if uncommented
+        .def_property("rw_value_prop", &RegisteredDerived::get_int, &RegisteredDerived::set_int)
+        .def_property_readonly("ro_value_prop", &RegisteredDerived::get_double)
+        // This one is in the registered class:
+        .def("sum", &RegisteredDerived::sum)
+        ;
+
+    using Adapted = decltype(py::method_adaptor<RegisteredDerived>(&RegisteredDerived::do_nothing));
+    static_assert(std::is_same<Adapted, void (RegisteredDerived::*)() const>::value, "");
+
+    // test_methods_and_attributes
+    py::class_<RefQualified>(m, "RefQualified")
+        .def(py::init<>())
+        .def_readonly("value", &RefQualified::value)
+        .def("refQualified", &RefQualified::refQualified)
+        .def("constRefQualified", &RefQualified::constRefQualified);
+}
diff --git a/pybind11/tests/test_methods_and_attributes.py b/pybind11/tests/test_methods_and_attributes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c296b6868d64f75085493e6def6d319860851b44
--- /dev/null
+++ b/pybind11/tests/test_methods_and_attributes.py
@@ -0,0 +1,440 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import methods_and_attributes as m
+from pybind11_tests import ConstructorStats
+
+
+def test_methods_and_attributes():
+    instance1 = m.ExampleMandA()
+    instance2 = m.ExampleMandA(32)
+
+    instance1.add1(instance2)
+    instance1.add2(instance2)
+    instance1.add3(instance2)
+    instance1.add4(instance2)
+    instance1.add5(instance2)
+    instance1.add6(32)
+    instance1.add7(32)
+    instance1.add8(32)
+    instance1.add9(32)
+    instance1.add10(32)
+
+    assert str(instance1) == "ExampleMandA[value=320]"
+    assert str(instance2) == "ExampleMandA[value=32]"
+    assert str(instance1.self1()) == "ExampleMandA[value=320]"
+    assert str(instance1.self2()) == "ExampleMandA[value=320]"
+    assert str(instance1.self3()) == "ExampleMandA[value=320]"
+    assert str(instance1.self4()) == "ExampleMandA[value=320]"
+    assert str(instance1.self5()) == "ExampleMandA[value=320]"
+
+    assert instance1.internal1() == 320
+    assert instance1.internal2() == 320
+    assert instance1.internal3() == 320
+    assert instance1.internal4() == 320
+    assert instance1.internal5() == 320
+
+    assert instance1.overloaded() == "()"
+    assert instance1.overloaded(0) == "(int)"
+    assert instance1.overloaded(1, 1.0) == "(int, float)"
+    assert instance1.overloaded(2.0, 2) == "(float, int)"
+    assert instance1.overloaded(3,   3) == "(int, int)"
+    assert instance1.overloaded(4., 4.) == "(float, float)"
+    assert instance1.overloaded_const(-3) == "(int) const"
+    assert instance1.overloaded_const(5, 5.0) == "(int, float) const"
+    assert instance1.overloaded_const(6.0, 6) == "(float, int) const"
+    assert instance1.overloaded_const(7,   7) == "(int, int) const"
+    assert instance1.overloaded_const(8., 8.) == "(float, float) const"
+    assert instance1.overloaded_float(1, 1) == "(float, float)"
+    assert instance1.overloaded_float(1, 1.) == "(float, float)"
+    assert instance1.overloaded_float(1., 1) == "(float, float)"
+    assert instance1.overloaded_float(1., 1.) == "(float, float)"
+
+    assert instance1.value == 320
+    instance1.value = 100
+    assert str(instance1) == "ExampleMandA[value=100]"
+
+    cstats = ConstructorStats.get(m.ExampleMandA)
+    assert cstats.alive() == 2
+    del instance1, instance2
+    assert cstats.alive() == 0
+    assert cstats.values() == ["32"]
+    assert cstats.default_constructions == 1
+    assert cstats.copy_constructions == 2
+    assert cstats.move_constructions >= 2
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_copy_method():
+    """Issue #443: calling copied methods fails in Python 3"""
+
+    m.ExampleMandA.add2c = m.ExampleMandA.add2
+    m.ExampleMandA.add2d = m.ExampleMandA.add2b
+    a = m.ExampleMandA(123)
+    assert a.value == 123
+    a.add2(m.ExampleMandA(-100))
+    assert a.value == 23
+    a.add2b(m.ExampleMandA(20))
+    assert a.value == 43
+    a.add2c(m.ExampleMandA(6))
+    assert a.value == 49
+    a.add2d(m.ExampleMandA(-7))
+    assert a.value == 42
+
+
+def test_properties():
+    instance = m.TestProperties()
+
+    assert instance.def_readonly == 1
+    with pytest.raises(AttributeError):
+        instance.def_readonly = 2
+
+    instance.def_readwrite = 2
+    assert instance.def_readwrite == 2
+
+    assert instance.def_property_readonly == 2
+    with pytest.raises(AttributeError):
+        instance.def_property_readonly = 3
+
+    instance.def_property = 3
+    assert instance.def_property == 3
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = instance.def_property_writeonly  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    instance.def_property_writeonly = 4
+    assert instance.def_property_readonly == 4
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = instance.def_property_impossible  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    with pytest.raises(AttributeError) as excinfo:
+        instance.def_property_impossible = 5
+    assert "can't set attribute" in str(excinfo.value)
+
+
+def test_static_properties():
+    assert m.TestProperties.def_readonly_static == 1
+    with pytest.raises(AttributeError) as excinfo:
+        m.TestProperties.def_readonly_static = 2
+    assert "can't set attribute" in str(excinfo.value)
+
+    m.TestProperties.def_readwrite_static = 2
+    assert m.TestProperties.def_readwrite_static == 2
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = m.TestProperties.def_writeonly_static  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    m.TestProperties.def_writeonly_static = 3
+    assert m.TestProperties.def_readonly_static == 3
+
+    assert m.TestProperties.def_property_readonly_static == 3
+    with pytest.raises(AttributeError) as excinfo:
+        m.TestProperties.def_property_readonly_static = 99
+    assert "can't set attribute" in str(excinfo.value)
+
+    m.TestProperties.def_property_static = 4
+    assert m.TestProperties.def_property_static == 4
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = m.TestProperties.def_property_writeonly_static
+    assert "unreadable attribute" in str(excinfo.value)
+
+    m.TestProperties.def_property_writeonly_static = 5
+    assert m.TestProperties.def_property_static == 5
+
+    # Static property read and write via instance
+    instance = m.TestProperties()
+
+    m.TestProperties.def_readwrite_static = 0
+    assert m.TestProperties.def_readwrite_static == 0
+    assert instance.def_readwrite_static == 0
+
+    instance.def_readwrite_static = 2
+    assert m.TestProperties.def_readwrite_static == 2
+    assert instance.def_readwrite_static == 2
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = instance.def_property_writeonly_static  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    instance.def_property_writeonly_static = 4
+    assert instance.def_property_static == 4
+
+    # It should be possible to override properties in derived classes
+    assert m.TestPropertiesOverride().def_readonly == 99
+    assert m.TestPropertiesOverride.def_readonly_static == 99
+
+
+def test_static_cls():
+    """Static property getter and setters expect the type object as the their only argument"""
+
+    instance = m.TestProperties()
+    assert m.TestProperties.static_cls is m.TestProperties
+    assert instance.static_cls is m.TestProperties
+
+    def check_self(self):
+        assert self is m.TestProperties
+
+    m.TestProperties.static_cls = check_self
+    instance.static_cls = check_self
+
+
+def test_metaclass_override():
+    """Overriding pybind11's default metaclass changes the behavior of `static_property`"""
+
+    assert type(m.ExampleMandA).__name__ == "pybind11_type"
+    assert type(m.MetaclassOverride).__name__ == "type"
+
+    assert m.MetaclassOverride.readonly == 1
+    assert type(m.MetaclassOverride.__dict__["readonly"]).__name__ == "pybind11_static_property"
+
+    # Regular `type` replaces the property instead of calling `__set__()`
+    m.MetaclassOverride.readonly = 2
+    assert m.MetaclassOverride.readonly == 2
+    assert isinstance(m.MetaclassOverride.__dict__["readonly"], int)
+
+
+def test_no_mixed_overloads():
+    from pybind11_tests import debug_enabled
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.ExampleMandA.add_mixed_overloads1()
+    assert (str(excinfo.value) ==
+            "overloading a method with both static and instance methods is not supported; " +
+            ("compile in debug mode for more details" if not debug_enabled else
+             "error while attempting to bind static method ExampleMandA.overload_mixed1"
+             "(arg0: float) -> str")
+            )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.ExampleMandA.add_mixed_overloads2()
+    assert (str(excinfo.value) ==
+            "overloading a method with both static and instance methods is not supported; " +
+            ("compile in debug mode for more details" if not debug_enabled else
+             "error while attempting to bind instance method ExampleMandA.overload_mixed2"
+             "(self: pybind11_tests.methods_and_attributes.ExampleMandA, arg0: int, arg1: int)"
+             " -> str")
+            )
+
+
+@pytest.mark.parametrize("access", ["ro", "rw", "static_ro", "static_rw"])
+def test_property_return_value_policies(access):
+    if not access.startswith("static"):
+        obj = m.TestPropRVP()
+    else:
+        obj = m.TestPropRVP
+
+    ref = getattr(obj, access + "_ref")
+    assert ref.value == 1
+    ref.value = 2
+    assert getattr(obj, access + "_ref").value == 2
+    ref.value = 1  # restore original value for static properties
+
+    copy = getattr(obj, access + "_copy")
+    assert copy.value == 1
+    copy.value = 2
+    assert getattr(obj, access + "_copy").value == 1
+
+    copy = getattr(obj, access + "_func")
+    assert copy.value == 1
+    copy.value = 2
+    assert getattr(obj, access + "_func").value == 1
+
+
+def test_property_rvalue_policy():
+    """When returning an rvalue, the return value policy is automatically changed from
+    `reference(_internal)` to `move`. The following would not work otherwise."""
+
+    instance = m.TestPropRVP()
+    o = instance.rvalue
+    assert o.value == 1
+
+    os = m.TestPropRVP.static_rvalue
+    assert os.value == 1
+
+
+# https://foss.heptapod.net/pypy/pypy/-/issues/2447
+@pytest.mark.xfail("env.PYPY")
+def test_dynamic_attributes():
+    instance = m.DynamicClass()
+    assert not hasattr(instance, "foo")
+    assert "foo" not in dir(instance)
+
+    # Dynamically add attribute
+    instance.foo = 42
+    assert hasattr(instance, "foo")
+    assert instance.foo == 42
+    assert "foo" in dir(instance)
+
+    # __dict__ should be accessible and replaceable
+    assert "foo" in instance.__dict__
+    instance.__dict__ = {"bar": True}
+    assert not hasattr(instance, "foo")
+    assert hasattr(instance, "bar")
+
+    with pytest.raises(TypeError) as excinfo:
+        instance.__dict__ = []
+    assert str(excinfo.value) == "__dict__ must be set to a dictionary, not a 'list'"
+
+    cstats = ConstructorStats.get(m.DynamicClass)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+    # Derived classes should work as well
+    class PythonDerivedDynamicClass(m.DynamicClass):
+        pass
+
+    for cls in m.CppDerivedDynamicClass, PythonDerivedDynamicClass:
+        derived = cls()
+        derived.foobar = 100
+        assert derived.foobar == 100
+
+        assert cstats.alive() == 1
+        del derived
+        assert cstats.alive() == 0
+
+
+# https://foss.heptapod.net/pypy/pypy/-/issues/2447
+@pytest.mark.xfail("env.PYPY")
+def test_cyclic_gc():
+    # One object references itself
+    instance = m.DynamicClass()
+    instance.circular_reference = instance
+
+    cstats = ConstructorStats.get(m.DynamicClass)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+    # Two object reference each other
+    i1 = m.DynamicClass()
+    i2 = m.DynamicClass()
+    i1.cycle = i2
+    i2.cycle = i1
+
+    assert cstats.alive() == 2
+    del i1, i2
+    assert cstats.alive() == 0
+
+
+def test_bad_arg_default(msg):
+    from pybind11_tests import debug_enabled
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.bad_arg_def_named()
+    assert msg(excinfo.value) == (
+        "arg(): could not convert default argument 'a: UnregisteredType' in function "
+        "'should_fail' into a Python object (type not registered yet?)"
+        if debug_enabled else
+        "arg(): could not convert default argument into a Python object (type not registered "
+        "yet?). Compile in debug mode for more information."
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.bad_arg_def_unnamed()
+    assert msg(excinfo.value) == (
+        "arg(): could not convert default argument 'UnregisteredType' in function "
+        "'should_fail' into a Python object (type not registered yet?)"
+        if debug_enabled else
+        "arg(): could not convert default argument into a Python object (type not registered "
+        "yet?). Compile in debug mode for more information."
+    )
+
+
+def test_accepts_none(msg):
+    a = m.NoneTester()
+    assert m.no_none1(a) == 42
+    assert m.no_none2(a) == 42
+    assert m.no_none3(a) == 42
+    assert m.no_none4(a) == 42
+    assert m.no_none5(a) == 42
+    assert m.ok_none1(a) == 42
+    assert m.ok_none2(a) == 42
+    assert m.ok_none3(a) == 42
+    assert m.ok_none4(a) == 42
+    assert m.ok_none5(a) == 42
+
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none1(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none2(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none3(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none4(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none5(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    # The first one still raises because you can't pass None as a lvalue reference arg:
+    with pytest.raises(TypeError) as excinfo:
+        assert m.ok_none1(None) == -1
+    assert msg(excinfo.value) == """
+        ok_none1(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: m.methods_and_attributes.NoneTester) -> int
+
+        Invoked with: None
+    """
+
+    # The rest take the argument as pointer or holder, and accept None:
+    assert m.ok_none2(None) == -1
+    assert m.ok_none3(None) == -1
+    assert m.ok_none4(None) == -1
+    assert m.ok_none5(None) == -1
+
+
+def test_str_issue(msg):
+    """#283: __str__ called on uninitialized instance when constructor arguments invalid"""
+
+    assert str(m.StrIssue(3)) == "StrIssue[3]"
+
+    with pytest.raises(TypeError) as excinfo:
+        str(m.StrIssue("no", "such", "constructor"))
+    assert msg(excinfo.value) == """
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.methods_and_attributes.StrIssue(arg0: int)
+            2. m.methods_and_attributes.StrIssue()
+
+        Invoked with: 'no', 'such', 'constructor'
+    """
+
+
+def test_unregistered_base_implementations():
+    a = m.RegisteredDerived()
+    a.do_nothing()
+    assert a.rw_value == 42
+    assert a.ro_value == 1.25
+    a.rw_value += 5
+    assert a.sum() == 48.25
+    a.increase_value()
+    assert a.rw_value == 48
+    assert a.ro_value == 1.5
+    assert a.sum() == 49.5
+    assert a.rw_value_prop == 48
+    a.rw_value_prop += 1
+    assert a.rw_value_prop == 49
+    a.increase_value()
+    assert a.ro_value_prop == 1.75
+
+
+def test_ref_qualified():
+    """Tests that explicit lvalue ref-qualified methods can be called just like their
+    non ref-qualified counterparts."""
+
+    r = m.RefQualified()
+    assert r.value == 0
+    r.refQualified(17)
+    assert r.value == 17
+    assert r.constRefQualified(23) == 40
diff --git a/pybind11/tests/test_modules.cpp b/pybind11/tests/test_modules.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1475fa62357b9b2f2b31b844b2479557665f152
--- /dev/null
+++ b/pybind11/tests/test_modules.cpp
@@ -0,0 +1,98 @@
+/*
+    tests/test_modules.cpp -- nested modules, importing modules, and
+                            internal references
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+TEST_SUBMODULE(modules, m) {
+    // test_nested_modules
+    py::module m_sub = m.def_submodule("subsubmodule");
+    m_sub.def("submodule_func", []() { return "submodule_func()"; });
+
+    // test_reference_internal
+    class A {
+    public:
+        A(int v) : v(v) { print_created(this, v); }
+        ~A() { print_destroyed(this); }
+        A(const A&) { print_copy_created(this); }
+        A& operator=(const A &copy) { print_copy_assigned(this); v = copy.v; return *this; }
+        std::string toString() { return "A[" + std::to_string(v) + "]"; }
+    private:
+        int v;
+    };
+    py::class_<A>(m_sub, "A")
+        .def(py::init<int>())
+        .def("__repr__", &A::toString);
+
+    class B {
+    public:
+        B() { print_default_created(this); }
+        ~B() { print_destroyed(this); }
+        B(const B&) { print_copy_created(this); }
+        B& operator=(const B &copy) { print_copy_assigned(this); a1 = copy.a1; a2 = copy.a2; return *this; }
+        A &get_a1() { return a1; }
+        A &get_a2() { return a2; }
+
+        A a1{1};
+        A a2{2};
+    };
+    py::class_<B>(m_sub, "B")
+        .def(py::init<>())
+        .def("get_a1", &B::get_a1, "Return the internal A 1", py::return_value_policy::reference_internal)
+        .def("get_a2", &B::get_a2, "Return the internal A 2", py::return_value_policy::reference_internal)
+        .def_readwrite("a1", &B::a1)  // def_readonly uses an internal reference return policy by default
+        .def_readwrite("a2", &B::a2);
+
+    m.attr("OD") = py::module::import("collections").attr("OrderedDict");
+
+    // test_duplicate_registration
+    // Registering two things with the same name
+    m.def("duplicate_registration", []() {
+        class Dupe1 { };
+        class Dupe2 { };
+        class Dupe3 { };
+        class DupeException { };
+
+        auto dm = py::module("dummy");
+        auto failures = py::list();
+
+        py::class_<Dupe1>(dm, "Dupe1");
+        py::class_<Dupe2>(dm, "Dupe2");
+        dm.def("dupe1_factory", []() { return Dupe1(); });
+        py::exception<DupeException>(dm, "DupeException");
+
+        try {
+            py::class_<Dupe1>(dm, "Dupe1");
+            failures.append("Dupe1 class");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("Dupe1", []() { return Dupe1(); });
+            failures.append("Dupe1 function");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<Dupe3>(dm, "dupe1_factory");
+            failures.append("dupe1_factory");
+        } catch (std::runtime_error &) {}
+        try {
+            py::exception<Dupe3>(dm, "Dupe2");
+            failures.append("Dupe2");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("DupeException", []() { return 30; });
+            failures.append("DupeException1");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<DupeException>(dm, "DupeException");
+            failures.append("DupeException2");
+        } catch (std::runtime_error &) {}
+
+        return failures;
+    });
+}
diff --git a/pybind11/tests/test_modules.py b/pybind11/tests/test_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e2100524506b13a5d3189a3fabb9dead628c2a5
--- /dev/null
+++ b/pybind11/tests/test_modules.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import modules as m
+from pybind11_tests.modules import subsubmodule as ms
+from pybind11_tests import ConstructorStats
+
+
+def test_nested_modules():
+    import pybind11_tests
+    assert pybind11_tests.__name__ == "pybind11_tests"
+    assert pybind11_tests.modules.__name__ == "pybind11_tests.modules"
+    assert pybind11_tests.modules.subsubmodule.__name__ == "pybind11_tests.modules.subsubmodule"
+    assert m.__name__ == "pybind11_tests.modules"
+    assert ms.__name__ == "pybind11_tests.modules.subsubmodule"
+
+    assert ms.submodule_func() == "submodule_func()"
+
+
+def test_reference_internal():
+    b = ms.B()
+    assert str(b.get_a1()) == "A[1]"
+    assert str(b.a1) == "A[1]"
+    assert str(b.get_a2()) == "A[2]"
+    assert str(b.a2) == "A[2]"
+
+    b.a1 = ms.A(42)
+    b.a2 = ms.A(43)
+    assert str(b.get_a1()) == "A[42]"
+    assert str(b.a1) == "A[42]"
+    assert str(b.get_a2()) == "A[43]"
+    assert str(b.a2) == "A[43]"
+
+    astats, bstats = ConstructorStats.get(ms.A), ConstructorStats.get(ms.B)
+    assert astats.alive() == 2
+    assert bstats.alive() == 1
+    del b
+    assert astats.alive() == 0
+    assert bstats.alive() == 0
+    assert astats.values() == ['1', '2', '42', '43']
+    assert bstats.values() == []
+    assert astats.default_constructions == 0
+    assert bstats.default_constructions == 1
+    assert astats.copy_constructions == 0
+    assert bstats.copy_constructions == 0
+    # assert astats.move_constructions >= 0  # Don't invoke any
+    # assert bstats.move_constructions >= 0  # Don't invoke any
+    assert astats.copy_assignments == 2
+    assert bstats.copy_assignments == 0
+    assert astats.move_assignments == 0
+    assert bstats.move_assignments == 0
+
+
+def test_importing():
+    from pybind11_tests.modules import OD
+    from collections import OrderedDict
+
+    assert OD is OrderedDict
+    assert str(OD([(1, 'a'), (2, 'b')])) == "OrderedDict([(1, 'a'), (2, 'b')])"
+
+
+def test_pydoc():
+    """Pydoc needs to be able to provide help() for everything inside a pybind11 module"""
+    import pybind11_tests
+    import pydoc
+
+    assert pybind11_tests.__name__ == "pybind11_tests"
+    assert pybind11_tests.__doc__ == "pybind11 test module"
+    assert pydoc.text.docmodule(pybind11_tests)
+
+
+def test_duplicate_registration():
+    """Registering two things with the same name"""
+
+    assert m.duplicate_registration() == []
diff --git a/pybind11/tests/test_multiple_inheritance.cpp b/pybind11/tests/test_multiple_inheritance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70e34178540d210770fc862b5520a3b3c9d91a5c
--- /dev/null
+++ b/pybind11/tests/test_multiple_inheritance.cpp
@@ -0,0 +1,220 @@
+/*
+    tests/test_multiple_inheritance.cpp -- multiple inheritance,
+    implicit MI casts
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+// Many bases for testing that multiple inheritance from many classes (i.e. requiring extra
+// space for holder constructed flags) works.
+template <int N> struct BaseN {
+    BaseN(int i) : i(i) { }
+    int i;
+};
+
+// test_mi_static_properties
+struct Vanilla {
+    std::string vanilla() { return "Vanilla"; };
+};
+struct WithStatic1 {
+    static std::string static_func1() { return "WithStatic1"; };
+    static int static_value1;
+};
+struct WithStatic2 {
+    static std::string static_func2() { return "WithStatic2"; };
+    static int static_value2;
+};
+struct VanillaStaticMix1 : Vanilla, WithStatic1, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix1"; }
+    static int static_value;
+};
+struct VanillaStaticMix2 : WithStatic1, Vanilla, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix2"; }
+    static int static_value;
+};
+int WithStatic1::static_value1 = 1;
+int WithStatic2::static_value2 = 2;
+int VanillaStaticMix1::static_value = 12;
+int VanillaStaticMix2::static_value = 12;
+
+TEST_SUBMODULE(multiple_inheritance, m) {
+
+    // test_multiple_inheritance_mix1
+    // test_multiple_inheritance_mix2
+    struct Base1 {
+        Base1(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1> b1(m, "Base1");
+    b1.def(py::init<int>())
+      .def("foo", &Base1::foo);
+
+    struct Base2 {
+        Base2(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2> b2(m, "Base2");
+    b2.def(py::init<int>())
+      .def("bar", &Base2::bar);
+
+
+    // test_multiple_inheritance_cpp
+    struct Base12 : Base1, Base2 {
+        Base12(int i, int j) : Base1(i), Base2(j) { }
+    };
+    struct MIType : Base12 {
+        MIType(int i, int j) : Base12(i, j) { }
+    };
+    py::class_<Base12, Base1, Base2>(m, "Base12");
+    py::class_<MIType, Base12>(m, "MIType")
+        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_python_many_bases
+    #define PYBIND11_BASEN(N) py::class_<BaseN<N>>(m, "BaseN" #N).def(py::init<int>()).def("f" #N, [](BaseN<N> &b) { return b.i + N; })
+    PYBIND11_BASEN( 1); PYBIND11_BASEN( 2); PYBIND11_BASEN( 3); PYBIND11_BASEN( 4);
+    PYBIND11_BASEN( 5); PYBIND11_BASEN( 6); PYBIND11_BASEN( 7); PYBIND11_BASEN( 8);
+    PYBIND11_BASEN( 9); PYBIND11_BASEN(10); PYBIND11_BASEN(11); PYBIND11_BASEN(12);
+    PYBIND11_BASEN(13); PYBIND11_BASEN(14); PYBIND11_BASEN(15); PYBIND11_BASEN(16);
+    PYBIND11_BASEN(17);
+
+    // Uncommenting this should result in a compile time failure (MI can only be specified via
+    // template parameters because pybind has to know the types involved; see discussion in #742 for
+    // details).
+//    struct Base12v2 : Base1, Base2 {
+//        Base12v2(int i, int j) : Base1(i), Base2(j) { }
+//    };
+//    py::class_<Base12v2>(m, "Base12v2", b1, b2)
+//        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_virtbase
+    // Test the case where not all base classes are specified, and where pybind11 requires the
+    // py::multiple_inheritance flag to perform proper casting between types.
+    struct Base1a {
+        Base1a(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1a, std::shared_ptr<Base1a>>(m, "Base1a")
+        .def(py::init<int>())
+        .def("foo", &Base1a::foo);
+
+    struct Base2a {
+        Base2a(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2a, std::shared_ptr<Base2a>>(m, "Base2a")
+        .def(py::init<int>())
+        .def("bar", &Base2a::bar);
+
+    struct Base12a : Base1a, Base2a {
+        Base12a(int i, int j) : Base1a(i), Base2a(j) { }
+    };
+    py::class_<Base12a, /* Base1 missing */ Base2a,
+               std::shared_ptr<Base12a>>(m, "Base12a", py::multiple_inheritance())
+        .def(py::init<int, int>());
+
+    m.def("bar_base2a", [](Base2a *b) { return b->bar(); });
+    m.def("bar_base2a_sharedptr", [](std::shared_ptr<Base2a> b) { return b->bar(); });
+
+    // test_mi_unaligned_base
+    // test_mi_base_return
+    // Issue #801: invalid casting to derived type with MI bases
+    struct I801B1 { int a = 1; I801B1() = default; I801B1(const I801B1 &) = default; virtual ~I801B1() = default; };
+    struct I801B2 { int b = 2; I801B2() = default; I801B2(const I801B2 &) = default; virtual ~I801B2() = default; };
+    struct I801C : I801B1, I801B2 {};
+    struct I801D : I801C {}; // Indirect MI
+    // Unregistered classes:
+    struct I801B3 { int c = 3; virtual ~I801B3() = default; };
+    struct I801E : I801B3, I801D {};
+
+    py::class_<I801B1, std::shared_ptr<I801B1>>(m, "I801B1").def(py::init<>()).def_readonly("a", &I801B1::a);
+    py::class_<I801B2, std::shared_ptr<I801B2>>(m, "I801B2").def(py::init<>()).def_readonly("b", &I801B2::b);
+    py::class_<I801C, I801B1, I801B2, std::shared_ptr<I801C>>(m, "I801C").def(py::init<>());
+    py::class_<I801D, I801C, std::shared_ptr<I801D>>(m, "I801D").def(py::init<>());
+
+    // Two separate issues here: first, we want to recognize a pointer to a base type as being a
+    // known instance even when the pointer value is unequal (i.e. due to a non-first
+    // multiple-inheritance base class):
+    m.def("i801b1_c", [](I801C *c) { return static_cast<I801B1 *>(c); });
+    m.def("i801b2_c", [](I801C *c) { return static_cast<I801B2 *>(c); });
+    m.def("i801b1_d", [](I801D *d) { return static_cast<I801B1 *>(d); });
+    m.def("i801b2_d", [](I801D *d) { return static_cast<I801B2 *>(d); });
+
+    // Second, when returned a base class pointer to a derived instance, we cannot assume that the
+    // pointer is `reinterpret_cast`able to the derived pointer because, like above, the base class
+    // pointer could be offset.
+    m.def("i801c_b1", []() -> I801B1 * { return new I801C(); });
+    m.def("i801c_b2", []() -> I801B2 * { return new I801C(); });
+    m.def("i801d_b1", []() -> I801B1 * { return new I801D(); });
+    m.def("i801d_b2", []() -> I801B2 * { return new I801D(); });
+
+    // Return a base class pointer to a pybind-registered type when the actual derived type
+    // isn't pybind-registered (and uses multiple-inheritance to offset the pybind base)
+    m.def("i801e_c", []() -> I801C * { return new I801E(); });
+    m.def("i801e_b2", []() -> I801B2 * { return new I801E(); });
+
+
+    // test_mi_static_properties
+    py::class_<Vanilla>(m, "Vanilla")
+        .def(py::init<>())
+        .def("vanilla", &Vanilla::vanilla);
+
+    py::class_<WithStatic1>(m, "WithStatic1")
+        .def(py::init<>())
+        .def_static("static_func1", &WithStatic1::static_func1)
+        .def_readwrite_static("static_value1", &WithStatic1::static_value1);
+
+    py::class_<WithStatic2>(m, "WithStatic2")
+        .def(py::init<>())
+        .def_static("static_func2", &WithStatic2::static_func2)
+        .def_readwrite_static("static_value2", &WithStatic2::static_value2);
+
+    py::class_<VanillaStaticMix1, Vanilla, WithStatic1, WithStatic2>(
+        m, "VanillaStaticMix1")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix1::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix1::static_value);
+
+    py::class_<VanillaStaticMix2, WithStatic1, Vanilla, WithStatic2>(
+        m, "VanillaStaticMix2")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix2::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix2::static_value);
+
+
+#if !(defined(PYPY_VERSION) && (PYPY_VERSION_NUM < 0x06000000))
+    struct WithDict { };
+    struct VanillaDictMix1 : Vanilla, WithDict { };
+    struct VanillaDictMix2 : WithDict, Vanilla { };
+    py::class_<WithDict>(m, "WithDict", py::dynamic_attr()).def(py::init<>());
+    py::class_<VanillaDictMix1, Vanilla, WithDict>(m, "VanillaDictMix1").def(py::init<>());
+    py::class_<VanillaDictMix2, WithDict, Vanilla>(m, "VanillaDictMix2").def(py::init<>());
+#endif
+
+    // test_diamond_inheritance
+    // Issue #959: segfault when constructing diamond inheritance instance
+    // All of these have int members so that there will be various unequal pointers involved.
+    struct B { int b; B() = default; B(const B&) = default; virtual ~B() = default; };
+    struct C0 : public virtual B { int c0; };
+    struct C1 : public virtual B { int c1; };
+    struct D : public C0, public C1 { int d; };
+    py::class_<B>(m, "B")
+        .def("b", [](B *self) { return self; });
+    py::class_<C0, B>(m, "C0")
+        .def("c0", [](C0 *self) { return self; });
+    py::class_<C1, B>(m, "C1")
+        .def("c1", [](C1 *self) { return self; });
+    py::class_<D, C0, C1>(m, "D")
+        .def(py::init<>());
+}
diff --git a/pybind11/tests/test_multiple_inheritance.py b/pybind11/tests/test_multiple_inheritance.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a0259d2148f14aafeac67a43d3c906a0b5719d0
--- /dev/null
+++ b/pybind11/tests/test_multiple_inheritance.py
@@ -0,0 +1,356 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import ConstructorStats
+from pybind11_tests import multiple_inheritance as m
+
+
+def test_multiple_inheritance_cpp():
+    mt = m.MIType(3, 4)
+
+    assert mt.foo() == 3
+    assert mt.bar() == 4
+
+
+@pytest.mark.skipif("env.PYPY and env.PY2")
+@pytest.mark.xfail("env.PYPY and not env.PY2")
+def test_multiple_inheritance_mix1():
+    class Base1:
+        def __init__(self, i):
+            self.i = i
+
+        def foo(self):
+            return self.i
+
+    class MITypePy(Base1, m.Base2):
+        def __init__(self, i, j):
+            Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    mt = MITypePy(3, 4)
+
+    assert mt.foo() == 3
+    assert mt.bar() == 4
+
+
+def test_multiple_inheritance_mix2():
+    class Base2:
+        def __init__(self, i):
+            self.i = i
+
+        def bar(self):
+            return self.i
+
+    class MITypePy(m.Base1, Base2):
+        def __init__(self, i, j):
+            m.Base1.__init__(self, i)
+            Base2.__init__(self, j)
+
+    mt = MITypePy(3, 4)
+
+    assert mt.foo() == 3
+    assert mt.bar() == 4
+
+
+@pytest.mark.skipif("env.PYPY and env.PY2")
+@pytest.mark.xfail("env.PYPY and not env.PY2")
+def test_multiple_inheritance_python():
+
+    class MI1(m.Base1, m.Base2):
+        def __init__(self, i, j):
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class B1(object):
+        def v(self):
+            return 1
+
+    class MI2(B1, m.Base1, m.Base2):
+        def __init__(self, i, j):
+            B1.__init__(self)
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class MI3(MI2):
+        def __init__(self, i, j):
+            MI2.__init__(self, i, j)
+
+    class MI4(MI3, m.Base2):
+        def __init__(self, i, j):
+            MI3.__init__(self, i, j)
+            # This should be ignored (Base2 is already initialized via MI2):
+            m.Base2.__init__(self, i + 100)
+
+    class MI5(m.Base2, B1, m.Base1):
+        def __init__(self, i, j):
+            B1.__init__(self)
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class MI6(m.Base2, B1):
+        def __init__(self, i):
+            m.Base2.__init__(self, i)
+            B1.__init__(self)
+
+    class B2(B1):
+        def v(self):
+            return 2
+
+    class B3(object):
+        def v(self):
+            return 3
+
+    class B4(B3, B2):
+        def v(self):
+            return 4
+
+    class MI7(B4, MI6):
+        def __init__(self, i):
+            B4.__init__(self)
+            MI6.__init__(self, i)
+
+    class MI8(MI6, B3):
+        def __init__(self, i):
+            MI6.__init__(self, i)
+            B3.__init__(self)
+
+    class MI8b(B3, MI6):
+        def __init__(self, i):
+            B3.__init__(self)
+            MI6.__init__(self, i)
+
+    mi1 = MI1(1, 2)
+    assert mi1.foo() == 1
+    assert mi1.bar() == 2
+
+    mi2 = MI2(3, 4)
+    assert mi2.v() == 1
+    assert mi2.foo() == 3
+    assert mi2.bar() == 4
+
+    mi3 = MI3(5, 6)
+    assert mi3.v() == 1
+    assert mi3.foo() == 5
+    assert mi3.bar() == 6
+
+    mi4 = MI4(7, 8)
+    assert mi4.v() == 1
+    assert mi4.foo() == 7
+    assert mi4.bar() == 8
+
+    mi5 = MI5(10, 11)
+    assert mi5.v() == 1
+    assert mi5.foo() == 10
+    assert mi5.bar() == 11
+
+    mi6 = MI6(12)
+    assert mi6.v() == 1
+    assert mi6.bar() == 12
+
+    mi7 = MI7(13)
+    assert mi7.v() == 4
+    assert mi7.bar() == 13
+
+    mi8 = MI8(14)
+    assert mi8.v() == 1
+    assert mi8.bar() == 14
+
+    mi8b = MI8b(15)
+    assert mi8b.v() == 3
+    assert mi8b.bar() == 15
+
+
+def test_multiple_inheritance_python_many_bases():
+
+    class MIMany14(m.BaseN1, m.BaseN2, m.BaseN3, m.BaseN4):
+        def __init__(self):
+            m.BaseN1.__init__(self, 1)
+            m.BaseN2.__init__(self, 2)
+            m.BaseN3.__init__(self, 3)
+            m.BaseN4.__init__(self, 4)
+
+    class MIMany58(m.BaseN5, m.BaseN6, m.BaseN7, m.BaseN8):
+        def __init__(self):
+            m.BaseN5.__init__(self, 5)
+            m.BaseN6.__init__(self, 6)
+            m.BaseN7.__init__(self, 7)
+            m.BaseN8.__init__(self, 8)
+
+    class MIMany916(m.BaseN9, m.BaseN10, m.BaseN11, m.BaseN12, m.BaseN13, m.BaseN14, m.BaseN15,
+                    m.BaseN16):
+        def __init__(self):
+            m.BaseN9.__init__(self, 9)
+            m.BaseN10.__init__(self, 10)
+            m.BaseN11.__init__(self, 11)
+            m.BaseN12.__init__(self, 12)
+            m.BaseN13.__init__(self, 13)
+            m.BaseN14.__init__(self, 14)
+            m.BaseN15.__init__(self, 15)
+            m.BaseN16.__init__(self, 16)
+
+    class MIMany19(MIMany14, MIMany58, m.BaseN9):
+        def __init__(self):
+            MIMany14.__init__(self)
+            MIMany58.__init__(self)
+            m.BaseN9.__init__(self, 9)
+
+    class MIMany117(MIMany14, MIMany58, MIMany916, m.BaseN17):
+        def __init__(self):
+            MIMany14.__init__(self)
+            MIMany58.__init__(self)
+            MIMany916.__init__(self)
+            m.BaseN17.__init__(self, 17)
+
+    # Inherits from 4 registered C++ classes: can fit in one pointer on any modern arch:
+    a = MIMany14()
+    for i in range(1, 4):
+        assert getattr(a, "f" + str(i))() == 2 * i
+
+    # Inherits from 8: requires 1/2 pointers worth of holder flags on 32/64-bit arch:
+    b = MIMany916()
+    for i in range(9, 16):
+        assert getattr(b, "f" + str(i))() == 2 * i
+
+    # Inherits from 9: requires >= 2 pointers worth of holder flags
+    c = MIMany19()
+    for i in range(1, 9):
+        assert getattr(c, "f" + str(i))() == 2 * i
+
+    # Inherits from 17: requires >= 3 pointers worth of holder flags
+    d = MIMany117()
+    for i in range(1, 17):
+        assert getattr(d, "f" + str(i))() == 2 * i
+
+
+def test_multiple_inheritance_virtbase():
+
+    class MITypePy(m.Base12a):
+        def __init__(self, i, j):
+            m.Base12a.__init__(self, i, j)
+
+    mt = MITypePy(3, 4)
+    assert mt.bar() == 4
+    assert m.bar_base2a(mt) == 4
+    assert m.bar_base2a_sharedptr(mt) == 4
+
+
+def test_mi_static_properties():
+    """Mixing bases with and without static properties should be possible
+     and the result should be independent of base definition order"""
+
+    for d in (m.VanillaStaticMix1(), m.VanillaStaticMix2()):
+        assert d.vanilla() == "Vanilla"
+        assert d.static_func1() == "WithStatic1"
+        assert d.static_func2() == "WithStatic2"
+        assert d.static_func() == d.__class__.__name__
+
+        m.WithStatic1.static_value1 = 1
+        m.WithStatic2.static_value2 = 2
+        assert d.static_value1 == 1
+        assert d.static_value2 == 2
+        assert d.static_value == 12
+
+        d.static_value1 = 0
+        assert d.static_value1 == 0
+        d.static_value2 = 0
+        assert d.static_value2 == 0
+        d.static_value = 0
+        assert d.static_value == 0
+
+
+# Requires PyPy 6+
+def test_mi_dynamic_attributes():
+    """Mixing bases with and without dynamic attribute support"""
+
+    for d in (m.VanillaDictMix1(), m.VanillaDictMix2()):
+        d.dynamic = 1
+        assert d.dynamic == 1
+
+
+def test_mi_unaligned_base():
+    """Returning an offset (non-first MI) base class pointer should recognize the instance"""
+
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    c = m.I801C()
+    d = m.I801D()
+    # + 4 below because we have the two instances, and each instance has offset base I801B2
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+    b1c = m.i801b1_c(c)
+    assert b1c is c
+    b2c = m.i801b2_c(c)
+    assert b2c is c
+    b1d = m.i801b1_d(d)
+    assert b1d is d
+    b2d = m.i801b2_d(d)
+    assert b2d is d
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4  # no extra instances
+    del c, b1c, b2c
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    del d, b1d, b2d
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+
+def test_mi_base_return():
+    """Tests returning an offset (non-first MI) base class pointer to a derived instance"""
+
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    c1 = m.i801c_b1()
+    assert type(c1) is m.I801C
+    assert c1.a == 1
+    assert c1.b == 2
+
+    d1 = m.i801d_b1()
+    assert type(d1) is m.I801D
+    assert d1.a == 1
+    assert d1.b == 2
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    c2 = m.i801c_b2()
+    assert type(c2) is m.I801C
+    assert c2.a == 1
+    assert c2.b == 2
+
+    d2 = m.i801d_b2()
+    assert type(d2) is m.I801D
+    assert d2.a == 1
+    assert d2.b == 2
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 8
+
+    del c2
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+    del c1, d1, d2
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    # Returning an unregistered derived type with a registered base; we won't
+    # pick up the derived type, obviously, but should still work (as an object
+    # of whatever type was returned).
+    e1 = m.i801e_c()
+    assert type(e1) is m.I801C
+    assert e1.a == 1
+    assert e1.b == 2
+
+    e2 = m.i801e_b2()
+    assert type(e2) is m.I801B2
+    assert e2.b == 2
+
+
+def test_diamond_inheritance():
+    """Tests that diamond inheritance works as expected (issue #959)"""
+
+    # Issue #959: this shouldn't segfault:
+    d = m.D()
+
+    # Make sure all the various distinct pointers are all recognized as registered instances:
+    assert d is d.c0()
+    assert d is d.c1()
+    assert d is d.b()
+    assert d is d.c0().b()
+    assert d is d.c1().b()
+    assert d is d.c0().c1().b().c0().b()
diff --git a/pybind11/tests/test_numpy_array.cpp b/pybind11/tests/test_numpy_array.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e37beb5a5c22661f39bd1651d41dac594d3ac2ba
--- /dev/null
+++ b/pybind11/tests/test_numpy_array.cpp
@@ -0,0 +1,388 @@
+/*
+    tests/test_numpy_array.cpp -- test core array functionality
+
+    Copyright (c) 2016 Ivan Smirnov <i.s.smirnov@gmail.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <cstdint>
+
+// Size / dtype checks.
+struct DtypeCheck {
+    py::dtype numpy{};
+    py::dtype pybind11{};
+};
+
+template <typename T>
+DtypeCheck get_dtype_check(const char* name) {
+    py::module np = py::module::import("numpy");
+    DtypeCheck check{};
+    check.numpy = np.attr("dtype")(np.attr(name));
+    check.pybind11 = py::dtype::of<T>();
+    return check;
+}
+
+std::vector<DtypeCheck> get_concrete_dtype_checks() {
+    return {
+        // Normalization
+        get_dtype_check<std::int8_t>("int8"),
+        get_dtype_check<std::uint8_t>("uint8"),
+        get_dtype_check<std::int16_t>("int16"),
+        get_dtype_check<std::uint16_t>("uint16"),
+        get_dtype_check<std::int32_t>("int32"),
+        get_dtype_check<std::uint32_t>("uint32"),
+        get_dtype_check<std::int64_t>("int64"),
+        get_dtype_check<std::uint64_t>("uint64")
+    };
+}
+
+struct DtypeSizeCheck {
+    std::string name{};
+    int size_cpp{};
+    int size_numpy{};
+    // For debugging.
+    py::dtype dtype{};
+};
+
+template <typename T>
+DtypeSizeCheck get_dtype_size_check() {
+    DtypeSizeCheck check{};
+    check.name = py::type_id<T>();
+    check.size_cpp = sizeof(T);
+    check.dtype = py::dtype::of<T>();
+    check.size_numpy = check.dtype.attr("itemsize").template cast<int>();
+    return check;
+}
+
+std::vector<DtypeSizeCheck> get_platform_dtype_size_checks() {
+    return {
+        get_dtype_size_check<short>(),
+        get_dtype_size_check<unsigned short>(),
+        get_dtype_size_check<int>(),
+        get_dtype_size_check<unsigned int>(),
+        get_dtype_size_check<long>(),
+        get_dtype_size_check<unsigned long>(),
+        get_dtype_size_check<long long>(),
+        get_dtype_size_check<unsigned long long>(),
+    };
+}
+
+// Arrays.
+using arr = py::array;
+using arr_t = py::array_t<uint16_t, 0>;
+static_assert(std::is_same<arr_t::value_type, uint16_t>::value, "");
+
+template<typename... Ix> arr data(const arr& a, Ix... index) {
+    return arr(a.nbytes() - a.offset_at(index...), (const uint8_t *) a.data(index...));
+}
+
+template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
+    return arr(a.size() - a.index_at(index...), a.data(index...));
+}
+
+template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
+    auto ptr = (uint8_t *) a.mutable_data(index...);
+    for (ssize_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+        ptr[i] = (uint8_t) (ptr[i] * 2);
+    return a;
+}
+
+template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
+    auto ptr = a.mutable_data(index...);
+    for (ssize_t i = 0; i < a.size() - a.index_at(index...); i++)
+        ptr[i]++;
+    return a;
+}
+
+template<typename... Ix> ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
+
+#define def_index_fn(name, type) \
+    sm.def(#name, [](type a) { return name(a); }); \
+    sm.def(#name, [](type a, int i) { return name(a, i); }); \
+    sm.def(#name, [](type a, int i, int j) { return name(a, i, j); }); \
+    sm.def(#name, [](type a, int i, int j, int k) { return name(a, i, j, k); });
+
+template <typename T, typename T2> py::handle auxiliaries(T &&r, T2 &&r2) {
+    if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+    py::list l;
+    l.append(*r.data(0, 0));
+    l.append(*r2.mutable_data(0, 0));
+    l.append(r.data(0, 1) == r2.mutable_data(0, 1));
+    l.append(r.ndim());
+    l.append(r.itemsize());
+    l.append(r.shape(0));
+    l.append(r.shape(1));
+    l.append(r.size());
+    l.append(r.nbytes());
+    return l.release();
+}
+
+// note: declaration at local scope would create a dangling reference!
+static int data_i = 42;
+
+TEST_SUBMODULE(numpy_array, sm) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_dtypes
+    py::class_<DtypeCheck>(sm, "DtypeCheck")
+        .def_readonly("numpy", &DtypeCheck::numpy)
+        .def_readonly("pybind11", &DtypeCheck::pybind11)
+        .def("__repr__", [](const DtypeCheck& self) {
+            return py::str("<DtypeCheck numpy={} pybind11={}>").format(
+                self.numpy, self.pybind11);
+        });
+    sm.def("get_concrete_dtype_checks", &get_concrete_dtype_checks);
+
+    py::class_<DtypeSizeCheck>(sm, "DtypeSizeCheck")
+        .def_readonly("name", &DtypeSizeCheck::name)
+        .def_readonly("size_cpp", &DtypeSizeCheck::size_cpp)
+        .def_readonly("size_numpy", &DtypeSizeCheck::size_numpy)
+        .def("__repr__", [](const DtypeSizeCheck& self) {
+            return py::str("<DtypeSizeCheck name='{}' size_cpp={} size_numpy={} dtype={}>").format(
+                self.name, self.size_cpp, self.size_numpy, self.dtype);
+        });
+    sm.def("get_platform_dtype_size_checks", &get_platform_dtype_size_checks);
+
+    // test_array_attributes
+    sm.def("ndim", [](const arr& a) { return a.ndim(); });
+    sm.def("shape", [](const arr& a) { return arr(a.ndim(), a.shape()); });
+    sm.def("shape", [](const arr& a, ssize_t dim) { return a.shape(dim); });
+    sm.def("strides", [](const arr& a) { return arr(a.ndim(), a.strides()); });
+    sm.def("strides", [](const arr& a, ssize_t dim) { return a.strides(dim); });
+    sm.def("writeable", [](const arr& a) { return a.writeable(); });
+    sm.def("size", [](const arr& a) { return a.size(); });
+    sm.def("itemsize", [](const arr& a) { return a.itemsize(); });
+    sm.def("nbytes", [](const arr& a) { return a.nbytes(); });
+    sm.def("owndata", [](const arr& a) { return a.owndata(); });
+
+    // test_index_offset
+    def_index_fn(index_at, const arr&);
+    def_index_fn(index_at_t, const arr_t&);
+    def_index_fn(offset_at, const arr&);
+    def_index_fn(offset_at_t, const arr_t&);
+    // test_data
+    def_index_fn(data, const arr&);
+    def_index_fn(data_t, const arr_t&);
+    // test_mutate_data, test_mutate_readonly
+    def_index_fn(mutate_data, arr&);
+    def_index_fn(mutate_data_t, arr_t&);
+    def_index_fn(at_t, const arr_t&);
+    def_index_fn(mutate_at_t, arr_t&);
+
+    // test_make_c_f_array
+    sm.def("make_f_array", [] { return py::array_t<float>({ 2, 2 }, { 4, 8 }); });
+    sm.def("make_c_array", [] { return py::array_t<float>({ 2, 2 }, { 8, 4 }); });
+
+    // test_empty_shaped_array
+    sm.def("make_empty_shaped_array", [] { return py::array(py::dtype("f"), {}, {}); });
+    // test numpy scalars (empty shape, ndim==0)
+    sm.def("scalar_int", []() { return py::array(py::dtype("i"), {}, {}, &data_i); });
+
+    // test_wrap
+    sm.def("wrap", [](py::array a) {
+        return py::array(
+            a.dtype(),
+            {a.shape(), a.shape() + a.ndim()},
+            {a.strides(), a.strides() + a.ndim()},
+            a.data(),
+            a
+        );
+    });
+
+    // test_numpy_view
+    struct ArrayClass {
+        int data[2] = { 1, 2 };
+        ArrayClass() { py::print("ArrayClass()"); }
+        ~ArrayClass() { py::print("~ArrayClass()"); }
+    };
+    py::class_<ArrayClass>(sm, "ArrayClass")
+        .def(py::init<>())
+        .def("numpy_view", [](py::object &obj) {
+            py::print("ArrayClass::numpy_view()");
+            ArrayClass &a = obj.cast<ArrayClass&>();
+            return py::array_t<int>({2}, {4}, a.data, obj);
+        }
+    );
+
+    // test_cast_numpy_int64_to_uint64
+    sm.def("function_taking_uint64", [](uint64_t) { });
+
+    // test_isinstance
+    sm.def("isinstance_untyped", [](py::object yes, py::object no) {
+        return py::isinstance<py::array>(yes) && !py::isinstance<py::array>(no);
+    });
+    sm.def("isinstance_typed", [](py::object o) {
+        return py::isinstance<py::array_t<double>>(o) && !py::isinstance<py::array_t<int>>(o);
+    });
+
+    // test_constructors
+    sm.def("default_constructors", []() {
+        return py::dict(
+            "array"_a=py::array(),
+            "array_t<int32>"_a=py::array_t<std::int32_t>(),
+            "array_t<double>"_a=py::array_t<double>()
+        );
+    });
+    sm.def("converting_constructors", [](py::object o) {
+        return py::dict(
+            "array"_a=py::array(o),
+            "array_t<int32>"_a=py::array_t<std::int32_t>(o),
+            "array_t<double>"_a=py::array_t<double>(o)
+        );
+    });
+
+    // test_overload_resolution
+    sm.def("overloaded", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded", [](py::array_t<float>) { return "float"; });
+    sm.def("overloaded", [](py::array_t<int>) { return "int"; });
+    sm.def("overloaded", [](py::array_t<unsigned short>) { return "unsigned short"; });
+    sm.def("overloaded", [](py::array_t<long long>) { return "long long"; });
+    sm.def("overloaded", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded", [](py::array_t<std::complex<float>>) { return "float complex"; });
+
+    sm.def("overloaded2", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded2", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded2", [](py::array_t<std::complex<float>>) { return "float complex"; });
+    sm.def("overloaded2", [](py::array_t<float>) { return "float"; });
+
+    // Only accept the exact types:
+    sm.def("overloaded3", [](py::array_t<int>) { return "int"; }, py::arg().noconvert());
+    sm.def("overloaded3", [](py::array_t<double>) { return "double"; }, py::arg().noconvert());
+
+    // Make sure we don't do unsafe coercion (e.g. float to int) when not using forcecast, but
+    // rather that float gets converted via the safe (conversion to double) overload:
+    sm.def("overloaded4", [](py::array_t<long long, 0>) { return "long long"; });
+    sm.def("overloaded4", [](py::array_t<double, 0>) { return "double"; });
+
+    // But we do allow conversion to int if forcecast is enabled (but only if no overload matches
+    // without conversion)
+    sm.def("overloaded5", [](py::array_t<unsigned int>) { return "unsigned int"; });
+    sm.def("overloaded5", [](py::array_t<double>) { return "double"; });
+
+    // test_greedy_string_overload
+    // Issue 685: ndarray shouldn't go to std::string overload
+    sm.def("issue685", [](std::string) { return "string"; });
+    sm.def("issue685", [](py::array) { return "array"; });
+    sm.def("issue685", [](py::object) { return "other"; });
+
+    // test_array_unchecked_fixed_dims
+    sm.def("proxy_add2", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked<2>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+
+    sm.def("proxy_init3", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_init3F", [](double start) {
+        py::array_t<double, py::array::f_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t k = 0; k < r.shape(2); k++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_squared_L2_norm", [](py::array_t<double> a) {
+        auto r = a.unchecked<1>();
+        double sumsq = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            sumsq += r[i] * r(i); // Either notation works for a 1D array
+        return sumsq;
+    });
+
+    sm.def("proxy_auxiliaries2", [](py::array_t<double> a) {
+        auto r = a.unchecked<2>();
+        auto r2 = a.mutable_unchecked<2>();
+        return auxiliaries(r, r2);
+    });
+
+    // test_array_unchecked_dyn_dims
+    // Same as the above, but without a compile-time dimensions specification:
+    sm.def("proxy_add2_dyn", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+    sm.def("proxy_init3_dyn", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 3) throw std::domain_error("error: ndim != 3");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_auxiliaries2_dyn", [](py::array_t<double> a) {
+        return auxiliaries(a.unchecked(), a.mutable_unchecked());
+    });
+
+    sm.def("array_auxiliaries2", [](py::array_t<double> a) {
+        return auxiliaries(a, a);
+    });
+
+    // test_array_failures
+    // Issue #785: Uninformative "Unknown internal error" exception when constructing array from empty object:
+    sm.def("array_fail_test", []() { return py::array(py::object()); });
+    sm.def("array_t_fail_test", []() { return py::array_t<double>(py::object()); });
+    // Make sure the error from numpy is being passed through:
+    sm.def("array_fail_test_negative_size", []() { int c = 0; return py::array(-1, &c); });
+
+    // test_initializer_list
+    // Issue (unnumbered; reported in #788): regression: initializer lists can be ambiguous
+    sm.def("array_initializer_list1", []() { return py::array_t<float>(1); }); // { 1 } also works, but clang warns about it
+    sm.def("array_initializer_list2", []() { return py::array_t<float>({ 1, 2 }); });
+    sm.def("array_initializer_list3", []() { return py::array_t<float>({ 1, 2, 3 }); });
+    sm.def("array_initializer_list4", []() { return py::array_t<float>({ 1, 2, 3, 4 }); });
+
+    // test_array_resize
+    // reshape array to 2D without changing size
+    sm.def("array_reshape2", [](py::array_t<double> a) {
+        const ssize_t dim_sz = (ssize_t)std::sqrt(a.size());
+        if (dim_sz * dim_sz != a.size())
+            throw std::domain_error("array_reshape2: input array total size is not a squared integer");
+        a.resize({dim_sz, dim_sz});
+    });
+
+    // resize to 3D array with each dimension = N
+    sm.def("array_resize3", [](py::array_t<double> a, size_t N, bool refcheck) {
+        a.resize({N, N, N}, refcheck);
+    });
+
+    // test_array_create_and_resize
+    // return 2D array with Nrows = Ncols = N
+    sm.def("create_and_resize", [](size_t N) {
+        py::array_t<double> a;
+        a.resize({N, N});
+        std::fill(a.mutable_data(), a.mutable_data() + a.size(), 42.);
+        return a;
+    });
+
+    sm.def("index_using_ellipsis", [](py::array a) {
+        return a[py::make_tuple(0, py::ellipsis(), 0)];
+    });
+}
diff --git a/pybind11/tests/test_numpy_array.py b/pybind11/tests/test_numpy_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad3ca58c1af53e2b65ffed341a0512ebb5c20815
--- /dev/null
+++ b/pybind11/tests/test_numpy_array.py
@@ -0,0 +1,446 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import numpy_array as m
+
+np = pytest.importorskip("numpy")
+
+
+def test_dtypes():
+    # See issue #1328.
+    # - Platform-dependent sizes.
+    for size_check in m.get_platform_dtype_size_checks():
+        print(size_check)
+        assert size_check.size_cpp == size_check.size_numpy, size_check
+    # - Concrete sizes.
+    for check in m.get_concrete_dtype_checks():
+        print(check)
+        assert check.numpy == check.pybind11, check
+        if check.numpy.num != check.pybind11.num:
+            print("NOTE: typenum mismatch for {}: {} != {}".format(
+                check, check.numpy.num, check.pybind11.num))
+
+
+@pytest.fixture(scope='function')
+def arr():
+    return np.array([[1, 2, 3], [4, 5, 6]], '=u2')
+
+
+def test_array_attributes():
+    a = np.array(0, 'f8')
+    assert m.ndim(a) == 0
+    assert all(m.shape(a) == [])
+    assert all(m.strides(a) == [])
+    with pytest.raises(IndexError) as excinfo:
+        m.shape(a, 0)
+    assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
+    with pytest.raises(IndexError) as excinfo:
+        m.strides(a, 0)
+    assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
+    assert m.writeable(a)
+    assert m.size(a) == 1
+    assert m.itemsize(a) == 8
+    assert m.nbytes(a) == 8
+    assert m.owndata(a)
+
+    a = np.array([[1, 2, 3], [4, 5, 6]], 'u2').view()
+    a.flags.writeable = False
+    assert m.ndim(a) == 2
+    assert all(m.shape(a) == [2, 3])
+    assert m.shape(a, 0) == 2
+    assert m.shape(a, 1) == 3
+    assert all(m.strides(a) == [6, 2])
+    assert m.strides(a, 0) == 6
+    assert m.strides(a, 1) == 2
+    with pytest.raises(IndexError) as excinfo:
+        m.shape(a, 2)
+    assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
+    with pytest.raises(IndexError) as excinfo:
+        m.strides(a, 2)
+    assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
+    assert not m.writeable(a)
+    assert m.size(a) == 6
+    assert m.itemsize(a) == 2
+    assert m.nbytes(a) == 12
+    assert not m.owndata(a)
+
+
+@pytest.mark.parametrize('args, ret', [([], 0), ([0], 0), ([1], 3), ([0, 1], 1), ([1, 2], 5)])
+def test_index_offset(arr, args, ret):
+    assert m.index_at(arr, *args) == ret
+    assert m.index_at_t(arr, *args) == ret
+    assert m.offset_at(arr, *args) == ret * arr.dtype.itemsize
+    assert m.offset_at_t(arr, *args) == ret * arr.dtype.itemsize
+
+
+def test_dim_check_fail(arr):
+    for func in (m.index_at, m.index_at_t, m.offset_at, m.offset_at_t, m.data, m.data_t,
+                 m.mutate_data, m.mutate_data_t):
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, 1, 2, 3)
+        assert str(excinfo.value) == 'too many indices for an array: 3 (ndim = 2)'
+
+
+@pytest.mark.parametrize('args, ret',
+                         [([], [1, 2, 3, 4, 5, 6]),
+                          ([1], [4, 5, 6]),
+                          ([0, 1], [2, 3, 4, 5, 6]),
+                          ([1, 2], [6])])
+def test_data(arr, args, ret):
+    from sys import byteorder
+    assert all(m.data_t(arr, *args) == ret)
+    assert all(m.data(arr, *args)[(0 if byteorder == 'little' else 1)::2] == ret)
+    assert all(m.data(arr, *args)[(1 if byteorder == 'little' else 0)::2] == 0)
+
+
+@pytest.mark.parametrize('dim', [0, 1, 3])
+def test_at_fail(arr, dim):
+    for func in m.at_t, m.mutate_at_t:
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, *([0] * dim))
+        assert str(excinfo.value) == 'index dimension mismatch: {} (ndim = 2)'.format(dim)
+
+
+def test_at(arr):
+    assert m.at_t(arr, 0, 2) == 3
+    assert m.at_t(arr, 1, 0) == 4
+
+    assert all(m.mutate_at_t(arr, 0, 2).ravel() == [1, 2, 4, 4, 5, 6])
+    assert all(m.mutate_at_t(arr, 1, 0).ravel() == [1, 2, 4, 5, 5, 6])
+
+
+def test_mutate_readonly(arr):
+    arr.flags.writeable = False
+    for func, args in (m.mutate_data, ()), (m.mutate_data_t, ()), (m.mutate_at_t, (0, 0)):
+        with pytest.raises(ValueError) as excinfo:
+            func(arr, *args)
+        assert str(excinfo.value) == 'array is not writeable'
+
+
+def test_mutate_data(arr):
+    assert all(m.mutate_data(arr).ravel() == [2, 4, 6, 8, 10, 12])
+    assert all(m.mutate_data(arr).ravel() == [4, 8, 12, 16, 20, 24])
+    assert all(m.mutate_data(arr, 1).ravel() == [4, 8, 12, 32, 40, 48])
+    assert all(m.mutate_data(arr, 0, 1).ravel() == [4, 16, 24, 64, 80, 96])
+    assert all(m.mutate_data(arr, 1, 2).ravel() == [4, 16, 24, 64, 80, 192])
+
+    assert all(m.mutate_data_t(arr).ravel() == [5, 17, 25, 65, 81, 193])
+    assert all(m.mutate_data_t(arr).ravel() == [6, 18, 26, 66, 82, 194])
+    assert all(m.mutate_data_t(arr, 1).ravel() == [6, 18, 26, 67, 83, 195])
+    assert all(m.mutate_data_t(arr, 0, 1).ravel() == [6, 19, 27, 68, 84, 196])
+    assert all(m.mutate_data_t(arr, 1, 2).ravel() == [6, 19, 27, 68, 84, 197])
+
+
+def test_bounds_check(arr):
+    for func in (m.index_at, m.index_at_t, m.data, m.data_t,
+                 m.mutate_data, m.mutate_data_t, m.at_t, m.mutate_at_t):
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, 2, 0)
+        assert str(excinfo.value) == 'index 2 is out of bounds for axis 0 with size 2'
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, 0, 4)
+        assert str(excinfo.value) == 'index 4 is out of bounds for axis 1 with size 3'
+
+
+def test_make_c_f_array():
+    assert m.make_c_array().flags.c_contiguous
+    assert not m.make_c_array().flags.f_contiguous
+    assert m.make_f_array().flags.f_contiguous
+    assert not m.make_f_array().flags.c_contiguous
+
+
+def test_make_empty_shaped_array():
+    m.make_empty_shaped_array()
+
+    # empty shape means numpy scalar, PEP 3118
+    assert m.scalar_int().ndim == 0
+    assert m.scalar_int().shape == ()
+    assert m.scalar_int() == 42
+
+
+def test_wrap():
+    def assert_references(a, b, base=None):
+        from distutils.version import LooseVersion
+        if base is None:
+            base = a
+        assert a is not b
+        assert a.__array_interface__['data'][0] == b.__array_interface__['data'][0]
+        assert a.shape == b.shape
+        assert a.strides == b.strides
+        assert a.flags.c_contiguous == b.flags.c_contiguous
+        assert a.flags.f_contiguous == b.flags.f_contiguous
+        assert a.flags.writeable == b.flags.writeable
+        assert a.flags.aligned == b.flags.aligned
+        if LooseVersion(np.__version__) >= LooseVersion("1.14.0"):
+            assert a.flags.writebackifcopy == b.flags.writebackifcopy
+        else:
+            assert a.flags.updateifcopy == b.flags.updateifcopy
+        assert np.all(a == b)
+        assert not b.flags.owndata
+        assert b.base is base
+        if a.flags.writeable and a.ndim == 2:
+            a[0, 0] = 1234
+            assert b[0, 0] == 1234
+
+    a1 = np.array([1, 2], dtype=np.int16)
+    assert a1.flags.owndata and a1.base is None
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='F')
+    assert a1.flags.owndata and a1.base is None
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='C')
+    a1.flags.writeable = False
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1 = np.random.random((4, 4, 4))
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1t = a1.transpose()
+    a2 = m.wrap(a1t)
+    assert_references(a1t, a2, a1)
+
+    a1d = a1.diagonal()
+    a2 = m.wrap(a1d)
+    assert_references(a1d, a2, a1)
+
+    a1m = a1[::-1, ::-1, ::-1]
+    a2 = m.wrap(a1m)
+    assert_references(a1m, a2, a1)
+
+
+def test_numpy_view(capture):
+    with capture:
+        ac = m.ArrayClass()
+        ac_view_1 = ac.numpy_view()
+        ac_view_2 = ac.numpy_view()
+        assert np.all(ac_view_1 == np.array([1, 2], dtype=np.int32))
+        del ac
+        pytest.gc_collect()
+    assert capture == """
+        ArrayClass()
+        ArrayClass::numpy_view()
+        ArrayClass::numpy_view()
+    """
+    ac_view_1[0] = 4
+    ac_view_1[1] = 3
+    assert ac_view_2[0] == 4
+    assert ac_view_2[1] == 3
+    with capture:
+        del ac_view_1
+        del ac_view_2
+        pytest.gc_collect()
+        pytest.gc_collect()
+    assert capture == """
+        ~ArrayClass()
+    """
+
+
+def test_cast_numpy_int64_to_uint64():
+    m.function_taking_uint64(123)
+    m.function_taking_uint64(np.uint64(123))
+
+
+def test_isinstance():
+    assert m.isinstance_untyped(np.array([1, 2, 3]), "not an array")
+    assert m.isinstance_typed(np.array([1.0, 2.0, 3.0]))
+
+
+def test_constructors():
+    defaults = m.default_constructors()
+    for a in defaults.values():
+        assert a.size == 0
+    assert defaults["array"].dtype == np.array([]).dtype
+    assert defaults["array_t<int32>"].dtype == np.int32
+    assert defaults["array_t<double>"].dtype == np.float64
+
+    results = m.converting_constructors([1, 2, 3])
+    for a in results.values():
+        np.testing.assert_array_equal(a, [1, 2, 3])
+    assert results["array"].dtype == np.int_
+    assert results["array_t<int32>"].dtype == np.int32
+    assert results["array_t<double>"].dtype == np.float64
+
+
+def test_overload_resolution(msg):
+    # Exact overload matches:
+    assert m.overloaded(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded(np.array([1], dtype='ushort')) == 'unsigned short'
+    assert m.overloaded(np.array([1], dtype='intc')) == 'int'
+    assert m.overloaded(np.array([1], dtype='longlong')) == 'long long'
+    assert m.overloaded(np.array([1], dtype='complex')) == 'double complex'
+    assert m.overloaded(np.array([1], dtype='csingle')) == 'float complex'
+
+    # No exact match, should call first convertible version:
+    assert m.overloaded(np.array([1], dtype='uint8')) == 'double'
+
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded("not an array")
+    assert msg(excinfo.value) == """
+        overloaded(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[numpy.float64]) -> str
+            2. (arg0: numpy.ndarray[numpy.float32]) -> str
+            3. (arg0: numpy.ndarray[numpy.int32]) -> str
+            4. (arg0: numpy.ndarray[numpy.uint16]) -> str
+            5. (arg0: numpy.ndarray[numpy.int64]) -> str
+            6. (arg0: numpy.ndarray[numpy.complex128]) -> str
+            7. (arg0: numpy.ndarray[numpy.complex64]) -> str
+
+        Invoked with: 'not an array'
+    """
+
+    assert m.overloaded2(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded2(np.array([1], dtype='complex64')) == 'float complex'
+    assert m.overloaded2(np.array([1], dtype='complex128')) == 'double complex'
+    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+
+    assert m.overloaded3(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded3(np.array([1], dtype='intc')) == 'int'
+    expected_exc = """
+        overloaded3(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[numpy.int32]) -> str
+            2. (arg0: numpy.ndarray[numpy.float64]) -> str
+
+        Invoked with: """
+
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='uintc'))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1], dtype='uint32'))
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='float32'))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1.], dtype='float32'))
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='complex'))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1. + 0.j]))
+
+    # Exact matches:
+    assert m.overloaded4(np.array([1], dtype='double')) == 'double'
+    assert m.overloaded4(np.array([1], dtype='longlong')) == 'long long'
+    # Non-exact matches requiring conversion.  Since float to integer isn't a
+    # save conversion, it should go to the double overload, but short can go to
+    # either (and so should end up on the first-registered, the long long).
+    assert m.overloaded4(np.array([1], dtype='float32')) == 'double'
+    assert m.overloaded4(np.array([1], dtype='short')) == 'long long'
+
+    assert m.overloaded5(np.array([1], dtype='double')) == 'double'
+    assert m.overloaded5(np.array([1], dtype='uintc')) == 'unsigned int'
+    assert m.overloaded5(np.array([1], dtype='float32')) == 'unsigned int'
+
+
+def test_greedy_string_overload():
+    """Tests fix for #685 - ndarray shouldn't go to std::string overload"""
+
+    assert m.issue685("abc") == "string"
+    assert m.issue685(np.array([97, 98, 99], dtype='b')) == "array"
+    assert m.issue685(123) == "other"
+
+
+def test_array_unchecked_fixed_dims(msg):
+    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    m.proxy_add2(z1, 10)
+    assert np.all(z1 == [[11, 12], [13, 14]])
+
+    with pytest.raises(ValueError) as excinfo:
+        m.proxy_add2(np.array([1., 2, 3]), 5.0)
+    assert msg(excinfo.value) == "array has incorrect number of dimensions: 1; expected 2"
+
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    assert np.all(m.proxy_init3(3.0) == expect_c)
+    expect_f = np.transpose(expect_c)
+    assert np.all(m.proxy_init3F(3.0) == expect_f)
+
+    assert m.proxy_squared_L2_norm(np.array(range(6))) == 55
+    assert m.proxy_squared_L2_norm(np.array(range(6), dtype="float64")) == 55
+
+    assert m.proxy_auxiliaries2(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
+    assert m.proxy_auxiliaries2(z1) == m.array_auxiliaries2(z1)
+
+
+def test_array_unchecked_dyn_dims(msg):
+    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    m.proxy_add2_dyn(z1, 10)
+    assert np.all(z1 == [[11, 12], [13, 14]])
+
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    assert np.all(m.proxy_init3_dyn(3.0) == expect_c)
+
+    assert m.proxy_auxiliaries2_dyn(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
+    assert m.proxy_auxiliaries2_dyn(z1) == m.array_auxiliaries2(z1)
+
+
+def test_array_failure():
+    with pytest.raises(ValueError) as excinfo:
+        m.array_fail_test()
+    assert str(excinfo.value) == 'cannot create a pybind11::array from a nullptr'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.array_t_fail_test()
+    assert str(excinfo.value) == 'cannot create a pybind11::array_t from a nullptr'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.array_fail_test_negative_size()
+    assert str(excinfo.value) == 'negative dimensions are not allowed'
+
+
+def test_initializer_list():
+    assert m.array_initializer_list1().shape == (1,)
+    assert m.array_initializer_list2().shape == (1, 2)
+    assert m.array_initializer_list3().shape == (1, 2, 3)
+    assert m.array_initializer_list4().shape == (1, 2, 3, 4)
+
+
+def test_array_resize(msg):
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float64')
+    m.array_reshape2(a)
+    assert(a.size == 9)
+    assert(np.all(a == [[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
+
+    # total size change should succced with refcheck off
+    m.array_resize3(a, 4, False)
+    assert(a.size == 64)
+    # ... and fail with refcheck on
+    try:
+        m.array_resize3(a, 3, True)
+    except ValueError as e:
+        assert(str(e).startswith("cannot resize an array"))
+    # transposed array doesn't own data
+    b = a.transpose()
+    try:
+        m.array_resize3(b, 3, False)
+    except ValueError as e:
+        assert(str(e).startswith("cannot resize this array: it does not own its data"))
+    # ... but reshape should be fine
+    m.array_reshape2(b)
+    assert(b.shape == (8, 8))
+
+
+@pytest.mark.xfail("env.PYPY")
+def test_array_create_and_resize(msg):
+    a = m.create_and_resize(2)
+    assert(a.size == 4)
+    assert(np.all(a == 42.))
+
+
+def test_index_using_ellipsis():
+    a = m.index_using_ellipsis(np.zeros((5, 6, 7)))
+    assert a.shape == (6,)
+
+
+@pytest.mark.xfail("env.PYPY")
+def test_dtype_refcount_leak():
+    from sys import getrefcount
+    dtype = np.dtype(np.float_)
+    a = np.array([1], dtype=dtype)
+    before = getrefcount(dtype)
+    m.ndim(a)
+    after = getrefcount(dtype)
+    assert after == before
diff --git a/pybind11/tests/test_numpy_dtypes.cpp b/pybind11/tests/test_numpy_dtypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..467e0253f7eb422da4fff3b4db7e4836fc2c11f2
--- /dev/null
+++ b/pybind11/tests/test_numpy_dtypes.cpp
@@ -0,0 +1,474 @@
+/*
+  tests/test_numpy_dtypes.cpp -- Structured and compound NumPy dtypes
+
+  Copyright (c) 2016 Ivan Smirnov
+
+  All rights reserved. Use of this source code is governed by a
+  BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/numpy.h>
+
+#ifdef __GNUC__
+#define PYBIND11_PACKED(cls) cls __attribute__((__packed__))
+#else
+#define PYBIND11_PACKED(cls) __pragma(pack(push, 1)) cls __pragma(pack(pop))
+#endif
+
+namespace py = pybind11;
+
+struct SimpleStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
+};
+
+std::ostream& operator<<(std::ostream& os, const SimpleStruct& v) {
+    return os << "s:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
+}
+
+struct SimpleStructReordered {
+    bool bool_;
+    float float_;
+    uint32_t uint_;
+    long double ldbl_;
+};
+
+PYBIND11_PACKED(struct PackedStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
+});
+
+std::ostream& operator<<(std::ostream& os, const PackedStruct& v) {
+    return os << "p:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
+}
+
+PYBIND11_PACKED(struct NestedStruct {
+    SimpleStruct a;
+    PackedStruct b;
+});
+
+std::ostream& operator<<(std::ostream& os, const NestedStruct& v) {
+    return os << "n:a=" << v.a << ";b=" << v.b;
+}
+
+struct PartialStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    uint64_t dummy2;
+    long double ldbl_;
+};
+
+struct PartialNestedStruct {
+    uint64_t dummy1;
+    PartialStruct a;
+    uint64_t dummy2;
+};
+
+struct UnboundStruct { };
+
+struct StringStruct {
+    char a[3];
+    std::array<char, 3> b;
+};
+
+struct ComplexStruct {
+    std::complex<float> cflt;
+    std::complex<double> cdbl;
+};
+
+std::ostream& operator<<(std::ostream& os, const ComplexStruct& v) {
+    return os << "c:" << v.cflt << "," << v.cdbl;
+}
+
+struct ArrayStruct {
+    char a[3][4];
+    int32_t b[2];
+    std::array<uint8_t, 3> c;
+    std::array<float, 2> d[4];
+};
+
+PYBIND11_PACKED(struct StructWithUglyNames {
+    int8_t __x__;
+    uint64_t __y__;
+});
+
+enum class E1 : int64_t { A = -1, B = 1 };
+enum E2 : uint8_t { X = 1, Y = 2 };
+
+PYBIND11_PACKED(struct EnumStruct {
+    E1 e1;
+    E2 e2;
+});
+
+std::ostream& operator<<(std::ostream& os, const StringStruct& v) {
+    os << "a='";
+    for (size_t i = 0; i < 3 && v.a[i]; i++) os << v.a[i];
+    os << "',b='";
+    for (size_t i = 0; i < 3 && v.b[i]; i++) os << v.b[i];
+    return os << "'";
+}
+
+std::ostream& operator<<(std::ostream& os, const ArrayStruct& v) {
+    os << "a={";
+    for (int i = 0; i < 3; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{';
+        for (int j = 0; j < 3; j++)
+            os << v.a[i][j] << ',';
+        os << v.a[i][3] << '}';
+    }
+    os << "},b={" << v.b[0] << ',' << v.b[1];
+    os << "},c={" << int(v.c[0]) << ',' << int(v.c[1]) << ',' << int(v.c[2]);
+    os << "},d={";
+    for (int i = 0; i < 4; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{' << v.d[i][0] << ',' << v.d[i][1] << '}';
+    }
+    return os << '}';
+}
+
+std::ostream& operator<<(std::ostream& os, const EnumStruct& v) {
+    return os << "e1=" << (v.e1 == E1::A ? "A" : "B") << ",e2=" << (v.e2 == E2::X ? "X" : "Y");
+}
+
+template <typename T>
+py::array mkarray_via_buffer(size_t n) {
+    return py::array(py::buffer_info(nullptr, sizeof(T),
+                                     py::format_descriptor<T>::format(),
+                                     1, { n }, { sizeof(T) }));
+}
+
+#define SET_TEST_VALS(s, i) do { \
+    s.bool_ = (i) % 2 != 0; \
+    s.uint_ = (uint32_t) (i); \
+    s.float_ = (float) (i) * 1.5f; \
+    s.ldbl_ = (long double) (i) * -2.5L; } while (0)
+
+template <typename S>
+py::array_t<S, 0> create_recarray(size_t n) {
+    auto arr = mkarray_via_buffer<S>(n);
+    auto req = arr.request();
+    auto ptr = static_cast<S*>(req.ptr);
+    for (size_t i = 0; i < n; i++) {
+        SET_TEST_VALS(ptr[i], i);
+    }
+    return arr;
+}
+
+template <typename S>
+py::list print_recarray(py::array_t<S, 0> arr) {
+    const auto req = arr.request();
+    const auto ptr = static_cast<S*>(req.ptr);
+    auto l = py::list();
+    for (ssize_t i = 0; i < req.size; i++) {
+        std::stringstream ss;
+        ss << ptr[i];
+        l.append(py::str(ss.str()));
+    }
+    return l;
+}
+
+py::array_t<int32_t, 0> test_array_ctors(int i) {
+    using arr_t = py::array_t<int32_t, 0>;
+
+    std::vector<int32_t> data { 1, 2, 3, 4, 5, 6 };
+    std::vector<ssize_t> shape { 3, 2 };
+    std::vector<ssize_t> strides { 8, 4 };
+
+    auto ptr = data.data();
+    auto vptr = (void *) ptr;
+    auto dtype = py::dtype("int32");
+
+    py::buffer_info buf_ndim1(vptr, 4, "i", 6);
+    py::buffer_info buf_ndim1_null(nullptr, 4, "i", 6);
+    py::buffer_info buf_ndim2(vptr, 4, "i", 2, shape, strides);
+    py::buffer_info buf_ndim2_null(nullptr, 4, "i", 2, shape, strides);
+
+    auto fill = [](py::array arr) {
+        auto req = arr.request();
+        for (int i = 0; i < 6; i++) ((int32_t *) req.ptr)[i] = i + 1;
+        return arr;
+    };
+
+    switch (i) {
+    // shape: (3, 2)
+    case 10: return arr_t(shape, strides, ptr);
+    case 11: return py::array(shape, strides, ptr);
+    case 12: return py::array(dtype, shape, strides, vptr);
+    case 13: return arr_t(shape, ptr);
+    case 14: return py::array(shape, ptr);
+    case 15: return py::array(dtype, shape, vptr);
+    case 16: return arr_t(buf_ndim2);
+    case 17: return py::array(buf_ndim2);
+    // shape: (3, 2) - post-fill
+    case 20: return fill(arr_t(shape, strides));
+    case 21: return py::array(shape, strides, ptr); // can't have nullptr due to templated ctor
+    case 22: return fill(py::array(dtype, shape, strides));
+    case 23: return fill(arr_t(shape));
+    case 24: return py::array(shape, ptr); // can't have nullptr due to templated ctor
+    case 25: return fill(py::array(dtype, shape));
+    case 26: return fill(arr_t(buf_ndim2_null));
+    case 27: return fill(py::array(buf_ndim2_null));
+    // shape: (6, )
+    case 30: return arr_t(6, ptr);
+    case 31: return py::array(6, ptr);
+    case 32: return py::array(dtype, 6, vptr);
+    case 33: return arr_t(buf_ndim1);
+    case 34: return py::array(buf_ndim1);
+    // shape: (6, )
+    case 40: return fill(arr_t(6));
+    case 41: return py::array(6, ptr);  // can't have nullptr due to templated ctor
+    case 42: return fill(py::array(dtype, 6));
+    case 43: return fill(arr_t(buf_ndim1_null));
+    case 44: return fill(py::array(buf_ndim1_null));
+    }
+    return arr_t();
+}
+
+py::list test_dtype_ctors() {
+    py::list list;
+    list.append(py::dtype("int32"));
+    list.append(py::dtype(std::string("float64")));
+    list.append(py::dtype::from_args(py::str("bool")));
+    py::list names, offsets, formats;
+    py::dict dict;
+    names.append(py::str("a")); names.append(py::str("b")); dict["names"] = names;
+    offsets.append(py::int_(1)); offsets.append(py::int_(10)); dict["offsets"] = offsets;
+    formats.append(py::dtype("int32")); formats.append(py::dtype("float64")); dict["formats"] = formats;
+    dict["itemsize"] = py::int_(20);
+    list.append(py::dtype::from_args(dict));
+    list.append(py::dtype(names, formats, offsets, 20));
+    list.append(py::dtype(py::buffer_info((void *) 0, sizeof(unsigned int), "I", 1)));
+    list.append(py::dtype(py::buffer_info((void *) 0, 0, "T{i:a:f:b:}", 1)));
+    return list;
+}
+
+struct A {};
+struct B {};
+
+TEST_SUBMODULE(numpy_dtypes, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // typeinfo may be registered before the dtype descriptor for scalar casts to work...
+    py::class_<SimpleStruct>(m, "SimpleStruct");
+
+    PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(SimpleStructReordered, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PackedStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(NestedStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(PartialStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PartialNestedStruct, a);
+    PYBIND11_NUMPY_DTYPE(StringStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(ArrayStruct, a, b, c, d);
+    PYBIND11_NUMPY_DTYPE(EnumStruct, e1, e2);
+    PYBIND11_NUMPY_DTYPE(ComplexStruct, cflt, cdbl);
+
+    // ... or after
+    py::class_<PackedStruct>(m, "PackedStruct");
+
+    PYBIND11_NUMPY_DTYPE_EX(StructWithUglyNames, __x__, "x", __y__, "y");
+
+    // If uncommented, this should produce a static_assert failure telling the user that the struct
+    // is not a POD type
+//    struct NotPOD { std::string v; NotPOD() : v("hi") {}; };
+//    PYBIND11_NUMPY_DTYPE(NotPOD, v);
+
+    // Check that dtypes can be registered programmatically, both from
+    // initializer lists of field descriptors and from other containers.
+    py::detail::npy_format_descriptor<A>::register_dtype(
+        {}
+    );
+    py::detail::npy_format_descriptor<B>::register_dtype(
+        std::vector<py::detail::field_descriptor>{}
+    );
+
+    // test_recarray, test_scalar_conversion
+    m.def("create_rec_simple", &create_recarray<SimpleStruct>);
+    m.def("create_rec_packed", &create_recarray<PackedStruct>);
+    m.def("create_rec_nested", [](size_t n) { // test_signature
+        py::array_t<NestedStruct, 0> arr = mkarray_via_buffer<NestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<NestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+            SET_TEST_VALS(ptr[i].b, i + 1);
+        }
+        return arr;
+    });
+    m.def("create_rec_partial", &create_recarray<PartialStruct>);
+    m.def("create_rec_partial_nested", [](size_t n) {
+        py::array_t<PartialNestedStruct, 0> arr = mkarray_via_buffer<PartialNestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<PartialNestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+        }
+        return arr;
+    });
+    m.def("print_rec_simple", &print_recarray<SimpleStruct>);
+    m.def("print_rec_packed", &print_recarray<PackedStruct>);
+    m.def("print_rec_nested", &print_recarray<NestedStruct>);
+
+    // test_format_descriptors
+    m.def("get_format_unbound", []() { return py::format_descriptor<UnboundStruct>::format(); });
+    m.def("print_format_descriptors", []() {
+        py::list l;
+        for (const auto &fmt : {
+            py::format_descriptor<SimpleStruct>::format(),
+            py::format_descriptor<PackedStruct>::format(),
+            py::format_descriptor<NestedStruct>::format(),
+            py::format_descriptor<PartialStruct>::format(),
+            py::format_descriptor<PartialNestedStruct>::format(),
+            py::format_descriptor<StringStruct>::format(),
+            py::format_descriptor<ArrayStruct>::format(),
+            py::format_descriptor<EnumStruct>::format(),
+            py::format_descriptor<ComplexStruct>::format()
+        }) {
+            l.append(py::cast(fmt));
+        }
+        return l;
+    });
+
+    // test_dtype
+    m.def("print_dtypes", []() {
+        py::list l;
+        for (const py::handle &d : {
+            py::dtype::of<SimpleStruct>(),
+            py::dtype::of<PackedStruct>(),
+            py::dtype::of<NestedStruct>(),
+            py::dtype::of<PartialStruct>(),
+            py::dtype::of<PartialNestedStruct>(),
+            py::dtype::of<StringStruct>(),
+            py::dtype::of<ArrayStruct>(),
+            py::dtype::of<EnumStruct>(),
+            py::dtype::of<StructWithUglyNames>(),
+            py::dtype::of<ComplexStruct>()
+        })
+            l.append(py::str(d));
+        return l;
+    });
+    m.def("test_dtype_ctors", &test_dtype_ctors);
+    m.def("test_dtype_methods", []() {
+        py::list list;
+        auto dt1 = py::dtype::of<int32_t>();
+        auto dt2 = py::dtype::of<SimpleStruct>();
+        list.append(dt1); list.append(dt2);
+        list.append(py::bool_(dt1.has_fields())); list.append(py::bool_(dt2.has_fields()));
+        list.append(py::int_(dt1.itemsize())); list.append(py::int_(dt2.itemsize()));
+        return list;
+    });
+    struct TrailingPaddingStruct {
+        int32_t a;
+        char b;
+    };
+    PYBIND11_NUMPY_DTYPE(TrailingPaddingStruct, a, b);
+    m.def("trailing_padding_dtype", []() { return py::dtype::of<TrailingPaddingStruct>(); });
+
+    // test_string_array
+    m.def("create_string_array", [](bool non_empty) {
+        py::array_t<StringStruct, 0> arr = mkarray_via_buffer<StringStruct>(non_empty ? 4 : 0);
+        if (non_empty) {
+            auto req = arr.request();
+            auto ptr = static_cast<StringStruct*>(req.ptr);
+            for (ssize_t i = 0; i < req.size * req.itemsize; i++)
+                static_cast<char*>(req.ptr)[i] = 0;
+            ptr[1].a[0] = 'a'; ptr[1].b[0] = 'a';
+            ptr[2].a[0] = 'a'; ptr[2].b[0] = 'a';
+            ptr[3].a[0] = 'a'; ptr[3].b[0] = 'a';
+
+            ptr[2].a[1] = 'b'; ptr[2].b[1] = 'b';
+            ptr[3].a[1] = 'b'; ptr[3].b[1] = 'b';
+
+            ptr[3].a[2] = 'c'; ptr[3].b[2] = 'c';
+        }
+        return arr;
+    });
+    m.def("print_string_array", &print_recarray<StringStruct>);
+
+    // test_array_array
+    m.def("create_array_array", [](size_t n) {
+        py::array_t<ArrayStruct, 0> arr = mkarray_via_buffer<ArrayStruct>(n);
+        auto ptr = (ArrayStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            for (size_t j = 0; j < 3; j++)
+                for (size_t k = 0; k < 4; k++)
+                    ptr[i].a[j][k] = char('A' + (i * 100 + j * 10 + k) % 26);
+            for (size_t j = 0; j < 2; j++)
+                ptr[i].b[j] = int32_t(i * 1000 + j);
+            for (size_t j = 0; j < 3; j++)
+                ptr[i].c[j] = uint8_t(i * 10 + j);
+            for (size_t j = 0; j < 4; j++)
+                for (size_t k = 0; k < 2; k++)
+                    ptr[i].d[j][k] = float(i) * 100.0f + float(j) * 10.0f + float(k);
+        }
+        return arr;
+    });
+    m.def("print_array_array", &print_recarray<ArrayStruct>);
+
+    // test_enum_array
+    m.def("create_enum_array", [](size_t n) {
+        py::array_t<EnumStruct, 0> arr = mkarray_via_buffer<EnumStruct>(n);
+        auto ptr = (EnumStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].e1 = static_cast<E1>(-1 + ((int) i % 2) * 2);
+            ptr[i].e2 = static_cast<E2>(1 + (i % 2));
+        }
+        return arr;
+    });
+    m.def("print_enum_array", &print_recarray<EnumStruct>);
+
+    // test_complex_array
+    m.def("create_complex_array", [](size_t n) {
+        py::array_t<ComplexStruct, 0> arr = mkarray_via_buffer<ComplexStruct>(n);
+        auto ptr = (ComplexStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].cflt.real(float(i));
+            ptr[i].cflt.imag(float(i) + 0.25f);
+            ptr[i].cdbl.real(double(i) + 0.5);
+            ptr[i].cdbl.imag(double(i) + 0.75);
+        }
+        return arr;
+    });
+    m.def("print_complex_array", &print_recarray<ComplexStruct>);
+
+    // test_array_constructors
+    m.def("test_array_ctors", &test_array_ctors);
+
+    // test_compare_buffer_info
+    struct CompareStruct {
+        bool x;
+        uint32_t y;
+        float z;
+    };
+    PYBIND11_NUMPY_DTYPE(CompareStruct, x, y, z);
+    m.def("compare_buffer_info", []() {
+        py::list list;
+        list.append(py::bool_(py::detail::compare_buffer_info<float>::compare(py::buffer_info(nullptr, sizeof(float), "f", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<unsigned>::compare(py::buffer_info(nullptr, sizeof(int), "I", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), "l", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), sizeof(long) == sizeof(int) ? "i" : "q", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<CompareStruct>::compare(py::buffer_info(nullptr, sizeof(CompareStruct), "T{?:x:3xI:y:f:z:}", 1))));
+        return list;
+    });
+    m.def("buffer_to_dtype", [](py::buffer& buf) { return py::dtype(buf.request()); });
+
+    // test_scalar_conversion
+    m.def("f_simple", [](SimpleStruct s) { return s.uint_ * 10; });
+    m.def("f_packed", [](PackedStruct s) { return s.uint_ * 10; });
+    m.def("f_nested", [](NestedStruct s) { return s.a.uint_ * 10; });
+
+    // test_register_dtype
+    m.def("register_dtype", []() { PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_); });
+
+    // test_str_leak
+    m.def("dtype_wrapper", [](py::object d) { return py::dtype::from_args(std::move(d)); });
+}
diff --git a/pybind11/tests/test_numpy_dtypes.py b/pybind11/tests/test_numpy_dtypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..417d6f1cffbbd3a08857797c5c22f555d6f2dd33
--- /dev/null
+++ b/pybind11/tests/test_numpy_dtypes.py
@@ -0,0 +1,312 @@
+# -*- coding: utf-8 -*-
+import re
+
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import numpy_dtypes as m
+
+np = pytest.importorskip("numpy")
+
+
+@pytest.fixture(scope='module')
+def simple_dtype():
+    ld = np.dtype('longdouble')
+    return np.dtype({'names': ['bool_', 'uint_', 'float_', 'ldbl_'],
+                     'formats': ['?', 'u4', 'f4', 'f{}'.format(ld.itemsize)],
+                     'offsets': [0, 4, 8, (16 if ld.alignment > 4 else 12)]})
+
+
+@pytest.fixture(scope='module')
+def packed_dtype():
+    return np.dtype([('bool_', '?'), ('uint_', 'u4'), ('float_', 'f4'), ('ldbl_', 'g')])
+
+
+def dt_fmt():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+    return ("{{'names':['bool_','uint_','float_','ldbl_'],"
+            " 'formats':['?','" + e + "u4','" + e + "f4','" + e + "f{}'],"
+            " 'offsets':[0,4,8,{}], 'itemsize':{}}}")
+
+
+def simple_dtype_fmt():
+    ld = np.dtype('longdouble')
+    simple_ld_off = 12 + 4 * (ld.alignment > 4)
+    return dt_fmt().format(ld.itemsize, simple_ld_off, simple_ld_off + ld.itemsize)
+
+
+def packed_dtype_fmt():
+    from sys import byteorder
+    return "[('bool_', '?'), ('uint_', '{e}u4'), ('float_', '{e}f4'), ('ldbl_', '{e}f{}')]".format(
+        np.dtype('longdouble').itemsize, e='<' if byteorder == 'little' else '>')
+
+
+def partial_ld_offset():
+    return 12 + 4 * (np.dtype('uint64').alignment > 4) + 8 + 8 * (
+        np.dtype('longdouble').alignment > 8)
+
+
+def partial_dtype_fmt():
+    ld = np.dtype('longdouble')
+    partial_ld_off = partial_ld_offset()
+    return dt_fmt().format(ld.itemsize, partial_ld_off, partial_ld_off + ld.itemsize)
+
+
+def partial_nested_fmt():
+    ld = np.dtype('longdouble')
+    partial_nested_off = 8 + 8 * (ld.alignment > 8)
+    partial_ld_off = partial_ld_offset()
+    partial_nested_size = partial_nested_off * 2 + partial_ld_off + ld.itemsize
+    return "{{'names':['a'], 'formats':[{}], 'offsets':[{}], 'itemsize':{}}}".format(
+        partial_dtype_fmt(), partial_nested_off, partial_nested_size)
+
+
+def assert_equal(actual, expected_data, expected_dtype):
+    np.testing.assert_equal(actual, np.array(expected_data, dtype=expected_dtype))
+
+
+def test_format_descriptors():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.get_format_unbound()
+    assert re.match('^NumPy type info missing for .*UnboundStruct.*$', str(excinfo.value))
+
+    ld = np.dtype('longdouble')
+    ldbl_fmt = ('4x' if ld.alignment > 4 else '') + ld.char
+    ss_fmt = "^T{?:bool_:3xI:uint_:f:float_:" + ldbl_fmt + ":ldbl_:}"
+    dbl = np.dtype('double')
+    partial_fmt = ("^T{?:bool_:3xI:uint_:f:float_:" +
+                   str(4 * (dbl.alignment > 4) + dbl.itemsize + 8 * (ld.alignment > 8)) +
+                   "xg:ldbl_:}")
+    nested_extra = str(max(8, ld.alignment))
+    assert m.print_format_descriptors() == [
+        ss_fmt,
+        "^T{?:bool_:I:uint_:f:float_:g:ldbl_:}",
+        "^T{" + ss_fmt + ":a:^T{?:bool_:I:uint_:f:float_:g:ldbl_:}:b:}",
+        partial_fmt,
+        "^T{" + nested_extra + "x" + partial_fmt + ":a:" + nested_extra + "x}",
+        "^T{3s:a:3s:b:}",
+        "^T{(3)4s:a:(2)i:b:(3)B:c:1x(4, 2)f:d:}",
+        '^T{q:e1:B:e2:}',
+        '^T{Zf:cflt:Zd:cdbl:}'
+    ]
+
+
+def test_dtype(simple_dtype):
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    assert m.print_dtypes() == [
+        simple_dtype_fmt(),
+        packed_dtype_fmt(),
+        "[('a', {}), ('b', {})]".format(simple_dtype_fmt(), packed_dtype_fmt()),
+        partial_dtype_fmt(),
+        partial_nested_fmt(),
+        "[('a', 'S3'), ('b', 'S3')]",
+        ("{{'names':['a','b','c','d'], " +
+         "'formats':[('S4', (3,)),('" + e + "i4', (2,)),('u1', (3,)),('" + e + "f4', (4, 2))], " +
+         "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e),
+        "[('e1', '" + e + "i8'), ('e2', 'u1')]",
+        "[('x', 'i1'), ('y', '" + e + "u8')]",
+        "[('cflt', '" + e + "c8'), ('cdbl', '" + e + "c16')]"
+    ]
+
+    d1 = np.dtype({'names': ['a', 'b'], 'formats': ['int32', 'float64'],
+                   'offsets': [1, 10], 'itemsize': 20})
+    d2 = np.dtype([('a', 'i4'), ('b', 'f4')])
+    assert m.test_dtype_ctors() == [np.dtype('int32'), np.dtype('float64'),
+                                    np.dtype('bool'), d1, d1, np.dtype('uint32'), d2]
+
+    assert m.test_dtype_methods() == [np.dtype('int32'), simple_dtype, False, True,
+                                      np.dtype('int32').itemsize, simple_dtype.itemsize]
+
+    assert m.trailing_padding_dtype() == m.buffer_to_dtype(np.zeros(1, m.trailing_padding_dtype()))
+
+
+def test_recarray(simple_dtype, packed_dtype):
+    elements = [(False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)]
+
+    for func, dtype in [(m.create_rec_simple, simple_dtype), (m.create_rec_packed, packed_dtype)]:
+        arr = func(0)
+        assert arr.dtype == dtype
+        assert_equal(arr, [], simple_dtype)
+        assert_equal(arr, [], packed_dtype)
+
+        arr = func(3)
+        assert arr.dtype == dtype
+        assert_equal(arr, elements, simple_dtype)
+        assert_equal(arr, elements, packed_dtype)
+
+        if dtype == simple_dtype:
+            assert m.print_rec_simple(arr) == [
+                "s:0,0,0,-0",
+                "s:1,1,1.5,-2.5",
+                "s:0,2,3,-5"
+            ]
+        else:
+            assert m.print_rec_packed(arr) == [
+                "p:0,0,0,-0",
+                "p:1,1,1.5,-2.5",
+                "p:0,2,3,-5"
+            ]
+
+    nested_dtype = np.dtype([('a', simple_dtype), ('b', packed_dtype)])
+
+    arr = m.create_rec_nested(0)
+    assert arr.dtype == nested_dtype
+    assert_equal(arr, [], nested_dtype)
+
+    arr = m.create_rec_nested(3)
+    assert arr.dtype == nested_dtype
+    assert_equal(arr, [((False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5)),
+                       ((True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)),
+                       ((False, 2, 3.0, -5.0), (True, 3, 4.5, -7.5))], nested_dtype)
+    assert m.print_rec_nested(arr) == [
+        "n:a=s:0,0,0,-0;b=p:1,1,1.5,-2.5",
+        "n:a=s:1,1,1.5,-2.5;b=p:0,2,3,-5",
+        "n:a=s:0,2,3,-5;b=p:1,3,4.5,-7.5"
+    ]
+
+    arr = m.create_rec_partial(3)
+    assert str(arr.dtype) == partial_dtype_fmt()
+    partial_dtype = arr.dtype
+    assert '' not in arr.dtype.fields
+    assert partial_dtype.itemsize > simple_dtype.itemsize
+    assert_equal(arr, elements, simple_dtype)
+    assert_equal(arr, elements, packed_dtype)
+
+    arr = m.create_rec_partial_nested(3)
+    assert str(arr.dtype) == partial_nested_fmt()
+    assert '' not in arr.dtype.fields
+    assert '' not in arr.dtype.fields['a'][0].fields
+    assert arr.dtype.itemsize > partial_dtype.itemsize
+    np.testing.assert_equal(arr['a'], m.create_rec_partial(3))
+
+
+def test_array_constructors():
+    data = np.arange(1, 7, dtype='int32')
+    for i in range(8):
+        np.testing.assert_array_equal(m.test_array_ctors(10 + i), data.reshape((3, 2)))
+        np.testing.assert_array_equal(m.test_array_ctors(20 + i), data.reshape((3, 2)))
+    for i in range(5):
+        np.testing.assert_array_equal(m.test_array_ctors(30 + i), data)
+        np.testing.assert_array_equal(m.test_array_ctors(40 + i), data)
+
+
+def test_string_array():
+    arr = m.create_string_array(True)
+    assert str(arr.dtype) == "[('a', 'S3'), ('b', 'S3')]"
+    assert m.print_string_array(arr) == [
+        "a='',b=''",
+        "a='a',b='a'",
+        "a='ab',b='ab'",
+        "a='abc',b='abc'"
+    ]
+    dtype = arr.dtype
+    assert arr['a'].tolist() == [b'', b'a', b'ab', b'abc']
+    assert arr['b'].tolist() == [b'', b'a', b'ab', b'abc']
+    arr = m.create_string_array(False)
+    assert dtype == arr.dtype
+
+
+def test_array_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_array_array(3)
+    assert str(arr.dtype) == (
+        "{{'names':['a','b','c','d'], " +
+        "'formats':[('S4', (3,)),('" + e + "i4', (2,)),('u1', (3,)),('{e}f4', (4, 2))], " +
+        "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e)
+    assert m.print_array_array(arr) == [
+        "a={{A,B,C,D},{K,L,M,N},{U,V,W,X}},b={0,1}," +
+        "c={0,1,2},d={{0,1},{10,11},{20,21},{30,31}}",
+        "a={{W,X,Y,Z},{G,H,I,J},{Q,R,S,T}},b={1000,1001}," +
+        "c={10,11,12},d={{100,101},{110,111},{120,121},{130,131}}",
+        "a={{S,T,U,V},{C,D,E,F},{M,N,O,P}},b={2000,2001}," +
+        "c={20,21,22},d={{200,201},{210,211},{220,221},{230,231}}",
+    ]
+    assert arr['a'].tolist() == [[b'ABCD', b'KLMN', b'UVWX'],
+                                 [b'WXYZ', b'GHIJ', b'QRST'],
+                                 [b'STUV', b'CDEF', b'MNOP']]
+    assert arr['b'].tolist() == [[0, 1], [1000, 1001], [2000, 2001]]
+    assert m.create_array_array(0).dtype == arr.dtype
+
+
+def test_enum_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_enum_array(3)
+    dtype = arr.dtype
+    assert dtype == np.dtype([('e1', e + 'i8'), ('e2', 'u1')])
+    assert m.print_enum_array(arr) == [
+        "e1=A,e2=X",
+        "e1=B,e2=Y",
+        "e1=A,e2=X"
+    ]
+    assert arr['e1'].tolist() == [-1, 1, -1]
+    assert arr['e2'].tolist() == [1, 2, 1]
+    assert m.create_enum_array(0).dtype == dtype
+
+
+def test_complex_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_complex_array(3)
+    dtype = arr.dtype
+    assert dtype == np.dtype([('cflt', e + 'c8'), ('cdbl', e + 'c16')])
+    assert m.print_complex_array(arr) == [
+        "c:(0,0.25),(0.5,0.75)",
+        "c:(1,1.25),(1.5,1.75)",
+        "c:(2,2.25),(2.5,2.75)"
+    ]
+    assert arr['cflt'].tolist() == [0.0 + 0.25j, 1.0 + 1.25j, 2.0 + 2.25j]
+    assert arr['cdbl'].tolist() == [0.5 + 0.75j, 1.5 + 1.75j, 2.5 + 2.75j]
+    assert m.create_complex_array(0).dtype == dtype
+
+
+def test_signature(doc):
+    assert doc(m.create_rec_nested) == \
+        "create_rec_nested(arg0: int) -> numpy.ndarray[NestedStruct]"
+
+
+def test_scalar_conversion():
+    n = 3
+    arrays = [m.create_rec_simple(n), m.create_rec_packed(n),
+              m.create_rec_nested(n), m.create_enum_array(n)]
+    funcs = [m.f_simple, m.f_packed, m.f_nested]
+
+    for i, func in enumerate(funcs):
+        for j, arr in enumerate(arrays):
+            if i == j and i < 2:
+                assert [func(arr[k]) for k in range(n)] == [k * 10 for k in range(n)]
+            else:
+                with pytest.raises(TypeError) as excinfo:
+                    func(arr[0])
+                assert 'incompatible function arguments' in str(excinfo.value)
+
+
+def test_register_dtype():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.register_dtype()
+    assert 'dtype is already registered' in str(excinfo.value)
+
+
+@pytest.mark.xfail("env.PYPY")
+def test_str_leak():
+    from sys import getrefcount
+    fmt = "f4"
+    pytest.gc_collect()
+    start = getrefcount(fmt)
+    d = m.dtype_wrapper(fmt)
+    assert d is np.dtype("f4")
+    del d
+    pytest.gc_collect()
+    assert getrefcount(fmt) == start
+
+
+def test_compare_buffer_info():
+    assert all(m.compare_buffer_info())
diff --git a/pybind11/tests/test_numpy_vectorize.cpp b/pybind11/tests/test_numpy_vectorize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a875a74b99e95285ad5733616ad3f2ff1d0b2900
--- /dev/null
+++ b/pybind11/tests/test_numpy_vectorize.cpp
@@ -0,0 +1,89 @@
+/*
+    tests/test_numpy_vectorize.cpp -- auto-vectorize functions over NumPy array
+    arguments
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/numpy.h>
+
+double my_func(int x, float y, double z) {
+    py::print("my_func(x:int={}, y:float={:.0f}, z:float={:.0f})"_s.format(x, y, z));
+    return (float) x*y*z;
+}
+
+TEST_SUBMODULE(numpy_vectorize, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vectorize, test_docs, test_array_collapse
+    // Vectorize all arguments of a function (though non-vector arguments are also allowed)
+    m.def("vectorized_func", py::vectorize(my_func));
+
+    // Vectorize a lambda function with a capture object (e.g. to exclude some arguments from the vectorization)
+    m.def("vectorized_func2",
+        [](py::array_t<int> x, py::array_t<float> y, float z) {
+            return py::vectorize([z](int x, float y) { return my_func(x, y, z); })(x, y);
+        }
+    );
+
+    // Vectorize a complex-valued function
+    m.def("vectorized_func3", py::vectorize(
+        [](std::complex<double> c) { return c * std::complex<double>(2.f); }
+    ));
+
+    // test_type_selection
+    // Numpy function which only accepts specific data types
+    m.def("selective_func", [](py::array_t<int, py::array::c_style>) { return "Int branch taken."; });
+    m.def("selective_func", [](py::array_t<float, py::array::c_style>) { return "Float branch taken."; });
+    m.def("selective_func", [](py::array_t<std::complex<float>, py::array::c_style>) { return "Complex float branch taken."; });
+
+
+    // test_passthrough_arguments
+    // Passthrough test: references and non-pod types should be automatically passed through (in the
+    // function definition below, only `b`, `d`, and `g` are vectorized):
+    struct NonPODClass {
+        NonPODClass(int v) : value{v} {}
+        int value;
+    };
+    py::class_<NonPODClass>(m, "NonPODClass").def(py::init<int>());
+    m.def("vec_passthrough", py::vectorize(
+        [](double *a, double b, py::array_t<double> c, const int &d, int &e, NonPODClass f, const double g) {
+            return *a + b + c.at(0) + d + e + f.value + g;
+        }
+    ));
+
+    // test_method_vectorization
+    struct VectorizeTestClass {
+        VectorizeTestClass(int v) : value{v} {};
+        float method(int x, float y) { return y + (float) (x + value); }
+        int value = 0;
+    };
+    py::class_<VectorizeTestClass> vtc(m, "VectorizeTestClass");
+    vtc .def(py::init<int>())
+        .def_readwrite("value", &VectorizeTestClass::value);
+
+    // Automatic vectorizing of methods
+    vtc.def("method", py::vectorize(&VectorizeTestClass::method));
+
+    // test_trivial_broadcasting
+    // Internal optimization test for whether the input is trivially broadcastable:
+    py::enum_<py::detail::broadcast_trivial>(m, "trivial")
+        .value("f_trivial", py::detail::broadcast_trivial::f_trivial)
+        .value("c_trivial", py::detail::broadcast_trivial::c_trivial)
+        .value("non_trivial", py::detail::broadcast_trivial::non_trivial);
+    m.def("vectorized_is_trivial", [](
+                py::array_t<int, py::array::forcecast> arg1,
+                py::array_t<float, py::array::forcecast> arg2,
+                py::array_t<double, py::array::forcecast> arg3
+                ) {
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::array<py::buffer_info, 3> buffers {{ arg1.request(), arg2.request(), arg3.request() }};
+        return py::detail::broadcast(buffers, ndim, shape);
+    });
+}
diff --git a/pybind11/tests/test_numpy_vectorize.py b/pybind11/tests/test_numpy_vectorize.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e44cd8d3f9630a4d76c419e449c8ce1e7cee59
--- /dev/null
+++ b/pybind11/tests/test_numpy_vectorize.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import numpy_vectorize as m
+
+np = pytest.importorskip("numpy")
+
+
+def test_vectorize(capture):
+    assert np.isclose(m.vectorized_func3(np.array(3 + 7j)), [6 + 14j])
+
+    for f in [m.vectorized_func, m.vectorized_func2]:
+        with capture:
+            assert np.isclose(f(1, 2, 3), 6)
+        assert capture == "my_func(x:int=1, y:float=2, z:float=3)"
+        with capture:
+            assert np.isclose(f(np.array(1), np.array(2), 3), 6)
+        assert capture == "my_func(x:int=1, y:float=2, z:float=3)"
+        with capture:
+            assert np.allclose(f(np.array([1, 3]), np.array([2, 4]), 3), [6, 36])
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=3)
+            my_func(x:int=3, y:float=4, z:float=3)
+        """
+        with capture:
+            a = np.array([[1, 2], [3, 4]], order='F')
+            b = np.array([[10, 20], [30, 40]], order='F')
+            c = 3
+            result = f(a, b, c)
+            assert np.allclose(result, a * b * c)
+            assert result.flags.f_contiguous
+        # All inputs are F order and full or singletons, so we the result is in col-major order:
+        assert capture == """
+            my_func(x:int=1, y:float=10, z:float=3)
+            my_func(x:int=3, y:float=30, z:float=3)
+            my_func(x:int=2, y:float=20, z:float=3)
+            my_func(x:int=4, y:float=40, z:float=3)
+        """
+        with capture:
+            a, b, c = np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=3)
+            my_func(x:int=3, y:float=4, z:float=3)
+            my_func(x:int=5, y:float=6, z:float=3)
+            my_func(x:int=7, y:float=8, z:float=3)
+            my_func(x:int=9, y:float=10, z:float=3)
+            my_func(x:int=11, y:float=12, z:float=3)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=3, z:float=2)
+            my_func(x:int=3, y:float=4, z:float=2)
+            my_func(x:int=4, y:float=2, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=4, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F'), np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]])[::, ::2], np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F')[::, ::2], np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+
+
+def test_type_selection():
+    assert m.selective_func(np.array([1], dtype=np.int32)) == "Int branch taken."
+    assert m.selective_func(np.array([1.0], dtype=np.float32)) == "Float branch taken."
+    assert m.selective_func(np.array([1.0j], dtype=np.complex64)) == "Complex float branch taken."
+
+
+def test_docs(doc):
+    assert doc(m.vectorized_func) == """
+        vectorized_func(arg0: numpy.ndarray[numpy.int32], arg1: numpy.ndarray[numpy.float32], arg2: numpy.ndarray[numpy.float64]) -> object
+    """  # noqa: E501 line too long
+
+
+def test_trivial_broadcasting():
+    trivial, vectorized_is_trivial = m.trivial, m.vectorized_is_trivial
+
+    assert vectorized_is_trivial(1, 2, 3) == trivial.c_trivial
+    assert vectorized_is_trivial(np.array(1), np.array(2), 3) == trivial.c_trivial
+    assert vectorized_is_trivial(np.array([1, 3]), np.array([2, 4]), 3) == trivial.c_trivial
+    assert trivial.c_trivial == vectorized_is_trivial(
+        np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3)
+    assert vectorized_is_trivial(
+        np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2) == trivial.non_trivial
+    assert vectorized_is_trivial(
+        np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2) == trivial.non_trivial
+    z1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype='int32')
+    z2 = np.array(z1, dtype='float32')
+    z3 = np.array(z1, dtype='float64')
+    assert vectorized_is_trivial(z1, z2, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(1, z2, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(z1, 1, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(z1, z2, 1) == trivial.c_trivial
+    assert vectorized_is_trivial(z1[::2, ::2], 1, 1) == trivial.non_trivial
+    assert vectorized_is_trivial(1, 1, z1[::2, ::2]) == trivial.c_trivial
+    assert vectorized_is_trivial(1, 1, z3[::2, ::2]) == trivial.non_trivial
+    assert vectorized_is_trivial(z1, 1, z3[1::4, 1::4]) == trivial.c_trivial
+
+    y1 = np.array(z1, order='F')
+    y2 = np.array(y1)
+    y3 = np.array(y1)
+    assert vectorized_is_trivial(y1, y2, y3) == trivial.f_trivial
+    assert vectorized_is_trivial(y1, 1, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(1, y2, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(1, 1, y3) == trivial.f_trivial
+    assert vectorized_is_trivial(y1, z2, 1) == trivial.non_trivial
+    assert vectorized_is_trivial(z1[1::4, 1::4], y2, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(y1[1::4, 1::4], z2, 1) == trivial.c_trivial
+
+    assert m.vectorized_func(z1, z2, z3).flags.c_contiguous
+    assert m.vectorized_func(y1, y2, y3).flags.f_contiguous
+    assert m.vectorized_func(z1, 1, 1).flags.c_contiguous
+    assert m.vectorized_func(1, y2, 1).flags.f_contiguous
+    assert m.vectorized_func(z1[1::4, 1::4], y2, 1).flags.f_contiguous
+    assert m.vectorized_func(y1[1::4, 1::4], z2, 1).flags.c_contiguous
+
+
+def test_passthrough_arguments(doc):
+    assert doc(m.vec_passthrough) == (
+        "vec_passthrough(" + ", ".join([
+            "arg0: float",
+            "arg1: numpy.ndarray[numpy.float64]",
+            "arg2: numpy.ndarray[numpy.float64]",
+            "arg3: numpy.ndarray[numpy.int32]",
+            "arg4: int",
+            "arg5: m.numpy_vectorize.NonPODClass",
+            "arg6: numpy.ndarray[numpy.float64]"]) + ") -> object")
+
+    b = np.array([[10, 20, 30]], dtype='float64')
+    c = np.array([100, 200])  # NOT a vectorized argument
+    d = np.array([[1000], [2000], [3000]], dtype='int')
+    g = np.array([[1000000, 2000000, 3000000]], dtype='int')  # requires casting
+    assert np.all(
+        m.vec_passthrough(1, b, c, d, 10000, m.NonPODClass(100000), g) ==
+        np.array([[1111111, 2111121, 3111131],
+                  [1112111, 2112121, 3112131],
+                  [1113111, 2113121, 3113131]]))
+
+
+def test_method_vectorization():
+    o = m.VectorizeTestClass(3)
+    x = np.array([1, 2], dtype='int')
+    y = np.array([[10], [20]], dtype='float32')
+    assert np.all(o.method(x, y) == [[14, 15], [24, 25]])
+
+
+def test_array_collapse():
+    assert not isinstance(m.vectorized_func(1, 2, 3), np.ndarray)
+    assert not isinstance(m.vectorized_func(np.array(1), 2, 3), np.ndarray)
+    z = m.vectorized_func([1], 2, 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, )
+    z = m.vectorized_func(1, [[[2]]], 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, 1, 1)
diff --git a/pybind11/tests/test_opaque_types.cpp b/pybind11/tests/test_opaque_types.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d20d9a01c8592e844fb909b336fd5c8e969b9e0
--- /dev/null
+++ b/pybind11/tests/test_opaque_types.cpp
@@ -0,0 +1,67 @@
+/*
+    tests/test_opaque_types.cpp -- opaque types, passing void pointers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+#include <vector>
+
+// IMPORTANT: Disable internal pybind11 translation mechanisms for STL data structures
+//
+// This also deliberately doesn't use the below StringList type alias to test
+// that MAKE_OPAQUE can handle a type containing a `,`.  (The `std::allocator`
+// bit is just the default `std::vector` allocator).
+PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
+
+using StringList = std::vector<std::string, std::allocator<std::string>>;
+
+TEST_SUBMODULE(opaque_types, m) {
+    // test_string_list
+    py::class_<StringList>(m, "StringList")
+        .def(py::init<>())
+        .def("pop_back", &StringList::pop_back)
+        /* There are multiple versions of push_back(), etc. Select the right ones. */
+        .def("push_back", (void (StringList::*)(const std::string &)) &StringList::push_back)
+        .def("back", (std::string &(StringList::*)()) &StringList::back)
+        .def("__len__", [](const StringList &v) { return v.size(); })
+        .def("__iter__", [](StringList &v) {
+           return py::make_iterator(v.begin(), v.end());
+        }, py::keep_alive<0, 1>());
+
+    class ClassWithSTLVecProperty {
+    public:
+        StringList stringList;
+    };
+    py::class_<ClassWithSTLVecProperty>(m, "ClassWithSTLVecProperty")
+        .def(py::init<>())
+        .def_readwrite("stringList", &ClassWithSTLVecProperty::stringList);
+
+    m.def("print_opaque_list", [](const StringList &l) {
+        std::string ret = "Opaque list: [";
+        bool first = true;
+        for (auto entry : l) {
+            if (!first)
+                ret += ", ";
+            ret += entry;
+            first = false;
+        }
+        return ret + "]";
+    });
+
+    // test_pointers
+    m.def("return_void_ptr", []() { return (void *) 0x1234; });
+    m.def("get_void_ptr_value", [](void *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
+    m.def("return_null_str", []() { return (char *) nullptr; });
+    m.def("get_null_str_value", [](char *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
+
+    m.def("return_unique_ptr", []() -> std::unique_ptr<StringList> {
+        StringList *result = new StringList();
+        result->push_back("some value");
+        return std::unique_ptr<StringList>(result);
+    });
+}
diff --git a/pybind11/tests/test_opaque_types.py b/pybind11/tests/test_opaque_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f2392775d83a833457d95520648ee7e1f2aa6d5
--- /dev/null
+++ b/pybind11/tests/test_opaque_types.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import opaque_types as m
+from pybind11_tests import ConstructorStats, UserType
+
+
+def test_string_list():
+    lst = m.StringList()
+    lst.push_back("Element 1")
+    lst.push_back("Element 2")
+    assert m.print_opaque_list(lst) == "Opaque list: [Element 1, Element 2]"
+    assert lst.back() == "Element 2"
+
+    for i, k in enumerate(lst, start=1):
+        assert k == "Element {}".format(i)
+    lst.pop_back()
+    assert m.print_opaque_list(lst) == "Opaque list: [Element 1]"
+
+    cvp = m.ClassWithSTLVecProperty()
+    assert m.print_opaque_list(cvp.stringList) == "Opaque list: []"
+
+    cvp.stringList = lst
+    cvp.stringList.push_back("Element 3")
+    assert m.print_opaque_list(cvp.stringList) == "Opaque list: [Element 1, Element 3]"
+
+
+def test_pointers(msg):
+    living_before = ConstructorStats.get(UserType).alive()
+    assert m.get_void_ptr_value(m.return_void_ptr()) == 0x1234
+    assert m.get_void_ptr_value(UserType())  # Should also work for other C++ types
+    assert ConstructorStats.get(UserType).alive() == living_before
+
+    with pytest.raises(TypeError) as excinfo:
+        m.get_void_ptr_value([1, 2, 3])  # This should not work
+    assert msg(excinfo.value) == """
+        get_void_ptr_value(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: capsule) -> int
+
+        Invoked with: [1, 2, 3]
+    """  # noqa: E501 line too long
+
+    assert m.return_null_str() is None
+    assert m.get_null_str_value(m.return_null_str()) is not None
+
+    ptr = m.return_unique_ptr()
+    assert "StringList" in repr(ptr)
+    assert m.print_opaque_list(ptr) == "Opaque list: [some value]"
diff --git a/pybind11/tests/test_operator_overloading.cpp b/pybind11/tests/test_operator_overloading.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3c2eaafa9918baf38483725cd52c48aa6ecb8af
--- /dev/null
+++ b/pybind11/tests/test_operator_overloading.cpp
@@ -0,0 +1,226 @@
+/*
+    tests/test_operator_overloading.cpp -- operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/operators.h>
+#include <functional>
+
+class Vector2 {
+public:
+    Vector2(float x, float y) : x(x), y(y) { print_created(this, toString()); }
+    Vector2(const Vector2 &v) : x(v.x), y(v.y) { print_copy_created(this); }
+    Vector2(Vector2 &&v) : x(v.x), y(v.y) { print_move_created(this); v.x = v.y = 0; }
+    Vector2 &operator=(const Vector2 &v) { x = v.x; y = v.y; print_copy_assigned(this); return *this; }
+    Vector2 &operator=(Vector2 &&v) { x = v.x; y = v.y; v.x = v.y = 0; print_move_assigned(this); return *this; }
+    ~Vector2() { print_destroyed(this); }
+
+    std::string toString() const { return "[" + std::to_string(x) + ", " + std::to_string(y) + "]"; }
+
+    Vector2 operator-() const { return Vector2(-x, -y); }
+    Vector2 operator+(const Vector2 &v) const { return Vector2(x + v.x, y + v.y); }
+    Vector2 operator-(const Vector2 &v) const { return Vector2(x - v.x, y - v.y); }
+    Vector2 operator-(float value) const { return Vector2(x - value, y - value); }
+    Vector2 operator+(float value) const { return Vector2(x + value, y + value); }
+    Vector2 operator*(float value) const { return Vector2(x * value, y * value); }
+    Vector2 operator/(float value) const { return Vector2(x / value, y / value); }
+    Vector2 operator*(const Vector2 &v) const { return Vector2(x * v.x, y * v.y); }
+    Vector2 operator/(const Vector2 &v) const { return Vector2(x / v.x, y / v.y); }
+    Vector2& operator+=(const Vector2 &v) { x += v.x; y += v.y; return *this; }
+    Vector2& operator-=(const Vector2 &v) { x -= v.x; y -= v.y; return *this; }
+    Vector2& operator*=(float v) { x *= v; y *= v; return *this; }
+    Vector2& operator/=(float v) { x /= v; y /= v; return *this; }
+    Vector2& operator*=(const Vector2 &v) { x *= v.x; y *= v.y; return *this; }
+    Vector2& operator/=(const Vector2 &v) { x /= v.x; y /= v.y; return *this; }
+
+    friend Vector2 operator+(float f, const Vector2 &v) { return Vector2(f + v.x, f + v.y); }
+    friend Vector2 operator-(float f, const Vector2 &v) { return Vector2(f - v.x, f - v.y); }
+    friend Vector2 operator*(float f, const Vector2 &v) { return Vector2(f * v.x, f * v.y); }
+    friend Vector2 operator/(float f, const Vector2 &v) { return Vector2(f / v.x, f / v.y); }
+
+    bool operator==(const Vector2 &v) const {
+        return x == v.x && y == v.y;
+    }
+    bool operator!=(const Vector2 &v) const {
+        return x != v.x || y != v.y;
+    }
+private:
+    float x, y;
+};
+
+class C1 { };
+class C2 { };
+
+int operator+(const C1 &, const C1 &) { return 11; }
+int operator+(const C2 &, const C2 &) { return 22; }
+int operator+(const C2 &, const C1 &) { return 21; }
+int operator+(const C1 &, const C2 &) { return 12; }
+
+// Note: Specializing explicit within `namespace std { ... }` is done due to a
+// bug in GCC<7. If you are supporting compilers later than this, consider
+// specializing `using template<> struct std::hash<...>` in the global
+// namespace instead, per this recommendation:
+// https://en.cppreference.com/w/cpp/language/extending_std#Adding_template_specializations
+namespace std {
+    template<>
+    struct hash<Vector2> {
+        // Not a good hash function, but easy to test
+        size_t operator()(const Vector2 &) { return 4; }
+    };
+}
+
+// Not a good abs function, but easy to test.
+std::string abs(const Vector2&) {
+    return "abs(Vector2)";
+}
+
+// MSVC warns about unknown pragmas, and warnings are errors.
+#ifndef _MSC_VER
+  #pragma GCC diagnostic push
+  // clang 7.0.0 and Apple LLVM 10.0.1 introduce `-Wself-assign-overloaded` to
+  // `-Wall`, which is used here for overloading (e.g. `py::self += py::self `).
+  // Here, we suppress the warning using `#pragma diagnostic`.
+  // Taken from: https://github.com/RobotLocomotion/drake/commit/aaf84b46
+  // TODO(eric): This could be resolved using a function / functor (e.g. `py::self()`).
+  #if (__APPLE__) && (__clang__)
+    #if (__clang_major__ >= 10) && (__clang_minor__ >= 0) && (__clang_patchlevel__ >= 1)
+      #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+    #endif
+  #elif (__clang__)
+    #if (__clang_major__ >= 7)
+      #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+    #endif
+  #endif
+#endif
+
+TEST_SUBMODULE(operators, m) {
+
+    // test_operator_overloading
+    py::class_<Vector2>(m, "Vector2")
+        .def(py::init<float, float>())
+        .def(py::self + py::self)
+        .def(py::self + float())
+        .def(py::self - py::self)
+        .def(py::self - float())
+        .def(py::self * float())
+        .def(py::self / float())
+        .def(py::self * py::self)
+        .def(py::self / py::self)
+        .def(py::self += py::self)
+        .def(py::self -= py::self)
+        .def(py::self *= float())
+        .def(py::self /= float())
+        .def(py::self *= py::self)
+        .def(py::self /= py::self)
+        .def(float() + py::self)
+        .def(float() - py::self)
+        .def(float() * py::self)
+        .def(float() / py::self)
+        .def(-py::self)
+        .def("__str__", &Vector2::toString)
+        .def("__repr__", &Vector2::toString)
+        .def(py::self == py::self)
+        .def(py::self != py::self)
+        .def(py::hash(py::self))
+        // N.B. See warning about usage of `py::detail::abs(py::self)` in
+        // `operators.h`.
+        .def("__abs__", [](const Vector2& v) { return abs(v); })
+        ;
+
+    m.attr("Vector") = m.attr("Vector2");
+
+    // test_operators_notimplemented
+    // #393: need to return NotSupported to ensure correct arithmetic operator behavior
+    py::class_<C1>(m, "C1")
+        .def(py::init<>())
+        .def(py::self + py::self);
+
+    py::class_<C2>(m, "C2")
+        .def(py::init<>())
+        .def(py::self + py::self)
+        .def("__add__", [](const C2& c2, const C1& c1) { return c2 + c1; })
+        .def("__radd__", [](const C2& c2, const C1& c1) { return c1 + c2; });
+
+    // test_nested
+    // #328: first member in a class can't be used in operators
+    struct NestABase { int value = -2; };
+    py::class_<NestABase>(m, "NestABase")
+        .def(py::init<>())
+        .def_readwrite("value", &NestABase::value);
+
+    struct NestA : NestABase {
+        int value = 3;
+        NestA& operator+=(int i) { value += i; return *this; }
+    };
+    py::class_<NestA>(m, "NestA")
+        .def(py::init<>())
+        .def(py::self += int())
+        .def("as_base", [](NestA &a) -> NestABase& {
+            return (NestABase&) a;
+        }, py::return_value_policy::reference_internal);
+    m.def("get_NestA", [](const NestA &a) { return a.value; });
+
+    struct NestB {
+        NestA a;
+        int value = 4;
+        NestB& operator-=(int i) { value -= i; return *this; }
+    };
+    py::class_<NestB>(m, "NestB")
+        .def(py::init<>())
+        .def(py::self -= int())
+        .def_readwrite("a", &NestB::a);
+    m.def("get_NestB", [](const NestB &b) { return b.value; });
+
+    struct NestC {
+        NestB b;
+        int value = 5;
+        NestC& operator*=(int i) { value *= i; return *this; }
+    };
+    py::class_<NestC>(m, "NestC")
+        .def(py::init<>())
+        .def(py::self *= int())
+        .def_readwrite("b", &NestC::b);
+    m.def("get_NestC", [](const NestC &c) { return c.value; });
+
+
+    // test_overriding_eq_reset_hash
+    // #2191 Overriding __eq__ should set __hash__ to None
+    struct Comparable {
+        int value;
+        bool operator==(const Comparable& rhs) const {return value == rhs.value;}
+    };
+
+    struct Hashable : Comparable {
+        explicit Hashable(int value): Comparable{value}{};
+        size_t hash() const { return static_cast<size_t>(value); }
+    };
+
+    struct Hashable2 : Hashable {
+        using Hashable::Hashable;
+    };
+
+    py::class_<Comparable>(m, "Comparable")
+        .def(py::init<int>())
+        .def(py::self == py::self);
+
+    py::class_<Hashable>(m, "Hashable")
+        .def(py::init<int>())
+        .def(py::self == py::self)
+        .def("__hash__", &Hashable::hash);
+
+    // define __hash__ before __eq__
+    py::class_<Hashable2>(m, "Hashable2")
+        .def("__hash__", &Hashable::hash)
+        .def(py::init<int>())
+        .def(py::self == py::self);
+}
+
+#ifndef _MSC_VER
+  #pragma GCC diagnostic pop
+#endif
diff --git a/pybind11/tests/test_operator_overloading.py b/pybind11/tests/test_operator_overloading.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e3aee271c6f94ab0d54207a02e1962fdc20a24
--- /dev/null
+++ b/pybind11/tests/test_operator_overloading.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import operators as m
+from pybind11_tests import ConstructorStats
+
+
+def test_operator_overloading():
+    v1 = m.Vector2(1, 2)
+    v2 = m.Vector(3, -1)
+    v3 = m.Vector2(1, 2)  # Same value as v1, but different instance.
+    assert v1 is not v3
+
+    assert str(v1) == "[1.000000, 2.000000]"
+    assert str(v2) == "[3.000000, -1.000000]"
+
+    assert str(-v2) == "[-3.000000, 1.000000]"
+
+    assert str(v1 + v2) == "[4.000000, 1.000000]"
+    assert str(v1 - v2) == "[-2.000000, 3.000000]"
+    assert str(v1 - 8) == "[-7.000000, -6.000000]"
+    assert str(v1 + 8) == "[9.000000, 10.000000]"
+    assert str(v1 * 8) == "[8.000000, 16.000000]"
+    assert str(v1 / 8) == "[0.125000, 0.250000]"
+    assert str(8 - v1) == "[7.000000, 6.000000]"
+    assert str(8 + v1) == "[9.000000, 10.000000]"
+    assert str(8 * v1) == "[8.000000, 16.000000]"
+    assert str(8 / v1) == "[8.000000, 4.000000]"
+    assert str(v1 * v2) == "[3.000000, -2.000000]"
+    assert str(v2 / v1) == "[3.000000, -0.500000]"
+
+    assert v1 == v3
+    assert v1 != v2
+    assert hash(v1) == 4
+    # TODO(eric.cousineau): Make this work.
+    # assert abs(v1) == "abs(Vector2)"
+
+    v1 += 2 * v2
+    assert str(v1) == "[7.000000, 0.000000]"
+    v1 -= v2
+    assert str(v1) == "[4.000000, 1.000000]"
+    v1 *= 2
+    assert str(v1) == "[8.000000, 2.000000]"
+    v1 /= 16
+    assert str(v1) == "[0.500000, 0.125000]"
+    v1 *= v2
+    assert str(v1) == "[1.500000, -0.125000]"
+    v2 /= v1
+    assert str(v2) == "[2.000000, 8.000000]"
+
+    cstats = ConstructorStats.get(m.Vector2)
+    assert cstats.alive() == 3
+    del v1
+    assert cstats.alive() == 2
+    del v2
+    assert cstats.alive() == 1
+    del v3
+    assert cstats.alive() == 0
+    assert cstats.values() == [
+        '[1.000000, 2.000000]',
+        '[3.000000, -1.000000]',
+        '[1.000000, 2.000000]',
+        '[-3.000000, 1.000000]',
+        '[4.000000, 1.000000]',
+        '[-2.000000, 3.000000]',
+        '[-7.000000, -6.000000]',
+        '[9.000000, 10.000000]',
+        '[8.000000, 16.000000]',
+        '[0.125000, 0.250000]',
+        '[7.000000, 6.000000]',
+        '[9.000000, 10.000000]',
+        '[8.000000, 16.000000]',
+        '[8.000000, 4.000000]',
+        '[3.000000, -2.000000]',
+        '[3.000000, -0.500000]',
+        '[6.000000, -2.000000]',
+    ]
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    assert cstats.move_constructions >= 10
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_operators_notimplemented():
+    """#393: need to return NotSupported to ensure correct arithmetic operator behavior"""
+
+    c1, c2 = m.C1(), m.C2()
+    assert c1 + c1 == 11
+    assert c2 + c2 == 22
+    assert c2 + c1 == 21
+    assert c1 + c2 == 12
+
+
+def test_nested():
+    """#328: first member in a class can't be used in operators"""
+
+    a = m.NestA()
+    b = m.NestB()
+    c = m.NestC()
+
+    a += 10
+    assert m.get_NestA(a) == 13
+    b.a += 100
+    assert m.get_NestA(b.a) == 103
+    c.b.a += 1000
+    assert m.get_NestA(c.b.a) == 1003
+    b -= 1
+    assert m.get_NestB(b) == 3
+    c.b -= 3
+    assert m.get_NestB(c.b) == 1
+    c *= 7
+    assert m.get_NestC(c) == 35
+
+    abase = a.as_base()
+    assert abase.value == -2
+    a.as_base().value += 44
+    assert abase.value == 42
+    assert c.b.a.as_base().value == -2
+    c.b.a.as_base().value += 44
+    assert c.b.a.as_base().value == 42
+
+    del c
+    pytest.gc_collect()
+    del a  # Shouldn't delete while abase is still alive
+    pytest.gc_collect()
+
+    assert abase.value == 42
+    del abase, b
+    pytest.gc_collect()
+
+
+def test_overriding_eq_reset_hash():
+
+    assert m.Comparable(15) is not m.Comparable(15)
+    assert m.Comparable(15) == m.Comparable(15)
+
+    with pytest.raises(TypeError):
+        hash(m.Comparable(15))  # TypeError: unhashable type: 'm.Comparable'
+
+    for hashable in (m.Hashable, m.Hashable2):
+        assert hashable(15) is not hashable(15)
+        assert hashable(15) == hashable(15)
+
+        assert hash(hashable(15)) == 15
+        assert hash(hashable(15)) == hash(hashable(15))
diff --git a/pybind11/tests/test_pickling.cpp b/pybind11/tests/test_pickling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9dc63bda3b5949032fbcd30e7aa4e7db2072dcff
--- /dev/null
+++ b/pybind11/tests/test_pickling.cpp
@@ -0,0 +1,130 @@
+/*
+    tests/test_pickling.cpp -- pickle support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(pickling, m) {
+    // test_roundtrip
+    class Pickleable {
+    public:
+        Pickleable(const std::string &value) : m_value(value) { }
+        const std::string &value() const { return m_value; }
+
+        void setExtra1(int extra1) { m_extra1 = extra1; }
+        void setExtra2(int extra2) { m_extra2 = extra2; }
+        int extra1() const { return m_extra1; }
+        int extra2() const { return m_extra2; }
+    private:
+        std::string m_value;
+        int m_extra1 = 0;
+        int m_extra2 = 0;
+    };
+
+    class PickleableNew : public Pickleable {
+    public:
+        using Pickleable::Pickleable;
+    };
+
+    py::class_<Pickleable>(m, "Pickleable")
+        .def(py::init<std::string>())
+        .def("value", &Pickleable::value)
+        .def("extra1", &Pickleable::extra1)
+        .def("extra2", &Pickleable::extra2)
+        .def("setExtra1", &Pickleable::setExtra1)
+        .def("setExtra2", &Pickleable::setExtra2)
+        // For details on the methods below, refer to
+        // http://docs.python.org/3/library/pickle.html#pickling-class-instances
+        .def("__getstate__", [](const Pickleable &p) {
+            /* Return a tuple that fully encodes the state of the object */
+            return py::make_tuple(p.value(), p.extra1(), p.extra2());
+        })
+        .def("__setstate__", [](Pickleable &p, py::tuple t) {
+            if (t.size() != 3)
+                throw std::runtime_error("Invalid state!");
+            /* Invoke the constructor (need to use in-place version) */
+            new (&p) Pickleable(t[0].cast<std::string>());
+
+            /* Assign any additional state */
+            p.setExtra1(t[1].cast<int>());
+            p.setExtra2(t[2].cast<int>());
+        });
+
+    py::class_<PickleableNew, Pickleable>(m, "PickleableNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](const PickleableNew &p) {
+                return py::make_tuple(p.value(), p.extra1(), p.extra2());
+            },
+            [](py::tuple t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+                auto p = PickleableNew(t[0].cast<std::string>());
+
+                p.setExtra1(t[1].cast<int>());
+                p.setExtra2(t[2].cast<int>());
+                return p;
+            }
+        ));
+
+#if !defined(PYPY_VERSION)
+    // test_roundtrip_with_dict
+    class PickleableWithDict {
+    public:
+        PickleableWithDict(const std::string &value) : value(value) { }
+
+        std::string value;
+        int extra;
+    };
+
+    class PickleableWithDictNew : public PickleableWithDict {
+    public:
+        using PickleableWithDict::PickleableWithDict;
+    };
+
+    py::class_<PickleableWithDict>(m, "PickleableWithDict", py::dynamic_attr())
+        .def(py::init<std::string>())
+        .def_readwrite("value", &PickleableWithDict::value)
+        .def_readwrite("extra", &PickleableWithDict::extra)
+        .def("__getstate__", [](py::object self) {
+            /* Also include __dict__ in state */
+            return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+        })
+        .def("__setstate__", [](py::object self, py::tuple t) {
+            if (t.size() != 3)
+                throw std::runtime_error("Invalid state!");
+            /* Cast and construct */
+            auto& p = self.cast<PickleableWithDict&>();
+            new (&p) PickleableWithDict(t[0].cast<std::string>());
+
+            /* Assign C++ state */
+            p.extra = t[1].cast<int>();
+
+            /* Assign Python state */
+            self.attr("__dict__") = t[2];
+        });
+
+    py::class_<PickleableWithDictNew, PickleableWithDict>(m, "PickleableWithDictNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](py::object self) {
+                return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+            },
+            [](const py::tuple &t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+
+                auto cpp_state = PickleableWithDictNew(t[0].cast<std::string>());
+                cpp_state.extra = t[1].cast<int>();
+
+                auto py_state = t[2].cast<py::dict>();
+                return std::make_pair(cpp_state, py_state);
+            }
+        ));
+#endif
+}
diff --git a/pybind11/tests/test_pickling.py b/pybind11/tests/test_pickling.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aee70505de7acc21ee09623417d35812ae11463
--- /dev/null
+++ b/pybind11/tests/test_pickling.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import pickling as m
+
+try:
+    import cPickle as pickle  # Use cPickle on Python 2.7
+except ImportError:
+    import pickle
+
+
+@pytest.mark.parametrize("cls_name", ["Pickleable", "PickleableNew"])
+def test_roundtrip(cls_name):
+    cls = getattr(m, cls_name)
+    p = cls("test_value")
+    p.setExtra1(15)
+    p.setExtra2(48)
+
+    data = pickle.dumps(p, 2)  # Must use pickle protocol >= 2
+    p2 = pickle.loads(data)
+    assert p2.value() == p.value()
+    assert p2.extra1() == p.extra1()
+    assert p2.extra2() == p.extra2()
+
+
+@pytest.mark.xfail("env.PYPY")
+@pytest.mark.parametrize("cls_name", ["PickleableWithDict", "PickleableWithDictNew"])
+def test_roundtrip_with_dict(cls_name):
+    cls = getattr(m, cls_name)
+    p = cls("test_value")
+    p.extra = 15
+    p.dynamic = "Attribute"
+
+    data = pickle.dumps(p, pickle.HIGHEST_PROTOCOL)
+    p2 = pickle.loads(data)
+    assert p2.value == p.value
+    assert p2.extra == p.extra
+    assert p2.dynamic == p.dynamic
+
+
+def test_enum_pickle():
+    from pybind11_tests import enums as e
+    data = pickle.dumps(e.EOne, 2)
+    assert e.EOne == pickle.loads(data)
diff --git a/pybind11/tests/test_pytypes.cpp b/pybind11/tests/test_pytypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f8d56410f0add1f2e341e06ea9560ab9e88d643
--- /dev/null
+++ b/pybind11/tests/test_pytypes.cpp
@@ -0,0 +1,375 @@
+/*
+    tests/test_pytypes.cpp -- Python type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+
+TEST_SUBMODULE(pytypes, m) {
+    // test_int
+    m.def("get_int", []{return py::int_(0);});
+    // test_iterator
+    m.def("get_iterator", []{return py::iterator();});
+    // test_iterable
+    m.def("get_iterable", []{return py::iterable();});
+    // test_list
+    m.def("get_list", []() {
+        py::list list;
+        list.append("value");
+        py::print("Entry at position 0:", list[0]);
+        list[0] = py::str("overwritten");
+        list.insert(0, "inserted-0");
+        list.insert(2, "inserted-2");
+        return list;
+    });
+    m.def("print_list", [](py::list list) {
+        int index = 0;
+        for (auto item : list)
+            py::print("list item {}: {}"_s.format(index++, item));
+    });
+    // test_none
+    m.def("get_none", []{return py::none();});
+    m.def("print_none", [](py::none none) {
+        py::print("none: {}"_s.format(none));
+    });
+
+    // test_set
+    m.def("get_set", []() {
+        py::set set;
+        set.add(py::str("key1"));
+        set.add("key2");
+        set.add(std::string("key3"));
+        return set;
+    });
+    m.def("print_set", [](py::set set) {
+        for (auto item : set)
+            py::print("key:", item);
+    });
+    m.def("set_contains", [](py::set set, py::object key) {
+        return set.contains(key);
+    });
+    m.def("set_contains", [](py::set set, const char* key) {
+        return set.contains(key);
+    });
+
+    // test_dict
+    m.def("get_dict", []() { return py::dict("key"_a="value"); });
+    m.def("print_dict", [](py::dict dict) {
+        for (auto item : dict)
+            py::print("key: {}, value={}"_s.format(item.first, item.second));
+    });
+    m.def("dict_keyword_constructor", []() {
+        auto d1 = py::dict("x"_a=1, "y"_a=2);
+        auto d2 = py::dict("z"_a=3, **d1);
+        return d2;
+    });
+    m.def("dict_contains", [](py::dict dict, py::object val) {
+        return dict.contains(val);
+    });
+    m.def("dict_contains", [](py::dict dict, const char* val) {
+        return dict.contains(val);
+    });
+
+    // test_str
+    m.def("str_from_string", []() { return py::str(std::string("baz")); });
+    m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); });
+    m.def("str_from_object", [](const py::object& obj) { return py::str(obj); });
+    m.def("repr_from_object", [](const py::object& obj) { return py::repr(obj); });
+
+    m.def("str_format", []() {
+        auto s1 = "{} + {} = {}"_s.format(1, 2, 3);
+        auto s2 = "{a} + {b} = {c}"_s.format("a"_a=1, "b"_a=2, "c"_a=3);
+        return py::make_tuple(s1, s2);
+    });
+
+    // test_bytes
+    m.def("bytes_from_string", []() { return py::bytes(std::string("foo")); });
+    m.def("bytes_from_str", []() { return py::bytes(py::str("bar", 3)); });
+
+    // test_capsule
+    m.def("return_capsule_with_destructor", []() {
+        py::print("creating capsule");
+        return py::capsule([]() {
+            py::print("destructing capsule");
+        });
+    });
+
+    m.def("return_capsule_with_destructor_2", []() {
+        py::print("creating capsule");
+        return py::capsule((void *) 1234, [](void *ptr) {
+            py::print("destructing capsule: {}"_s.format((size_t) ptr));
+        });
+    });
+
+    m.def("return_capsule_with_name_and_destructor", []() {
+        auto capsule = py::capsule((void *) 1234, "pointer type description", [](PyObject *ptr) {
+            if (ptr) {
+                auto name = PyCapsule_GetName(ptr);
+                py::print("destructing capsule ({}, '{}')"_s.format(
+                    (size_t) PyCapsule_GetPointer(ptr, name), name
+                ));
+            }
+        });
+        void *contents = capsule;
+        py::print("created capsule ({}, '{}')"_s.format((size_t) contents, capsule.name()));
+        return capsule;
+    });
+
+    // test_accessors
+    m.def("accessor_api", [](py::object o) {
+        auto d = py::dict();
+
+        d["basic_attr"] = o.attr("basic_attr");
+
+        auto l = py::list();
+        for (const auto &item : o.attr("begin_end")) {
+            l.append(item);
+        }
+        d["begin_end"] = l;
+
+        d["operator[object]"] = o.attr("d")["operator[object]"_s];
+        d["operator[char *]"] = o.attr("d")["operator[char *]"];
+
+        d["attr(object)"] = o.attr("sub").attr("attr_obj");
+        d["attr(char *)"] = o.attr("sub").attr("attr_char");
+        try {
+            o.attr("sub").attr("missing").ptr();
+        } catch (const py::error_already_set &) {
+            d["missing_attr_ptr"] = "raised"_s;
+        }
+        try {
+            o.attr("missing").attr("doesn't matter");
+        } catch (const py::error_already_set &) {
+            d["missing_attr_chain"] = "raised"_s;
+        }
+
+        d["is_none"] = o.attr("basic_attr").is_none();
+
+        d["operator()"] = o.attr("func")(1);
+        d["operator*"] = o.attr("func")(*o.attr("begin_end"));
+
+        // Test implicit conversion
+        py::list implicit_list = o.attr("begin_end");
+        d["implicit_list"] = implicit_list;
+        py::dict implicit_dict = o.attr("__dict__");
+        d["implicit_dict"] = implicit_dict;
+
+        return d;
+    });
+
+    m.def("tuple_accessor", [](py::tuple existing_t) {
+        try {
+            existing_t[0] = 1;
+        } catch (const py::error_already_set &) {
+            // --> Python system error
+            // Only new tuples (refcount == 1) are mutable
+            auto new_t = py::tuple(3);
+            for (size_t i = 0; i < new_t.size(); ++i) {
+                new_t[i] = i;
+            }
+            return new_t;
+        }
+        return py::tuple();
+    });
+
+    m.def("accessor_assignment", []() {
+        auto l = py::list(1);
+        l[0] = 0;
+
+        auto d = py::dict();
+        d["get"] = l[0];
+        auto var = l[0];
+        d["deferred_get"] = var;
+        l[0] = 1;
+        d["set"] = l[0];
+        var = 99; // this assignment should not overwrite l[0]
+        d["deferred_set"] = l[0];
+        d["var"] = var;
+
+        return d;
+    });
+
+    // test_constructors
+    m.def("default_constructors", []() {
+        return py::dict(
+            "bytes"_a=py::bytes(),
+            "str"_a=py::str(),
+            "bool"_a=py::bool_(),
+            "int"_a=py::int_(),
+            "float"_a=py::float_(),
+            "tuple"_a=py::tuple(),
+            "list"_a=py::list(),
+            "dict"_a=py::dict(),
+            "set"_a=py::set()
+        );
+    });
+
+    m.def("converting_constructors", [](py::dict d) {
+        return py::dict(
+            "bytes"_a=py::bytes(d["bytes"]),
+            "str"_a=py::str(d["str"]),
+            "bool"_a=py::bool_(d["bool"]),
+            "int"_a=py::int_(d["int"]),
+            "float"_a=py::float_(d["float"]),
+            "tuple"_a=py::tuple(d["tuple"]),
+            "list"_a=py::list(d["list"]),
+            "dict"_a=py::dict(d["dict"]),
+            "set"_a=py::set(d["set"]),
+            "memoryview"_a=py::memoryview(d["memoryview"])
+        );
+    });
+
+    m.def("cast_functions", [](py::dict d) {
+        // When converting between Python types, obj.cast<T>() should be the same as T(obj)
+        return py::dict(
+            "bytes"_a=d["bytes"].cast<py::bytes>(),
+            "str"_a=d["str"].cast<py::str>(),
+            "bool"_a=d["bool"].cast<py::bool_>(),
+            "int"_a=d["int"].cast<py::int_>(),
+            "float"_a=d["float"].cast<py::float_>(),
+            "tuple"_a=d["tuple"].cast<py::tuple>(),
+            "list"_a=d["list"].cast<py::list>(),
+            "dict"_a=d["dict"].cast<py::dict>(),
+            "set"_a=d["set"].cast<py::set>(),
+            "memoryview"_a=d["memoryview"].cast<py::memoryview>()
+        );
+    });
+
+    m.def("convert_to_pybind11_str", [](py::object o) { return py::str(o); });
+
+    m.def("get_implicit_casting", []() {
+        py::dict d;
+        d["char*_i1"] = "abc";
+        const char *c2 = "abc";
+        d["char*_i2"] = c2;
+        d["char*_e"] = py::cast(c2);
+        d["char*_p"] = py::str(c2);
+
+        d["int_i1"] = 42;
+        int i = 42;
+        d["int_i2"] = i;
+        i++;
+        d["int_e"] = py::cast(i);
+        i++;
+        d["int_p"] = py::int_(i);
+
+        d["str_i1"] = std::string("str");
+        std::string s2("str1");
+        d["str_i2"] = s2;
+        s2[3] = '2';
+        d["str_e"] = py::cast(s2);
+        s2[3] = '3';
+        d["str_p"] = py::str(s2);
+
+        py::list l(2);
+        l[0] = 3;
+        l[1] = py::cast(6);
+        l.append(9);
+        l.append(py::cast(12));
+        l.append(py::int_(15));
+
+        return py::dict(
+            "d"_a=d,
+            "l"_a=l
+        );
+    });
+
+    // test_print
+    m.def("print_function", []() {
+        py::print("Hello, World!");
+        py::print(1, 2.0, "three", true, std::string("-- multiple args"));
+        auto args = py::make_tuple("and", "a", "custom", "separator");
+        py::print("*args", *args, "sep"_a="-");
+        py::print("no new line here", "end"_a=" -- ");
+        py::print("next print");
+
+        auto py_stderr = py::module::import("sys").attr("stderr");
+        py::print("this goes to stderr", "file"_a=py_stderr);
+
+        py::print("flush", "flush"_a=true);
+
+        py::print("{a} + {b} = {c}"_s.format("a"_a="py::print", "b"_a="str.format", "c"_a="this"));
+    });
+
+    m.def("print_failure", []() { py::print(42, UnregisteredType()); });
+
+    m.def("hash_function", [](py::object obj) { return py::hash(obj); });
+
+    m.def("test_number_protocol", [](py::object a, py::object b) {
+        py::list l;
+        l.append(a.equal(b));
+        l.append(a.not_equal(b));
+        l.append(a < b);
+        l.append(a <= b);
+        l.append(a > b);
+        l.append(a >= b);
+        l.append(a + b);
+        l.append(a - b);
+        l.append(a * b);
+        l.append(a / b);
+        l.append(a | b);
+        l.append(a & b);
+        l.append(a ^ b);
+        l.append(a >> b);
+        l.append(a << b);
+        return l;
+    });
+
+    m.def("test_list_slicing", [](py::list a) {
+        return a[py::slice(0, -1, 2)];
+    });
+
+    m.def("test_memoryview_object", [](py::buffer b) {
+        return py::memoryview(b);
+    });
+
+    m.def("test_memoryview_buffer_info", [](py::buffer b) {
+        return py::memoryview(b.request());
+    });
+
+    m.def("test_memoryview_from_buffer", [](bool is_unsigned) {
+        static const int16_t si16[] = { 3, 1, 4, 1, 5 };
+        static const uint16_t ui16[] = { 2, 7, 1, 8 };
+        if (is_unsigned)
+            return py::memoryview::from_buffer(
+                ui16, { 4 }, { sizeof(uint16_t) });
+        else
+            return py::memoryview::from_buffer(
+                si16, { 5 }, { sizeof(int16_t) });
+    });
+
+    m.def("test_memoryview_from_buffer_nativeformat", []() {
+        static const char* format = "@i";
+        static const int32_t arr[] = { 4, 7, 5 };
+        return py::memoryview::from_buffer(
+            arr, sizeof(int32_t), format, { 3 }, { sizeof(int32_t) });
+    });
+
+    m.def("test_memoryview_from_buffer_empty_shape", []() {
+        static const char* buf = "";
+        return py::memoryview::from_buffer(buf, 1, "B", { }, { });
+    });
+
+    m.def("test_memoryview_from_buffer_invalid_strides", []() {
+        static const char* buf = "\x02\x03\x04";
+        return py::memoryview::from_buffer(buf, 1, "B", { 3 }, { });
+    });
+
+    m.def("test_memoryview_from_buffer_nullptr", []() {
+        return py::memoryview::from_buffer(
+            static_cast<void*>(nullptr), 1, "B", { }, { });
+    });
+
+#if PY_MAJOR_VERSION >= 3
+    m.def("test_memoryview_from_memory", []() {
+        const char* buf = "\xff\xe1\xab\x37";
+        return py::memoryview::from_memory(
+            buf, static_cast<ssize_t>(strlen(buf)));
+    });
+#endif
+}
diff --git a/pybind11/tests/test_pytypes.py b/pybind11/tests/test_pytypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..95cc94af8c89517bf4a993af43041414c46d4dd5
--- /dev/null
+++ b/pybind11/tests/test_pytypes.py
@@ -0,0 +1,392 @@
+# -*- coding: utf-8 -*-
+from __future__ import division
+import pytest
+import sys
+
+import env  # noqa: F401
+
+from pybind11_tests import pytypes as m
+from pybind11_tests import debug_enabled
+
+
+def test_int(doc):
+    assert doc(m.get_int) == "get_int() -> int"
+
+
+def test_iterator(doc):
+    assert doc(m.get_iterator) == "get_iterator() -> Iterator"
+
+
+def test_iterable(doc):
+    assert doc(m.get_iterable) == "get_iterable() -> Iterable"
+
+
+def test_list(capture, doc):
+    with capture:
+        lst = m.get_list()
+        assert lst == ["inserted-0", "overwritten", "inserted-2"]
+
+        lst.append("value2")
+        m.print_list(lst)
+    assert capture.unordered == """
+        Entry at position 0: value
+        list item 0: inserted-0
+        list item 1: overwritten
+        list item 2: inserted-2
+        list item 3: value2
+    """
+
+    assert doc(m.get_list) == "get_list() -> list"
+    assert doc(m.print_list) == "print_list(arg0: list) -> None"
+
+
+def test_none(capture, doc):
+    assert doc(m.get_none) == "get_none() -> None"
+    assert doc(m.print_none) == "print_none(arg0: None) -> None"
+
+
+def test_set(capture, doc):
+    s = m.get_set()
+    assert s == {"key1", "key2", "key3"}
+
+    with capture:
+        s.add("key4")
+        m.print_set(s)
+    assert capture.unordered == """
+        key: key1
+        key: key2
+        key: key3
+        key: key4
+    """
+
+    assert not m.set_contains(set([]), 42)
+    assert m.set_contains({42}, 42)
+    assert m.set_contains({"foo"}, "foo")
+
+    assert doc(m.get_list) == "get_list() -> list"
+    assert doc(m.print_list) == "print_list(arg0: list) -> None"
+
+
+def test_dict(capture, doc):
+    d = m.get_dict()
+    assert d == {"key": "value"}
+
+    with capture:
+        d["key2"] = "value2"
+        m.print_dict(d)
+    assert capture.unordered == """
+        key: key, value=value
+        key: key2, value=value2
+    """
+
+    assert not m.dict_contains({}, 42)
+    assert m.dict_contains({42: None}, 42)
+    assert m.dict_contains({"foo": None}, "foo")
+
+    assert doc(m.get_dict) == "get_dict() -> dict"
+    assert doc(m.print_dict) == "print_dict(arg0: dict) -> None"
+
+    assert m.dict_keyword_constructor() == {"x": 1, "y": 2, "z": 3}
+
+
+def test_str(doc):
+    assert m.str_from_string().encode().decode() == "baz"
+    assert m.str_from_bytes().encode().decode() == "boo"
+
+    assert doc(m.str_from_bytes) == "str_from_bytes() -> str"
+
+    class A(object):
+        def __str__(self):
+            return "this is a str"
+
+        def __repr__(self):
+            return "this is a repr"
+
+    assert m.str_from_object(A()) == "this is a str"
+    assert m.repr_from_object(A()) == "this is a repr"
+
+    s1, s2 = m.str_format()
+    assert s1 == "1 + 2 = 3"
+    assert s1 == s2
+
+
+def test_bytes(doc):
+    assert m.bytes_from_string().decode() == "foo"
+    assert m.bytes_from_str().decode() == "bar"
+
+    assert doc(m.bytes_from_str) == "bytes_from_str() -> {}".format(
+        "str" if env.PY2 else "bytes"
+    )
+
+
+def test_capsule(capture):
+    pytest.gc_collect()
+    with capture:
+        a = m.return_capsule_with_destructor()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        creating capsule
+        destructing capsule
+    """
+
+    with capture:
+        a = m.return_capsule_with_destructor_2()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        creating capsule
+        destructing capsule: 1234
+    """
+
+    with capture:
+        a = m.return_capsule_with_name_and_destructor()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        created capsule (1234, 'pointer type description')
+        destructing capsule (1234, 'pointer type description')
+    """
+
+
+def test_accessors():
+    class SubTestObject:
+        attr_obj = 1
+        attr_char = 2
+
+    class TestObject:
+        basic_attr = 1
+        begin_end = [1, 2, 3]
+        d = {"operator[object]": 1, "operator[char *]": 2}
+        sub = SubTestObject()
+
+        def func(self, x, *args):
+            return self.basic_attr + x + sum(args)
+
+    d = m.accessor_api(TestObject())
+    assert d["basic_attr"] == 1
+    assert d["begin_end"] == [1, 2, 3]
+    assert d["operator[object]"] == 1
+    assert d["operator[char *]"] == 2
+    assert d["attr(object)"] == 1
+    assert d["attr(char *)"] == 2
+    assert d["missing_attr_ptr"] == "raised"
+    assert d["missing_attr_chain"] == "raised"
+    assert d["is_none"] is False
+    assert d["operator()"] == 2
+    assert d["operator*"] == 7
+    assert d["implicit_list"] == [1, 2, 3]
+    assert all(x in TestObject.__dict__ for x in d["implicit_dict"])
+
+    assert m.tuple_accessor(tuple()) == (0, 1, 2)
+
+    d = m.accessor_assignment()
+    assert d["get"] == 0
+    assert d["deferred_get"] == 0
+    assert d["set"] == 1
+    assert d["deferred_set"] == 1
+    assert d["var"] == 99
+
+
+def test_constructors():
+    """C++ default and converting constructors are equivalent to type calls in Python"""
+    types = [bytes, str, bool, int, float, tuple, list, dict, set]
+    expected = {t.__name__: t() for t in types}
+    if env.PY2:
+        # Note that bytes.__name__ == 'str' in Python 2.
+        # pybind11::str is unicode even under Python 2.
+        expected["bytes"] = bytes()
+        expected["str"] = unicode()  # noqa: F821
+    assert m.default_constructors() == expected
+
+    data = {
+        bytes: b'41',  # Currently no supported or working conversions.
+        str: 42,
+        bool: "Not empty",
+        int: "42",
+        float: "+1e3",
+        tuple: range(3),
+        list: range(3),
+        dict: [("two", 2), ("one", 1), ("three", 3)],
+        set: [4, 4, 5, 6, 6, 6],
+        memoryview: b'abc'
+    }
+    inputs = {k.__name__: v for k, v in data.items()}
+    expected = {k.__name__: k(v) for k, v in data.items()}
+    if env.PY2:  # Similar to the above. See comments above.
+        inputs["bytes"] = b'41'
+        inputs["str"] = 42
+        expected["bytes"] = b'41'
+        expected["str"] = u"42"
+
+    assert m.converting_constructors(inputs) == expected
+    assert m.cast_functions(inputs) == expected
+
+    # Converting constructors and cast functions should just reference rather
+    # than copy when no conversion is needed:
+    noconv1 = m.converting_constructors(expected)
+    for k in noconv1:
+        assert noconv1[k] is expected[k]
+
+    noconv2 = m.cast_functions(expected)
+    for k in noconv2:
+        assert noconv2[k] is expected[k]
+
+
+def test_pybind11_str_raw_str():
+    # specifically to exercise pybind11::str::raw_str
+    cvt = m.convert_to_pybind11_str
+    assert cvt(u"Str") == u"Str"
+    assert cvt(b'Bytes') == u"Bytes" if env.PY2 else "b'Bytes'"
+    assert cvt(None) == u"None"
+    assert cvt(False) == u"False"
+    assert cvt(True) == u"True"
+    assert cvt(42) == u"42"
+    assert cvt(2**65) == u"36893488147419103232"
+    assert cvt(-1.50) == u"-1.5"
+    assert cvt(()) == u"()"
+    assert cvt((18,)) == u"(18,)"
+    assert cvt([]) == u"[]"
+    assert cvt([28]) == u"[28]"
+    assert cvt({}) == u"{}"
+    assert cvt({3: 4}) == u"{3: 4}"
+    assert cvt(set()) == u"set([])" if env.PY2 else "set()"
+    assert cvt({3, 3}) == u"set([3])" if env.PY2 else "{3}"
+
+    valid_orig = u"Ǳ"
+    valid_utf8 = valid_orig.encode("utf-8")
+    valid_cvt = cvt(valid_utf8)
+    assert type(valid_cvt) == bytes  # Probably surprising.
+    assert valid_cvt == b'\xc7\xb1'
+
+    malformed_utf8 = b'\x80'
+    malformed_cvt = cvt(malformed_utf8)
+    assert type(malformed_cvt) == bytes  # Probably surprising.
+    assert malformed_cvt == b'\x80'
+
+
+def test_implicit_casting():
+    """Tests implicit casting when assigning or appending to dicts and lists."""
+    z = m.get_implicit_casting()
+    assert z['d'] == {
+        'char*_i1': 'abc', 'char*_i2': 'abc', 'char*_e': 'abc', 'char*_p': 'abc',
+        'str_i1': 'str', 'str_i2': 'str1', 'str_e': 'str2', 'str_p': 'str3',
+        'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44
+    }
+    assert z['l'] == [3, 6, 9, 12, 15]
+
+
+def test_print(capture):
+    with capture:
+        m.print_function()
+    assert capture == """
+        Hello, World!
+        1 2.0 three True -- multiple args
+        *args-and-a-custom-separator
+        no new line here -- next print
+        flush
+        py::print + str.format = this
+    """
+    assert capture.stderr == "this goes to stderr"
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.print_failure()
+    assert str(excinfo.value) == "make_tuple(): unable to convert " + (
+        "argument of type 'UnregisteredType' to Python object"
+        if debug_enabled else
+        "arguments to Python object (compile in debug mode for details)"
+    )
+
+
+def test_hash():
+    class Hashable(object):
+        def __init__(self, value):
+            self.value = value
+
+        def __hash__(self):
+            return self.value
+
+    class Unhashable(object):
+        __hash__ = None
+
+    assert m.hash_function(Hashable(42)) == 42
+    with pytest.raises(TypeError):
+        m.hash_function(Unhashable())
+
+
+def test_number_protocol():
+    for a, b in [(1, 1), (3, 5)]:
+        li = [a == b, a != b, a < b, a <= b, a > b, a >= b, a + b,
+              a - b, a * b, a / b, a | b, a & b, a ^ b, a >> b, a << b]
+        assert m.test_number_protocol(a, b) == li
+
+
+def test_list_slicing():
+    li = list(range(100))
+    assert li[::2] == m.test_list_slicing(li)
+
+
+@pytest.mark.parametrize('method, args, fmt, expected_view', [
+    (m.test_memoryview_object, (b'red',), 'B', b'red'),
+    (m.test_memoryview_buffer_info, (b'green',), 'B', b'green'),
+    (m.test_memoryview_from_buffer, (False,), 'h', [3, 1, 4, 1, 5]),
+    (m.test_memoryview_from_buffer, (True,), 'H', [2, 7, 1, 8]),
+    (m.test_memoryview_from_buffer_nativeformat, (), '@i', [4, 7, 5]),
+])
+def test_memoryview(method, args, fmt, expected_view):
+    view = method(*args)
+    assert isinstance(view, memoryview)
+    assert view.format == fmt
+    if isinstance(expected_view, bytes) or not env.PY2:
+        view_as_list = list(view)
+    else:
+        # Using max to pick non-zero byte (big-endian vs little-endian).
+        view_as_list = [max([ord(c) for c in s]) for s in view]
+    assert view_as_list == list(expected_view)
+
+
+@pytest.mark.xfail("env.PYPY", reason="getrefcount is not available")
+@pytest.mark.parametrize('method', [
+    m.test_memoryview_object,
+    m.test_memoryview_buffer_info,
+])
+def test_memoryview_refcount(method):
+    buf = b'\x0a\x0b\x0c\x0d'
+    ref_before = sys.getrefcount(buf)
+    view = method(buf)
+    ref_after = sys.getrefcount(buf)
+    assert ref_before < ref_after
+    assert list(view) == list(buf)
+
+
+def test_memoryview_from_buffer_empty_shape():
+    view = m.test_memoryview_from_buffer_empty_shape()
+    assert isinstance(view, memoryview)
+    assert view.format == 'B'
+    if env.PY2:
+        # Python 2 behavior is weird, but Python 3 (the future) is fine.
+        # PyPy3 has <memoryview, while CPython 2 has <memory
+        assert bytes(view).startswith(b'<memory')
+    else:
+        assert bytes(view) == b''
+
+
+def test_test_memoryview_from_buffer_invalid_strides():
+    with pytest.raises(RuntimeError):
+        m.test_memoryview_from_buffer_invalid_strides()
+
+
+def test_test_memoryview_from_buffer_nullptr():
+    if env.PY2:
+        m.test_memoryview_from_buffer_nullptr()
+    else:
+        with pytest.raises(ValueError):
+            m.test_memoryview_from_buffer_nullptr()
+
+
+@pytest.mark.skipif("env.PY2")
+def test_memoryview_from_memory():
+    view = m.test_memoryview_from_memory()
+    assert isinstance(view, memoryview)
+    assert view.format == 'B'
+    assert bytes(view) == b'\xff\xe1\xab\x37'
diff --git a/pybind11/tests/test_sequences_and_iterators.cpp b/pybind11/tests/test_sequences_and_iterators.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ce0451092b2050b4b5ef762c6a5e755237844d8
--- /dev/null
+++ b/pybind11/tests/test_sequences_and_iterators.cpp
@@ -0,0 +1,358 @@
+/*
+    tests/test_sequences_and_iterators.cpp -- supporting Pythons' sequence protocol, iterators,
+    etc.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+
+template<typename T>
+class NonZeroIterator {
+    const T* ptr_;
+public:
+    NonZeroIterator(const T* ptr) : ptr_(ptr) {}
+    const T& operator*() const { return *ptr_; }
+    NonZeroIterator& operator++() { ++ptr_; return *this; }
+};
+
+class NonZeroSentinel {};
+
+template<typename A, typename B>
+bool operator==(const NonZeroIterator<std::pair<A, B>>& it, const NonZeroSentinel&) {
+    return !(*it).first || !(*it).second;
+}
+
+template <typename PythonType>
+py::list test_random_access_iterator(PythonType x) {
+    if (x.size() < 5)
+        throw py::value_error("Please provide at least 5 elements for testing.");
+
+    auto checks = py::list();
+    auto assert_equal = [&checks](py::handle a, py::handle b) {
+        auto result = PyObject_RichCompareBool(a.ptr(), b.ptr(), Py_EQ);
+        if (result == -1) { throw py::error_already_set(); }
+        checks.append(result != 0);
+    };
+
+    auto it = x.begin();
+    assert_equal(x[0], *it);
+    assert_equal(x[0], it[0]);
+    assert_equal(x[1], it[1]);
+
+    assert_equal(x[1], *(++it));
+    assert_equal(x[1], *(it++));
+    assert_equal(x[2], *it);
+    assert_equal(x[3], *(it += 1));
+    assert_equal(x[2], *(--it));
+    assert_equal(x[2], *(it--));
+    assert_equal(x[1], *it);
+    assert_equal(x[0], *(it -= 1));
+
+    assert_equal(it->attr("real"), x[0].attr("real"));
+    assert_equal((it + 1)->attr("real"), x[1].attr("real"));
+
+    assert_equal(x[1], *(it + 1));
+    assert_equal(x[1], *(1 + it));
+    it += 3;
+    assert_equal(x[1], *(it - 2));
+
+    checks.append(static_cast<std::size_t>(x.end() - x.begin()) == x.size());
+    checks.append((x.begin() + static_cast<std::ptrdiff_t>(x.size())) == x.end());
+    checks.append(x.begin() < x.end());
+
+    return checks;
+}
+
+TEST_SUBMODULE(sequences_and_iterators, m) {
+    // test_sliceable
+    class Sliceable{
+    public:
+      Sliceable(int n): size(n) {}
+      int start,stop,step;
+      int size;
+    };
+    py::class_<Sliceable>(m,"Sliceable")
+        .def(py::init<int>())
+        .def("__getitem__",[](const Sliceable &s, py::slice slice) {
+          ssize_t start, stop, step, slicelength;
+          if (!slice.compute(s.size, &start, &stop, &step, &slicelength))
+              throw py::error_already_set();
+          int istart = static_cast<int>(start);
+          int istop =  static_cast<int>(stop);
+          int istep =  static_cast<int>(step);
+          return std::make_tuple(istart,istop,istep);
+        })
+        ;
+
+    // test_sequence
+    class Sequence {
+    public:
+        Sequence(size_t size) : m_size(size) {
+            print_created(this, "of size", m_size);
+            m_data = new float[size];
+            memset(m_data, 0, sizeof(float) * size);
+        }
+        Sequence(const std::vector<float> &value) : m_size(value.size()) {
+            print_created(this, "of size", m_size, "from std::vector");
+            m_data = new float[m_size];
+            memcpy(m_data, &value[0], sizeof(float) * m_size);
+        }
+        Sequence(const Sequence &s) : m_size(s.m_size) {
+            print_copy_created(this);
+            m_data = new float[m_size];
+            memcpy(m_data, s.m_data, sizeof(float)*m_size);
+        }
+        Sequence(Sequence &&s) : m_size(s.m_size), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_size = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Sequence() { print_destroyed(this); delete[] m_data; }
+
+        Sequence &operator=(const Sequence &s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = new float[m_size];
+                memcpy(m_data, s.m_data, sizeof(float)*m_size);
+            }
+            print_copy_assigned(this);
+            return *this;
+        }
+
+        Sequence &operator=(Sequence &&s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = s.m_data;
+                s.m_size = 0;
+                s.m_data = nullptr;
+            }
+            print_move_assigned(this);
+            return *this;
+        }
+
+        bool operator==(const Sequence &s) const {
+            if (m_size != s.size()) return false;
+            for (size_t i = 0; i < m_size; ++i)
+                if (m_data[i] != s[i])
+                    return false;
+            return true;
+        }
+        bool operator!=(const Sequence &s) const { return !operator==(s); }
+
+        float operator[](size_t index) const { return m_data[index]; }
+        float &operator[](size_t index) { return m_data[index]; }
+
+        bool contains(float v) const {
+            for (size_t i = 0; i < m_size; ++i)
+                if (v == m_data[i])
+                    return true;
+            return false;
+        }
+
+        Sequence reversed() const {
+            Sequence result(m_size);
+            for (size_t i = 0; i < m_size; ++i)
+                result[m_size - i - 1] = m_data[i];
+            return result;
+        }
+
+        size_t size() const { return m_size; }
+
+        const float *begin() const { return m_data; }
+        const float *end() const { return m_data+m_size; }
+
+    private:
+        size_t m_size;
+        float *m_data;
+    };
+    py::class_<Sequence>(m, "Sequence")
+        .def(py::init<size_t>())
+        .def(py::init<const std::vector<float>&>())
+        /// Bare bones interface
+        .def("__getitem__", [](const Sequence &s, size_t i) {
+            if (i >= s.size()) throw py::index_error();
+            return s[i];
+        })
+        .def("__setitem__", [](Sequence &s, size_t i, float v) {
+            if (i >= s.size()) throw py::index_error();
+            s[i] = v;
+        })
+        .def("__len__", &Sequence::size)
+        /// Optional sequence protocol operations
+        .def("__iter__", [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
+                         py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
+        .def("__contains__", [](const Sequence &s, float v) { return s.contains(v); })
+        .def("__reversed__", [](const Sequence &s) -> Sequence { return s.reversed(); })
+        /// Slicing protocol (optional)
+        .def("__getitem__", [](const Sequence &s, py::slice slice) -> Sequence* {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            Sequence *seq = new Sequence(slicelength);
+            for (size_t i = 0; i < slicelength; ++i) {
+                (*seq)[i] = s[start]; start += step;
+            }
+            return seq;
+        })
+        .def("__setitem__", [](Sequence &s, py::slice slice, const Sequence &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+            for (size_t i = 0; i < slicelength; ++i) {
+                s[start] = value[i]; start += step;
+            }
+        })
+        /// Comparisons
+        .def(py::self == py::self)
+        .def(py::self != py::self)
+        // Could also define py::self + py::self for concatenation, etc.
+        ;
+
+    // test_map_iterator
+    // Interface of a map-like object that isn't (directly) an unordered_map, but provides some basic
+    // map-like functionality.
+    class StringMap {
+    public:
+        StringMap() = default;
+        StringMap(std::unordered_map<std::string, std::string> init)
+            : map(std::move(init)) {}
+
+        void set(std::string key, std::string val) { map[key] = val; }
+        std::string get(std::string key) const { return map.at(key); }
+        size_t size() const { return map.size(); }
+    private:
+        std::unordered_map<std::string, std::string> map;
+    public:
+        decltype(map.cbegin()) begin() const { return map.cbegin(); }
+        decltype(map.cend()) end() const { return map.cend(); }
+    };
+    py::class_<StringMap>(m, "StringMap")
+        .def(py::init<>())
+        .def(py::init<std::unordered_map<std::string, std::string>>())
+        .def("__getitem__", [](const StringMap &map, std::string key) {
+                try { return map.get(key); }
+                catch (const std::out_of_range&) {
+                    throw py::key_error("key '" + key + "' does not exist");
+                }
+        })
+        .def("__setitem__", &StringMap::set)
+        .def("__len__", &StringMap::size)
+        .def("__iter__", [](const StringMap &map) { return py::make_key_iterator(map.begin(), map.end()); },
+                py::keep_alive<0, 1>())
+        .def("items", [](const StringMap &map) { return py::make_iterator(map.begin(), map.end()); },
+                py::keep_alive<0, 1>())
+        ;
+
+    // test_generalized_iterators
+    class IntPairs {
+    public:
+        IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
+        const std::pair<int, int>* begin() const { return data_.data(); }
+    private:
+        std::vector<std::pair<int, int>> data_;
+    };
+    py::class_<IntPairs>(m, "IntPairs")
+        .def(py::init<std::vector<std::pair<int, int>>>())
+        .def("nonzero", [](const IntPairs& s) {
+                return py::make_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        .def("nonzero_keys", [](const IntPairs& s) {
+            return py::make_key_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        ;
+
+
+#if 0
+    // Obsolete: special data structure for exposing custom iterator types to python
+    // kept here for illustrative purposes because there might be some use cases which
+    // are not covered by the much simpler py::make_iterator
+
+    struct PySequenceIterator {
+        PySequenceIterator(const Sequence &seq, py::object ref) : seq(seq), ref(ref) { }
+
+        float next() {
+            if (index == seq.size())
+                throw py::stop_iteration();
+            return seq[index++];
+        }
+
+        const Sequence &seq;
+        py::object ref; // keep a reference
+        size_t index = 0;
+    };
+
+    py::class_<PySequenceIterator>(seq, "Iterator")
+        .def("__iter__", [](PySequenceIterator &it) -> PySequenceIterator& { return it; })
+        .def("__next__", &PySequenceIterator::next);
+
+    On the actual Sequence object, the iterator would be constructed as follows:
+    .def("__iter__", [](py::object s) { return PySequenceIterator(s.cast<const Sequence &>(), s); })
+#endif
+
+    // test_python_iterator_in_cpp
+    m.def("object_to_list", [](py::object o) {
+        auto l = py::list();
+        for (auto item : o) {
+            l.append(item);
+        }
+        return l;
+    });
+
+    m.def("iterator_to_list", [](py::iterator it) {
+        auto l = py::list();
+        while (it != py::iterator::sentinel()) {
+            l.append(*it);
+            ++it;
+        }
+        return l;
+    });
+
+    // test_sequence_length: check that Python sequences can be converted to py::sequence.
+    m.def("sequence_length", [](py::sequence seq) { return seq.size(); });
+
+    // Make sure that py::iterator works with std algorithms
+    m.def("count_none", [](py::object o) {
+        return std::count_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+    });
+
+    m.def("find_none", [](py::object o) {
+        auto it = std::find_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+        return it->is_none();
+    });
+
+    m.def("count_nonzeros", [](py::dict d) {
+       return std::count_if(d.begin(), d.end(), [](std::pair<py::handle, py::handle> p) {
+           return p.second.cast<int>() != 0;
+       });
+    });
+
+    m.def("tuple_iterator", &test_random_access_iterator<py::tuple>);
+    m.def("list_iterator", &test_random_access_iterator<py::list>);
+    m.def("sequence_iterator", &test_random_access_iterator<py::sequence>);
+
+    // test_iterator_passthrough
+    // #181: iterator passthrough did not compile
+    m.def("iterator_passthrough", [](py::iterator s) -> py::iterator {
+        return py::make_iterator(std::begin(s), std::end(s));
+    });
+
+    // test_iterator_rvp
+    // #388: Can't make iterators via make_iterator() with different r/v policies
+    static std::vector<int> list = { 1, 2, 3 };
+    m.def("make_iterator_1", []() { return py::make_iterator<py::return_value_policy::copy>(list); });
+    m.def("make_iterator_2", []() { return py::make_iterator<py::return_value_policy::automatic>(list); });
+}
diff --git a/pybind11/tests/test_sequences_and_iterators.py b/pybind11/tests/test_sequences_and_iterators.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f6c0c4bbdf71bb45759d83a630f910a4f117ecd
--- /dev/null
+++ b/pybind11/tests/test_sequences_and_iterators.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import sequences_and_iterators as m
+from pybind11_tests import ConstructorStats
+
+
+def isclose(a, b, rel_tol=1e-05, abs_tol=0.0):
+    """Like math.isclose() from Python 3.5"""
+    return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+
+def allclose(a_list, b_list, rel_tol=1e-05, abs_tol=0.0):
+    return all(isclose(a, b, rel_tol=rel_tol, abs_tol=abs_tol) for a, b in zip(a_list, b_list))
+
+
+def test_generalized_iterators():
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero()) == [(1, 2), (3, 4)]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero()) == [(1, 2)]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero()) == []
+
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero_keys()) == [1, 3]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero_keys()) == [1]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero_keys()) == []
+
+    # __next__ must continue to raise StopIteration
+    it = m.IntPairs([(0, 0)]).nonzero()
+    for _ in range(3):
+        with pytest.raises(StopIteration):
+            next(it)
+
+    it = m.IntPairs([(0, 0)]).nonzero_keys()
+    for _ in range(3):
+        with pytest.raises(StopIteration):
+            next(it)
+
+
+def test_sliceable():
+    sliceable = m.Sliceable(100)
+    assert sliceable[::] == (0, 100, 1)
+    assert sliceable[10::] == (10, 100, 1)
+    assert sliceable[:10:] == (0, 10, 1)
+    assert sliceable[::10] == (0, 100, 10)
+    assert sliceable[-10::] == (90, 100, 1)
+    assert sliceable[:-10:] == (0, 90, 1)
+    assert sliceable[::-10] == (99, -1, -10)
+    assert sliceable[50:60:1] == (50, 60, 1)
+    assert sliceable[50:60:-1] == (50, 60, -1)
+
+
+def test_sequence():
+    cstats = ConstructorStats.get(m.Sequence)
+
+    s = m.Sequence(5)
+    assert cstats.values() == ['of size', '5']
+
+    assert "Sequence" in repr(s)
+    assert len(s) == 5
+    assert s[0] == 0 and s[3] == 0
+    assert 12.34 not in s
+    s[0], s[3] = 12.34, 56.78
+    assert 12.34 in s
+    assert isclose(s[0], 12.34) and isclose(s[3], 56.78)
+
+    rev = reversed(s)
+    assert cstats.values() == ['of size', '5']
+
+    rev2 = s[::-1]
+    assert cstats.values() == ['of size', '5']
+
+    it = iter(m.Sequence(0))
+    for _ in range(3):  # __next__ must continue to raise StopIteration
+        with pytest.raises(StopIteration):
+            next(it)
+    assert cstats.values() == ['of size', '0']
+
+    expected = [0, 56.78, 0, 0, 12.34]
+    assert allclose(rev, expected)
+    assert allclose(rev2, expected)
+    assert rev == rev2
+
+    rev[0::2] = m.Sequence([2.0, 2.0, 2.0])
+    assert cstats.values() == ['of size', '3', 'from std::vector']
+
+    assert allclose(rev, [2, 56.78, 2, 0, 2])
+
+    assert cstats.alive() == 4
+    del it
+    assert cstats.alive() == 3
+    del s
+    assert cstats.alive() == 2
+    del rev
+    assert cstats.alive() == 1
+    del rev2
+    assert cstats.alive() == 0
+
+    assert cstats.values() == []
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    assert cstats.move_constructions >= 1
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_sequence_length():
+    """#2076: Exception raised by len(arg) should be propagated """
+    class BadLen(RuntimeError):
+        pass
+
+    class SequenceLike():
+        def __getitem__(self, i):
+            return None
+
+        def __len__(self):
+            raise BadLen()
+
+    with pytest.raises(BadLen):
+        m.sequence_length(SequenceLike())
+
+    assert m.sequence_length([1, 2, 3]) == 3
+    assert m.sequence_length("hello") == 5
+
+
+def test_map_iterator():
+    sm = m.StringMap({'hi': 'bye', 'black': 'white'})
+    assert sm['hi'] == 'bye'
+    assert len(sm) == 2
+    assert sm['black'] == 'white'
+
+    with pytest.raises(KeyError):
+        assert sm['orange']
+    sm['orange'] = 'banana'
+    assert sm['orange'] == 'banana'
+
+    expected = {'hi': 'bye', 'black': 'white', 'orange': 'banana'}
+    for k in sm:
+        assert sm[k] == expected[k]
+    for k, v in sm.items():
+        assert v == expected[k]
+
+    it = iter(m.StringMap({}))
+    for _ in range(3):  # __next__ must continue to raise StopIteration
+        with pytest.raises(StopIteration):
+            next(it)
+
+
+def test_python_iterator_in_cpp():
+    t = (1, 2, 3)
+    assert m.object_to_list(t) == [1, 2, 3]
+    assert m.object_to_list(iter(t)) == [1, 2, 3]
+    assert m.iterator_to_list(iter(t)) == [1, 2, 3]
+
+    with pytest.raises(TypeError) as excinfo:
+        m.object_to_list(1)
+    assert "object is not iterable" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.iterator_to_list(1)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    def bad_next_call():
+        raise RuntimeError("py::iterator::advance() should propagate errors")
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.iterator_to_list(iter(bad_next_call, None))
+    assert str(excinfo.value) == "py::iterator::advance() should propagate errors"
+
+    lst = [1, None, 0, None]
+    assert m.count_none(lst) == 2
+    assert m.find_none(lst) is True
+    assert m.count_nonzeros({"a": 0, "b": 1, "c": 2}) == 2
+
+    r = range(5)
+    assert all(m.tuple_iterator(tuple(r)))
+    assert all(m.list_iterator(list(r)))
+    assert all(m.sequence_iterator(r))
+
+
+def test_iterator_passthrough():
+    """#181: iterator passthrough did not compile"""
+    from pybind11_tests.sequences_and_iterators import iterator_passthrough
+
+    assert list(iterator_passthrough(iter([3, 5, 7, 9, 11, 13, 15]))) == [3, 5, 7, 9, 11, 13, 15]
+
+
+def test_iterator_rvp():
+    """#388: Can't make iterators via make_iterator() with different r/v policies """
+    import pybind11_tests.sequences_and_iterators as m
+
+    assert list(m.make_iterator_1()) == [1, 2, 3]
+    assert list(m.make_iterator_2()) == [1, 2, 3]
+    assert not isinstance(m.make_iterator_1(), type(m.make_iterator_2()))
diff --git a/pybind11/tests/test_smart_ptr.cpp b/pybind11/tests/test_smart_ptr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bea90691d4a2703ee7a92ed3d5b975835a7f6013
--- /dev/null
+++ b/pybind11/tests/test_smart_ptr.cpp
@@ -0,0 +1,369 @@
+/*
+    tests/test_smart_ptr.cpp -- binding classes with custom reference counting,
+    implicit conversions between types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+#  pragma warning(disable: 4702) // unreachable code in system header
+#endif
+
+#include "pybind11_tests.h"
+#include "object.h"
+
+// Make pybind aware of the ref-counted wrapper type (s):
+
+// ref<T> is a wrapper for 'Object' which uses intrusive reference counting
+// It is always possible to construct a ref<T> from an Object* pointer without
+// possible inconsistencies, hence the 'true' argument at the end.
+PYBIND11_DECLARE_HOLDER_TYPE(T, ref<T>, true);
+// Make pybind11 aware of the non-standard getter member function
+namespace pybind11 { namespace detail {
+    template <typename T>
+    struct holder_helper<ref<T>> {
+        static const T *get(const ref<T> &p) { return p.get_ptr(); }
+    };
+}}
+
+// The following is not required anymore for std::shared_ptr, but it should compile without error:
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+
+// This is just a wrapper around unique_ptr, but with extra fields to deliberately bloat up the
+// holder size to trigger the non-simple-layout internal instance layout for single inheritance with
+// large holder type:
+template <typename T> class huge_unique_ptr {
+    std::unique_ptr<T> ptr;
+    uint64_t padding[10];
+public:
+    huge_unique_ptr(T *p) : ptr(p) {};
+    T *get() { return ptr.get(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, huge_unique_ptr<T>);
+
+// Simple custom holder that works like unique_ptr
+template <typename T>
+class custom_unique_ptr {
+    std::unique_ptr<T> impl;
+public:
+    custom_unique_ptr(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
+
+// Simple custom holder that works like shared_ptr and has operator& overload
+// To obtain address of an instance of this holder pybind should use std::addressof
+// Attempt to get address via operator& may leads to segmentation fault
+template <typename T>
+class shared_ptr_with_addressof_operator {
+    std::shared_ptr<T> impl;
+public:
+    shared_ptr_with_addressof_operator( ) = default;
+    shared_ptr_with_addressof_operator(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, shared_ptr_with_addressof_operator<T>);
+
+// Simple custom holder that works like unique_ptr and has operator& overload
+// To obtain address of an instance of this holder pybind should use std::addressof
+// Attempt to get address via operator& may leads to segmentation fault
+template <typename T>
+class unique_ptr_with_addressof_operator {
+    std::unique_ptr<T> impl;
+public:
+    unique_ptr_with_addressof_operator() = default;
+    unique_ptr_with_addressof_operator(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+    T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, unique_ptr_with_addressof_operator<T>);
+
+
+TEST_SUBMODULE(smart_ptr, m) {
+
+    // test_smart_ptr
+
+    // Object implementation in `object.h`
+    py::class_<Object, ref<Object>> obj(m, "Object");
+    obj.def("getRefCount", &Object::getRefCount);
+
+    // Custom object with builtin reference counting (see 'object.h' for the implementation)
+    class MyObject1 : public Object {
+    public:
+        MyObject1(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject1[" + std::to_string(value) + "]"; }
+    protected:
+        virtual ~MyObject1() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject1, ref<MyObject1>>(m, "MyObject1", obj)
+        .def(py::init<int>());
+    py::implicitly_convertible<py::int_, MyObject1>();
+
+    m.def("make_object_1", []() -> Object * { return new MyObject1(1); });
+    m.def("make_object_2", []() -> ref<Object> { return new MyObject1(2); });
+    m.def("make_myobject1_1", []() -> MyObject1 * { return new MyObject1(4); });
+    m.def("make_myobject1_2", []() -> ref<MyObject1> { return new MyObject1(5); });
+    m.def("print_object_1", [](const Object *obj) { py::print(obj->toString()); });
+    m.def("print_object_2", [](ref<Object> obj) { py::print(obj->toString()); });
+    m.def("print_object_3", [](const ref<Object> &obj) { py::print(obj->toString()); });
+    m.def("print_object_4", [](const ref<Object> *obj) { py::print((*obj)->toString()); });
+    m.def("print_myobject1_1", [](const MyObject1 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_2", [](ref<MyObject1> obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_3", [](const ref<MyObject1> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_4", [](const ref<MyObject1> *obj) { py::print((*obj)->toString()); });
+
+    // Expose constructor stats for the ref type
+    m.def("cstats_ref", &ConstructorStats::get<ref_tag>);
+
+
+    // Object managed by a std::shared_ptr<>
+    class MyObject2 {
+    public:
+        MyObject2(const MyObject2 &) = default;
+        MyObject2(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject2[" + std::to_string(value) + "]"; }
+        virtual ~MyObject2() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject2, std::shared_ptr<MyObject2>>(m, "MyObject2")
+        .def(py::init<int>());
+    m.def("make_myobject2_1", []() { return new MyObject2(6); });
+    m.def("make_myobject2_2", []() { return std::make_shared<MyObject2>(7); });
+    m.def("print_myobject2_1", [](const MyObject2 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_2", [](std::shared_ptr<MyObject2> obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_3", [](const std::shared_ptr<MyObject2> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_4", [](const std::shared_ptr<MyObject2> *obj) { py::print((*obj)->toString()); });
+
+    // Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
+    class MyObject3 : public std::enable_shared_from_this<MyObject3> {
+    public:
+        MyObject3(const MyObject3 &) = default;
+        MyObject3(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject3[" + std::to_string(value) + "]"; }
+        virtual ~MyObject3() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject3, std::shared_ptr<MyObject3>>(m, "MyObject3")
+        .def(py::init<int>());
+    m.def("make_myobject3_1", []() { return new MyObject3(8); });
+    m.def("make_myobject3_2", []() { return std::make_shared<MyObject3>(9); });
+    m.def("print_myobject3_1", [](const MyObject3 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_2", [](std::shared_ptr<MyObject3> obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_3", [](const std::shared_ptr<MyObject3> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_4", [](const std::shared_ptr<MyObject3> *obj) { py::print((*obj)->toString()); });
+
+    // test_smart_ptr_refcounting
+    m.def("test_object1_refcounting", []() {
+        ref<MyObject1> o = new MyObject1(0);
+        bool good = o->getRefCount() == 1;
+        py::object o2 = py::cast(o, py::return_value_policy::reference);
+        // always request (partial) ownership for objects with intrusive
+        // reference counting even when using the 'reference' RVP
+        good &= o->getRefCount() == 2;
+        return good;
+    });
+
+    // test_unique_nodelete
+    // Object with a private destructor
+    class MyObject4 {
+    public:
+        MyObject4(int value) : value{value} { print_created(this); }
+        int value;
+    private:
+        ~MyObject4() { print_destroyed(this); }
+    };
+    py::class_<MyObject4, std::unique_ptr<MyObject4, py::nodelete>>(m, "MyObject4")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject4::value);
+
+    // test_unique_deleter
+    // Object with std::unique_ptr<T, D> where D is not matching the base class
+    // Object with a protected destructor
+    class MyObject4a {
+    public:
+        MyObject4a(int i) {
+            value = i;
+            print_created(this);
+        };
+        int value;
+    protected:
+        virtual ~MyObject4a() { print_destroyed(this); }
+    };
+    py::class_<MyObject4a, std::unique_ptr<MyObject4a, py::nodelete>>(m, "MyObject4a")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject4a::value);
+
+    // Object derived but with public destructor and no Deleter in default holder
+    class MyObject4b : public MyObject4a {
+    public:
+        MyObject4b(int i) : MyObject4a(i) { print_created(this); }
+        ~MyObject4b() { print_destroyed(this); }
+    };
+    py::class_<MyObject4b, MyObject4a>(m, "MyObject4b")
+        .def(py::init<int>());
+
+    // test_large_holder
+    class MyObject5 { // managed by huge_unique_ptr
+    public:
+        MyObject5(int value) : value{value} { print_created(this); }
+        ~MyObject5() { print_destroyed(this); }
+        int value;
+    };
+    py::class_<MyObject5, huge_unique_ptr<MyObject5>>(m, "MyObject5")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject5::value);
+
+    // test_shared_ptr_and_references
+    struct SharedPtrRef {
+        struct A {
+            A() { print_created(this); }
+            A(const A &) { print_copy_created(this); }
+            A(A &&) { print_move_created(this); }
+            ~A() { print_destroyed(this); }
+        };
+
+        A value = {};
+        std::shared_ptr<A> shared = std::make_shared<A>();
+    };
+    using A = SharedPtrRef::A;
+    py::class_<A, std::shared_ptr<A>>(m, "A");
+    py::class_<SharedPtrRef>(m, "SharedPtrRef")
+        .def(py::init<>())
+        .def_readonly("ref", &SharedPtrRef::value)
+        .def_property_readonly("copy", [](const SharedPtrRef &s) { return s.value; },
+                               py::return_value_policy::copy)
+        .def_readonly("holder_ref", &SharedPtrRef::shared)
+        .def_property_readonly("holder_copy", [](const SharedPtrRef &s) { return s.shared; },
+                               py::return_value_policy::copy)
+        .def("set_ref", [](SharedPtrRef &, const A &) { return true; })
+        .def("set_holder", [](SharedPtrRef &, std::shared_ptr<A>) { return true; });
+
+    // test_shared_ptr_from_this_and_references
+    struct SharedFromThisRef {
+        struct B : std::enable_shared_from_this<B> {
+            B() { print_created(this); }
+            B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
+            B(B &&) : std::enable_shared_from_this<B>() { print_move_created(this); }
+            ~B() { print_destroyed(this); }
+        };
+
+        B value = {};
+        std::shared_ptr<B> shared = std::make_shared<B>();
+    };
+    using B = SharedFromThisRef::B;
+    py::class_<B, std::shared_ptr<B>>(m, "B");
+    py::class_<SharedFromThisRef>(m, "SharedFromThisRef")
+        .def(py::init<>())
+        .def_readonly("bad_wp", &SharedFromThisRef::value)
+        .def_property_readonly("ref", [](const SharedFromThisRef &s) -> const B & { return *s.shared; })
+        .def_property_readonly("copy", [](const SharedFromThisRef &s) { return s.value; },
+                               py::return_value_policy::copy)
+        .def_readonly("holder_ref", &SharedFromThisRef::shared)
+        .def_property_readonly("holder_copy", [](const SharedFromThisRef &s) { return s.shared; },
+                               py::return_value_policy::copy)
+        .def("set_ref", [](SharedFromThisRef &, const B &) { return true; })
+        .def("set_holder", [](SharedFromThisRef &, std::shared_ptr<B>) { return true; });
+
+    // Issue #865: shared_from_this doesn't work with virtual inheritance
+    struct SharedFromThisVBase : std::enable_shared_from_this<SharedFromThisVBase> {
+        SharedFromThisVBase() = default;
+        SharedFromThisVBase(const SharedFromThisVBase &) = default;
+        virtual ~SharedFromThisVBase() = default;
+    };
+    struct SharedFromThisVirt : virtual SharedFromThisVBase {};
+    static std::shared_ptr<SharedFromThisVirt> sft(new SharedFromThisVirt());
+    py::class_<SharedFromThisVirt, std::shared_ptr<SharedFromThisVirt>>(m, "SharedFromThisVirt")
+        .def_static("get", []() { return sft.get(); });
+
+    // test_move_only_holder
+    struct C {
+        C() { print_created(this); }
+        ~C() { print_destroyed(this); }
+    };
+    py::class_<C, custom_unique_ptr<C>>(m, "TypeWithMoveOnlyHolder")
+        .def_static("make", []() { return custom_unique_ptr<C>(new C); })
+        .def_static("make_as_object", []() { return py::cast(custom_unique_ptr<C>(new C)); });
+
+    // test_holder_with_addressof_operator
+    struct TypeForHolderWithAddressOf {
+        TypeForHolderWithAddressOf() { print_created(this); }
+        TypeForHolderWithAddressOf(const TypeForHolderWithAddressOf &) { print_copy_created(this); }
+        TypeForHolderWithAddressOf(TypeForHolderWithAddressOf &&) { print_move_created(this); }
+        ~TypeForHolderWithAddressOf() { print_destroyed(this); }
+        std::string toString() const {
+            return "TypeForHolderWithAddressOf[" + std::to_string(value) + "]";
+        }
+        int value = 42;
+    };
+    using HolderWithAddressOf = shared_ptr_with_addressof_operator<TypeForHolderWithAddressOf>;
+    py::class_<TypeForHolderWithAddressOf, HolderWithAddressOf>(m, "TypeForHolderWithAddressOf")
+        .def_static("make", []() { return HolderWithAddressOf(new TypeForHolderWithAddressOf); })
+        .def("get", [](const HolderWithAddressOf &self) { return self.get(); })
+        .def("print_object_1", [](const TypeForHolderWithAddressOf *obj) { py::print(obj->toString()); })
+        .def("print_object_2", [](HolderWithAddressOf obj) { py::print(obj.get()->toString()); })
+        .def("print_object_3", [](const HolderWithAddressOf &obj) { py::print(obj.get()->toString()); })
+        .def("print_object_4", [](const HolderWithAddressOf *obj) { py::print((*obj).get()->toString()); });
+
+    // test_move_only_holder_with_addressof_operator
+    struct TypeForMoveOnlyHolderWithAddressOf {
+        TypeForMoveOnlyHolderWithAddressOf(int value) : value{value} { print_created(this); }
+        ~TypeForMoveOnlyHolderWithAddressOf() { print_destroyed(this); }
+        std::string toString() const {
+            return "MoveOnlyHolderWithAddressOf[" + std::to_string(value) + "]";
+        }
+        int value;
+    };
+    using MoveOnlyHolderWithAddressOf = unique_ptr_with_addressof_operator<TypeForMoveOnlyHolderWithAddressOf>;
+    py::class_<TypeForMoveOnlyHolderWithAddressOf, MoveOnlyHolderWithAddressOf>(m, "TypeForMoveOnlyHolderWithAddressOf")
+        .def_static("make", []() { return MoveOnlyHolderWithAddressOf(new TypeForMoveOnlyHolderWithAddressOf(0)); })
+        .def_readwrite("value", &TypeForMoveOnlyHolderWithAddressOf::value)
+        .def("print_object", [](const TypeForMoveOnlyHolderWithAddressOf *obj) { py::print(obj->toString()); });
+
+    // test_smart_ptr_from_default
+    struct HeldByDefaultHolder { };
+    py::class_<HeldByDefaultHolder>(m, "HeldByDefaultHolder")
+        .def(py::init<>())
+        .def_static("load_shared_ptr", [](std::shared_ptr<HeldByDefaultHolder>) {});
+
+    // test_shared_ptr_gc
+    // #187: issue involving std::shared_ptr<> return value policy & garbage collection
+    struct ElementBase {
+        virtual ~ElementBase() { } /* Force creation of virtual table */
+        ElementBase() = default;
+        ElementBase(const ElementBase&) = delete;
+    };
+    py::class_<ElementBase, std::shared_ptr<ElementBase>>(m, "ElementBase");
+
+    struct ElementA : ElementBase {
+        ElementA(int v) : v(v) { }
+        int value() { return v; }
+        int v;
+    };
+    py::class_<ElementA, ElementBase, std::shared_ptr<ElementA>>(m, "ElementA")
+        .def(py::init<int>())
+        .def("value", &ElementA::value);
+
+    struct ElementList {
+        void add(std::shared_ptr<ElementBase> e) { l.push_back(e); }
+        std::vector<std::shared_ptr<ElementBase>> l;
+    };
+    py::class_<ElementList, std::shared_ptr<ElementList>>(m, "ElementList")
+        .def(py::init<>())
+        .def("add", &ElementList::add)
+        .def("get", [](ElementList &el) {
+            py::list list;
+            for (auto &e : el.l)
+                list.append(py::cast(e));
+            return list;
+        });
+}
diff --git a/pybind11/tests/test_smart_ptr.py b/pybind11/tests/test_smart_ptr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9267f6878f1c0d912017bd2a6b0d21dd673c32b
--- /dev/null
+++ b/pybind11/tests/test_smart_ptr.py
@@ -0,0 +1,290 @@
+# -*- coding: utf-8 -*-
+import pytest
+from pybind11_tests import smart_ptr as m
+from pybind11_tests import ConstructorStats
+
+
+def test_smart_ptr(capture):
+    # Object1
+    for i, o in enumerate([m.make_object_1(), m.make_object_2(), m.MyObject1(3)], start=1):
+        assert o.getRefCount() == 1
+        with capture:
+            m.print_object_1(o)
+            m.print_object_2(o)
+            m.print_object_3(o)
+            m.print_object_4(o)
+        assert capture == "MyObject1[{i}]\n".format(i=i) * 4
+
+    for i, o in enumerate([m.make_myobject1_1(), m.make_myobject1_2(), m.MyObject1(6), 7],
+                          start=4):
+        print(o)
+        with capture:
+            if not isinstance(o, int):
+                m.print_object_1(o)
+                m.print_object_2(o)
+                m.print_object_3(o)
+                m.print_object_4(o)
+            m.print_myobject1_1(o)
+            m.print_myobject1_2(o)
+            m.print_myobject1_3(o)
+            m.print_myobject1_4(o)
+        assert capture == "MyObject1[{i}]\n".format(i=i) * (4 if isinstance(o, int) else 8)
+
+    cstats = ConstructorStats.get(m.MyObject1)
+    assert cstats.alive() == 0
+    expected_values = ['MyObject1[{}]'.format(i) for i in range(1, 7)] + ['MyObject1[7]'] * 4
+    assert cstats.values() == expected_values
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # Object2
+    for i, o in zip([8, 6, 7], [m.MyObject2(8), m.make_myobject2_1(), m.make_myobject2_2()]):
+        print(o)
+        with capture:
+            m.print_myobject2_1(o)
+            m.print_myobject2_2(o)
+            m.print_myobject2_3(o)
+            m.print_myobject2_4(o)
+        assert capture == "MyObject2[{i}]\n".format(i=i) * 4
+
+    cstats = ConstructorStats.get(m.MyObject2)
+    assert cstats.alive() == 1
+    o = None
+    assert cstats.alive() == 0
+    assert cstats.values() == ['MyObject2[8]', 'MyObject2[6]', 'MyObject2[7]']
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # Object3
+    for i, o in zip([9, 8, 9], [m.MyObject3(9), m.make_myobject3_1(), m.make_myobject3_2()]):
+        print(o)
+        with capture:
+            m.print_myobject3_1(o)
+            m.print_myobject3_2(o)
+            m.print_myobject3_3(o)
+            m.print_myobject3_4(o)
+        assert capture == "MyObject3[{i}]\n".format(i=i) * 4
+
+    cstats = ConstructorStats.get(m.MyObject3)
+    assert cstats.alive() == 1
+    o = None
+    assert cstats.alive() == 0
+    assert cstats.values() == ['MyObject3[9]', 'MyObject3[8]', 'MyObject3[9]']
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # Object
+    cstats = ConstructorStats.get(m.Object)
+    assert cstats.alive() == 0
+    assert cstats.values() == []
+    assert cstats.default_constructions == 10
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # ref<>
+    cstats = m.cstats_ref()
+    assert cstats.alive() == 0
+    assert cstats.values() == ['from pointer'] * 10
+    assert cstats.default_constructions == 30
+    assert cstats.copy_constructions == 12
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 30
+    assert cstats.move_assignments == 0
+
+
+def test_smart_ptr_refcounting():
+    assert m.test_object1_refcounting()
+
+
+def test_unique_nodelete():
+    o = m.MyObject4(23)
+    assert o.value == 23
+    cstats = ConstructorStats.get(m.MyObject4)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 1  # Leak, but that's intentional
+
+
+def test_unique_nodelete4a():
+    o = m.MyObject4a(23)
+    assert o.value == 23
+    cstats = ConstructorStats.get(m.MyObject4a)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 1  # Leak, but that's intentional
+
+
+def test_unique_deleter():
+    o = m.MyObject4b(23)
+    assert o.value == 23
+    cstats4a = ConstructorStats.get(m.MyObject4a)
+    assert cstats4a.alive() == 2  # Two because of previous test
+    cstats4b = ConstructorStats.get(m.MyObject4b)
+    assert cstats4b.alive() == 1
+    del o
+    assert cstats4a.alive() == 1  # Should now only be one leftover from previous test
+    assert cstats4b.alive() == 0  # Should be deleted
+
+
+def test_large_holder():
+    o = m.MyObject5(5)
+    assert o.value == 5
+    cstats = ConstructorStats.get(m.MyObject5)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 0
+
+
+def test_shared_ptr_and_references():
+    s = m.SharedPtrRef()
+    stats = ConstructorStats.get(m.A)
+    assert stats.alive() == 2
+
+    ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false)
+    assert stats.alive() == 2
+    assert s.set_ref(ref)
+    with pytest.raises(RuntimeError) as excinfo:
+        assert s.set_holder(ref)
+    assert "Unable to cast from non-held to held instance" in str(excinfo.value)
+
+    copy = s.copy  # init_holder_helper(holder_ptr=false, owned=true)
+    assert stats.alive() == 3
+    assert s.set_ref(copy)
+    assert s.set_holder(copy)
+
+    holder_ref = s.holder_ref  # init_holder_helper(holder_ptr=true, owned=false)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_ref)
+    assert s.set_holder(holder_ref)
+
+    holder_copy = s.holder_copy  # init_holder_helper(holder_ptr=true, owned=true)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_copy)
+    assert s.set_holder(holder_copy)
+
+    del ref, copy, holder_ref, holder_copy, s
+    assert stats.alive() == 0
+
+
+def test_shared_ptr_from_this_and_references():
+    s = m.SharedFromThisRef()
+    stats = ConstructorStats.get(m.B)
+    assert stats.alive() == 2
+
+    ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=false)
+    assert stats.alive() == 2
+    assert s.set_ref(ref)
+    assert s.set_holder(ref)  # std::enable_shared_from_this can create a holder from a reference
+
+    bad_wp = s.bad_wp  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=true)
+    assert stats.alive() == 2
+    assert s.set_ref(bad_wp)
+    with pytest.raises(RuntimeError) as excinfo:
+        assert s.set_holder(bad_wp)
+    assert "Unable to cast from non-held to held instance" in str(excinfo.value)
+
+    copy = s.copy  # init_holder_helper(holder_ptr=false, owned=true, bad_wp=false)
+    assert stats.alive() == 3
+    assert s.set_ref(copy)
+    assert s.set_holder(copy)
+
+    holder_ref = s.holder_ref  # init_holder_helper(holder_ptr=true, owned=false, bad_wp=false)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_ref)
+    assert s.set_holder(holder_ref)
+
+    holder_copy = s.holder_copy  # init_holder_helper(holder_ptr=true, owned=true, bad_wp=false)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_copy)
+    assert s.set_holder(holder_copy)
+
+    del ref, bad_wp, copy, holder_ref, holder_copy, s
+    assert stats.alive() == 0
+
+    z = m.SharedFromThisVirt.get()
+    y = m.SharedFromThisVirt.get()
+    assert y is z
+
+
+def test_move_only_holder():
+    a = m.TypeWithMoveOnlyHolder.make()
+    b = m.TypeWithMoveOnlyHolder.make_as_object()
+    stats = ConstructorStats.get(m.TypeWithMoveOnlyHolder)
+    assert stats.alive() == 2
+    del b
+    assert stats.alive() == 1
+    del a
+    assert stats.alive() == 0
+
+
+def test_holder_with_addressof_operator():
+    # this test must not throw exception from c++
+    a = m.TypeForHolderWithAddressOf.make()
+    a.print_object_1()
+    a.print_object_2()
+    a.print_object_3()
+    a.print_object_4()
+
+    stats = ConstructorStats.get(m.TypeForHolderWithAddressOf)
+    assert stats.alive() == 1
+
+    np = m.TypeForHolderWithAddressOf.make()
+    assert stats.alive() == 2
+    del a
+    assert stats.alive() == 1
+    del np
+    assert stats.alive() == 0
+
+    b = m.TypeForHolderWithAddressOf.make()
+    c = b
+    assert b.get() is c.get()
+    assert stats.alive() == 1
+
+    del b
+    assert stats.alive() == 1
+
+    del c
+    assert stats.alive() == 0
+
+
+def test_move_only_holder_with_addressof_operator():
+    a = m.TypeForMoveOnlyHolderWithAddressOf.make()
+    a.print_object()
+
+    stats = ConstructorStats.get(m.TypeForMoveOnlyHolderWithAddressOf)
+    assert stats.alive() == 1
+
+    a.value = 42
+    assert a.value == 42
+
+    del a
+    assert stats.alive() == 0
+
+
+def test_smart_ptr_from_default():
+    instance = m.HeldByDefaultHolder()
+    with pytest.raises(RuntimeError) as excinfo:
+        m.HeldByDefaultHolder.load_shared_ptr(instance)
+    assert "Unable to load a custom holder type from a " \
+           "default-holder instance" in str(excinfo.value)
+
+
+def test_shared_ptr_gc():
+    """#187: issue involving std::shared_ptr<> return value policy & garbage collection"""
+    el = m.ElementList()
+    for i in range(10):
+        el.add(m.ElementA(i))
+    pytest.gc_collect()
+    for i, v in enumerate(el.get()):
+        assert i == v.value()
diff --git a/pybind11/tests/test_stl.cpp b/pybind11/tests/test_stl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..928635788e484d98f3cc8cf701d9221bef0a8bac
--- /dev/null
+++ b/pybind11/tests/test_stl.cpp
@@ -0,0 +1,324 @@
+/*
+    tests/test_stl.cpp -- STL type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+#include <vector>
+#include <string>
+
+// Test with `std::variant` in C++17 mode, or with `boost::variant` in C++11/14
+#if PYBIND11_HAS_VARIANT
+using std::variant;
+#elif defined(PYBIND11_TEST_BOOST) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+#  include <boost/variant.hpp>
+#  define PYBIND11_HAS_VARIANT 1
+using boost::variant;
+
+namespace pybind11 { namespace detail {
+template <typename... Ts>
+struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+template <>
+struct visit_helper<boost::variant> {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+        return boost::apply_visitor(args...);
+    }
+};
+}} // namespace pybind11::detail
+#endif
+
+PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
+
+/// Issue #528: templated constructor
+struct TplCtorClass {
+    template <typename T> TplCtorClass(const T &) { }
+    bool operator==(const TplCtorClass &) const { return true; }
+};
+
+namespace std {
+    template <>
+    struct hash<TplCtorClass> { size_t operator()(const TplCtorClass &) const { return 0; } };
+}
+
+
+template <template <typename> class OptionalImpl, typename T>
+struct OptionalHolder
+{
+    OptionalHolder() = default;
+    bool member_initialized() const {
+        return member && member->initialized;
+    }
+    OptionalImpl<T> member = T{};
+};
+
+
+TEST_SUBMODULE(stl, m) {
+    // test_vector
+    m.def("cast_vector", []() { return std::vector<int>{1}; });
+    m.def("load_vector", [](const std::vector<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+    // `std::vector<bool>` is special because it returns proxy objects instead of references
+    m.def("cast_bool_vector", []() { return std::vector<bool>{true, false}; });
+    m.def("load_bool_vector", [](const std::vector<bool> &v) {
+        return v.at(0) == true && v.at(1) == false;
+    });
+    // Unnumbered regression (caused by #936): pointers to stl containers aren't castable
+    static std::vector<RValueCaster> lvv{2};
+    m.def("cast_ptr_vector", []() { return &lvv; });
+
+    // test_deque
+    m.def("cast_deque", []() { return std::deque<int>{1}; });
+    m.def("load_deque", [](const std::deque<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+
+    // test_array
+    m.def("cast_array", []() { return std::array<int, 2> {{1 , 2}}; });
+    m.def("load_array", [](const std::array<int, 2> &a) { return a[0] == 1 && a[1] == 2; });
+
+    // test_valarray
+    m.def("cast_valarray", []() { return std::valarray<int>{1, 4, 9}; });
+    m.def("load_valarray", [](const std::valarray<int>& v) {
+        return v.size() == 3 && v[0] == 1 && v[1] == 4 && v[2] == 9;
+    });
+
+    // test_map
+    m.def("cast_map", []() { return std::map<std::string, std::string>{{"key", "value"}}; });
+    m.def("load_map", [](const std::map<std::string, std::string> &map) {
+        return map.at("key") == "value" && map.at("key2") == "value2";
+    });
+
+    // test_set
+    m.def("cast_set", []() { return std::set<std::string>{"key1", "key2"}; });
+    m.def("load_set", [](const std::set<std::string> &set) {
+        return set.count("key1") && set.count("key2") && set.count("key3");
+    });
+
+    // test_recursive_casting
+    m.def("cast_rv_vector", []() { return std::vector<RValueCaster>{2}; });
+    m.def("cast_rv_array", []() { return std::array<RValueCaster, 3>(); });
+    // NB: map and set keys are `const`, so while we technically do move them (as `const Type &&`),
+    // casters don't typically do anything with that, which means they fall to the `const Type &`
+    // caster.
+    m.def("cast_rv_map", []() { return std::unordered_map<std::string, RValueCaster>{{"a", RValueCaster{}}}; });
+    m.def("cast_rv_nested", []() {
+        std::vector<std::array<std::list<std::unordered_map<std::string, RValueCaster>>, 2>> v;
+        v.emplace_back(); // add an array
+        v.back()[0].emplace_back(); // add a map to the array
+        v.back()[0].back().emplace("b", RValueCaster{});
+        v.back()[0].back().emplace("c", RValueCaster{});
+        v.back()[1].emplace_back(); // add a map to the array
+        v.back()[1].back().emplace("a", RValueCaster{});
+        return v;
+    });
+    static std::array<RValueCaster, 2> lva;
+    static std::unordered_map<std::string, RValueCaster> lvm{{"a", RValueCaster{}}, {"b", RValueCaster{}}};
+    static std::unordered_map<std::string, std::vector<std::list<std::array<RValueCaster, 2>>>> lvn;
+    lvn["a"].emplace_back(); // add a list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["a"].emplace_back(); // another list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["b"].emplace_back(); // add a list
+    lvn["b"].back().emplace_back(); // add an array
+    lvn["b"].back().emplace_back(); // add another array
+    m.def("cast_lv_vector", []() -> const decltype(lvv) & { return lvv; });
+    m.def("cast_lv_array", []() -> const decltype(lva) & { return lva; });
+    m.def("cast_lv_map", []() -> const decltype(lvm) & { return lvm; });
+    m.def("cast_lv_nested", []() -> const decltype(lvn) & { return lvn; });
+    // #853:
+    m.def("cast_unique_ptr_vector", []() {
+        std::vector<std::unique_ptr<UserType>> v;
+        v.emplace_back(new UserType{7});
+        v.emplace_back(new UserType{42});
+        return v;
+    });
+
+    // test_move_out_container
+    struct MoveOutContainer {
+        struct Value { int value; };
+        std::list<Value> move_list() const { return {{0}, {1}, {2}}; }
+    };
+    py::class_<MoveOutContainer::Value>(m, "MoveOutContainerValue")
+        .def_readonly("value", &MoveOutContainer::Value::value);
+    py::class_<MoveOutContainer>(m, "MoveOutContainer")
+        .def(py::init<>())
+        .def_property_readonly("move_list", &MoveOutContainer::move_list);
+
+    // Class that can be move- and copy-constructed, but not assigned
+    struct NoAssign {
+        int value;
+
+        explicit NoAssign(int value = 0) : value(value) { }
+        NoAssign(const NoAssign &) = default;
+        NoAssign(NoAssign &&) = default;
+
+        NoAssign &operator=(const NoAssign &) = delete;
+        NoAssign &operator=(NoAssign &&) = delete;
+    };
+    py::class_<NoAssign>(m, "NoAssign", "Class with no C++ assignment operators")
+        .def(py::init<>())
+        .def(py::init<int>());
+
+
+    struct MoveOutDetector
+    {
+        MoveOutDetector() = default;
+        MoveOutDetector(const MoveOutDetector&) = default;
+        MoveOutDetector(MoveOutDetector&& other) noexcept
+         : initialized(other.initialized) {
+            // steal underlying resource
+            other.initialized = false;
+        }
+        bool initialized = true;
+    };
+    py::class_<MoveOutDetector>(m, "MoveOutDetector", "Class with move tracking")
+        .def(py::init<>())
+        .def_readonly("initialized", &MoveOutDetector::initialized);
+
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_optional
+    m.attr("has_optional") = true;
+
+    using opt_int = std::optional<int>;
+    using opt_no_assign = std::optional<NoAssign>;
+    m.def("double_or_zero", [](const opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none", [](int x) -> opt_int {
+        return x ? opt_int(x / 2) : opt_int();
+    });
+    m.def("test_nullopt", [](opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::nullopt, "None"));
+    m.def("test_no_assign", [](const opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::nullopt, "None"));
+
+    m.def("nodefer_none_optional", [](std::optional<int>) { return true; });
+    m.def("nodefer_none_optional", [](py::none) { return false; });
+
+    using opt_holder = OptionalHolder<std::optional, MoveOutDetector>;
+    py::class_<opt_holder>(m, "OptionalHolder", "Class with optional member")
+        .def(py::init<>())
+        .def_readonly("member", &opt_holder::member)
+        .def("member_initialized", &opt_holder::member_initialized);
+#endif
+
+#ifdef PYBIND11_HAS_EXP_OPTIONAL
+    // test_exp_optional
+    m.attr("has_exp_optional") = true;
+
+    using exp_opt_int = std::experimental::optional<int>;
+    using exp_opt_no_assign = std::experimental::optional<NoAssign>;
+    m.def("double_or_zero_exp", [](const exp_opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none_exp", [](int x) -> exp_opt_int {
+        return x ? exp_opt_int(x / 2) : exp_opt_int();
+    });
+    m.def("test_nullopt_exp", [](exp_opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+    m.def("test_no_assign_exp", [](const exp_opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+
+    using opt_exp_holder = OptionalHolder<std::experimental::optional, MoveOutDetector>;
+    py::class_<opt_exp_holder>(m, "OptionalExpHolder", "Class with optional member")
+        .def(py::init<>())
+        .def_readonly("member", &opt_exp_holder::member)
+        .def("member_initialized", &opt_exp_holder::member_initialized);
+#endif
+
+#ifdef PYBIND11_HAS_VARIANT
+    static_assert(std::is_same<py::detail::variant_caster_visitor::result_type, py::handle>::value,
+                  "visitor::result_type is required by boost::variant in C++11 mode");
+
+    struct visitor {
+        using result_type = const char *;
+
+        result_type operator()(int) { return "int"; }
+        result_type operator()(std::string) { return "std::string"; }
+        result_type operator()(double) { return "double"; }
+        result_type operator()(std::nullptr_t) { return "std::nullptr_t"; }
+    };
+
+    // test_variant
+    m.def("load_variant", [](variant<int, std::string, double, std::nullptr_t> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("load_variant_2pass", [](variant<double, int> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("cast_variant", []() {
+        using V = variant<int, std::string>;
+        return py::make_tuple(V(5), V("Hello"));
+    });
+#endif
+
+    // #528: templated constructor
+    // (no python tests: the test here is that this compiles)
+    m.def("tpl_ctor_vector", [](std::vector<TplCtorClass> &) {});
+    m.def("tpl_ctor_map", [](std::unordered_map<TplCtorClass, TplCtorClass> &) {});
+    m.def("tpl_ctor_set", [](std::unordered_set<TplCtorClass> &) {});
+#if defined(PYBIND11_HAS_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::optional<TplCtorClass> &) {});
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::experimental::optional<TplCtorClass> &) {});
+#endif
+
+    // test_vec_of_reference_wrapper
+    // #171: Can't return STL structures containing reference wrapper
+    m.def("return_vec_of_reference_wrapper", [](std::reference_wrapper<UserType> p4) {
+        static UserType p1{1}, p2{2}, p3{3};
+        return std::vector<std::reference_wrapper<UserType>> {
+            std::ref(p1), std::ref(p2), std::ref(p3), p4
+        };
+    });
+
+    // test_stl_pass_by_pointer
+    m.def("stl_pass_by_pointer", [](std::vector<int>* v) { return *v; }, "v"_a=nullptr);
+
+    // #1258: pybind11/stl.h converts string to vector<string>
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::vector<std::string>) { return 1; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::list<std::string>) { return 2; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::string) { return 3; });
+
+    class Placeholder {
+    public:
+        Placeholder() { print_created(this); }
+        Placeholder(const Placeholder &) = delete;
+        ~Placeholder() { print_destroyed(this); }
+    };
+    py::class_<Placeholder>(m, "Placeholder");
+
+    /// test_stl_vector_ownership
+    m.def("test_stl_ownership",
+          []() {
+              std::vector<Placeholder *> result;
+              result.push_back(new Placeholder());
+              return result;
+          },
+          py::return_value_policy::take_ownership);
+
+    m.def("array_cast_sequence", [](std::array<int, 3> x) { return x; });
+
+    /// test_issue_1561
+    struct Issue1561Inner { std::string data; };
+    struct Issue1561Outer { std::vector<Issue1561Inner> list; };
+
+    py::class_<Issue1561Inner>(m, "Issue1561Inner")
+        .def(py::init<std::string>())
+        .def_readwrite("data", &Issue1561Inner::data);
+
+    py::class_<Issue1561Outer>(m, "Issue1561Outer")
+        .def(py::init<>())
+        .def_readwrite("list", &Issue1561Outer::list);
+}
diff --git a/pybind11/tests/test_stl.py b/pybind11/tests/test_stl.py
new file mode 100644
index 0000000000000000000000000000000000000000..141b3e8492c7400e4d0980dd9bc6347f5229f80a
--- /dev/null
+++ b/pybind11/tests/test_stl.py
@@ -0,0 +1,252 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+from pybind11_tests import stl as m
+from pybind11_tests import UserType
+from pybind11_tests import ConstructorStats
+
+
+def test_vector(doc):
+    """std::vector <-> list"""
+    lst = m.cast_vector()
+    assert lst == [1]
+    lst.append(2)
+    assert m.load_vector(lst)
+    assert m.load_vector(tuple(lst))
+
+    assert m.cast_bool_vector() == [True, False]
+    assert m.load_bool_vector([True, False])
+
+    assert doc(m.cast_vector) == "cast_vector() -> List[int]"
+    assert doc(m.load_vector) == "load_vector(arg0: List[int]) -> bool"
+
+    # Test regression caused by 936: pointers to stl containers weren't castable
+    assert m.cast_ptr_vector() == ["lvalue", "lvalue"]
+
+
+def test_deque(doc):
+    """std::deque <-> list"""
+    lst = m.cast_deque()
+    assert lst == [1]
+    lst.append(2)
+    assert m.load_deque(lst)
+    assert m.load_deque(tuple(lst))
+
+
+def test_array(doc):
+    """std::array <-> list"""
+    lst = m.cast_array()
+    assert lst == [1, 2]
+    assert m.load_array(lst)
+
+    assert doc(m.cast_array) == "cast_array() -> List[int[2]]"
+    assert doc(m.load_array) == "load_array(arg0: List[int[2]]) -> bool"
+
+
+def test_valarray(doc):
+    """std::valarray <-> list"""
+    lst = m.cast_valarray()
+    assert lst == [1, 4, 9]
+    assert m.load_valarray(lst)
+
+    assert doc(m.cast_valarray) == "cast_valarray() -> List[int]"
+    assert doc(m.load_valarray) == "load_valarray(arg0: List[int]) -> bool"
+
+
+def test_map(doc):
+    """std::map <-> dict"""
+    d = m.cast_map()
+    assert d == {"key": "value"}
+    assert "key" in d
+    d["key2"] = "value2"
+    assert "key2" in d
+    assert m.load_map(d)
+
+    assert doc(m.cast_map) == "cast_map() -> Dict[str, str]"
+    assert doc(m.load_map) == "load_map(arg0: Dict[str, str]) -> bool"
+
+
+def test_set(doc):
+    """std::set <-> set"""
+    s = m.cast_set()
+    assert s == {"key1", "key2"}
+    s.add("key3")
+    assert m.load_set(s)
+
+    assert doc(m.cast_set) == "cast_set() -> Set[str]"
+    assert doc(m.load_set) == "load_set(arg0: Set[str]) -> bool"
+
+
+def test_recursive_casting():
+    """Tests that stl casters preserve lvalue/rvalue context for container values"""
+    assert m.cast_rv_vector() == ["rvalue", "rvalue"]
+    assert m.cast_lv_vector() == ["lvalue", "lvalue"]
+    assert m.cast_rv_array() == ["rvalue", "rvalue", "rvalue"]
+    assert m.cast_lv_array() == ["lvalue", "lvalue"]
+    assert m.cast_rv_map() == {"a": "rvalue"}
+    assert m.cast_lv_map() == {"a": "lvalue", "b": "lvalue"}
+    assert m.cast_rv_nested() == [[[{"b": "rvalue", "c": "rvalue"}], [{"a": "rvalue"}]]]
+    assert m.cast_lv_nested() == {
+        "a": [[["lvalue", "lvalue"]], [["lvalue", "lvalue"]]],
+        "b": [[["lvalue", "lvalue"], ["lvalue", "lvalue"]]]
+    }
+
+    # Issue #853 test case:
+    z = m.cast_unique_ptr_vector()
+    assert z[0].value == 7 and z[1].value == 42
+
+
+def test_move_out_container():
+    """Properties use the `reference_internal` policy by default. If the underlying function
+    returns an rvalue, the policy is automatically changed to `move` to avoid referencing
+    a temporary. In case the return value is a container of user-defined types, the policy
+    also needs to be applied to the elements, not just the container."""
+    c = m.MoveOutContainer()
+    moved_out_list = c.move_list
+    assert [x.value for x in moved_out_list] == [0, 1, 2]
+
+
+@pytest.mark.skipif(not hasattr(m, "has_optional"), reason='no <optional>')
+def test_optional():
+    assert m.double_or_zero(None) == 0
+    assert m.double_or_zero(42) == 84
+    pytest.raises(TypeError, m.double_or_zero, 'foo')
+
+    assert m.half_or_none(0) is None
+    assert m.half_or_none(42) == 21
+    pytest.raises(TypeError, m.half_or_none, 'foo')
+
+    assert m.test_nullopt() == 42
+    assert m.test_nullopt(None) == 42
+    assert m.test_nullopt(42) == 42
+    assert m.test_nullopt(43) == 43
+
+    assert m.test_no_assign() == 42
+    assert m.test_no_assign(None) == 42
+    assert m.test_no_assign(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign, 43)
+
+    assert m.nodefer_none_optional(None)
+
+    holder = m.OptionalHolder()
+    mvalue = holder.member
+    assert mvalue.initialized
+    assert holder.member_initialized()
+
+
+@pytest.mark.skipif(not hasattr(m, "has_exp_optional"), reason='no <experimental/optional>')
+def test_exp_optional():
+    assert m.double_or_zero_exp(None) == 0
+    assert m.double_or_zero_exp(42) == 84
+    pytest.raises(TypeError, m.double_or_zero_exp, 'foo')
+
+    assert m.half_or_none_exp(0) is None
+    assert m.half_or_none_exp(42) == 21
+    pytest.raises(TypeError, m.half_or_none_exp, 'foo')
+
+    assert m.test_nullopt_exp() == 42
+    assert m.test_nullopt_exp(None) == 42
+    assert m.test_nullopt_exp(42) == 42
+    assert m.test_nullopt_exp(43) == 43
+
+    assert m.test_no_assign_exp() == 42
+    assert m.test_no_assign_exp(None) == 42
+    assert m.test_no_assign_exp(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign_exp, 43)
+
+    holder = m.OptionalExpHolder()
+    mvalue = holder.member
+    assert mvalue.initialized
+    assert holder.member_initialized()
+
+
+@pytest.mark.skipif(not hasattr(m, "load_variant"), reason='no <variant>')
+def test_variant(doc):
+    assert m.load_variant(1) == "int"
+    assert m.load_variant("1") == "std::string"
+    assert m.load_variant(1.0) == "double"
+    assert m.load_variant(None) == "std::nullptr_t"
+
+    assert m.load_variant_2pass(1) == "int"
+    assert m.load_variant_2pass(1.0) == "double"
+
+    assert m.cast_variant() == (5, "Hello")
+
+    assert doc(m.load_variant) == "load_variant(arg0: Union[int, str, float, None]) -> str"
+
+
+def test_vec_of_reference_wrapper():
+    """#171: Can't return reference wrappers (or STL structures containing them)"""
+    assert str(m.return_vec_of_reference_wrapper(UserType(4))) == \
+        "[UserType(1), UserType(2), UserType(3), UserType(4)]"
+
+
+def test_stl_pass_by_pointer(msg):
+    """Passing nullptr or None to an STL container pointer is not expected to work"""
+    with pytest.raises(TypeError) as excinfo:
+        m.stl_pass_by_pointer()  # default value is `nullptr`
+    assert msg(excinfo.value) == """
+        stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
+            1. (v: List[int] = None) -> List[int]
+
+        Invoked with:
+    """  # noqa: E501 line too long
+
+    with pytest.raises(TypeError) as excinfo:
+        m.stl_pass_by_pointer(None)
+    assert msg(excinfo.value) == """
+        stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
+            1. (v: List[int] = None) -> List[int]
+
+        Invoked with: None
+    """  # noqa: E501 line too long
+
+    assert m.stl_pass_by_pointer([1, 2, 3]) == [1, 2, 3]
+
+
+def test_missing_header_message():
+    """Trying convert `list` to a `std::vector`, or vice versa, without including
+    <pybind11/stl.h> should result in a helpful suggestion in the error message"""
+    import pybind11_cross_module_tests as cm
+
+    expected_message = ("Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                        "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                        "conversions are optional and require extra headers to be included\n"
+                        "when compiling your pybind11 module.")
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.missing_header_arg([1.0, 2.0, 3.0])
+    assert expected_message in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.missing_header_return()
+    assert expected_message in str(excinfo.value)
+
+
+def test_function_with_string_and_vector_string_arg():
+    """Check if a string is NOT implicitly converted to a list, which was the
+    behavior before fix of issue #1258"""
+    assert m.func_with_string_or_vector_string_arg_overload(('A', 'B', )) == 2
+    assert m.func_with_string_or_vector_string_arg_overload(['A', 'B']) == 2
+    assert m.func_with_string_or_vector_string_arg_overload('A') == 3
+
+
+def test_stl_ownership():
+    cstats = ConstructorStats.get(m.Placeholder)
+    assert cstats.alive() == 0
+    r = m.test_stl_ownership()
+    assert len(r) == 1
+    del r
+    assert cstats.alive() == 0
+
+
+def test_array_cast_sequence():
+    assert m.array_cast_sequence((1, 2, 3)) == [1, 2, 3]
+
+
+def test_issue_1561():
+    """ check fix for issue #1561 """
+    bar = m.Issue1561Outer()
+    bar.list = [m.Issue1561Inner('bar')]
+    bar.list
+    assert bar.list[0].data == 'bar'
diff --git a/pybind11/tests/test_stl_binders.cpp b/pybind11/tests/test_stl_binders.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8688874091219f5a5035f5eb46e976e7408080b8
--- /dev/null
+++ b/pybind11/tests/test_stl_binders.cpp
@@ -0,0 +1,129 @@
+/*
+    tests/test_stl_binders.cpp -- Usage of stl_binders functions
+
+    Copyright (c) 2016 Sergey Lyskov
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+#include <pybind11/stl_bind.h>
+#include <pybind11/numpy.h>
+#include <map>
+#include <deque>
+#include <unordered_map>
+
+class El {
+public:
+    El() = delete;
+    El(int v) : a(v) { }
+
+    int a;
+};
+
+std::ostream & operator<<(std::ostream &s, El const&v) {
+    s << "El{" << v.a << '}';
+    return s;
+}
+
+/// Issue #487: binding std::vector<E> with E non-copyable
+class E_nc {
+public:
+    explicit E_nc(int i) : value{i} {}
+    E_nc(const E_nc &) = delete;
+    E_nc &operator=(const E_nc &) = delete;
+    E_nc(E_nc &&) = default;
+    E_nc &operator=(E_nc &&) = default;
+
+    int value;
+};
+
+template <class Container> Container *one_to_n(int n) {
+    auto v = new Container();
+    for (int i = 1; i <= n; i++)
+        v->emplace_back(i);
+    return v;
+}
+
+template <class Map> Map *times_ten(int n) {
+    auto m = new Map();
+    for (int i = 1; i <= n; i++)
+        m->emplace(int(i), E_nc(10*i));
+    return m;
+}
+
+template <class NestMap> NestMap *times_hundred(int n) {
+    auto m = new NestMap();
+    for (int i = 1; i <= n; i++)
+        for (int j = 1; j <= n; j++)
+            (*m)[i].emplace(int(j*10), E_nc(100*j));
+    return m;
+}
+
+TEST_SUBMODULE(stl_binders, m) {
+    // test_vector_int
+    py::bind_vector<std::vector<unsigned int>>(m, "VectorInt", py::buffer_protocol());
+
+    // test_vector_custom
+    py::class_<El>(m, "El")
+        .def(py::init<int>());
+    py::bind_vector<std::vector<El>>(m, "VectorEl");
+    py::bind_vector<std::vector<std::vector<El>>>(m, "VectorVectorEl");
+
+    // test_map_string_double
+    py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
+    py::bind_map<std::unordered_map<std::string, double>>(m, "UnorderedMapStringDouble");
+
+    // test_map_string_double_const
+    py::bind_map<std::map<std::string, double const>>(m, "MapStringDoubleConst");
+    py::bind_map<std::unordered_map<std::string, double const>>(m, "UnorderedMapStringDoubleConst");
+
+    py::class_<E_nc>(m, "ENC")
+        .def(py::init<int>())
+        .def_readwrite("value", &E_nc::value);
+
+    // test_noncopyable_containers
+    py::bind_vector<std::vector<E_nc>>(m, "VectorENC");
+    m.def("get_vnc", &one_to_n<std::vector<E_nc>>, py::return_value_policy::reference);
+    py::bind_vector<std::deque<E_nc>>(m, "DequeENC");
+    m.def("get_dnc", &one_to_n<std::deque<E_nc>>, py::return_value_policy::reference);
+    py::bind_map<std::map<int, E_nc>>(m, "MapENC");
+    m.def("get_mnc", &times_ten<std::map<int, E_nc>>, py::return_value_policy::reference);
+    py::bind_map<std::unordered_map<int, E_nc>>(m, "UmapENC");
+    m.def("get_umnc", &times_ten<std::unordered_map<int, E_nc>>, py::return_value_policy::reference);
+    // Issue #1885: binding nested std::map<X, Container<E>> with E non-copyable
+    py::bind_map<std::map<int, std::vector<E_nc>>>(m, "MapVecENC");
+    m.def("get_nvnc", [](int n)
+        {
+            auto m = new std::map<int, std::vector<E_nc>>();
+            for (int i = 1; i <= n; i++)
+                for (int j = 1; j <= n; j++)
+                    (*m)[i].emplace_back(j);
+            return m;
+        }, py::return_value_policy::reference);
+    py::bind_map<std::map<int, std::map<int, E_nc>>>(m, "MapMapENC");
+    m.def("get_nmnc", &times_hundred<std::map<int, std::map<int, E_nc>>>, py::return_value_policy::reference);
+    py::bind_map<std::unordered_map<int, std::unordered_map<int, E_nc>>>(m, "UmapUmapENC");
+    m.def("get_numnc", &times_hundred<std::unordered_map<int, std::unordered_map<int, E_nc>>>, py::return_value_policy::reference);
+
+    // test_vector_buffer
+    py::bind_vector<std::vector<unsigned char>>(m, "VectorUChar", py::buffer_protocol());
+    // no dtype declared for this version:
+    struct VUndeclStruct { bool w; uint32_t x; double y; bool z; };
+    m.def("create_undeclstruct", [m] () mutable {
+        py::bind_vector<std::vector<VUndeclStruct>>(m, "VectorUndeclStruct", py::buffer_protocol());
+    });
+
+    // The rest depends on numpy:
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vector_buffer_numpy
+    struct VStruct { bool w; uint32_t x; double y; bool z; };
+    PYBIND11_NUMPY_DTYPE(VStruct, w, x, y, z);
+    py::class_<VStruct>(m, "VStruct").def_readwrite("x", &VStruct::x);
+    py::bind_vector<std::vector<VStruct>>(m, "VectorStruct", py::buffer_protocol());
+    m.def("get_vectorstruct", [] {return std::vector<VStruct> {{0, 5, 3.0, 1}, {1, 30, -1e4, 0}};});
+}
diff --git a/pybind11/tests/test_stl_binders.py b/pybind11/tests/test_stl_binders.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b8ea4af2a3156ecb09dd9d61d3464bb85ceefb
--- /dev/null
+++ b/pybind11/tests/test_stl_binders.py
@@ -0,0 +1,285 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import stl_binders as m
+
+
+def test_vector_int():
+    v_int = m.VectorInt([0, 0])
+    assert len(v_int) == 2
+    assert bool(v_int) is True
+
+    # test construction from a generator
+    v_int1 = m.VectorInt(x for x in range(5))
+    assert v_int1 == m.VectorInt([0, 1, 2, 3, 4])
+
+    v_int2 = m.VectorInt([0, 0])
+    assert v_int == v_int2
+    v_int2[1] = 1
+    assert v_int != v_int2
+
+    v_int2.append(2)
+    v_int2.insert(0, 1)
+    v_int2.insert(0, 2)
+    v_int2.insert(0, 3)
+    v_int2.insert(6, 3)
+    assert str(v_int2) == "VectorInt[3, 2, 1, 0, 1, 2, 3]"
+    with pytest.raises(IndexError):
+        v_int2.insert(8, 4)
+
+    v_int.append(99)
+    v_int2[2:-2] = v_int
+    assert v_int2 == m.VectorInt([3, 2, 0, 0, 99, 2, 3])
+    del v_int2[1:3]
+    assert v_int2 == m.VectorInt([3, 0, 99, 2, 3])
+    del v_int2[0]
+    assert v_int2 == m.VectorInt([0, 99, 2, 3])
+
+    v_int2.extend(m.VectorInt([4, 5]))
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5])
+
+    v_int2.extend([6, 7])
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7])
+
+    # test error handling, and that the vector is unchanged
+    with pytest.raises(RuntimeError):
+        v_int2.extend([8, 'a'])
+
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7])
+
+    # test extending from a generator
+    v_int2.extend(x for x in range(5))
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4])
+
+    # test negative indexing
+    assert v_int2[-1] == 4
+
+    # insert with negative index
+    v_int2.insert(-1, 88)
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 88, 4])
+
+    # delete negative index
+    del v_int2[-1]
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 88])
+
+    v_int2.clear()
+    assert len(v_int2) == 0
+
+
+# Older PyPy's failed here, related to the PyPy's buffer protocol.
+def test_vector_buffer():
+    b = bytearray([1, 2, 3, 4])
+    v = m.VectorUChar(b)
+    assert v[1] == 2
+    v[2] = 5
+    mv = memoryview(v)  # We expose the buffer interface
+    if not env.PY2:
+        assert mv[2] == 5
+        mv[2] = 6
+    else:
+        assert mv[2] == '\x05'
+        mv[2] = '\x06'
+    assert v[2] == 6
+
+    if not env.PY2:
+        mv = memoryview(b)
+        v = m.VectorUChar(mv[::2])
+        assert v[1] == 3
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.create_undeclstruct()  # Undeclared struct contents, no buffer interface
+    assert "NumPy type info missing for " in str(excinfo.value)
+
+
+def test_vector_buffer_numpy():
+    np = pytest.importorskip("numpy")
+    a = np.array([1, 2, 3, 4], dtype=np.int32)
+    with pytest.raises(TypeError):
+        m.VectorInt(a)
+
+    a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.uintc)
+    v = m.VectorInt(a[0, :])
+    assert len(v) == 4
+    assert v[2] == 3
+    ma = np.asarray(v)
+    ma[2] = 5
+    assert v[2] == 5
+
+    v = m.VectorInt(a[:, 1])
+    assert len(v) == 3
+    assert v[2] == 10
+
+    v = m.get_vectorstruct()
+    assert v[0].x == 5
+    ma = np.asarray(v)
+    ma[1]['x'] = 99
+    assert v[1].x == 99
+
+    v = m.VectorStruct(np.zeros(3, dtype=np.dtype([('w', 'bool'), ('x', 'I'),
+                                                   ('y', 'float64'), ('z', 'bool')], align=True)))
+    assert len(v) == 3
+
+    b = np.array([1, 2, 3, 4], dtype=np.uint8)
+    v = m.VectorUChar(b[::2])
+    assert v[1] == 3
+
+
+def test_vector_bool():
+    import pybind11_cross_module_tests as cm
+
+    vv_c = cm.VectorBool()
+    for i in range(10):
+        vv_c.append(i % 2 == 0)
+    for i in range(10):
+        assert vv_c[i] == (i % 2 == 0)
+    assert str(vv_c) == "VectorBool[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]"
+
+
+def test_vector_custom():
+    v_a = m.VectorEl()
+    v_a.append(m.El(1))
+    v_a.append(m.El(2))
+    assert str(v_a) == "VectorEl[El{1}, El{2}]"
+
+    vv_a = m.VectorVectorEl()
+    vv_a.append(v_a)
+    vv_b = vv_a[0]
+    assert str(vv_b) == "VectorEl[El{1}, El{2}]"
+
+
+def test_map_string_double():
+    mm = m.MapStringDouble()
+    mm['a'] = 1
+    mm['b'] = 2.5
+
+    assert list(mm) == ['a', 'b']
+    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
+    assert str(mm) == "MapStringDouble{a: 1, b: 2.5}"
+
+    um = m.UnorderedMapStringDouble()
+    um['ua'] = 1.1
+    um['ub'] = 2.6
+
+    assert sorted(list(um)) == ['ua', 'ub']
+    assert sorted(list(um.items())) == [('ua', 1.1), ('ub', 2.6)]
+    assert "UnorderedMapStringDouble" in str(um)
+
+
+def test_map_string_double_const():
+    mc = m.MapStringDoubleConst()
+    mc['a'] = 10
+    mc['b'] = 20.5
+    assert str(mc) == "MapStringDoubleConst{a: 10, b: 20.5}"
+
+    umc = m.UnorderedMapStringDoubleConst()
+    umc['a'] = 11
+    umc['b'] = 21.5
+
+    str(umc)
+
+
+def test_noncopyable_containers():
+    # std::vector
+    vnc = m.get_vnc(5)
+    for i in range(0, 5):
+        assert vnc[i].value == i + 1
+
+    for i, j in enumerate(vnc, start=1):
+        assert j.value == i
+
+    # std::deque
+    dnc = m.get_dnc(5)
+    for i in range(0, 5):
+        assert dnc[i].value == i + 1
+
+    i = 1
+    for j in dnc:
+        assert(j.value == i)
+        i += 1
+
+    # std::map
+    mnc = m.get_mnc(5)
+    for i in range(1, 6):
+        assert mnc[i].value == 10 * i
+
+    vsum = 0
+    for k, v in mnc.items():
+        assert v.value == 10 * k
+        vsum += v.value
+
+    assert vsum == 150
+
+    # std::unordered_map
+    mnc = m.get_umnc(5)
+    for i in range(1, 6):
+        assert mnc[i].value == 10 * i
+
+    vsum = 0
+    for k, v in mnc.items():
+        assert v.value == 10 * k
+        vsum += v.value
+
+    assert vsum == 150
+
+    # nested std::map<std::vector>
+    nvnc = m.get_nvnc(5)
+    for i in range(1, 6):
+        for j in range(0, 5):
+            assert nvnc[i][j].value == j + 1
+
+    # Note: maps do not have .values()
+    for _, v in nvnc.items():
+        for i, j in enumerate(v, start=1):
+            assert j.value == i
+
+    # nested std::map<std::map>
+    nmnc = m.get_nmnc(5)
+    for i in range(1, 6):
+        for j in range(10, 60, 10):
+            assert nmnc[i][j].value == 10 * j
+
+    vsum = 0
+    for _, v_o in nmnc.items():
+        for k_i, v_i in v_o.items():
+            assert v_i.value == 10 * k_i
+            vsum += v_i.value
+
+    assert vsum == 7500
+
+    # nested std::unordered_map<std::unordered_map>
+    numnc = m.get_numnc(5)
+    for i in range(1, 6):
+        for j in range(10, 60, 10):
+            assert numnc[i][j].value == 10 * j
+
+    vsum = 0
+    for _, v_o in numnc.items():
+        for k_i, v_i in v_o.items():
+            assert v_i.value == 10 * k_i
+            vsum += v_i.value
+
+    assert vsum == 7500
+
+
+def test_map_delitem():
+    mm = m.MapStringDouble()
+    mm['a'] = 1
+    mm['b'] = 2.5
+
+    assert list(mm) == ['a', 'b']
+    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
+    del mm['a']
+    assert list(mm) == ['b']
+    assert list(mm.items()) == [('b', 2.5)]
+
+    um = m.UnorderedMapStringDouble()
+    um['ua'] = 1.1
+    um['ub'] = 2.6
+
+    assert sorted(list(um)) == ['ua', 'ub']
+    assert sorted(list(um.items())) == [('ua', 1.1), ('ub', 2.6)]
+    del um['ua']
+    assert sorted(list(um)) == ['ub']
+    assert sorted(list(um.items())) == [('ub', 2.6)]
diff --git a/pybind11/tests/test_tagbased_polymorphic.cpp b/pybind11/tests/test_tagbased_polymorphic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcc005126eed4ae13f69dedcb1fe04dce1a4c22f
--- /dev/null
+++ b/pybind11/tests/test_tagbased_polymorphic.cpp
@@ -0,0 +1,142 @@
+/*
+    tests/test_tagbased_polymorphic.cpp -- test of polymorphic_type_hook
+
+    Copyright (c) 2018 Hudson River Trading LLC <opensource@hudson-trading.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+
+struct Animal
+{
+    // Make this type also a "standard" polymorphic type, to confirm that
+    // specializing polymorphic_type_hook using enable_if_t still works
+    // (https://github.com/pybind/pybind11/pull/2016/).
+    virtual ~Animal() = default;
+
+    // Enum for tag-based polymorphism.
+    enum class Kind {
+        Unknown = 0,
+        Dog = 100, Labrador, Chihuahua, LastDog = 199,
+        Cat = 200, Panther, LastCat = 299
+    };
+    static const std::type_info* type_of_kind(Kind kind);
+    static std::string name_of_kind(Kind kind);
+
+    const Kind kind;
+    const std::string name;
+
+  protected:
+    Animal(const std::string& _name, Kind _kind)
+        : kind(_kind), name(_name)
+    {}
+};
+
+struct Dog : Animal
+{
+    Dog(const std::string& _name, Kind _kind = Kind::Dog) : Animal(_name, _kind) {}
+    std::string bark() const { return name_of_kind(kind) + " " + name + " goes " + sound; }
+    std::string sound = "WOOF!";
+};
+
+struct Labrador : Dog
+{
+    Labrador(const std::string& _name, int _excitement = 9001)
+        : Dog(_name, Kind::Labrador), excitement(_excitement) {}
+    int excitement;
+};
+
+struct Chihuahua : Dog
+{
+    Chihuahua(const std::string& _name) : Dog(_name, Kind::Chihuahua) { sound = "iyiyiyiyiyi"; }
+    std::string bark() const { return Dog::bark() + " and runs in circles"; }
+};
+
+struct Cat : Animal
+{
+    Cat(const std::string& _name, Kind _kind = Kind::Cat) : Animal(_name, _kind) {}
+    std::string purr() const { return "mrowr"; }
+};
+
+struct Panther : Cat
+{
+    Panther(const std::string& _name) : Cat(_name, Kind::Panther) {}
+    std::string purr() const { return "mrrrRRRRRR"; }
+};
+
+std::vector<std::unique_ptr<Animal>> create_zoo()
+{
+    std::vector<std::unique_ptr<Animal>> ret;
+    ret.emplace_back(new Labrador("Fido", 15000));
+
+    // simulate some new type of Dog that the Python bindings
+    // haven't been updated for; it should still be considered
+    // a Dog, not just an Animal.
+    ret.emplace_back(new Dog("Ginger", Dog::Kind(150)));
+
+    ret.emplace_back(new Chihuahua("Hertzl"));
+    ret.emplace_back(new Cat("Tiger", Cat::Kind::Cat));
+    ret.emplace_back(new Panther("Leo"));
+    return ret;
+}
+
+const std::type_info* Animal::type_of_kind(Kind kind)
+{
+    switch (kind) {
+        case Kind::Unknown: break;
+
+        case Kind::Dog: break;
+        case Kind::Labrador: return &typeid(Labrador);
+        case Kind::Chihuahua: return &typeid(Chihuahua);
+        case Kind::LastDog: break;
+
+        case Kind::Cat: break;
+        case Kind::Panther: return &typeid(Panther);
+        case Kind::LastCat: break;
+    }
+
+    if (kind >= Kind::Dog && kind <= Kind::LastDog) return &typeid(Dog);
+    if (kind >= Kind::Cat && kind <= Kind::LastCat) return &typeid(Cat);
+    return nullptr;
+}
+
+std::string Animal::name_of_kind(Kind kind)
+{
+    std::string raw_name = type_of_kind(kind)->name();
+    py::detail::clean_type_id(raw_name);
+    return raw_name;
+}
+
+namespace pybind11 {
+    template <typename itype>
+    struct polymorphic_type_hook<itype, detail::enable_if_t<std::is_base_of<Animal, itype>::value>>
+    {
+        static const void *get(const itype *src, const std::type_info*& type)
+        { type = src ? Animal::type_of_kind(src->kind) : nullptr; return src; }
+    };
+}
+
+TEST_SUBMODULE(tagbased_polymorphic, m) {
+    py::class_<Animal>(m, "Animal")
+        .def_readonly("name", &Animal::name);
+    py::class_<Dog, Animal>(m, "Dog")
+        .def(py::init<std::string>())
+        .def_readwrite("sound", &Dog::sound)
+        .def("bark", &Dog::bark);
+    py::class_<Labrador, Dog>(m, "Labrador")
+        .def(py::init<std::string, int>(), "name"_a, "excitement"_a = 9001)
+        .def_readwrite("excitement", &Labrador::excitement);
+    py::class_<Chihuahua, Dog>(m, "Chihuahua")
+        .def(py::init<std::string>())
+        .def("bark", &Chihuahua::bark);
+    py::class_<Cat, Animal>(m, "Cat")
+        .def(py::init<std::string>())
+        .def("purr", &Cat::purr);
+    py::class_<Panther, Cat>(m, "Panther")
+        .def(py::init<std::string>())
+        .def("purr", &Panther::purr);
+    m.def("create_zoo", &create_zoo);
+};
diff --git a/pybind11/tests/test_tagbased_polymorphic.py b/pybind11/tests/test_tagbased_polymorphic.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f374da90f09be93f2885708fd4bcfcce54134e
--- /dev/null
+++ b/pybind11/tests/test_tagbased_polymorphic.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import tagbased_polymorphic as m
+
+
+def test_downcast():
+    zoo = m.create_zoo()
+    assert [type(animal) for animal in zoo] == [
+        m.Labrador, m.Dog, m.Chihuahua, m.Cat, m.Panther
+    ]
+    assert [animal.name for animal in zoo] == [
+        "Fido", "Ginger", "Hertzl", "Tiger", "Leo"
+    ]
+    zoo[1].sound = "woooooo"
+    assert [dog.bark() for dog in zoo[:3]] == [
+        "Labrador Fido goes WOOF!",
+        "Dog Ginger goes woooooo",
+        "Chihuahua Hertzl goes iyiyiyiyiyi and runs in circles"
+    ]
+    assert [cat.purr() for cat in zoo[3:]] == ["mrowr", "mrrrRRRRRR"]
+    zoo[0].excitement -= 1000
+    assert zoo[0].excitement == 14000
diff --git a/pybind11/tests/test_union.cpp b/pybind11/tests/test_union.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b98ea216ca0b272978134d9dd1d1eff1b804ad5
--- /dev/null
+++ b/pybind11/tests/test_union.cpp
@@ -0,0 +1,22 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2019 Roland Dreier <roland.dreier@gmail.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(union_, m) {
+    union TestUnion {
+        int value_int;
+        unsigned value_uint;
+    };
+
+    py::class_<TestUnion>(m, "TestUnion")
+        .def(py::init<>())
+        .def_readonly("as_int", &TestUnion::value_int)
+        .def_readwrite("as_uint", &TestUnion::value_uint);
+}
diff --git a/pybind11/tests/test_union.py b/pybind11/tests/test_union.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a2c12fb4836a0f2194e06af7781216d78ed0ba2
--- /dev/null
+++ b/pybind11/tests/test_union.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from pybind11_tests import union_ as m
+
+
+def test_union():
+    instance = m.TestUnion()
+
+    instance.as_uint = 10
+    assert instance.as_int == 10
diff --git a/pybind11/tests/test_virtual_functions.cpp b/pybind11/tests/test_virtual_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..583c1e647efc5701424aced4a5ba21c6e17cef9e
--- /dev/null
+++ b/pybind11/tests/test_virtual_functions.cpp
@@ -0,0 +1,498 @@
+/*
+    tests/test_virtual_functions.cpp -- overriding virtual functions from Python
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/functional.h>
+#include <thread>
+
+/* This is an example class that we'll want to be able to extend from Python */
+class ExampleVirt  {
+public:
+    ExampleVirt(int state) : state(state) { print_created(this, state); }
+    ExampleVirt(const ExampleVirt &e) : state(e.state) { print_copy_created(this); }
+    ExampleVirt(ExampleVirt &&e) : state(e.state) { print_move_created(this); e.state = 0; }
+    virtual ~ExampleVirt() { print_destroyed(this); }
+
+    virtual int run(int value) {
+        py::print("Original implementation of "
+                  "ExampleVirt::run(state={}, value={}, str1={}, str2={})"_s.format(state, value, get_string1(), *get_string2()));
+        return state + value;
+    }
+
+    virtual bool run_bool() = 0;
+    virtual void pure_virtual() = 0;
+
+    // Returning a reference/pointer to a type converted from python (numbers, strings, etc.) is a
+    // bit trickier, because the actual int& or std::string& or whatever only exists temporarily, so
+    // we have to handle it specially in the trampoline class (see below).
+    virtual const std::string &get_string1() { return str1; }
+    virtual const std::string *get_string2() { return &str2; }
+
+private:
+    int state;
+    const std::string str1{"default1"}, str2{"default2"};
+};
+
+/* This is a wrapper class that must be generated */
+class PyExampleVirt : public ExampleVirt {
+public:
+    using ExampleVirt::ExampleVirt; /* Inherit constructors */
+
+    int run(int value) override {
+        /* Generate wrapping code that enables native function overloading */
+        PYBIND11_OVERLOAD(
+            int,         /* Return type */
+            ExampleVirt, /* Parent class */
+            run,         /* Name of function */
+            value        /* Argument(s) */
+        );
+    }
+
+    bool run_bool() override {
+        PYBIND11_OVERLOAD_PURE(
+            bool,         /* Return type */
+            ExampleVirt,  /* Parent class */
+            run_bool,     /* Name of function */
+                          /* This function has no arguments. The trailing comma
+                             in the previous line is needed for some compilers */
+        );
+    }
+
+    void pure_virtual() override {
+        PYBIND11_OVERLOAD_PURE(
+            void,         /* Return type */
+            ExampleVirt,  /* Parent class */
+            pure_virtual, /* Name of function */
+                          /* This function has no arguments. The trailing comma
+                             in the previous line is needed for some compilers */
+        );
+    }
+
+    // We can return reference types for compatibility with C++ virtual interfaces that do so, but
+    // note they have some significant limitations (see the documentation).
+    const std::string &get_string1() override {
+        PYBIND11_OVERLOAD(
+            const std::string &, /* Return type */
+            ExampleVirt,         /* Parent class */
+            get_string1,         /* Name of function */
+                                 /* (no arguments) */
+        );
+    }
+
+    const std::string *get_string2() override {
+        PYBIND11_OVERLOAD(
+            const std::string *, /* Return type */
+            ExampleVirt,         /* Parent class */
+            get_string2,         /* Name of function */
+                                 /* (no arguments) */
+        );
+    }
+
+};
+
+class NonCopyable {
+public:
+    NonCopyable(int a, int b) : value{new int(a*b)} { print_created(this, a, b); }
+    NonCopyable(NonCopyable &&o) { value = std::move(o.value); print_move_created(this); }
+    NonCopyable(const NonCopyable &) = delete;
+    NonCopyable() = delete;
+    void operator=(const NonCopyable &) = delete;
+    void operator=(NonCopyable &&) = delete;
+    std::string get_value() const {
+        if (value) return std::to_string(*value); else return "(null)";
+    }
+    ~NonCopyable() { print_destroyed(this); }
+
+private:
+    std::unique_ptr<int> value;
+};
+
+// This is like the above, but is both copy and movable.  In effect this means it should get moved
+// when it is not referenced elsewhere, but copied if it is still referenced.
+class Movable {
+public:
+    Movable(int a, int b) : value{a+b} { print_created(this, a, b); }
+    Movable(const Movable &m) { value = m.value; print_copy_created(this); }
+    Movable(Movable &&m) { value = std::move(m.value); print_move_created(this); }
+    std::string get_value() const { return std::to_string(value); }
+    ~Movable() { print_destroyed(this); }
+private:
+    int value;
+};
+
+class NCVirt {
+public:
+    virtual ~NCVirt() { }
+    NCVirt() = default;
+    NCVirt(const NCVirt&) = delete;
+    virtual NonCopyable get_noncopyable(int a, int b) { return NonCopyable(a, b); }
+    virtual Movable get_movable(int a, int b) = 0;
+
+    std::string print_nc(int a, int b) { return get_noncopyable(a, b).get_value(); }
+    std::string print_movable(int a, int b) { return get_movable(a, b).get_value(); }
+};
+class NCVirtTrampoline : public NCVirt {
+#if !defined(__INTEL_COMPILER)
+    NonCopyable get_noncopyable(int a, int b) override {
+        PYBIND11_OVERLOAD(NonCopyable, NCVirt, get_noncopyable, a, b);
+    }
+#endif
+    Movable get_movable(int a, int b) override {
+        PYBIND11_OVERLOAD_PURE(Movable, NCVirt, get_movable, a, b);
+    }
+};
+
+struct Base {
+    /* for some reason MSVC2015 can't compile this if the function is pure virtual */
+    virtual std::string dispatch() const { return {}; };
+    virtual ~Base() = default;
+    Base() = default;
+    Base(const Base&) = delete;
+};
+
+struct DispatchIssue : Base {
+    virtual std::string dispatch() const {
+        PYBIND11_OVERLOAD_PURE(std::string, Base, dispatch, /* no arguments */);
+    }
+};
+
+static void test_gil() {
+    {
+        py::gil_scoped_acquire lock;
+        py::print("1st lock acquired");
+
+    }
+
+    {
+        py::gil_scoped_acquire lock;
+        py::print("2nd lock acquired");
+    }
+
+}
+
+static void test_gil_from_thread() {
+    py::gil_scoped_release release;
+
+    std::thread t(test_gil);
+    t.join();
+}
+
+
+// Forward declaration (so that we can put the main tests here; the inherited virtual approaches are
+// rather long).
+void initialize_inherited_virtuals(py::module &m);
+
+TEST_SUBMODULE(virtual_functions, m) {
+    // test_override
+    py::class_<ExampleVirt, PyExampleVirt>(m, "ExampleVirt")
+        .def(py::init<int>())
+        /* Reference original class in function definitions */
+        .def("run", &ExampleVirt::run)
+        .def("run_bool", &ExampleVirt::run_bool)
+        .def("pure_virtual", &ExampleVirt::pure_virtual);
+
+    py::class_<NonCopyable>(m, "NonCopyable")
+        .def(py::init<int, int>());
+
+    py::class_<Movable>(m, "Movable")
+        .def(py::init<int, int>());
+
+    // test_move_support
+#if !defined(__INTEL_COMPILER)
+    py::class_<NCVirt, NCVirtTrampoline>(m, "NCVirt")
+        .def(py::init<>())
+        .def("get_noncopyable", &NCVirt::get_noncopyable)
+        .def("get_movable", &NCVirt::get_movable)
+        .def("print_nc", &NCVirt::print_nc)
+        .def("print_movable", &NCVirt::print_movable);
+#endif
+
+    m.def("runExampleVirt", [](ExampleVirt *ex, int value) { return ex->run(value); });
+    m.def("runExampleVirtBool", [](ExampleVirt* ex) { return ex->run_bool(); });
+    m.def("runExampleVirtVirtual", [](ExampleVirt *ex) { ex->pure_virtual(); });
+
+    m.def("cstats_debug", &ConstructorStats::get<ExampleVirt>);
+    initialize_inherited_virtuals(m);
+
+    // test_alias_delay_initialization1
+    // don't invoke Python dispatch classes by default when instantiating C++ classes
+    // that were not extended on the Python side
+    struct A {
+        A() = default;
+        A(const A&) = delete;
+        virtual ~A() {}
+        virtual void f() { py::print("A.f()"); }
+    };
+
+    struct PyA : A {
+        PyA() { py::print("PyA.PyA()"); }
+        PyA(const PyA&) = delete;
+        ~PyA() { py::print("PyA.~PyA()"); }
+
+        void f() override {
+            py::print("PyA.f()");
+            // This convolution just gives a `void`, but tests that PYBIND11_TYPE() works to protect
+            // a type containing a ,
+            PYBIND11_OVERLOAD(PYBIND11_TYPE(typename std::enable_if<true, void>::type), A, f);
+        }
+    };
+
+    py::class_<A, PyA>(m, "A")
+        .def(py::init<>())
+        .def("f", &A::f);
+
+    m.def("call_f", [](A *a) { a->f(); });
+
+    // test_alias_delay_initialization2
+    // ... unless we explicitly request it, as in this example:
+    struct A2 {
+        A2() = default;
+        A2(const A2&) = delete;
+        virtual ~A2() {}
+        virtual void f() { py::print("A2.f()"); }
+    };
+
+    struct PyA2 : A2 {
+        PyA2() { py::print("PyA2.PyA2()"); }
+        PyA2(const PyA2&) = delete;
+        ~PyA2() { py::print("PyA2.~PyA2()"); }
+        void f() override {
+            py::print("PyA2.f()");
+            PYBIND11_OVERLOAD(void, A2, f);
+        }
+    };
+
+    py::class_<A2, PyA2>(m, "A2")
+        .def(py::init_alias<>())
+        .def(py::init([](int) { return new PyA2(); }))
+        .def("f", &A2::f);
+
+    m.def("call_f", [](A2 *a2) { a2->f(); });
+
+    // test_dispatch_issue
+    // #159: virtual function dispatch has problems with similar-named functions
+    py::class_<Base, DispatchIssue>(m, "DispatchIssue")
+        .def(py::init<>())
+        .def("dispatch", &Base::dispatch);
+
+    m.def("dispatch_issue_go", [](const Base * b) { return b->dispatch(); });
+
+    // test_override_ref
+    // #392/397: overriding reference-returning functions
+    class OverrideTest {
+    public:
+        struct A { std::string value = "hi"; };
+        std::string v;
+        A a;
+        explicit OverrideTest(const std::string &v) : v{v} {}
+        OverrideTest() = default;
+        OverrideTest(const OverrideTest&) = delete;
+        virtual std::string str_value() { return v; }
+        virtual std::string &str_ref() { return v; }
+        virtual A A_value() { return a; }
+        virtual A &A_ref() { return a; }
+        virtual ~OverrideTest() = default;
+    };
+
+    class PyOverrideTest : public OverrideTest {
+    public:
+        using OverrideTest::OverrideTest;
+        std::string str_value() override { PYBIND11_OVERLOAD(std::string, OverrideTest, str_value); }
+        // Not allowed (uncommenting should hit a static_assert failure): we can't get a reference
+        // to a python numeric value, since we only copy values in the numeric type caster:
+//      std::string &str_ref() override { PYBIND11_OVERLOAD(std::string &, OverrideTest, str_ref); }
+        // But we can work around it like this:
+    private:
+        std::string _tmp;
+        std::string str_ref_helper() { PYBIND11_OVERLOAD(std::string, OverrideTest, str_ref); }
+    public:
+        std::string &str_ref() override { return _tmp = str_ref_helper(); }
+
+        A A_value() override { PYBIND11_OVERLOAD(A, OverrideTest, A_value); }
+        A &A_ref() override { PYBIND11_OVERLOAD(A &, OverrideTest, A_ref); }
+    };
+
+    py::class_<OverrideTest::A>(m, "OverrideTest_A")
+        .def_readwrite("value", &OverrideTest::A::value);
+    py::class_<OverrideTest, PyOverrideTest>(m, "OverrideTest")
+        .def(py::init<const std::string &>())
+        .def("str_value", &OverrideTest::str_value)
+//      .def("str_ref", &OverrideTest::str_ref)
+        .def("A_value", &OverrideTest::A_value)
+        .def("A_ref", &OverrideTest::A_ref);
+}
+
+
+// Inheriting virtual methods.  We do two versions here: the repeat-everything version and the
+// templated trampoline versions mentioned in docs/advanced.rst.
+//
+// These base classes are exactly the same, but we technically need distinct
+// classes for this example code because we need to be able to bind them
+// properly (pybind11, sensibly, doesn't allow us to bind the same C++ class to
+// multiple python classes).
+class A_Repeat {
+#define A_METHODS \
+public: \
+    virtual int unlucky_number() = 0; \
+    virtual std::string say_something(unsigned times) { \
+        std::string s = ""; \
+        for (unsigned i = 0; i < times; ++i) \
+            s += "hi"; \
+        return s; \
+    } \
+    std::string say_everything() { \
+        return say_something(1) + " " + std::to_string(unlucky_number()); \
+    }
+A_METHODS
+    A_Repeat() = default;
+    A_Repeat(const A_Repeat&) = delete;
+    virtual ~A_Repeat() = default;
+};
+class B_Repeat : public A_Repeat {
+#define B_METHODS \
+public: \
+    int unlucky_number() override { return 13; } \
+    std::string say_something(unsigned times) override { \
+        return "B says hi " + std::to_string(times) + " times"; \
+    } \
+    virtual double lucky_number() { return 7.0; }
+B_METHODS
+};
+class C_Repeat : public B_Repeat {
+#define C_METHODS \
+public: \
+    int unlucky_number() override { return 4444; } \
+    double lucky_number() override { return 888; }
+C_METHODS
+};
+class D_Repeat : public C_Repeat {
+#define D_METHODS // Nothing overridden.
+D_METHODS
+};
+
+// Base classes for templated inheritance trampolines.  Identical to the repeat-everything version:
+class A_Tpl {
+    A_METHODS;
+    A_Tpl() = default;
+    A_Tpl(const A_Tpl&) = delete;
+    virtual ~A_Tpl() = default;
+};
+class B_Tpl : public A_Tpl { B_METHODS };
+class C_Tpl : public B_Tpl { C_METHODS };
+class D_Tpl : public C_Tpl { D_METHODS };
+
+
+// Inheritance approach 1: each trampoline gets every virtual method (11 in total)
+class PyA_Repeat : public A_Repeat {
+public:
+    using A_Repeat::A_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD_PURE(int, A_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, A_Repeat, say_something, times); }
+};
+class PyB_Repeat : public B_Repeat {
+public:
+    using B_Repeat::B_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, B_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, B_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, B_Repeat, lucky_number, ); }
+};
+class PyC_Repeat : public C_Repeat {
+public:
+    using C_Repeat::C_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, C_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, C_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, C_Repeat, lucky_number, ); }
+};
+class PyD_Repeat : public D_Repeat {
+public:
+    using D_Repeat::D_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, D_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, D_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, D_Repeat, lucky_number, ); }
+};
+
+// Inheritance approach 2: templated trampoline classes.
+//
+// Advantages:
+// - we have only 2 (template) class and 4 method declarations (one per virtual method, plus one for
+//   any override of a pure virtual method), versus 4 classes and 6 methods (MI) or 4 classes and 11
+//   methods (repeat).
+// - Compared to MI, we also don't have to change the non-trampoline inheritance to virtual, and can
+//   properly inherit constructors.
+//
+// Disadvantage:
+// - the compiler must still generate and compile 14 different methods (more, even, than the 11
+//   required for the repeat approach) instead of the 6 required for MI.  (If there was no pure
+//   method (or no pure method override), the number would drop down to the same 11 as the repeat
+//   approach).
+template <class Base = A_Tpl>
+class PyA_Tpl : public Base {
+public:
+    using Base::Base; // Inherit constructors
+    int unlucky_number() override { PYBIND11_OVERLOAD_PURE(int, Base, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, Base, say_something, times); }
+};
+template <class Base = B_Tpl>
+class PyB_Tpl : public PyA_Tpl<Base> {
+public:
+    using PyA_Tpl<Base>::PyA_Tpl; // Inherit constructors (via PyA_Tpl's inherited constructors)
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, Base, unlucky_number, ); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, Base, lucky_number, ); }
+};
+// Since C_Tpl and D_Tpl don't declare any new virtual methods, we don't actually need these (we can
+// use PyB_Tpl<C_Tpl> and PyB_Tpl<D_Tpl> for the trampoline classes instead):
+/*
+template <class Base = C_Tpl> class PyC_Tpl : public PyB_Tpl<Base> {
+public:
+    using PyB_Tpl<Base>::PyB_Tpl;
+};
+template <class Base = D_Tpl> class PyD_Tpl : public PyC_Tpl<Base> {
+public:
+    using PyC_Tpl<Base>::PyC_Tpl;
+};
+*/
+
+void initialize_inherited_virtuals(py::module &m) {
+    // test_inherited_virtuals
+
+    // Method 1: repeat
+    py::class_<A_Repeat, PyA_Repeat>(m, "A_Repeat")
+        .def(py::init<>())
+        .def("unlucky_number", &A_Repeat::unlucky_number)
+        .def("say_something", &A_Repeat::say_something)
+        .def("say_everything", &A_Repeat::say_everything);
+    py::class_<B_Repeat, A_Repeat, PyB_Repeat>(m, "B_Repeat")
+        .def(py::init<>())
+        .def("lucky_number", &B_Repeat::lucky_number);
+    py::class_<C_Repeat, B_Repeat, PyC_Repeat>(m, "C_Repeat")
+        .def(py::init<>());
+    py::class_<D_Repeat, C_Repeat, PyD_Repeat>(m, "D_Repeat")
+        .def(py::init<>());
+
+    // test_
+    // Method 2: Templated trampolines
+    py::class_<A_Tpl, PyA_Tpl<>>(m, "A_Tpl")
+        .def(py::init<>())
+        .def("unlucky_number", &A_Tpl::unlucky_number)
+        .def("say_something", &A_Tpl::say_something)
+        .def("say_everything", &A_Tpl::say_everything);
+    py::class_<B_Tpl, A_Tpl, PyB_Tpl<>>(m, "B_Tpl")
+        .def(py::init<>())
+        .def("lucky_number", &B_Tpl::lucky_number);
+    py::class_<C_Tpl, B_Tpl, PyB_Tpl<C_Tpl>>(m, "C_Tpl")
+        .def(py::init<>());
+    py::class_<D_Tpl, C_Tpl, PyB_Tpl<D_Tpl>>(m, "D_Tpl")
+        .def(py::init<>());
+
+
+    // Fix issue #1454 (crash when acquiring/releasing GIL on another thread in Python 2.7)
+    m.def("test_gil", &test_gil);
+    m.def("test_gil_from_thread", &test_gil_from_thread);
+};
diff --git a/pybind11/tests/test_virtual_functions.py b/pybind11/tests/test_virtual_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7bd5badf0223812070e1f273cf8f9c4dd18db9c
--- /dev/null
+++ b/pybind11/tests/test_virtual_functions.py
@@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import env  # noqa: F401
+
+from pybind11_tests import virtual_functions as m
+from pybind11_tests import ConstructorStats
+
+
+def test_override(capture, msg):
+    class ExtendedExampleVirt(m.ExampleVirt):
+        def __init__(self, state):
+            super(ExtendedExampleVirt, self).__init__(state + 1)
+            self.data = "Hello world"
+
+        def run(self, value):
+            print('ExtendedExampleVirt::run(%i), calling parent..' % value)
+            return super(ExtendedExampleVirt, self).run(value + 1)
+
+        def run_bool(self):
+            print('ExtendedExampleVirt::run_bool()')
+            return False
+
+        def get_string1(self):
+            return "override1"
+
+        def pure_virtual(self):
+            print('ExtendedExampleVirt::pure_virtual(): %s' % self.data)
+
+    class ExtendedExampleVirt2(ExtendedExampleVirt):
+        def __init__(self, state):
+            super(ExtendedExampleVirt2, self).__init__(state + 1)
+
+        def get_string2(self):
+            return "override2"
+
+    ex12 = m.ExampleVirt(10)
+    with capture:
+        assert m.runExampleVirt(ex12, 20) == 30
+    assert capture == """
+        Original implementation of ExampleVirt::run(state=10, value=20, str1=default1, str2=default2)
+    """  # noqa: E501 line too long
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.runExampleVirtVirtual(ex12)
+    assert msg(excinfo.value) == 'Tried to call pure virtual function "ExampleVirt::pure_virtual"'
+
+    ex12p = ExtendedExampleVirt(10)
+    with capture:
+        assert m.runExampleVirt(ex12p, 20) == 32
+    assert capture == """
+        ExtendedExampleVirt::run(20), calling parent..
+        Original implementation of ExampleVirt::run(state=11, value=21, str1=override1, str2=default2)
+    """  # noqa: E501 line too long
+    with capture:
+        assert m.runExampleVirtBool(ex12p) is False
+    assert capture == "ExtendedExampleVirt::run_bool()"
+    with capture:
+        m.runExampleVirtVirtual(ex12p)
+    assert capture == "ExtendedExampleVirt::pure_virtual(): Hello world"
+
+    ex12p2 = ExtendedExampleVirt2(15)
+    with capture:
+        assert m.runExampleVirt(ex12p2, 50) == 68
+    assert capture == """
+        ExtendedExampleVirt::run(50), calling parent..
+        Original implementation of ExampleVirt::run(state=17, value=51, str1=override1, str2=override2)
+    """  # noqa: E501 line too long
+
+    cstats = ConstructorStats.get(m.ExampleVirt)
+    assert cstats.alive() == 3
+    del ex12, ex12p, ex12p2
+    assert cstats.alive() == 0
+    assert cstats.values() == ['10', '11', '17']
+    assert cstats.copy_constructions == 0
+    assert cstats.move_constructions >= 0
+
+
+def test_alias_delay_initialization1(capture):
+    """`A` only initializes its trampoline class when we inherit from it
+
+    If we just create and use an A instance directly, the trampoline initialization is
+    bypassed and we only initialize an A() instead (for performance reasons).
+    """
+    class B(m.A):
+        def __init__(self):
+            super(B, self).__init__()
+
+        def f(self):
+            print("In python f()")
+
+    # C++ version
+    with capture:
+        a = m.A()
+        m.call_f(a)
+        del a
+        pytest.gc_collect()
+    assert capture == "A.f()"
+
+    # Python version
+    with capture:
+        b = B()
+        m.call_f(b)
+        del b
+        pytest.gc_collect()
+    assert capture == """
+        PyA.PyA()
+        PyA.f()
+        In python f()
+        PyA.~PyA()
+    """
+
+
+def test_alias_delay_initialization2(capture):
+    """`A2`, unlike the above, is configured to always initialize the alias
+
+    While the extra initialization and extra class layer has small virtual dispatch
+    performance penalty, it also allows us to do more things with the trampoline
+    class such as defining local variables and performing construction/destruction.
+    """
+    class B2(m.A2):
+        def __init__(self):
+            super(B2, self).__init__()
+
+        def f(self):
+            print("In python B2.f()")
+
+    # No python subclass version
+    with capture:
+        a2 = m.A2()
+        m.call_f(a2)
+        del a2
+        pytest.gc_collect()
+        a3 = m.A2(1)
+        m.call_f(a3)
+        del a3
+        pytest.gc_collect()
+    assert capture == """
+        PyA2.PyA2()
+        PyA2.f()
+        A2.f()
+        PyA2.~PyA2()
+        PyA2.PyA2()
+        PyA2.f()
+        A2.f()
+        PyA2.~PyA2()
+    """
+
+    # Python subclass version
+    with capture:
+        b2 = B2()
+        m.call_f(b2)
+        del b2
+        pytest.gc_collect()
+    assert capture == """
+        PyA2.PyA2()
+        PyA2.f()
+        In python B2.f()
+        PyA2.~PyA2()
+    """
+
+
+# PyPy: Reference count > 1 causes call with noncopyable instance
+# to fail in ncv1.print_nc()
+@pytest.mark.xfail("env.PYPY")
+@pytest.mark.skipif(not hasattr(m, "NCVirt"), reason="NCVirt test broken on ICPC")
+def test_move_support():
+    class NCVirtExt(m.NCVirt):
+        def get_noncopyable(self, a, b):
+            # Constructs and returns a new instance:
+            nc = m.NonCopyable(a * a, b * b)
+            return nc
+
+        def get_movable(self, a, b):
+            # Return a referenced copy
+            self.movable = m.Movable(a, b)
+            return self.movable
+
+    class NCVirtExt2(m.NCVirt):
+        def get_noncopyable(self, a, b):
+            # Keep a reference: this is going to throw an exception
+            self.nc = m.NonCopyable(a, b)
+            return self.nc
+
+        def get_movable(self, a, b):
+            # Return a new instance without storing it
+            return m.Movable(a, b)
+
+    ncv1 = NCVirtExt()
+    assert ncv1.print_nc(2, 3) == "36"
+    assert ncv1.print_movable(4, 5) == "9"
+    ncv2 = NCVirtExt2()
+    assert ncv2.print_movable(7, 7) == "14"
+    # Don't check the exception message here because it differs under debug/non-debug mode
+    with pytest.raises(RuntimeError):
+        ncv2.print_nc(9, 9)
+
+    nc_stats = ConstructorStats.get(m.NonCopyable)
+    mv_stats = ConstructorStats.get(m.Movable)
+    assert nc_stats.alive() == 1
+    assert mv_stats.alive() == 1
+    del ncv1, ncv2
+    assert nc_stats.alive() == 0
+    assert mv_stats.alive() == 0
+    assert nc_stats.values() == ['4', '9', '9', '9']
+    assert mv_stats.values() == ['4', '5', '7', '7']
+    assert nc_stats.copy_constructions == 0
+    assert mv_stats.copy_constructions == 1
+    assert nc_stats.move_constructions >= 0
+    assert mv_stats.move_constructions >= 0
+
+
+def test_dispatch_issue(msg):
+    """#159: virtual function dispatch has problems with similar-named functions"""
+    class PyClass1(m.DispatchIssue):
+        def dispatch(self):
+            return "Yay.."
+
+    class PyClass2(m.DispatchIssue):
+        def dispatch(self):
+            with pytest.raises(RuntimeError) as excinfo:
+                super(PyClass2, self).dispatch()
+            assert msg(excinfo.value) == 'Tried to call pure virtual function "Base::dispatch"'
+
+            p = PyClass1()
+            return m.dispatch_issue_go(p)
+
+    b = PyClass2()
+    assert m.dispatch_issue_go(b) == "Yay.."
+
+
+def test_override_ref():
+    """#392/397: overriding reference-returning functions"""
+    o = m.OverrideTest("asdf")
+
+    # Not allowed (see associated .cpp comment)
+    # i = o.str_ref()
+    # assert o.str_ref() == "asdf"
+    assert o.str_value() == "asdf"
+
+    assert o.A_value().value == "hi"
+    a = o.A_ref()
+    assert a.value == "hi"
+    a.value = "bye"
+    assert a.value == "bye"
+
+
+def test_inherited_virtuals():
+    class AR(m.A_Repeat):
+        def unlucky_number(self):
+            return 99
+
+    class AT(m.A_Tpl):
+        def unlucky_number(self):
+            return 999
+
+    obj = AR()
+    assert obj.say_something(3) == "hihihi"
+    assert obj.unlucky_number() == 99
+    assert obj.say_everything() == "hi 99"
+
+    obj = AT()
+    assert obj.say_something(3) == "hihihi"
+    assert obj.unlucky_number() == 999
+    assert obj.say_everything() == "hi 999"
+
+    for obj in [m.B_Repeat(), m.B_Tpl()]:
+        assert obj.say_something(3) == "B says hi 3 times"
+        assert obj.unlucky_number() == 13
+        assert obj.lucky_number() == 7.0
+        assert obj.say_everything() == "B says hi 1 times 13"
+
+    for obj in [m.C_Repeat(), m.C_Tpl()]:
+        assert obj.say_something(3) == "B says hi 3 times"
+        assert obj.unlucky_number() == 4444
+        assert obj.lucky_number() == 888.0
+        assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CR(m.C_Repeat):
+        def lucky_number(self):
+            return m.C_Repeat.lucky_number(self) + 1.25
+
+    obj = CR()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 889.25
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CT(m.C_Tpl):
+        pass
+
+    obj = CT()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 888.0
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CCR(CR):
+        def lucky_number(self):
+            return CR.lucky_number(self) * 10
+
+    obj = CCR()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 8892.5
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CCT(CT):
+        def lucky_number(self):
+            return CT.lucky_number(self) * 1000
+
+    obj = CCT()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 888000.0
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class DR(m.D_Repeat):
+        def unlucky_number(self):
+            return 123
+
+        def lucky_number(self):
+            return 42.0
+
+    for obj in [m.D_Repeat(), m.D_Tpl()]:
+        assert obj.say_something(3) == "B says hi 3 times"
+        assert obj.unlucky_number() == 4444
+        assert obj.lucky_number() == 888.0
+        assert obj.say_everything() == "B says hi 1 times 4444"
+
+    obj = DR()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 123
+    assert obj.lucky_number() == 42.0
+    assert obj.say_everything() == "B says hi 1 times 123"
+
+    class DT(m.D_Tpl):
+        def say_something(self, times):
+            return "DT says:" + (' quack' * times)
+
+        def unlucky_number(self):
+            return 1234
+
+        def lucky_number(self):
+            return -4.25
+
+    obj = DT()
+    assert obj.say_something(3) == "DT says: quack quack quack"
+    assert obj.unlucky_number() == 1234
+    assert obj.lucky_number() == -4.25
+    assert obj.say_everything() == "DT says: quack 1234"
+
+    class DT2(DT):
+        def say_something(self, times):
+            return "DT2: " + ('QUACK' * times)
+
+        def unlucky_number(self):
+            return -3
+
+    class BT(m.B_Tpl):
+        def say_something(self, times):
+            return "BT" * times
+
+        def unlucky_number(self):
+            return -7
+
+        def lucky_number(self):
+            return -1.375
+
+    obj = BT()
+    assert obj.say_something(3) == "BTBTBT"
+    assert obj.unlucky_number() == -7
+    assert obj.lucky_number() == -1.375
+    assert obj.say_everything() == "BT -7"
+
+
+def test_issue_1454():
+    # Fix issue #1454 (crash when acquiring/releasing GIL on another thread in Python 2.7)
+    m.test_gil()
+    m.test_gil_from_thread()
diff --git a/pybind11/tools/FindCatch.cmake b/pybind11/tools/FindCatch.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4d6bffcf6810c5caa2ed424392f3c45216d1356b
--- /dev/null
+++ b/pybind11/tools/FindCatch.cmake
@@ -0,0 +1,70 @@
+# - Find the Catch test framework or download it (single header)
+#
+# This is a quick module for internal use. It assumes that Catch is
+# REQUIRED and that a minimum version is provided (not EXACT). If
+# a suitable version isn't found locally, the single header file
+# will be downloaded and placed in the build dir: PROJECT_BINARY_DIR.
+#
+# This code sets the following variables:
+#  CATCH_INCLUDE_DIR      - path to catch.hpp
+#  CATCH_VERSION          - version number
+
+if(NOT Catch_FIND_VERSION)
+  message(FATAL_ERROR "A version number must be specified.")
+elseif(Catch_FIND_REQUIRED)
+  message(FATAL_ERROR "This module assumes Catch is not required.")
+elseif(Catch_FIND_VERSION_EXACT)
+  message(FATAL_ERROR "Exact version numbers are not supported, only minimum.")
+endif()
+
+# Extract the version number from catch.hpp
+function(_get_catch_version)
+  file(
+    STRINGS "${CATCH_INCLUDE_DIR}/catch.hpp" version_line
+    REGEX "Catch v.*"
+    LIMIT_COUNT 1)
+  if(version_line MATCHES "Catch v([0-9]+)\\.([0-9]+)\\.([0-9]+)")
+    set(CATCH_VERSION
+        "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}"
+        PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Download the single-header version of Catch
+function(_download_catch version destination_dir)
+  message(STATUS "Downloading catch v${version}...")
+  set(url https://github.com/philsquared/Catch/releases/download/v${version}/catch.hpp)
+  file(DOWNLOAD ${url} "${destination_dir}/catch.hpp" STATUS status)
+  list(GET status 0 error)
+  if(error)
+    message(FATAL_ERROR "Could not download ${url}")
+  endif()
+  set(CATCH_INCLUDE_DIR
+      "${destination_dir}"
+      CACHE INTERNAL "")
+endfunction()
+
+# Look for catch locally
+find_path(
+  CATCH_INCLUDE_DIR
+  NAMES catch.hpp
+  PATH_SUFFIXES catch2)
+if(CATCH_INCLUDE_DIR)
+  _get_catch_version()
+endif()
+
+# Download the header if it wasn't found or if it's outdated
+if(NOT CATCH_VERSION OR CATCH_VERSION VERSION_LESS ${Catch_FIND_VERSION})
+  if(DOWNLOAD_CATCH)
+    _download_catch(${Catch_FIND_VERSION} "${PROJECT_BINARY_DIR}/catch/")
+    _get_catch_version()
+  else()
+    set(CATCH_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+add_library(Catch2::Catch2 IMPORTED INTERFACE)
+set_property(TARGET Catch2::Catch2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CATCH_INCLUDE_DIR}")
+
+set(CATCH_FOUND TRUE)
diff --git a/pybind11/tools/FindEigen3.cmake b/pybind11/tools/FindEigen3.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..98ab43d9e62e293c0c87e44b6f325579991e8732
--- /dev/null
+++ b/pybind11/tools/FindEigen3.cmake
@@ -0,0 +1,83 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif(NOT Eigen3_FIND_VERSION_PATCH)
+
+  set(Eigen3_FIND_VERSION
+      "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif(NOT Eigen3_FIND_VERSION)
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match
+               "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match
+               "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match
+               "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK TRUE)
+  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif(NOT EIGEN3_VERSION_OK)
+endmacro(_eigen3_check_version)
+
+if(EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+
+else(EIGEN3_INCLUDE_DIR)
+
+  find_path(
+    EIGEN3_INCLUDE_DIR
+    NAMES signature_of_eigen3_matrix_library
+    PATHS ${CMAKE_INSTALL_PREFIX}/include ${KDE4_INCLUDE_DIR}
+    PATH_SUFFIXES eigen3 eigen)
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif(EIGEN3_INCLUDE_DIR)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif(EIGEN3_INCLUDE_DIR)
diff --git a/pybind11/tools/FindPythonLibsNew.cmake b/pybind11/tools/FindPythonLibsNew.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c1c72c763c6cec6f2fa517f549a553d550ba49d0
--- /dev/null
+++ b/pybind11/tools/FindPythonLibsNew.cmake
@@ -0,0 +1,255 @@
+# - Find python libraries
+# This module finds the libraries corresponding to the Python interpreter
+# FindPythonInterp provides.
+# This code sets the following variables:
+#
+#  PYTHONLIBS_FOUND           - have the Python libs been found
+#  PYTHON_PREFIX              - path to the Python installation
+#  PYTHON_LIBRARIES           - path to the python library
+#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
+#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
+#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
+#  PYTHON_SITE_PACKAGES       - path to installation site-packages
+#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
+#
+# Thanks to talljimbo for the patch adding the 'LDVERSION' config
+# variable usage.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the names of Kitware, Inc., the Insight Software Consortium,
+# nor the names of their contributors may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
+if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
+  return()
+endif()
+
+if(PythonLibsNew_FIND_QUIETLY)
+  set(_pythonlibs_quiet QUIET)
+endif()
+
+if(PythonLibsNew_FIND_REQUIRED)
+  set(_pythonlibs_required REQUIRED)
+endif()
+
+# Check to see if the `python` command is present and from a virtual
+# environment, conda, or GHA activation - if it is, try to use that.
+
+if(NOT DEFINED PYTHON_EXECUTABLE)
+  if(DEFINED ENV{VIRTUAL_ENV})
+    find_program(
+      PYTHON_EXECUTABLE python
+      PATHS "$ENV{VIRTUAL_ENV}" "$ENV{VIRTUAL_ENV}/bin"
+      NO_DEFAULT_PATH)
+  elseif(DEFINED ENV{CONDA_PREFIX})
+    find_program(
+      PYTHON_EXECUTABLE python
+      PATHS "$ENV{CONDA_PREFIX}" "$ENV{CONDA_PREFIX}/bin"
+      NO_DEFAULT_PATH)
+  elseif(DEFINED ENV{pythonLocation})
+    find_program(
+      PYTHON_EXECUTABLE python
+      PATHS "$ENV{pythonLocation}" "$ENV{pythonLocation}/bin"
+      NO_DEFAULT_PATH)
+  endif()
+  if(NOT PYTHON_EXECUTABLE)
+    unset(PYTHON_EXECUTABLE)
+  endif()
+endif()
+
+# Use the Python interpreter to find the libs.
+if(NOT PythonLibsNew_FIND_VERSION)
+  set(PythonLibsNew_FIND_VERSION "")
+endif()
+
+find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} ${_pythonlibs_required}
+             ${_pythonlibs_quiet})
+
+if(NOT PYTHONINTERP_FOUND)
+  set(PYTHONLIBS_FOUND FALSE)
+  set(PythonLibsNew_FOUND FALSE)
+  return()
+endif()
+
+# According to https://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
+# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
+# way to detect a CPython debug interpreter.
+#
+# The library suffix is from the config var LDVERSION sometimes, otherwise
+# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
+execute_process(
+  COMMAND
+    "${PYTHON_EXECUTABLE}" "-c" "from distutils import sysconfig as s;import sys;import struct;
+print('.'.join(str(v) for v in sys.version_info));
+print(sys.prefix);
+print(s.get_python_inc(plat_specific=True));
+print(s.get_python_lib(plat_specific=True));
+print(s.get_config_var('SO'));
+print(hasattr(sys, 'gettotalrefcount')+0);
+print(struct.calcsize('@P'));
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+print(s.get_config_var('LIBDIR') or '');
+print(s.get_config_var('MULTIARCH') or '');
+"
+  RESULT_VARIABLE _PYTHON_SUCCESS
+  OUTPUT_VARIABLE _PYTHON_VALUES
+  ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+if(NOT _PYTHON_SUCCESS MATCHES 0)
+  if(PythonLibsNew_FIND_REQUIRED)
+    message(FATAL_ERROR "Python config failure:\n${_PYTHON_ERROR_VALUE}")
+  endif()
+  set(PYTHONLIBS_FOUND FALSE)
+  set(PythonLibsNew_FOUND FALSE)
+  return()
+endif()
+
+# Convert the process output into a list
+if(WIN32)
+  string(REGEX REPLACE "\\\\" "/" _PYTHON_VALUES ${_PYTHON_VALUES})
+endif()
+string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
+list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
+list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
+list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
+list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
+list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
+list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
+list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
+list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
+list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
+
+# Make sure the Python has the same pointer-size as the chosen compiler
+# Skip if CMAKE_SIZEOF_VOID_P is not defined
+if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
+  if(PythonLibsNew_FIND_REQUIRED)
+    math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
+    math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
+    message(FATAL_ERROR "Python config failure: Python is ${_PYTHON_BITS}-bit, "
+                        "chosen compiler is  ${_CMAKE_BITS}-bit")
+  endif()
+  set(PYTHONLIBS_FOUND FALSE)
+  set(PythonLibsNew_FOUND FALSE)
+  return()
+endif()
+
+# The built-in FindPython didn't always give the version numbers
+string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
+list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
+list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
+list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
+set(PYTHON_VERSION "${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}.${PYTHON_VERSION_PATCH}")
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX "${PYTHON_PREFIX}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR "${PYTHON_INCLUDE_DIR}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES "${PYTHON_SITE_PACKAGES}")
+
+if(CMAKE_HOST_WIN32)
+  set(PYTHON_LIBRARY "${PYTHON_PREFIX}/libs/python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+  # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+  # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+  if(NOT EXISTS "${PYTHON_LIBRARY}")
+    get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+    set(PYTHON_LIBRARY "${_PYTHON_ROOT}/libs/python${PYTHON_LIBRARY_SUFFIX}.lib")
+  endif()
+
+  # if we are in MSYS & MINGW, and we didn't find windows python lib, look for system python lib
+  if(DEFINED ENV{MSYSTEM}
+     AND MINGW
+     AND NOT EXISTS "${PYTHON_LIBRARY}")
+    if(PYTHON_MULTIARCH)
+      set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+    else()
+      set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+    endif()
+    unset(PYTHON_LIBRARY)
+    find_library(
+      PYTHON_LIBRARY
+      NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+      PATHS ${_PYTHON_LIBS_SEARCH}
+      NO_DEFAULT_PATH)
+  endif()
+
+  # raise an error if the python libs are still not found.
+  if(NOT EXISTS "${PYTHON_LIBRARY}")
+    message(FATAL_ERROR "Python libraries not found")
+  endif()
+
+else()
+  if(PYTHON_MULTIARCH)
+    set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+  else()
+    set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+  endif()
+  #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
+  # Probably this needs to be more involved. It would be nice if the config
+  # information the python interpreter itself gave us were more complete.
+  find_library(
+    PYTHON_LIBRARY
+    NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+    PATHS ${_PYTHON_LIBS_SEARCH}
+    NO_DEFAULT_PATH)
+
+  # If all else fails, just set the name/version and let the linker figure out the path.
+  if(NOT PYTHON_LIBRARY)
+    set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+mark_as_advanced(PYTHON_LIBRARY PYTHON_INCLUDE_DIR)
+
+# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
+# cache entries because they are meant to specify the location of a single
+# library. We now set the variables listed by the documentation for this
+# module.
+set(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
+set(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+if(NOT PYTHON_DEBUG_LIBRARY)
+  set(PYTHON_DEBUG_LIBRARY "")
+endif()
+set(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
+
+find_package_message(PYTHON "Found PythonLibs: ${PYTHON_LIBRARY}"
+                     "${PYTHON_EXECUTABLE}${PYTHON_VERSION_STRING}")
+
+set(PYTHONLIBS_FOUND TRUE)
+set(PythonLibsNew_FOUND TRUE)
+
+if(NOT PYTHON_MODULE_PREFIX)
+  set(PYTHON_MODULE_PREFIX "")
+endif()
diff --git a/pybind11/tools/check-style.sh b/pybind11/tools/check-style.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f7af2a4169744334af0b9c28823e98a502b813be
--- /dev/null
+++ b/pybind11/tools/check-style.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+# Script to check include/test code for common pybind11 code style errors.
+#
+# This script currently checks for
+#
+# 1. missing space between keyword and parenthesis, e.g.: for(, if(, while(
+# 2. Missing space between right parenthesis and brace, e.g. 'for (...){'
+# 3. opening brace on its own line. It should always be on the same line as the
+#    if/while/for/do statement.
+#
+# Invoke as: tools/check-style.sh <filenames>
+#
+
+check_style_errors=0
+IFS=$'\n'
+
+
+found="$(grep '\<\(if\|for\|while\|catch\)(\|){' $@ -rn --color=always)"
+if [ -n "$found" ]; then
+    echo -e '\033[31;01mError: found the following coding style problems:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
+
+found="$(awk '
+function prefix(filename, lineno) {
+    return "    \033[35m" filename "\033[36m:\033[32m" lineno "\033[36m:\033[0m"
+}
+function mark(pattern, string) { sub(pattern, "\033[01;31m&\033[0m", string); return string }
+last && /^\s*{/ {
+    print prefix(FILENAME, FNR-1) mark("\\)\\s*$", last)
+    print prefix(FILENAME, FNR)   mark("^\\s*{", $0)
+    last=""
+}
+{ last = /(if|for|while|catch|switch)\s*\(.*\)\s*$/ ? $0 : "" }
+' $(find include -type f) $@)"
+if [ -n "$found" ]; then
+    check_style_errors=1
+    echo -e '\033[31;01mError: braces should occur on the same line as the if/while/.. statement. Found issues in the following files:\033[0m'
+    echo "$found"
+fi
+
+exit $check_style_errors
diff --git a/pybind11/tools/clang/.gitignore b/pybind11/tools/clang/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8819bdaf388603251872117e7821f3fc1aeb62bf
--- /dev/null
+++ b/pybind11/tools/clang/.gitignore
@@ -0,0 +1,4 @@
+*.swp
+*.swo
+*.pyc
+__pycache__
diff --git a/pybind11/tools/clang/LICENSE.TXT b/pybind11/tools/clang/LICENSE.TXT
new file mode 100644
index 0000000000000000000000000000000000000000..6c224f84c5bbea06bf93071ee30f6c59ca10de2c
--- /dev/null
+++ b/pybind11/tools/clang/LICENSE.TXT
@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2012 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+
diff --git a/pybind11/tools/clang/README.md b/pybind11/tools/clang/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..efb892166cf3ca689801eab8e2396c9d8d7066c1
--- /dev/null
+++ b/pybind11/tools/clang/README.md
@@ -0,0 +1,2 @@
+This is simply clang's Python bindings (clang.cindex) ported to Python 3. Please see http://llvm.org/svn/llvm-project/cfe/trunk/bindings/python/ for the original project.
+
diff --git a/pybind11/tools/clang/__init__.py b/pybind11/tools/clang/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f30812383f8ebdcf095566500b1ecc78c92710
--- /dev/null
+++ b/pybind11/tools/clang/__init__.py
@@ -0,0 +1,24 @@
+#===- __init__.py - Clang Python Bindings --------------------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+r"""
+Clang Library Bindings
+======================
+
+This package provides access to the Clang compiler and libraries.
+
+The available modules are:
+
+  cindex
+
+    Bindings for the Clang indexing library.
+"""
+
+__all__ = ['cindex']
+
diff --git a/pybind11/tools/clang/cindex.py b/pybind11/tools/clang/cindex.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a083de0df70e64c07bb3c0cd4bdf69d7ddfd8c5
--- /dev/null
+++ b/pybind11/tools/clang/cindex.py
@@ -0,0 +1,3884 @@
+#===- cindex.py - Python Indexing Library Bindings -----------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+r"""
+Clang Indexing Library Bindings
+===============================
+
+This module provides an interface to the Clang indexing library. It is a
+low-level interface to the indexing library which attempts to match the Clang
+API directly while also being "pythonic". Notable differences from the C API
+are:
+
+ * string results are returned as Python strings, not CXString objects.
+
+ * null cursors are translated to None.
+
+ * access to child cursors is done via iteration, not visitation.
+
+The major indexing objects are:
+
+  Index
+
+    The top-level object which manages some global library state.
+
+  TranslationUnit
+
+    High-level object encapsulating the AST for a single translation unit. These
+    can be loaded from .ast files or parsed on the fly.
+
+  Cursor
+
+    Generic object for representing a node in the AST.
+
+  SourceRange, SourceLocation, and File
+
+    Objects representing information about the input source.
+
+Most object information is exposed using properties, when the underlying API
+call is efficient.
+"""
+
+# TODO
+# ====
+#
+# o API support for invalid translation units. Currently we can't even get the
+#   diagnostics on failure because they refer to locations in an object that
+#   will have been invalidated.
+#
+# o fix memory management issues (currently client must hold on to index and
+#   translation unit, or risk crashes).
+#
+# o expose code completion APIs.
+#
+# o cleanup ctypes wrapping, would be nice to separate the ctypes details more
+#   clearly, and hide from the external interface (i.e., help(cindex)).
+#
+# o implement additional SourceLocation, SourceRange, and File methods.
+
+from ctypes import *
+import collections
+
+import clang.enumerations
+
+# ctypes doesn't implicitly convert c_void_p to the appropriate wrapper
+# object. This is a problem, because it means that from_parameter will see an
+# integer and pass the wrong value on platforms where int != void*. Work around
+# this by marshalling object arguments as void**.
+c_object_p = POINTER(c_void_p)
+
+callbacks = {}
+
+### Exception Classes ###
+
+class TranslationUnitLoadError(Exception):
+    """Represents an error that occurred when loading a TranslationUnit.
+
+    This is raised in the case where a TranslationUnit could not be
+    instantiated due to failure in the libclang library.
+
+    FIXME: Make libclang expose additional error information in this scenario.
+    """
+    pass
+
+class TranslationUnitSaveError(Exception):
+    """Represents an error that occurred when saving a TranslationUnit.
+
+    Each error has associated with it an enumerated value, accessible under
+    e.save_error. Consumers can compare the value with one of the ERROR_
+    constants in this class.
+    """
+
+    # Indicates that an unknown error occurred. This typically indicates that
+    # I/O failed during save.
+    ERROR_UNKNOWN = 1
+
+    # Indicates that errors during translation prevented saving. The errors
+    # should be available via the TranslationUnit's diagnostics.
+    ERROR_TRANSLATION_ERRORS = 2
+
+    # Indicates that the translation unit was somehow invalid.
+    ERROR_INVALID_TU = 3
+
+    def __init__(self, enumeration, message):
+        assert isinstance(enumeration, int)
+
+        if enumeration < 1 or enumeration > 3:
+            raise Exception("Encountered undefined TranslationUnit save error "
+                            "constant: %d. Please file a bug to have this "
+                            "value supported." % enumeration)
+
+        self.save_error = enumeration
+        Exception.__init__(self, 'Error %d: %s' % (enumeration, message))
+
+### Structures and Utility Classes ###
+
+class CachedProperty(object):
+    """Decorator that lazy-loads the value of a property.
+
+    The first time the property is accessed, the original property function is
+    executed. The value it returns is set as the new value of that instance's
+    property, replacing the original method.
+    """
+
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+        try:
+            self.__doc__ = wrapped.__doc__
+        except:
+            pass
+
+    def __get__(self, instance, instance_type=None):
+        if instance is None:
+            return self
+
+        value = self.wrapped(instance)
+        setattr(instance, self.wrapped.__name__, value)
+
+        return value
+
+
+class _CXString(Structure):
+    """Helper for transforming CXString results."""
+
+    _fields_ = [("spelling", c_char_p), ("free", c_int)]
+
+    def __del__(self):
+        conf.lib.clang_disposeString(self)
+
+    @staticmethod
+    def from_result(res, fn, args):
+        assert isinstance(res, _CXString)
+        return conf.lib.clang_getCString(res)
+
+class SourceLocation(Structure):
+    """
+    A SourceLocation represents a particular location within a source file.
+    """
+    _fields_ = [("ptr_data", c_void_p * 2), ("int_data", c_uint)]
+    _data = None
+
+    def _get_instantiation(self):
+        if self._data is None:
+            f, l, c, o = c_object_p(), c_uint(), c_uint(), c_uint()
+            conf.lib.clang_getInstantiationLocation(self, byref(f), byref(l),
+                    byref(c), byref(o))
+            if f:
+                f = File(f)
+            else:
+                f = None
+            self._data = (f, int(l.value), int(c.value), int(o.value))
+        return self._data
+
+    @staticmethod
+    def from_position(tu, file, line, column):
+        """
+        Retrieve the source location associated with a given file/line/column in
+        a particular translation unit.
+        """
+        return conf.lib.clang_getLocation(tu, file, line, column)
+
+    @staticmethod
+    def from_offset(tu, file, offset):
+        """Retrieve a SourceLocation from a given character offset.
+
+        tu -- TranslationUnit file belongs to
+        file -- File instance to obtain offset from
+        offset -- Integer character offset within file
+        """
+        return conf.lib.clang_getLocationForOffset(tu, file, offset)
+
+    @property
+    def file(self):
+        """Get the file represented by this source location."""
+        return self._get_instantiation()[0]
+
+    @property
+    def line(self):
+        """Get the line represented by this source location."""
+        return self._get_instantiation()[1]
+
+    @property
+    def column(self):
+        """Get the column represented by this source location."""
+        return self._get_instantiation()[2]
+
+    @property
+    def offset(self):
+        """Get the file offset represented by this source location."""
+        return self._get_instantiation()[3]
+
+    def __eq__(self, other):
+        return conf.lib.clang_equalLocations(self, other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        if self.file:
+            filename = self.file.name
+        else:
+            filename = None
+        return "<SourceLocation file %r, line %r, column %r>" % (
+            filename, self.line, self.column)
+
+class SourceRange(Structure):
+    """
+    A SourceRange describes a range of source locations within the source
+    code.
+    """
+    _fields_ = [
+        ("ptr_data", c_void_p * 2),
+        ("begin_int_data", c_uint),
+        ("end_int_data", c_uint)]
+
+    # FIXME: Eliminate this and make normal constructor? Requires hiding ctypes
+    # object.
+    @staticmethod
+    def from_locations(start, end):
+        return conf.lib.clang_getRange(start, end)
+
+    @property
+    def start(self):
+        """
+        Return a SourceLocation representing the first character within a
+        source range.
+        """
+        return conf.lib.clang_getRangeStart(self)
+
+    @property
+    def end(self):
+        """
+        Return a SourceLocation representing the last character within a
+        source range.
+        """
+        return conf.lib.clang_getRangeEnd(self)
+
+    def __eq__(self, other):
+        return conf.lib.clang_equalRanges(self, other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __contains__(self, other):
+        """Useful to detect the Token/Lexer bug"""
+        if not isinstance(other, SourceLocation):
+            return False
+        if other.file is None and self.start.file is None:
+            pass
+        elif ( self.start.file.name != other.file.name or
+               other.file.name != self.end.file.name):
+            # same file name
+            return False
+        # same file, in between lines
+        if self.start.line < other.line < self.end.line:
+            return True
+        elif self.start.line == other.line:
+            # same file first line
+            if self.start.column <= other.column:
+                return True
+        elif other.line == self.end.line:
+            # same file last line
+            if other.column <= self.end.column:
+                return True
+        return False
+
+    def __repr__(self):
+        return "<SourceRange start %r, end %r>" % (self.start, self.end)
+
+class Diagnostic(object):
+    """
+    A Diagnostic is a single instance of a Clang diagnostic. It includes the
+    diagnostic severity, the message, the location the diagnostic occurred, as
+    well as additional source ranges and associated fix-it hints.
+    """
+
+    Ignored = 0
+    Note    = 1
+    Warning = 2
+    Error   = 3
+    Fatal   = 4
+
+    def __init__(self, ptr):
+        self.ptr = ptr
+
+    def __del__(self):
+        conf.lib.clang_disposeDiagnostic(self)
+
+    @property
+    def severity(self):
+        return conf.lib.clang_getDiagnosticSeverity(self)
+
+    @property
+    def location(self):
+        return conf.lib.clang_getDiagnosticLocation(self)
+
+    @property
+    def spelling(self):
+        return conf.lib.clang_getDiagnosticSpelling(self)
+
+    @property
+    def ranges(self):
+        class RangeIterator:
+            def __init__(self, diag):
+                self.diag = diag
+
+            def __len__(self):
+                return int(conf.lib.clang_getDiagnosticNumRanges(self.diag))
+
+            def __getitem__(self, key):
+                if (key >= len(self)):
+                    raise IndexError
+                return conf.lib.clang_getDiagnosticRange(self.diag, key)
+
+        return RangeIterator(self)
+
+    @property
+    def fixits(self):
+        class FixItIterator:
+            def __init__(self, diag):
+                self.diag = diag
+
+            def __len__(self):
+                return int(conf.lib.clang_getDiagnosticNumFixIts(self.diag))
+
+            def __getitem__(self, key):
+                range = SourceRange()
+                value = conf.lib.clang_getDiagnosticFixIt(self.diag, key,
+                        byref(range))
+                if len(value) == 0:
+                    raise IndexError
+
+                return FixIt(range, value)
+
+        return FixItIterator(self)
+
+    @property
+    def children(self):
+        class ChildDiagnosticsIterator:
+            def __init__(self, diag):
+                self.diag_set = conf.lib.clang_getChildDiagnostics(diag)
+
+            def __len__(self):
+                return int(conf.lib.clang_getNumDiagnosticsInSet(self.diag_set))
+
+            def __getitem__(self, key):
+                diag = conf.lib.clang_getDiagnosticInSet(self.diag_set, key)
+                if not diag:
+                    raise IndexError
+                return Diagnostic(diag)
+
+        return ChildDiagnosticsIterator(self)
+
+    @property
+    def category_number(self):
+        """The category number for this diagnostic or 0 if unavailable."""
+        return conf.lib.clang_getDiagnosticCategory(self)
+
+    @property
+    def category_name(self):
+        """The string name of the category for this diagnostic."""
+        return conf.lib.clang_getDiagnosticCategoryText(self)
+
+    @property
+    def option(self):
+        """The command-line option that enables this diagnostic."""
+        return conf.lib.clang_getDiagnosticOption(self, None)
+
+    @property
+    def disable_option(self):
+        """The command-line option that disables this diagnostic."""
+        disable = _CXString()
+        conf.lib.clang_getDiagnosticOption(self, byref(disable))
+
+        return conf.lib.clang_getCString(disable)
+
+    def __repr__(self):
+        return "<Diagnostic severity %r, location %r, spelling %r>" % (
+            self.severity, self.location, self.spelling)
+
+    def from_param(self):
+      return self.ptr
+
+class FixIt(object):
+    """
+    A FixIt represents a transformation to be applied to the source to
+    "fix-it". The fix-it shouldbe applied by replacing the given source range
+    with the given value.
+    """
+
+    def __init__(self, range, value):
+        self.range = range
+        self.value = value
+
+    def __repr__(self):
+        return "<FixIt range %r, value %r>" % (self.range, self.value)
+
+class TokenGroup(object):
+    """Helper class to facilitate token management.
+
+    Tokens are allocated from libclang in chunks. They must be disposed of as a
+    collective group.
+
+    One purpose of this class is for instances to represent groups of allocated
+    tokens. Each token in a group contains a reference back to an instance of
+    this class. When all tokens from a group are garbage collected, it allows
+    this class to be garbage collected. When this class is garbage collected,
+    it calls the libclang destructor which invalidates all tokens in the group.
+
+    You should not instantiate this class outside of this module.
+    """
+    def __init__(self, tu, memory, count):
+        self._tu = tu
+        self._memory = memory
+        self._count = count
+
+    def __del__(self):
+        conf.lib.clang_disposeTokens(self._tu, self._memory, self._count)
+
+    @staticmethod
+    def get_tokens(tu, extent):
+        """Helper method to return all tokens in an extent.
+
+        This functionality is needed multiple places in this module. We define
+        it here because it seems like a logical place.
+        """
+        tokens_memory = POINTER(Token)()
+        tokens_count = c_uint()
+
+        conf.lib.clang_tokenize(tu, extent, byref(tokens_memory),
+                byref(tokens_count))
+
+        count = int(tokens_count.value)
+
+        # If we get no tokens, no memory was allocated. Be sure not to return
+        # anything and potentially call a destructor on nothing.
+        if count < 1:
+            return
+
+        tokens_array = cast(tokens_memory, POINTER(Token * count)).contents
+
+        token_group = TokenGroup(tu, tokens_memory, tokens_count)
+
+        for i in range(0, count):
+            token = Token()
+            token.int_data = tokens_array[i].int_data
+            token.ptr_data = tokens_array[i].ptr_data
+            token._tu = tu
+            token._group = token_group
+
+            yield token
+
+class TokenKind(object):
+    """Describes a specific type of a Token."""
+
+    _value_map = {} # int -> TokenKind
+
+    def __init__(self, value, name):
+        """Create a new TokenKind instance from a numeric value and a name."""
+        self.value = value
+        self.name = name
+
+    def __repr__(self):
+        return 'TokenKind.%s' % (self.name,)
+
+    @staticmethod
+    def from_value(value):
+        """Obtain a registered TokenKind instance from its value."""
+        result = TokenKind._value_map.get(value, None)
+
+        if result is None:
+            raise ValueError('Unknown TokenKind: %d' % value)
+
+        return result
+
+    @staticmethod
+    def register(value, name):
+        """Register a new TokenKind enumeration.
+
+        This should only be called at module load time by code within this
+        package.
+        """
+        if value in TokenKind._value_map:
+            raise ValueError('TokenKind already registered: %d' % value)
+
+        kind = TokenKind(value, name)
+        TokenKind._value_map[value] = kind
+        setattr(TokenKind, name, kind)
+
+### Cursor Kinds ###
+class BaseEnumeration(object):
+    """
+    Common base class for named enumerations held in sync with Index.h values.
+
+    Subclasses must define their own _kinds and _name_map members, as:
+    _kinds = []
+    _name_map = None
+    These values hold the per-subclass instances and value-to-name mappings,
+    respectively.
+
+    """
+
+    def __init__(self, value):
+        if value >= len(self.__class__._kinds):
+            self.__class__._kinds += [None] * (value - len(self.__class__._kinds) + 1)
+        if self.__class__._kinds[value] is not None:
+            raise ValueError('{0} value {1} already loaded'.format(
+                str(self.__class__), value))
+        self.value = value
+        self.__class__._kinds[value] = self
+        self.__class__._name_map = None
+
+
+    def from_param(self):
+        return self.value
+
+    @property
+    def name(self):
+        """Get the enumeration name of this cursor kind."""
+        if self._name_map is None:
+            self._name_map = {}
+            for key, value in list(self.__class__.__dict__.items()):
+                if isinstance(value, self.__class__):
+                    self._name_map[value] = key
+        return self._name_map[self]
+
+    @classmethod
+    def from_id(cls, id):
+        if id >= len(cls._kinds) or cls._kinds[id] is None:
+            raise ValueError('Unknown template argument kind %d' % id)
+        return cls._kinds[id]
+
+    def __repr__(self):
+        return '%s.%s' % (self.__class__, self.name,)
+
+
+class CursorKind(BaseEnumeration):
+    """
+    A CursorKind describes the kind of entity that a cursor points to.
+    """
+
+    # The required BaseEnumeration declarations.
+    _kinds = []
+    _name_map = None
+
+    @staticmethod
+    def get_all_kinds():
+        """Return all CursorKind enumeration instances."""
+        return [_f for _f in CursorKind._kinds if _f]
+
+    def is_declaration(self):
+        """Test if this is a declaration kind."""
+        return conf.lib.clang_isDeclaration(self)
+
+    def is_reference(self):
+        """Test if this is a reference kind."""
+        return conf.lib.clang_isReference(self)
+
+    def is_expression(self):
+        """Test if this is an expression kind."""
+        return conf.lib.clang_isExpression(self)
+
+    def is_statement(self):
+        """Test if this is a statement kind."""
+        return conf.lib.clang_isStatement(self)
+
+    def is_attribute(self):
+        """Test if this is an attribute kind."""
+        return conf.lib.clang_isAttribute(self)
+
+    def is_invalid(self):
+        """Test if this is an invalid kind."""
+        return conf.lib.clang_isInvalid(self)
+
+    def is_translation_unit(self):
+        """Test if this is a translation unit kind."""
+        return conf.lib.clang_isTranslationUnit(self)
+
+    def is_preprocessing(self):
+        """Test if this is a preprocessing kind."""
+        return conf.lib.clang_isPreprocessing(self)
+
+    def is_unexposed(self):
+        """Test if this is an unexposed kind."""
+        return conf.lib.clang_isUnexposed(self)
+
+    def __repr__(self):
+        return 'CursorKind.%s' % (self.name,)
+
+###
+# Declaration Kinds
+
+# A declaration whose specific kind is not exposed via this interface.
+#
+# Unexposed declarations have the same operations as any other kind of
+# declaration; one can extract their location information, spelling, find their
+# definitions, etc. However, the specific kind of the declaration is not
+# reported.
+CursorKind.UNEXPOSED_DECL = CursorKind(1)
+
+# A C or C++ struct.
+CursorKind.STRUCT_DECL = CursorKind(2)
+
+# A C or C++ union.
+CursorKind.UNION_DECL = CursorKind(3)
+
+# A C++ class.
+CursorKind.CLASS_DECL = CursorKind(4)
+
+# An enumeration.
+CursorKind.ENUM_DECL = CursorKind(5)
+
+# A field (in C) or non-static data member (in C++) in a struct, union, or C++
+# class.
+CursorKind.FIELD_DECL = CursorKind(6)
+
+# An enumerator constant.
+CursorKind.ENUM_CONSTANT_DECL = CursorKind(7)
+
+# A function.
+CursorKind.FUNCTION_DECL = CursorKind(8)
+
+# A variable.
+CursorKind.VAR_DECL = CursorKind(9)
+
+# A function or method parameter.
+CursorKind.PARM_DECL = CursorKind(10)
+
+# An Objective-C @interface.
+CursorKind.OBJC_INTERFACE_DECL = CursorKind(11)
+
+# An Objective-C @interface for a category.
+CursorKind.OBJC_CATEGORY_DECL = CursorKind(12)
+
+# An Objective-C @protocol declaration.
+CursorKind.OBJC_PROTOCOL_DECL = CursorKind(13)
+
+# An Objective-C @property declaration.
+CursorKind.OBJC_PROPERTY_DECL = CursorKind(14)
+
+# An Objective-C instance variable.
+CursorKind.OBJC_IVAR_DECL = CursorKind(15)
+
+# An Objective-C instance method.
+CursorKind.OBJC_INSTANCE_METHOD_DECL = CursorKind(16)
+
+# An Objective-C class method.
+CursorKind.OBJC_CLASS_METHOD_DECL = CursorKind(17)
+
+# An Objective-C @implementation.
+CursorKind.OBJC_IMPLEMENTATION_DECL = CursorKind(18)
+
+# An Objective-C @implementation for a category.
+CursorKind.OBJC_CATEGORY_IMPL_DECL = CursorKind(19)
+
+# A typedef.
+CursorKind.TYPEDEF_DECL = CursorKind(20)
+
+# A C++ class method.
+CursorKind.CXX_METHOD = CursorKind(21)
+
+# A C++ namespace.
+CursorKind.NAMESPACE = CursorKind(22)
+
+# A linkage specification, e.g. 'extern "C"'.
+CursorKind.LINKAGE_SPEC = CursorKind(23)
+
+# A C++ constructor.
+CursorKind.CONSTRUCTOR = CursorKind(24)
+
+# A C++ destructor.
+CursorKind.DESTRUCTOR = CursorKind(25)
+
+# A C++ conversion function.
+CursorKind.CONVERSION_FUNCTION = CursorKind(26)
+
+# A C++ template type parameter
+CursorKind.TEMPLATE_TYPE_PARAMETER = CursorKind(27)
+
+# A C++ non-type template paramater.
+CursorKind.TEMPLATE_NON_TYPE_PARAMETER = CursorKind(28)
+
+# A C++ template template parameter.
+CursorKind.TEMPLATE_TEMPLATE_PARAMETER = CursorKind(29)
+
+# A C++ function template.
+CursorKind.FUNCTION_TEMPLATE = CursorKind(30)
+
+# A C++ class template.
+CursorKind.CLASS_TEMPLATE = CursorKind(31)
+
+# A C++ class template partial specialization.
+CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION = CursorKind(32)
+
+# A C++ namespace alias declaration.
+CursorKind.NAMESPACE_ALIAS = CursorKind(33)
+
+# A C++ using directive
+CursorKind.USING_DIRECTIVE = CursorKind(34)
+
+# A C++ using declaration
+CursorKind.USING_DECLARATION = CursorKind(35)
+
+# A Type alias decl.
+CursorKind.TYPE_ALIAS_DECL = CursorKind(36)
+
+# A Objective-C synthesize decl
+CursorKind.OBJC_SYNTHESIZE_DECL = CursorKind(37)
+
+# A Objective-C dynamic decl
+CursorKind.OBJC_DYNAMIC_DECL = CursorKind(38)
+
+# A C++ access specifier decl.
+CursorKind.CXX_ACCESS_SPEC_DECL = CursorKind(39)
+
+
+###
+# Reference Kinds
+
+CursorKind.OBJC_SUPER_CLASS_REF = CursorKind(40)
+CursorKind.OBJC_PROTOCOL_REF = CursorKind(41)
+CursorKind.OBJC_CLASS_REF = CursorKind(42)
+
+# A reference to a type declaration.
+#
+# A type reference occurs anywhere where a type is named but not
+# declared. For example, given:
+#   typedef unsigned size_type;
+#   size_type size;
+#
+# The typedef is a declaration of size_type (CXCursor_TypedefDecl),
+# while the type of the variable "size" is referenced. The cursor
+# referenced by the type of size is the typedef for size_type.
+CursorKind.TYPE_REF = CursorKind(43)
+CursorKind.CXX_BASE_SPECIFIER = CursorKind(44)
+
+# A reference to a class template, function template, template
+# template parameter, or class template partial specialization.
+CursorKind.TEMPLATE_REF = CursorKind(45)
+
+# A reference to a namespace or namepsace alias.
+CursorKind.NAMESPACE_REF = CursorKind(46)
+
+# A reference to a member of a struct, union, or class that occurs in
+# some non-expression context, e.g., a designated initializer.
+CursorKind.MEMBER_REF = CursorKind(47)
+
+# A reference to a labeled statement.
+CursorKind.LABEL_REF = CursorKind(48)
+
+# A reference to a set of overloaded functions or function templates
+# that has not yet been resolved to a specific function or function template.
+CursorKind.OVERLOADED_DECL_REF = CursorKind(49)
+
+# A reference to a variable that occurs in some non-expression
+# context, e.g., a C++ lambda capture list.
+CursorKind.VARIABLE_REF = CursorKind(50)
+
+###
+# Invalid/Error Kinds
+
+CursorKind.INVALID_FILE = CursorKind(70)
+CursorKind.NO_DECL_FOUND = CursorKind(71)
+CursorKind.NOT_IMPLEMENTED = CursorKind(72)
+CursorKind.INVALID_CODE = CursorKind(73)
+
+###
+# Expression Kinds
+
+# An expression whose specific kind is not exposed via this interface.
+#
+# Unexposed expressions have the same operations as any other kind of
+# expression; one can extract their location information, spelling, children,
+# etc. However, the specific kind of the expression is not reported.
+CursorKind.UNEXPOSED_EXPR = CursorKind(100)
+
+# An expression that refers to some value declaration, such as a function,
+# varible, or enumerator.
+CursorKind.DECL_REF_EXPR = CursorKind(101)
+
+# An expression that refers to a member of a struct, union, class, Objective-C
+# class, etc.
+CursorKind.MEMBER_REF_EXPR = CursorKind(102)
+
+# An expression that calls a function.
+CursorKind.CALL_EXPR = CursorKind(103)
+
+# An expression that sends a message to an Objective-C object or class.
+CursorKind.OBJC_MESSAGE_EXPR = CursorKind(104)
+
+# An expression that represents a block literal.
+CursorKind.BLOCK_EXPR = CursorKind(105)
+
+# An integer literal.
+CursorKind.INTEGER_LITERAL = CursorKind(106)
+
+# A floating point number literal.
+CursorKind.FLOATING_LITERAL = CursorKind(107)
+
+# An imaginary number literal.
+CursorKind.IMAGINARY_LITERAL = CursorKind(108)
+
+# A string literal.
+CursorKind.STRING_LITERAL = CursorKind(109)
+
+# A character literal.
+CursorKind.CHARACTER_LITERAL = CursorKind(110)
+
+# A parenthesized expression, e.g. "(1)".
+#
+# This AST node is only formed if full location information is requested.
+CursorKind.PAREN_EXPR = CursorKind(111)
+
+# This represents the unary-expression's (except sizeof and
+# alignof).
+CursorKind.UNARY_OPERATOR = CursorKind(112)
+
+# [C99 6.5.2.1] Array Subscripting.
+CursorKind.ARRAY_SUBSCRIPT_EXPR = CursorKind(113)
+
+# A builtin binary operation expression such as "x + y" or
+# "x <= y".
+CursorKind.BINARY_OPERATOR = CursorKind(114)
+
+# Compound assignment such as "+=".
+CursorKind.COMPOUND_ASSIGNMENT_OPERATOR = CursorKind(115)
+
+# The ?: ternary operator.
+CursorKind.CONDITIONAL_OPERATOR = CursorKind(116)
+
+# An explicit cast in C (C99 6.5.4) or a C-style cast in C++
+# (C++ [expr.cast]), which uses the syntax (Type)expr.
+#
+# For example: (int)f.
+CursorKind.CSTYLE_CAST_EXPR = CursorKind(117)
+
+# [C99 6.5.2.5]
+CursorKind.COMPOUND_LITERAL_EXPR = CursorKind(118)
+
+# Describes an C or C++ initializer list.
+CursorKind.INIT_LIST_EXPR = CursorKind(119)
+
+# The GNU address of label extension, representing &&label.
+CursorKind.ADDR_LABEL_EXPR = CursorKind(120)
+
+# This is the GNU Statement Expression extension: ({int X=4; X;})
+CursorKind.StmtExpr = CursorKind(121)
+
+# Represents a C11 generic selection.
+CursorKind.GENERIC_SELECTION_EXPR = CursorKind(122)
+
+# Implements the GNU __null extension, which is a name for a null
+# pointer constant that has integral type (e.g., int or long) and is the same
+# size and alignment as a pointer.
+#
+# The __null extension is typically only used by system headers, which define
+# NULL as __null in C++ rather than using 0 (which is an integer that may not
+# match the size of a pointer).
+CursorKind.GNU_NULL_EXPR = CursorKind(123)
+
+# C++'s static_cast<> expression.
+CursorKind.CXX_STATIC_CAST_EXPR = CursorKind(124)
+
+# C++'s dynamic_cast<> expression.
+CursorKind.CXX_DYNAMIC_CAST_EXPR = CursorKind(125)
+
+# C++'s reinterpret_cast<> expression.
+CursorKind.CXX_REINTERPRET_CAST_EXPR = CursorKind(126)
+
+# C++'s const_cast<> expression.
+CursorKind.CXX_CONST_CAST_EXPR = CursorKind(127)
+
+# Represents an explicit C++ type conversion that uses "functional"
+# notion (C++ [expr.type.conv]).
+#
+# Example:
+# \code
+#   x = int(0.5);
+# \endcode
+CursorKind.CXX_FUNCTIONAL_CAST_EXPR = CursorKind(128)
+
+# A C++ typeid expression (C++ [expr.typeid]).
+CursorKind.CXX_TYPEID_EXPR = CursorKind(129)
+
+# [C++ 2.13.5] C++ Boolean Literal.
+CursorKind.CXX_BOOL_LITERAL_EXPR = CursorKind(130)
+
+# [C++0x 2.14.7] C++ Pointer Literal.
+CursorKind.CXX_NULL_PTR_LITERAL_EXPR = CursorKind(131)
+
+# Represents the "this" expression in C++
+CursorKind.CXX_THIS_EXPR = CursorKind(132)
+
+# [C++ 15] C++ Throw Expression.
+#
+# This handles 'throw' and 'throw' assignment-expression. When
+# assignment-expression isn't present, Op will be null.
+CursorKind.CXX_THROW_EXPR = CursorKind(133)
+
+# A new expression for memory allocation and constructor calls, e.g:
+# "new CXXNewExpr(foo)".
+CursorKind.CXX_NEW_EXPR = CursorKind(134)
+
+# A delete expression for memory deallocation and destructor calls,
+# e.g. "delete[] pArray".
+CursorKind.CXX_DELETE_EXPR = CursorKind(135)
+
+# Represents a unary expression.
+CursorKind.CXX_UNARY_EXPR = CursorKind(136)
+
+# ObjCStringLiteral, used for Objective-C string literals i.e. "foo".
+CursorKind.OBJC_STRING_LITERAL = CursorKind(137)
+
+# ObjCEncodeExpr, used for in Objective-C.
+CursorKind.OBJC_ENCODE_EXPR = CursorKind(138)
+
+# ObjCSelectorExpr used for in Objective-C.
+CursorKind.OBJC_SELECTOR_EXPR = CursorKind(139)
+
+# Objective-C's protocol expression.
+CursorKind.OBJC_PROTOCOL_EXPR = CursorKind(140)
+
+# An Objective-C "bridged" cast expression, which casts between
+# Objective-C pointers and C pointers, transferring ownership in the process.
+#
+# \code
+#   NSString *str = (__bridge_transfer NSString *)CFCreateString();
+# \endcode
+CursorKind.OBJC_BRIDGE_CAST_EXPR = CursorKind(141)
+
+# Represents a C++0x pack expansion that produces a sequence of
+# expressions.
+#
+# A pack expansion expression contains a pattern (which itself is an
+# expression) followed by an ellipsis. For example:
+CursorKind.PACK_EXPANSION_EXPR = CursorKind(142)
+
+# Represents an expression that computes the length of a parameter
+# pack.
+CursorKind.SIZE_OF_PACK_EXPR = CursorKind(143)
+
+# Represents a C++ lambda expression that produces a local function
+# object.
+#
+#  \code
+#  void abssort(float *x, unsigned N) {
+#    std::sort(x, x + N,
+#              [](float a, float b) {
+#                return std::abs(a) < std::abs(b);
+#              });
+#  }
+#  \endcode
+CursorKind.LAMBDA_EXPR = CursorKind(144)
+
+# Objective-c Boolean Literal.
+CursorKind.OBJ_BOOL_LITERAL_EXPR = CursorKind(145)
+
+# Represents the "self" expression in a ObjC method.
+CursorKind.OBJ_SELF_EXPR = CursorKind(146)
+
+
+# A statement whose specific kind is not exposed via this interface.
+#
+# Unexposed statements have the same operations as any other kind of statement;
+# one can extract their location information, spelling, children, etc. However,
+# the specific kind of the statement is not reported.
+CursorKind.UNEXPOSED_STMT = CursorKind(200)
+
+# A labelled statement in a function.
+CursorKind.LABEL_STMT = CursorKind(201)
+
+# A compound statement
+CursorKind.COMPOUND_STMT = CursorKind(202)
+
+# A case statement.
+CursorKind.CASE_STMT = CursorKind(203)
+
+# A default statement.
+CursorKind.DEFAULT_STMT = CursorKind(204)
+
+# An if statement.
+CursorKind.IF_STMT = CursorKind(205)
+
+# A switch statement.
+CursorKind.SWITCH_STMT = CursorKind(206)
+
+# A while statement.
+CursorKind.WHILE_STMT = CursorKind(207)
+
+# A do statement.
+CursorKind.DO_STMT = CursorKind(208)
+
+# A for statement.
+CursorKind.FOR_STMT = CursorKind(209)
+
+# A goto statement.
+CursorKind.GOTO_STMT = CursorKind(210)
+
+# An indirect goto statement.
+CursorKind.INDIRECT_GOTO_STMT = CursorKind(211)
+
+# A continue statement.
+CursorKind.CONTINUE_STMT = CursorKind(212)
+
+# A break statement.
+CursorKind.BREAK_STMT = CursorKind(213)
+
+# A return statement.
+CursorKind.RETURN_STMT = CursorKind(214)
+
+# A GNU-style inline assembler statement.
+CursorKind.ASM_STMT = CursorKind(215)
+
+# Objective-C's overall @try-@catch-@finally statement.
+CursorKind.OBJC_AT_TRY_STMT = CursorKind(216)
+
+# Objective-C's @catch statement.
+CursorKind.OBJC_AT_CATCH_STMT = CursorKind(217)
+
+# Objective-C's @finally statement.
+CursorKind.OBJC_AT_FINALLY_STMT = CursorKind(218)
+
+# Objective-C's @throw statement.
+CursorKind.OBJC_AT_THROW_STMT = CursorKind(219)
+
+# Objective-C's @synchronized statement.
+CursorKind.OBJC_AT_SYNCHRONIZED_STMT = CursorKind(220)
+
+# Objective-C's autorealease pool statement.
+CursorKind.OBJC_AUTORELEASE_POOL_STMT = CursorKind(221)
+
+# Objective-C's for collection statement.
+CursorKind.OBJC_FOR_COLLECTION_STMT = CursorKind(222)
+
+# C++'s catch statement.
+CursorKind.CXX_CATCH_STMT = CursorKind(223)
+
+# C++'s try statement.
+CursorKind.CXX_TRY_STMT = CursorKind(224)
+
+# C++'s for (* : *) statement.
+CursorKind.CXX_FOR_RANGE_STMT = CursorKind(225)
+
+# Windows Structured Exception Handling's try statement.
+CursorKind.SEH_TRY_STMT = CursorKind(226)
+
+# Windows Structured Exception Handling's except statement.
+CursorKind.SEH_EXCEPT_STMT = CursorKind(227)
+
+# Windows Structured Exception Handling's finally statement.
+CursorKind.SEH_FINALLY_STMT = CursorKind(228)
+
+# A MS inline assembly statement extension.
+CursorKind.MS_ASM_STMT = CursorKind(229)
+
+# The null statement.
+CursorKind.NULL_STMT = CursorKind(230)
+
+# Adaptor class for mixing declarations with statements and expressions.
+CursorKind.DECL_STMT = CursorKind(231)
+
+# OpenMP parallel directive.
+CursorKind.OMP_PARALLEL_DIRECTIVE = CursorKind(232)
+
+# OpenMP SIMD directive.
+CursorKind.OMP_SIMD_DIRECTIVE = CursorKind(233)
+
+# OpenMP for directive.
+CursorKind.OMP_FOR_DIRECTIVE = CursorKind(234)
+
+# OpenMP sections directive.
+CursorKind.OMP_SECTIONS_DIRECTIVE = CursorKind(235)
+
+# OpenMP section directive.
+CursorKind.OMP_SECTION_DIRECTIVE = CursorKind(236)
+
+# OpenMP single directive.
+CursorKind.OMP_SINGLE_DIRECTIVE = CursorKind(237)
+
+# OpenMP parallel for directive.
+CursorKind.OMP_PARALLEL_FOR_DIRECTIVE = CursorKind(238)
+
+# OpenMP parallel sections directive.
+CursorKind.OMP_PARALLEL_SECTIONS_DIRECTIVE = CursorKind(239)
+
+# OpenMP task directive.
+CursorKind.OMP_TASK_DIRECTIVE = CursorKind(240)
+
+# OpenMP master directive.
+CursorKind.OMP_MASTER_DIRECTIVE = CursorKind(241)
+
+# OpenMP critical directive.
+CursorKind.OMP_CRITICAL_DIRECTIVE = CursorKind(242)
+
+# OpenMP taskyield directive.
+CursorKind.OMP_TASKYIELD_DIRECTIVE = CursorKind(243)
+
+# OpenMP barrier directive.
+CursorKind.OMP_BARRIER_DIRECTIVE = CursorKind(244)
+
+# OpenMP taskwait directive.
+CursorKind.OMP_TASKWAIT_DIRECTIVE = CursorKind(245)
+
+# OpenMP flush directive.
+CursorKind.OMP_FLUSH_DIRECTIVE = CursorKind(246)
+
+# Windows Structured Exception Handling's leave statement.
+CursorKind.SEH_LEAVE_STMT = CursorKind(247)
+
+# OpenMP ordered directive.
+CursorKind.OMP_ORDERED_DIRECTIVE = CursorKind(248)
+
+# OpenMP atomic directive.
+CursorKind.OMP_ATOMIC_DIRECTIVE = CursorKind(249)
+
+# OpenMP for SIMD directive.
+CursorKind.OMP_FOR_SIMD_DIRECTIVE = CursorKind(250)
+
+# OpenMP parallel for SIMD directive.
+CursorKind.OMP_PARALLELFORSIMD_DIRECTIVE = CursorKind(251)
+
+# OpenMP target directive.
+CursorKind.OMP_TARGET_DIRECTIVE = CursorKind(252)
+
+# OpenMP teams directive.
+CursorKind.OMP_TEAMS_DIRECTIVE = CursorKind(253)
+
+# OpenMP taskgroup directive.
+CursorKind.OMP_TASKGROUP_DIRECTIVE = CursorKind(254)
+
+# OpenMP cancellation point directive.
+CursorKind.OMP_CANCELLATION_POINT_DIRECTIVE = CursorKind(255)
+
+# OpenMP cancel directive.
+CursorKind.OMP_CANCEL_DIRECTIVE = CursorKind(256)
+
+# OpenMP target data directive.
+CursorKind.OMP_TARGET_DATA_DIRECTIVE = CursorKind(257)
+
+# OpenMP taskloop directive.
+CursorKind.OMP_TASK_LOOP_DIRECTIVE = CursorKind(258)
+
+# OpenMP taskloop simd directive.
+CursorKind.OMP_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(259)
+
+# OpenMP distribute directive.
+CursorKind.OMP_DISTRIBUTE_DIRECTIVE = CursorKind(260)
+
+# OpenMP target enter data directive.
+CursorKind.OMP_TARGET_ENTER_DATA_DIRECTIVE = CursorKind(261)
+
+# OpenMP target exit data directive.
+CursorKind.OMP_TARGET_EXIT_DATA_DIRECTIVE = CursorKind(262)
+
+# OpenMP target parallel directive.
+CursorKind.OMP_TARGET_PARALLEL_DIRECTIVE = CursorKind(263)
+
+# OpenMP target parallel for directive.
+CursorKind.OMP_TARGET_PARALLELFOR_DIRECTIVE = CursorKind(264)
+
+# OpenMP target update directive.
+CursorKind.OMP_TARGET_UPDATE_DIRECTIVE = CursorKind(265)
+
+# OpenMP distribute parallel for directive.
+CursorKind.OMP_DISTRIBUTE_PARALLELFOR_DIRECTIVE = CursorKind(266)
+
+# OpenMP distribute parallel for simd directive.
+CursorKind.OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(267)
+
+# OpenMP distribute simd directive.
+CursorKind.OMP_DISTRIBUTE_SIMD_DIRECTIVE = CursorKind(268)
+
+# OpenMP target parallel for simd directive.
+CursorKind.OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(269)
+
+# OpenMP target simd directive.
+CursorKind.OMP_TARGET_SIMD_DIRECTIVE = CursorKind(270)
+
+# OpenMP teams distribute directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(271)
+
+###
+# Other Kinds
+
+# Cursor that represents the translation unit itself.
+#
+# The translation unit cursor exists primarily to act as the root cursor for
+# traversing the contents of a translation unit.
+CursorKind.TRANSLATION_UNIT = CursorKind(300)
+
+###
+# Attributes
+
+# An attribute whoe specific kind is note exposed via this interface
+CursorKind.UNEXPOSED_ATTR = CursorKind(400)
+
+CursorKind.IB_ACTION_ATTR = CursorKind(401)
+CursorKind.IB_OUTLET_ATTR = CursorKind(402)
+CursorKind.IB_OUTLET_COLLECTION_ATTR = CursorKind(403)
+
+CursorKind.CXX_FINAL_ATTR = CursorKind(404)
+CursorKind.CXX_OVERRIDE_ATTR = CursorKind(405)
+CursorKind.ANNOTATE_ATTR = CursorKind(406)
+CursorKind.ASM_LABEL_ATTR = CursorKind(407)
+CursorKind.PACKED_ATTR = CursorKind(408)
+CursorKind.PURE_ATTR = CursorKind(409)
+CursorKind.CONST_ATTR = CursorKind(410)
+CursorKind.NODUPLICATE_ATTR = CursorKind(411)
+CursorKind.CUDACONSTANT_ATTR = CursorKind(412)
+CursorKind.CUDADEVICE_ATTR = CursorKind(413)
+CursorKind.CUDAGLOBAL_ATTR = CursorKind(414)
+CursorKind.CUDAHOST_ATTR = CursorKind(415)
+CursorKind.CUDASHARED_ATTR = CursorKind(416)
+
+CursorKind.VISIBILITY_ATTR = CursorKind(417)
+
+CursorKind.DLLEXPORT_ATTR = CursorKind(418)
+CursorKind.DLLIMPORT_ATTR = CursorKind(419)
+
+###
+# Preprocessing
+CursorKind.PREPROCESSING_DIRECTIVE = CursorKind(500)
+CursorKind.MACRO_DEFINITION = CursorKind(501)
+CursorKind.MACRO_INSTANTIATION = CursorKind(502)
+CursorKind.INCLUSION_DIRECTIVE = CursorKind(503)
+
+###
+# Extra declaration
+
+# A module import declaration.
+CursorKind.MODULE_IMPORT_DECL = CursorKind(600)
+# A type alias template declaration
+CursorKind.TYPE_ALIAS_TEMPLATE_DECL = CursorKind(601)
+# A static_assert or _Static_assert node
+CursorKind.STATIC_ASSERT = CursorKind(602)
+# A friend declaration
+CursorKind.FRIEND_DECL = CursorKind(603)
+
+# A code completion overload candidate.
+CursorKind.OVERLOAD_CANDIDATE = CursorKind(700)
+
+### Template Argument Kinds ###
+class TemplateArgumentKind(BaseEnumeration):
+    """
+    A TemplateArgumentKind describes the kind of entity that a template argument
+    represents.
+    """
+
+    # The required BaseEnumeration declarations.
+    _kinds = []
+    _name_map = None
+
+TemplateArgumentKind.NULL = TemplateArgumentKind(0)
+TemplateArgumentKind.TYPE = TemplateArgumentKind(1)
+TemplateArgumentKind.DECLARATION = TemplateArgumentKind(2)
+TemplateArgumentKind.NULLPTR = TemplateArgumentKind(3)
+TemplateArgumentKind.INTEGRAL = TemplateArgumentKind(4)
+
+### Cursors ###
+
+class Cursor(Structure):
+    """
+    The Cursor class represents a reference to an element within the AST. It
+    acts as a kind of iterator.
+    """
+    _fields_ = [("_kind_id", c_int), ("xdata", c_int), ("data", c_void_p * 3)]
+
+    @staticmethod
+    def from_location(tu, location):
+        # We store a reference to the TU in the instance so the TU won't get
+        # collected before the cursor.
+        cursor = conf.lib.clang_getCursor(tu, location)
+        cursor._tu = tu
+
+        return cursor
+
+    def __eq__(self, other):
+        return conf.lib.clang_equalCursors(self, other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def is_definition(self):
+        """
+        Returns true if the declaration pointed at by the cursor is also a
+        definition of that entity.
+        """
+        return conf.lib.clang_isCursorDefinition(self)
+
+    def is_const_method(self):
+        """Returns True if the cursor refers to a C++ member function or member
+        function template that is declared 'const'.
+        """
+        return conf.lib.clang_CXXMethod_isConst(self)
+
+    def is_converting_constructor(self):
+        """Returns True if the cursor refers to a C++ converting constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)
+
+    def is_copy_constructor(self):
+        """Returns True if the cursor refers to a C++ copy constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)
+
+    def is_default_constructor(self):
+        """Returns True if the cursor refers to a C++ default constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)
+
+    def is_move_constructor(self):
+        """Returns True if the cursor refers to a C++ move constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)
+
+    def is_default_method(self):
+        """Returns True if the cursor refers to a C++ member function or member
+        function template that is declared '= default'.
+        """
+        return conf.lib.clang_CXXMethod_isDefaulted(self)
+
+    def is_mutable_field(self):
+        """Returns True if the cursor refers to a C++ field that is declared
+        'mutable'.
+        """
+        return conf.lib.clang_CXXField_isMutable(self)
+
+    def is_pure_virtual_method(self):
+        """Returns True if the cursor refers to a C++ member function or member
+        function template that is declared pure virtual.
+        """
+        return conf.lib.clang_CXXMethod_isPureVirtual(self)
+
+    def is_static_method(self):
+        """Returns True if the cursor refers to a C++ member function or member
+        function template that is declared 'static'.
+        """
+        return conf.lib.clang_CXXMethod_isStatic(self)
+
+    def is_virtual_method(self):
+        """Returns True if the cursor refers to a C++ member function or member
+        function template that is declared 'virtual'.
+        """
+        return conf.lib.clang_CXXMethod_isVirtual(self)
+
+    def get_definition(self):
+        """
+        If the cursor is a reference to a declaration or a declaration of
+        some entity, return a cursor that points to the definition of that
+        entity.
+        """
+        # TODO: Should probably check that this is either a reference or
+        # declaration prior to issuing the lookup.
+        return conf.lib.clang_getCursorDefinition(self)
+
+    def get_usr(self):
+        """Return the Unified Symbol Resultion (USR) for the entity referenced
+        by the given cursor (or None).
+
+        A Unified Symbol Resolution (USR) is a string that identifies a
+        particular entity (function, class, variable, etc.) within a
+        program. USRs can be compared across translation units to determine,
+        e.g., when references in one translation refer to an entity defined in
+        another translation unit."""
+        return conf.lib.clang_getCursorUSR(self)
+
+    @property
+    def kind(self):
+        """Return the kind of this cursor."""
+        return CursorKind.from_id(self._kind_id)
+
+    @property
+    def spelling(self):
+        """Return the spelling of the entity pointed at by the cursor."""
+        if not hasattr(self, '_spelling'):
+            self._spelling = conf.lib.clang_getCursorSpelling(self)
+
+        return self._spelling
+
+    @property
+    def displayname(self):
+        """
+        Return the display name for the entity referenced by this cursor.
+
+        The display name contains extra information that helps identify the
+        cursor, such as the parameters of a function or template or the
+        arguments of a class template specialization.
+        """
+        if not hasattr(self, '_displayname'):
+            self._displayname = conf.lib.clang_getCursorDisplayName(self)
+
+        return self._displayname
+
+    @property
+    def mangled_name(self):
+        """Return the mangled name for the entity referenced by this cursor."""
+        if not hasattr(self, '_mangled_name'):
+            self._mangled_name = conf.lib.clang_Cursor_getMangling(self)
+
+        return self._mangled_name
+
+    @property
+    def location(self):
+        """
+        Return the source location (the starting character) of the entity
+        pointed at by the cursor.
+        """
+        if not hasattr(self, '_loc'):
+            self._loc = conf.lib.clang_getCursorLocation(self)
+
+        return self._loc
+
+    @property
+    def extent(self):
+        """
+        Return the source range (the range of text) occupied by the entity
+        pointed at by the cursor.
+        """
+        if not hasattr(self, '_extent'):
+            self._extent = conf.lib.clang_getCursorExtent(self)
+
+        return self._extent
+
+    @property
+    def storage_class(self):
+        """
+        Retrieves the storage class (if any) of the entity pointed at by the
+        cursor.
+        """
+        if not hasattr(self, '_storage_class'):
+            self._storage_class = conf.lib.clang_Cursor_getStorageClass(self)
+
+        return StorageClass.from_id(self._storage_class)
+
+    @property
+    def access_specifier(self):
+        """
+        Retrieves the access specifier (if any) of the entity pointed at by the
+        cursor.
+        """
+        if not hasattr(self, '_access_specifier'):
+            self._access_specifier = conf.lib.clang_getCXXAccessSpecifier(self)
+
+        return AccessSpecifier.from_id(self._access_specifier)
+
+    @property
+    def type(self):
+        """
+        Retrieve the Type (if any) of the entity pointed at by the cursor.
+        """
+        if not hasattr(self, '_type'):
+            self._type = conf.lib.clang_getCursorType(self)
+
+        return self._type
+
+    @property
+    def canonical(self):
+        """Return the canonical Cursor corresponding to this Cursor.
+
+        The canonical cursor is the cursor which is representative for the
+        underlying entity. For example, if you have multiple forward
+        declarations for the same class, the canonical cursor for the forward
+        declarations will be identical.
+        """
+        if not hasattr(self, '_canonical'):
+            self._canonical = conf.lib.clang_getCanonicalCursor(self)
+
+        return self._canonical
+
+    @property
+    def result_type(self):
+        """Retrieve the Type of the result for this Cursor."""
+        if not hasattr(self, '_result_type'):
+            self._result_type = conf.lib.clang_getResultType(self.type)
+
+        return self._result_type
+
+    @property
+    def underlying_typedef_type(self):
+        """Return the underlying type of a typedef declaration.
+
+        Returns a Type for the typedef this cursor is a declaration for. If
+        the current cursor is not a typedef, this raises.
+        """
+        if not hasattr(self, '_underlying_type'):
+            assert self.kind.is_declaration()
+            self._underlying_type = \
+              conf.lib.clang_getTypedefDeclUnderlyingType(self)
+
+        return self._underlying_type
+
+    @property
+    def enum_type(self):
+        """Return the integer type of an enum declaration.
+
+        Returns a Type corresponding to an integer. If the cursor is not for an
+        enum, this raises.
+        """
+        if not hasattr(self, '_enum_type'):
+            assert self.kind == CursorKind.ENUM_DECL
+            self._enum_type = conf.lib.clang_getEnumDeclIntegerType(self)
+
+        return self._enum_type
+
+    @property
+    def enum_value(self):
+        """Return the value of an enum constant."""
+        if not hasattr(self, '_enum_value'):
+            assert self.kind == CursorKind.ENUM_CONSTANT_DECL
+            # Figure out the underlying type of the enum to know if it
+            # is a signed or unsigned quantity.
+            underlying_type = self.type
+            if underlying_type.kind == TypeKind.ENUM:
+                underlying_type = underlying_type.get_declaration().enum_type
+            if underlying_type.kind in (TypeKind.CHAR_U,
+                                        TypeKind.UCHAR,
+                                        TypeKind.CHAR16,
+                                        TypeKind.CHAR32,
+                                        TypeKind.USHORT,
+                                        TypeKind.UINT,
+                                        TypeKind.ULONG,
+                                        TypeKind.ULONGLONG,
+                                        TypeKind.UINT128):
+                self._enum_value = \
+                  conf.lib.clang_getEnumConstantDeclUnsignedValue(self)
+            else:
+                self._enum_value = conf.lib.clang_getEnumConstantDeclValue(self)
+        return self._enum_value
+
+    @property
+    def objc_type_encoding(self):
+        """Return the Objective-C type encoding as a str."""
+        if not hasattr(self, '_objc_type_encoding'):
+            self._objc_type_encoding = \
+              conf.lib.clang_getDeclObjCTypeEncoding(self)
+
+        return self._objc_type_encoding
+
+    @property
+    def hash(self):
+        """Returns a hash of the cursor as an int."""
+        if not hasattr(self, '_hash'):
+            self._hash = conf.lib.clang_hashCursor(self)
+
+        return self._hash
+
+    @property
+    def semantic_parent(self):
+        """Return the semantic parent for this cursor."""
+        if not hasattr(self, '_semantic_parent'):
+            self._semantic_parent = conf.lib.clang_getCursorSemanticParent(self)
+
+        return self._semantic_parent
+
+    @property
+    def lexical_parent(self):
+        """Return the lexical parent for this cursor."""
+        if not hasattr(self, '_lexical_parent'):
+            self._lexical_parent = conf.lib.clang_getCursorLexicalParent(self)
+
+        return self._lexical_parent
+
+    @property
+    def translation_unit(self):
+        """Returns the TranslationUnit to which this Cursor belongs."""
+        # If this triggers an AttributeError, the instance was not properly
+        # created.
+        return self._tu
+
+    @property
+    def referenced(self):
+        """
+        For a cursor that is a reference, returns a cursor
+        representing the entity that it references.
+        """
+        if not hasattr(self, '_referenced'):
+            self._referenced = conf.lib.clang_getCursorReferenced(self)
+
+        return self._referenced
+
+    @property
+    def brief_comment(self):
+        """Returns the brief comment text associated with that Cursor"""
+        return conf.lib.clang_Cursor_getBriefCommentText(self)
+
+    @property
+    def raw_comment(self):
+        """Returns the raw comment text associated with that Cursor"""
+        return conf.lib.clang_Cursor_getRawCommentText(self)
+
+    def get_arguments(self):
+        """Return an iterator for accessing the arguments of this cursor."""
+        num_args = conf.lib.clang_Cursor_getNumArguments(self)
+        for i in range(0, num_args):
+            yield conf.lib.clang_Cursor_getArgument(self, i)
+
+    def get_num_template_arguments(self):
+        """Returns the number of template args associated with this cursor."""
+        return conf.lib.clang_Cursor_getNumTemplateArguments(self)
+
+    def get_template_argument_kind(self, num):
+        """Returns the TemplateArgumentKind for the indicated template
+        argument."""
+        return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)
+
+    def get_template_argument_type(self, num):
+        """Returns the CXType for the indicated template argument."""
+        return conf.lib.clang_Cursor_getTemplateArgumentType(self, num)
+
+    def get_template_argument_value(self, num):
+        """Returns the value of the indicated arg as a signed 64b integer."""
+        return conf.lib.clang_Cursor_getTemplateArgumentValue(self, num)
+
+    def get_template_argument_unsigned_value(self, num):
+        """Returns the value of the indicated arg as an unsigned 64b integer."""
+        return conf.lib.clang_Cursor_getTemplateArgumentUnsignedValue(self, num)
+
+    def get_children(self):
+        """Return an iterator for accessing the children of this cursor."""
+
+        # FIXME: Expose iteration from CIndex, PR6125.
+        def visitor(child, parent, children):
+            # FIXME: Document this assertion in API.
+            # FIXME: There should just be an isNull method.
+            assert child != conf.lib.clang_getNullCursor()
+
+            # Create reference to TU so it isn't GC'd before Cursor.
+            child._tu = self._tu
+            children.append(child)
+            return 1 # continue
+        children = []
+        conf.lib.clang_visitChildren(self, callbacks['cursor_visit'](visitor),
+            children)
+        return iter(children)
+
+    def walk_preorder(self):
+        """Depth-first preorder walk over the cursor and its descendants.
+
+        Yields cursors.
+        """
+        yield self
+        for child in self.get_children():
+            for descendant in child.walk_preorder():
+                yield descendant
+
+    def get_tokens(self):
+        """Obtain Token instances formulating that compose this Cursor.
+
+        This is a generator for Token instances. It returns all tokens which
+        occupy the extent this cursor occupies.
+        """
+        return TokenGroup.get_tokens(self._tu, self.extent)
+
+    def get_field_offsetof(self):
+        """Returns the offsetof the FIELD_DECL pointed by this Cursor."""
+        return conf.lib.clang_Cursor_getOffsetOfField(self)
+
+    def is_anonymous(self):
+        """
+        Check if the record is anonymous.
+        """
+        if self.kind == CursorKind.FIELD_DECL:
+            return self.type.get_declaration().is_anonymous()
+        return conf.lib.clang_Cursor_isAnonymous(self)
+
+    def is_bitfield(self):
+        """
+        Check if the field is a bitfield.
+        """
+        return conf.lib.clang_Cursor_isBitField(self)
+
+    def get_bitfield_width(self):
+        """
+        Retrieve the width of a bitfield.
+        """
+        return conf.lib.clang_getFieldDeclBitWidth(self)
+
+    @staticmethod
+    def from_result(res, fn, args):
+        assert isinstance(res, Cursor)
+        # FIXME: There should just be an isNull method.
+        if res == conf.lib.clang_getNullCursor():
+            return None
+
+        # Store a reference to the TU in the Python object so it won't get GC'd
+        # before the Cursor.
+        tu = None
+        for arg in args:
+            if isinstance(arg, TranslationUnit):
+                tu = arg
+                break
+
+            if hasattr(arg, 'translation_unit'):
+                tu = arg.translation_unit
+                break
+
+        assert tu is not None
+
+        res._tu = tu
+        return res
+
+    @staticmethod
+    def from_cursor_result(res, fn, args):
+        assert isinstance(res, Cursor)
+        if res == conf.lib.clang_getNullCursor():
+            return None
+
+        res._tu = args[0]._tu
+        return res
+
+class StorageClass(object):
+    """
+    Describes the storage class of a declaration
+    """
+
+    # The unique kind objects, index by id.
+    _kinds = []
+    _name_map = None
+
+    def __init__(self, value):
+        if value >= len(StorageClass._kinds):
+            StorageClass._kinds += [None] * (value - len(StorageClass._kinds) + 1)
+        if StorageClass._kinds[value] is not None:
+            raise ValueError('StorageClass already loaded')
+        self.value = value
+        StorageClass._kinds[value] = self
+        StorageClass._name_map = None
+
+    def from_param(self):
+        return self.value
+
+    @property
+    def name(self):
+        """Get the enumeration name of this storage class."""
+        if self._name_map is None:
+            self._name_map = {}
+            for key,value in list(StorageClass.__dict__.items()):
+                if isinstance(value,StorageClass):
+                    self._name_map[value] = key
+        return self._name_map[self]
+
+    @staticmethod
+    def from_id(id):
+        if id >= len(StorageClass._kinds) or not StorageClass._kinds[id]:
+            raise ValueError('Unknown storage class %d' % id)
+        return StorageClass._kinds[id]
+
+    def __repr__(self):
+        return 'StorageClass.%s' % (self.name,)
+
+StorageClass.INVALID = StorageClass(0)
+StorageClass.NONE = StorageClass(1)
+StorageClass.EXTERN = StorageClass(2)
+StorageClass.STATIC = StorageClass(3)
+StorageClass.PRIVATEEXTERN = StorageClass(4)
+StorageClass.OPENCLWORKGROUPLOCAL = StorageClass(5)
+StorageClass.AUTO = StorageClass(6)
+StorageClass.REGISTER = StorageClass(7)
+
+
+### C++ access specifiers ###
+
+class AccessSpecifier(BaseEnumeration):
+    """
+    Describes the access of a C++ class member
+    """
+
+    # The unique kind objects, index by id.
+    _kinds = []
+    _name_map = None
+
+    def from_param(self):
+        return self.value
+
+    def __repr__(self):
+        return 'AccessSpecifier.%s' % (self.name,)
+
+AccessSpecifier.INVALID = AccessSpecifier(0)
+AccessSpecifier.PUBLIC = AccessSpecifier(1)
+AccessSpecifier.PROTECTED = AccessSpecifier(2)
+AccessSpecifier.PRIVATE = AccessSpecifier(3)
+AccessSpecifier.NONE = AccessSpecifier(4)
+
+### Type Kinds ###
+
+class TypeKind(BaseEnumeration):
+    """
+    Describes the kind of type.
+    """
+
+    # The unique kind objects, indexed by id.
+    _kinds = []
+    _name_map = None
+
+    @property
+    def spelling(self):
+        """Retrieve the spelling of this TypeKind."""
+        return conf.lib.clang_getTypeKindSpelling(self.value)
+
+    def __repr__(self):
+        return 'TypeKind.%s' % (self.name,)
+
+TypeKind.INVALID = TypeKind(0)
+TypeKind.UNEXPOSED = TypeKind(1)
+TypeKind.VOID = TypeKind(2)
+TypeKind.BOOL = TypeKind(3)
+TypeKind.CHAR_U = TypeKind(4)
+TypeKind.UCHAR = TypeKind(5)
+TypeKind.CHAR16 = TypeKind(6)
+TypeKind.CHAR32 = TypeKind(7)
+TypeKind.USHORT = TypeKind(8)
+TypeKind.UINT = TypeKind(9)
+TypeKind.ULONG = TypeKind(10)
+TypeKind.ULONGLONG = TypeKind(11)
+TypeKind.UINT128 = TypeKind(12)
+TypeKind.CHAR_S = TypeKind(13)
+TypeKind.SCHAR = TypeKind(14)
+TypeKind.WCHAR = TypeKind(15)
+TypeKind.SHORT = TypeKind(16)
+TypeKind.INT = TypeKind(17)
+TypeKind.LONG = TypeKind(18)
+TypeKind.LONGLONG = TypeKind(19)
+TypeKind.INT128 = TypeKind(20)
+TypeKind.FLOAT = TypeKind(21)
+TypeKind.DOUBLE = TypeKind(22)
+TypeKind.LONGDOUBLE = TypeKind(23)
+TypeKind.NULLPTR = TypeKind(24)
+TypeKind.OVERLOAD = TypeKind(25)
+TypeKind.DEPENDENT = TypeKind(26)
+TypeKind.OBJCID = TypeKind(27)
+TypeKind.OBJCCLASS = TypeKind(28)
+TypeKind.OBJCSEL = TypeKind(29)
+TypeKind.FLOAT128 = TypeKind(30)
+TypeKind.HALF = TypeKind(31)
+TypeKind.COMPLEX = TypeKind(100)
+TypeKind.POINTER = TypeKind(101)
+TypeKind.BLOCKPOINTER = TypeKind(102)
+TypeKind.LVALUEREFERENCE = TypeKind(103)
+TypeKind.RVALUEREFERENCE = TypeKind(104)
+TypeKind.RECORD = TypeKind(105)
+TypeKind.ENUM = TypeKind(106)
+TypeKind.TYPEDEF = TypeKind(107)
+TypeKind.OBJCINTERFACE = TypeKind(108)
+TypeKind.OBJCOBJECTPOINTER = TypeKind(109)
+TypeKind.FUNCTIONNOPROTO = TypeKind(110)
+TypeKind.FUNCTIONPROTO = TypeKind(111)
+TypeKind.CONSTANTARRAY = TypeKind(112)
+TypeKind.VECTOR = TypeKind(113)
+TypeKind.INCOMPLETEARRAY = TypeKind(114)
+TypeKind.VARIABLEARRAY = TypeKind(115)
+TypeKind.DEPENDENTSIZEDARRAY = TypeKind(116)
+TypeKind.MEMBERPOINTER = TypeKind(117)
+TypeKind.AUTO = TypeKind(118)
+TypeKind.ELABORATED = TypeKind(119)
+
+class RefQualifierKind(BaseEnumeration):
+    """Describes a specific ref-qualifier of a type."""
+
+    # The unique kind objects, indexed by id.
+    _kinds = []
+    _name_map = None
+
+    def from_param(self):
+        return self.value
+
+    def __repr__(self):
+        return 'RefQualifierKind.%s' % (self.name,)
+
+RefQualifierKind.NONE = RefQualifierKind(0)
+RefQualifierKind.LVALUE = RefQualifierKind(1)
+RefQualifierKind.RVALUE = RefQualifierKind(2)
+
+class Type(Structure):
+    """
+    The type of an element in the abstract syntax tree.
+    """
+    _fields_ = [("_kind_id", c_int), ("data", c_void_p * 2)]
+
+    @property
+    def kind(self):
+        """Return the kind of this type."""
+        return TypeKind.from_id(self._kind_id)
+
+    def argument_types(self):
+        """Retrieve a container for the non-variadic arguments for this type.
+
+        The returned object is iterable and indexable. Each item in the
+        container is a Type instance.
+        """
+        class ArgumentsIterator(collections.Sequence):
+            def __init__(self, parent):
+                self.parent = parent
+                self.length = None
+
+            def __len__(self):
+                if self.length is None:
+                    self.length = conf.lib.clang_getNumArgTypes(self.parent)
+
+                return self.length
+
+            def __getitem__(self, key):
+                # FIXME Support slice objects.
+                if not isinstance(key, int):
+                    raise TypeError("Must supply a non-negative int.")
+
+                if key < 0:
+                    raise IndexError("Only non-negative indexes are accepted.")
+
+                if key >= len(self):
+                    raise IndexError("Index greater than container length: "
+                                     "%d > %d" % ( key, len(self) ))
+
+                result = conf.lib.clang_getArgType(self.parent, key)
+                if result.kind == TypeKind.INVALID:
+                    raise IndexError("Argument could not be retrieved.")
+
+                return result
+
+        assert self.kind == TypeKind.FUNCTIONPROTO
+        return ArgumentsIterator(self)
+
+    @property
+    def element_type(self):
+        """Retrieve the Type of elements within this Type.
+
+        If accessed on a type that is not an array, complex, or vector type, an
+        exception will be raised.
+        """
+        result = conf.lib.clang_getElementType(self)
+        if result.kind == TypeKind.INVALID:
+            raise Exception('Element type not available on this type.')
+
+        return result
+
+    @property
+    def element_count(self):
+        """Retrieve the number of elements in this type.
+
+        Returns an int.
+
+        If the Type is not an array or vector, this raises.
+        """
+        result = conf.lib.clang_getNumElements(self)
+        if result < 0:
+            raise Exception('Type does not have elements.')
+
+        return result
+
+    @property
+    def translation_unit(self):
+        """The TranslationUnit to which this Type is associated."""
+        # If this triggers an AttributeError, the instance was not properly
+        # instantiated.
+        return self._tu
+
+    @staticmethod
+    def from_result(res, fn, args):
+        assert isinstance(res, Type)
+
+        tu = None
+        for arg in args:
+            if hasattr(arg, 'translation_unit'):
+                tu = arg.translation_unit
+                break
+
+        assert tu is not None
+        res._tu = tu
+
+        return res
+
+    def get_canonical(self):
+        """
+        Return the canonical type for a Type.
+
+        Clang's type system explicitly models typedefs and all the
+        ways a specific type can be represented.  The canonical type
+        is the underlying type with all the "sugar" removed.  For
+        example, if 'T' is a typedef for 'int', the canonical type for
+        'T' would be 'int'.
+        """
+        return conf.lib.clang_getCanonicalType(self)
+
+    def is_const_qualified(self):
+        """Determine whether a Type has the "const" qualifier set.
+
+        This does not look through typedefs that may have added "const"
+        at a different level.
+        """
+        return conf.lib.clang_isConstQualifiedType(self)
+
+    def is_volatile_qualified(self):
+        """Determine whether a Type has the "volatile" qualifier set.
+
+        This does not look through typedefs that may have added "volatile"
+        at a different level.
+        """
+        return conf.lib.clang_isVolatileQualifiedType(self)
+
+    def is_restrict_qualified(self):
+        """Determine whether a Type has the "restrict" qualifier set.
+
+        This does not look through typedefs that may have added "restrict" at
+        a different level.
+        """
+        return conf.lib.clang_isRestrictQualifiedType(self)
+
+    def is_function_variadic(self):
+        """Determine whether this function Type is a variadic function type."""
+        assert self.kind == TypeKind.FUNCTIONPROTO
+
+        return conf.lib.clang_isFunctionTypeVariadic(self)
+
+    def is_pod(self):
+        """Determine whether this Type represents plain old data (POD)."""
+        return conf.lib.clang_isPODType(self)
+
+    def get_pointee(self):
+        """
+        For pointer types, returns the type of the pointee.
+        """
+        return conf.lib.clang_getPointeeType(self)
+
+    def get_declaration(self):
+        """
+        Return the cursor for the declaration of the given type.
+        """
+        return conf.lib.clang_getTypeDeclaration(self)
+
+    def get_result(self):
+        """
+        Retrieve the result type associated with a function type.
+        """
+        return conf.lib.clang_getResultType(self)
+
+    def get_array_element_type(self):
+        """
+        Retrieve the type of the elements of the array type.
+        """
+        return conf.lib.clang_getArrayElementType(self)
+
+    def get_array_size(self):
+        """
+        Retrieve the size of the constant array.
+        """
+        return conf.lib.clang_getArraySize(self)
+
+    def get_class_type(self):
+        """
+        Retrieve the class type of the member pointer type.
+        """
+        return conf.lib.clang_Type_getClassType(self)
+
+    def get_named_type(self):
+        """
+        Retrieve the type named by the qualified-id.
+        """
+        return conf.lib.clang_Type_getNamedType(self)
+    def get_align(self):
+        """
+        Retrieve the alignment of the record.
+        """
+        return conf.lib.clang_Type_getAlignOf(self)
+
+    def get_size(self):
+        """
+        Retrieve the size of the record.
+        """
+        return conf.lib.clang_Type_getSizeOf(self)
+
+    def get_offset(self, fieldname):
+        """
+        Retrieve the offset of a field in the record.
+        """
+        return conf.lib.clang_Type_getOffsetOf(self, c_char_p(fieldname))
+
+    def get_ref_qualifier(self):
+        """
+        Retrieve the ref-qualifier of the type.
+        """
+        return RefQualifierKind.from_id(
+                conf.lib.clang_Type_getCXXRefQualifier(self))
+
+    def get_fields(self):
+        """Return an iterator for accessing the fields of this type."""
+
+        def visitor(field, children):
+            assert field != conf.lib.clang_getNullCursor()
+
+            # Create reference to TU so it isn't GC'd before Cursor.
+            field._tu = self._tu
+            fields.append(field)
+            return 1 # continue
+        fields = []
+        conf.lib.clang_Type_visitFields(self,
+                            callbacks['fields_visit'](visitor), fields)
+        return iter(fields)
+
+    @property
+    def spelling(self):
+        """Retrieve the spelling of this Type."""
+        return conf.lib.clang_getTypeSpelling(self)
+
+    def __eq__(self, other):
+        if type(other) != type(self):
+            return False
+
+        return conf.lib.clang_equalTypes(self, other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+## CIndex Objects ##
+
+# CIndex objects (derived from ClangObject) are essentially lightweight
+# wrappers attached to some underlying object, which is exposed via CIndex as
+# a void*.
+
+class ClangObject(object):
+    """
+    A helper for Clang objects. This class helps act as an intermediary for
+    the ctypes library and the Clang CIndex library.
+    """
+    def __init__(self, obj):
+        assert isinstance(obj, c_object_p) and obj
+        self.obj = self._as_parameter_ = obj
+
+    def from_param(self):
+        return self._as_parameter_
+
+
+class _CXUnsavedFile(Structure):
+    """Helper for passing unsaved file arguments."""
+    _fields_ = [("name", c_char_p), ("contents", c_char_p), ('length', c_ulong)]
+
+# Functions calls through the python interface are rather slow. Fortunately,
+# for most symboles, we do not need to perform a function call. Their spelling
+# never changes and is consequently provided by this spelling cache.
+SpellingCache = {
+            # 0: CompletionChunk.Kind("Optional"),
+            # 1: CompletionChunk.Kind("TypedText"),
+            # 2: CompletionChunk.Kind("Text"),
+            # 3: CompletionChunk.Kind("Placeholder"),
+            # 4: CompletionChunk.Kind("Informative"),
+            # 5 : CompletionChunk.Kind("CurrentParameter"),
+            6: '(',   # CompletionChunk.Kind("LeftParen"),
+            7: ')',   # CompletionChunk.Kind("RightParen"),
+            8: '[',   # CompletionChunk.Kind("LeftBracket"),
+            9: ']',   # CompletionChunk.Kind("RightBracket"),
+            10: '{',  # CompletionChunk.Kind("LeftBrace"),
+            11: '}',  # CompletionChunk.Kind("RightBrace"),
+            12: '<',  # CompletionChunk.Kind("LeftAngle"),
+            13: '>',  # CompletionChunk.Kind("RightAngle"),
+            14: ', ', # CompletionChunk.Kind("Comma"),
+            # 15: CompletionChunk.Kind("ResultType"),
+            16: ':',  # CompletionChunk.Kind("Colon"),
+            17: ';',  # CompletionChunk.Kind("SemiColon"),
+            18: '=',  # CompletionChunk.Kind("Equal"),
+            19: ' ',  # CompletionChunk.Kind("HorizontalSpace"),
+            # 20: CompletionChunk.Kind("VerticalSpace")
+}
+
+class CompletionChunk:
+    class Kind:
+        def __init__(self, name):
+            self.name = name
+
+        def __str__(self):
+            return self.name
+
+        def __repr__(self):
+            return "<ChunkKind: %s>" % self
+
+    def __init__(self, completionString, key):
+        self.cs = completionString
+        self.key = key
+        self.__kindNumberCache = -1
+
+    def __repr__(self):
+        return "{'" + self.spelling + "', " + str(self.kind) + "}"
+
+    @CachedProperty
+    def spelling(self):
+        if self.__kindNumber in SpellingCache:
+                return SpellingCache[self.__kindNumber]
+        return conf.lib.clang_getCompletionChunkText(self.cs, self.key).spelling
+
+    # We do not use @CachedProperty here, as the manual implementation is
+    # apparently still significantly faster. Please profile carefully if you
+    # would like to add CachedProperty back.
+    @property
+    def __kindNumber(self):
+        if self.__kindNumberCache == -1:
+            self.__kindNumberCache = \
+                conf.lib.clang_getCompletionChunkKind(self.cs, self.key)
+        return self.__kindNumberCache
+
+    @CachedProperty
+    def kind(self):
+        return completionChunkKindMap[self.__kindNumber]
+
+    @CachedProperty
+    def string(self):
+        res = conf.lib.clang_getCompletionChunkCompletionString(self.cs,
+                                                                self.key)
+
+        if (res):
+          return CompletionString(res)
+        else:
+          None
+
+    def isKindOptional(self):
+      return self.__kindNumber == 0
+
+    def isKindTypedText(self):
+      return self.__kindNumber == 1
+
+    def isKindPlaceHolder(self):
+      return self.__kindNumber == 3
+
+    def isKindInformative(self):
+      return self.__kindNumber == 4
+
+    def isKindResultType(self):
+      return self.__kindNumber == 15
+
+completionChunkKindMap = {
+            0: CompletionChunk.Kind("Optional"),
+            1: CompletionChunk.Kind("TypedText"),
+            2: CompletionChunk.Kind("Text"),
+            3: CompletionChunk.Kind("Placeholder"),
+            4: CompletionChunk.Kind("Informative"),
+            5: CompletionChunk.Kind("CurrentParameter"),
+            6: CompletionChunk.Kind("LeftParen"),
+            7: CompletionChunk.Kind("RightParen"),
+            8: CompletionChunk.Kind("LeftBracket"),
+            9: CompletionChunk.Kind("RightBracket"),
+            10: CompletionChunk.Kind("LeftBrace"),
+            11: CompletionChunk.Kind("RightBrace"),
+            12: CompletionChunk.Kind("LeftAngle"),
+            13: CompletionChunk.Kind("RightAngle"),
+            14: CompletionChunk.Kind("Comma"),
+            15: CompletionChunk.Kind("ResultType"),
+            16: CompletionChunk.Kind("Colon"),
+            17: CompletionChunk.Kind("SemiColon"),
+            18: CompletionChunk.Kind("Equal"),
+            19: CompletionChunk.Kind("HorizontalSpace"),
+            20: CompletionChunk.Kind("VerticalSpace")}
+
+class CompletionString(ClangObject):
+    class Availability:
+        def __init__(self, name):
+            self.name = name
+
+        def __str__(self):
+            return self.name
+
+        def __repr__(self):
+            return "<Availability: %s>" % self
+
+    def __len__(self):
+        return self.num_chunks
+
+    @CachedProperty
+    def num_chunks(self):
+        return conf.lib.clang_getNumCompletionChunks(self.obj)
+
+    def __getitem__(self, key):
+        if self.num_chunks <= key:
+            raise IndexError
+        return CompletionChunk(self.obj, key)
+
+    @property
+    def priority(self):
+        return conf.lib.clang_getCompletionPriority(self.obj)
+
+    @property
+    def availability(self):
+        res = conf.lib.clang_getCompletionAvailability(self.obj)
+        return availabilityKinds[res]
+
+    @property
+    def briefComment(self):
+        if conf.function_exists("clang_getCompletionBriefComment"):
+            return conf.lib.clang_getCompletionBriefComment(self.obj)
+        return _CXString()
+
+    def __repr__(self):
+        return " | ".join([str(a) for a in self]) \
+               + " || Priority: " + str(self.priority) \
+               + " || Availability: " + str(self.availability) \
+               + " || Brief comment: " + str(self.briefComment.spelling)
+
+availabilityKinds = {
+            0: CompletionChunk.Kind("Available"),
+            1: CompletionChunk.Kind("Deprecated"),
+            2: CompletionChunk.Kind("NotAvailable"),
+            3: CompletionChunk.Kind("NotAccessible")}
+
+class CodeCompletionResult(Structure):
+    _fields_ = [('cursorKind', c_int), ('completionString', c_object_p)]
+
+    def __repr__(self):
+        return str(CompletionString(self.completionString))
+
+    @property
+    def kind(self):
+        return CursorKind.from_id(self.cursorKind)
+
+    @property
+    def string(self):
+        return CompletionString(self.completionString)
+
+class CCRStructure(Structure):
+    _fields_ = [('results', POINTER(CodeCompletionResult)),
+                ('numResults', c_int)]
+
+    def __len__(self):
+        return self.numResults
+
+    def __getitem__(self, key):
+        if len(self) <= key:
+            raise IndexError
+
+        return self.results[key]
+
+class CodeCompletionResults(ClangObject):
+    def __init__(self, ptr):
+        assert isinstance(ptr, POINTER(CCRStructure)) and ptr
+        self.ptr = self._as_parameter_ = ptr
+
+    def from_param(self):
+        return self._as_parameter_
+
+    def __del__(self):
+        conf.lib.clang_disposeCodeCompleteResults(self)
+
+    @property
+    def results(self):
+        return self.ptr.contents
+
+    @property
+    def diagnostics(self):
+        class DiagnosticsItr:
+            def __init__(self, ccr):
+                self.ccr= ccr
+
+            def __len__(self):
+                return int(\
+                  conf.lib.clang_codeCompleteGetNumDiagnostics(self.ccr))
+
+            def __getitem__(self, key):
+                return conf.lib.clang_codeCompleteGetDiagnostic(self.ccr, key)
+
+        return DiagnosticsItr(self)
+
+
+class Index(ClangObject):
+    """
+    The Index type provides the primary interface to the Clang CIndex library,
+    primarily by providing an interface for reading and parsing translation
+    units.
+    """
+
+    @staticmethod
+    def create(excludeDecls=False):
+        """
+        Create a new Index.
+        Parameters:
+        excludeDecls -- Exclude local declarations from translation units.
+        """
+        return Index(conf.lib.clang_createIndex(excludeDecls, 0))
+
+    def __del__(self):
+        conf.lib.clang_disposeIndex(self)
+
+    def read(self, path):
+        """Load a TranslationUnit from the given AST file."""
+        return TranslationUnit.from_ast_file(path, self)
+
+    def parse(self, path, args=None, unsaved_files=None, options = 0):
+        """Load the translation unit from the given source code file by running
+        clang and generating the AST before loading. Additional command line
+        parameters can be passed to clang via the args parameter.
+
+        In-memory contents for files can be provided by passing a list of pairs
+        to as unsaved_files, the first item should be the filenames to be mapped
+        and the second should be the contents to be substituted for the
+        file. The contents may be passed as strings or file objects.
+
+        If an error was encountered during parsing, a TranslationUnitLoadError
+        will be raised.
+        """
+        return TranslationUnit.from_source(path, args, unsaved_files, options,
+                                           self)
+
+class TranslationUnit(ClangObject):
+    """Represents a source code translation unit.
+
+    This is one of the main types in the API. Any time you wish to interact
+    with Clang's representation of a source file, you typically start with a
+    translation unit.
+    """
+
+    # Default parsing mode.
+    PARSE_NONE = 0
+
+    # Instruct the parser to create a detailed processing record containing
+    # metadata not normally retained.
+    PARSE_DETAILED_PROCESSING_RECORD = 1
+
+    # Indicates that the translation unit is incomplete. This is typically used
+    # when parsing headers.
+    PARSE_INCOMPLETE = 2
+
+    # Instruct the parser to create a pre-compiled preamble for the translation
+    # unit. This caches the preamble (included files at top of source file).
+    # This is useful if the translation unit will be reparsed and you don't
+    # want to incur the overhead of reparsing the preamble.
+    PARSE_PRECOMPILED_PREAMBLE = 4
+
+    # Cache code completion information on parse. This adds time to parsing but
+    # speeds up code completion.
+    PARSE_CACHE_COMPLETION_RESULTS = 8
+
+    # Flags with values 16 and 32 are deprecated and intentionally omitted.
+
+    # Do not parse function bodies. This is useful if you only care about
+    # searching for declarations/definitions.
+    PARSE_SKIP_FUNCTION_BODIES = 64
+
+    # Used to indicate that brief documentation comments should be included
+    # into the set of code completions returned from this translation unit.
+    PARSE_INCLUDE_BRIEF_COMMENTS_IN_CODE_COMPLETION = 128
+
+    @classmethod
+    def from_source(cls, filename, args=None, unsaved_files=None, options=0,
+                    index=None):
+        """Create a TranslationUnit by parsing source.
+
+        This is capable of processing source code both from files on the
+        filesystem as well as in-memory contents.
+
+        Command-line arguments that would be passed to clang are specified as
+        a list via args. These can be used to specify include paths, warnings,
+        etc. e.g. ["-Wall", "-I/path/to/include"].
+
+        In-memory file content can be provided via unsaved_files. This is an
+        iterable of 2-tuples. The first element is the str filename. The
+        second element defines the content. Content can be provided as str
+        source code or as file objects (anything with a read() method). If
+        a file object is being used, content will be read until EOF and the
+        read cursor will not be reset to its original position.
+
+        options is a bitwise or of TranslationUnit.PARSE_XXX flags which will
+        control parsing behavior.
+
+        index is an Index instance to utilize. If not provided, a new Index
+        will be created for this TranslationUnit.
+
+        To parse source from the filesystem, the filename of the file to parse
+        is specified by the filename argument. Or, filename could be None and
+        the args list would contain the filename(s) to parse.
+
+        To parse source from an in-memory buffer, set filename to the virtual
+        filename you wish to associate with this source (e.g. "test.c"). The
+        contents of that file are then provided in unsaved_files.
+
+        If an error occurs, a TranslationUnitLoadError is raised.
+
+        Please note that a TranslationUnit with parser errors may be returned.
+        It is the caller's responsibility to check tu.diagnostics for errors.
+
+        Also note that Clang infers the source language from the extension of
+        the input filename. If you pass in source code containing a C++ class
+        declaration with the filename "test.c" parsing will fail.
+        """
+        if args is None:
+            args = []
+
+        if unsaved_files is None:
+            unsaved_files = []
+
+        if index is None:
+            index = Index.create()
+
+        if isinstance(filename, str):
+            filename = filename.encode('utf8')
+
+        args_length = len(args)
+        if args_length > 0:
+            args = (arg.encode('utf8') if isinstance(arg, str) else arg
+                    for arg in args)
+            args_array = (c_char_p * args_length)(* args)
+
+        unsaved_array = None
+        if len(unsaved_files) > 0:
+            unsaved_array = (_CXUnsavedFile * len(unsaved_files))()
+            for i, (name, contents) in enumerate(unsaved_files):
+                if hasattr(contents, "read"):
+                    contents = contents.read()
+
+                unsaved_array[i].name = name
+                unsaved_array[i].contents = contents
+                unsaved_array[i].length = len(contents)
+
+        ptr = conf.lib.clang_parseTranslationUnit(index, filename, args_array,
+                                    args_length, unsaved_array,
+                                    len(unsaved_files), options)
+
+        if not ptr:
+            raise TranslationUnitLoadError("Error parsing translation unit.")
+
+        return cls(ptr, index=index)
+
+    @classmethod
+    def from_ast_file(cls, filename, index=None):
+        """Create a TranslationUnit instance from a saved AST file.
+
+        A previously-saved AST file (provided with -emit-ast or
+        TranslationUnit.save()) is loaded from the filename specified.
+
+        If the file cannot be loaded, a TranslationUnitLoadError will be
+        raised.
+
+        index is optional and is the Index instance to use. If not provided,
+        a default Index will be created.
+        """
+        if index is None:
+            index = Index.create()
+
+        ptr = conf.lib.clang_createTranslationUnit(index, filename)
+        if not ptr:
+            raise TranslationUnitLoadError(filename)
+
+        return cls(ptr=ptr, index=index)
+
+    def __init__(self, ptr, index):
+        """Create a TranslationUnit instance.
+
+        TranslationUnits should be created using one of the from_* @classmethod
+        functions above. __init__ is only called internally.
+        """
+        assert isinstance(index, Index)
+        self.index = index
+        ClangObject.__init__(self, ptr)
+
+    def __del__(self):
+        conf.lib.clang_disposeTranslationUnit(self)
+
+    @property
+    def cursor(self):
+        """Retrieve the cursor that represents the given translation unit."""
+        return conf.lib.clang_getTranslationUnitCursor(self)
+
+    @property
+    def spelling(self):
+        """Get the original translation unit source file name."""
+        return conf.lib.clang_getTranslationUnitSpelling(self)
+
+    def get_includes(self):
+        """
+        Return an iterable sequence of FileInclusion objects that describe the
+        sequence of inclusions in a translation unit. The first object in
+        this sequence is always the input file. Note that this method will not
+        recursively iterate over header files included through precompiled
+        headers.
+        """
+        def visitor(fobj, lptr, depth, includes):
+            if depth > 0:
+                loc = lptr.contents
+                includes.append(FileInclusion(loc.file, File(fobj), loc, depth))
+
+        # Automatically adapt CIndex/ctype pointers to python objects
+        includes = []
+        conf.lib.clang_getInclusions(self,
+                callbacks['translation_unit_includes'](visitor), includes)
+
+        return iter(includes)
+
+    def get_file(self, filename):
+        """Obtain a File from this translation unit."""
+
+        return File.from_name(self, filename)
+
+    def get_location(self, filename, position):
+        """Obtain a SourceLocation for a file in this translation unit.
+
+        The position can be specified by passing:
+
+          - Integer file offset. Initial file offset is 0.
+          - 2-tuple of (line number, column number). Initial file position is
+            (0, 0)
+        """
+        f = self.get_file(filename)
+
+        if isinstance(position, int):
+            return SourceLocation.from_offset(self, f, position)
+
+        return SourceLocation.from_position(self, f, position[0], position[1])
+
+    def get_extent(self, filename, locations):
+        """Obtain a SourceRange from this translation unit.
+
+        The bounds of the SourceRange must ultimately be defined by a start and
+        end SourceLocation. For the locations argument, you can pass:
+
+          - 2 SourceLocation instances in a 2-tuple or list.
+          - 2 int file offsets via a 2-tuple or list.
+          - 2 2-tuple or lists of (line, column) pairs in a 2-tuple or list.
+
+        e.g.
+
+        get_extent('foo.c', (5, 10))
+        get_extent('foo.c', ((1, 1), (1, 15)))
+        """
+        f = self.get_file(filename)
+
+        if len(locations) < 2:
+            raise Exception('Must pass object with at least 2 elements')
+
+        start_location, end_location = locations
+
+        if hasattr(start_location, '__len__'):
+            start_location = SourceLocation.from_position(self, f,
+                start_location[0], start_location[1])
+        elif isinstance(start_location, int):
+            start_location = SourceLocation.from_offset(self, f,
+                start_location)
+
+        if hasattr(end_location, '__len__'):
+            end_location = SourceLocation.from_position(self, f,
+                end_location[0], end_location[1])
+        elif isinstance(end_location, int):
+            end_location = SourceLocation.from_offset(self, f, end_location)
+
+        assert isinstance(start_location, SourceLocation)
+        assert isinstance(end_location, SourceLocation)
+
+        return SourceRange.from_locations(start_location, end_location)
+
+    @property
+    def diagnostics(self):
+        """
+        Return an iterable (and indexable) object containing the diagnostics.
+        """
+        class DiagIterator:
+            def __init__(self, tu):
+                self.tu = tu
+
+            def __len__(self):
+                return int(conf.lib.clang_getNumDiagnostics(self.tu))
+
+            def __getitem__(self, key):
+                diag = conf.lib.clang_getDiagnostic(self.tu, key)
+                if not diag:
+                    raise IndexError
+                return Diagnostic(diag)
+
+        return DiagIterator(self)
+
+    def reparse(self, unsaved_files=None, options=0):
+        """
+        Reparse an already parsed translation unit.
+
+        In-memory contents for files can be provided by passing a list of pairs
+        as unsaved_files, the first items should be the filenames to be mapped
+        and the second should be the contents to be substituted for the
+        file. The contents may be passed as strings or file objects.
+        """
+        if unsaved_files is None:
+            unsaved_files = []
+
+        unsaved_files_array = 0
+        if len(unsaved_files):
+            unsaved_files_array = (_CXUnsavedFile * len(unsaved_files))()
+            for i,(name,value) in enumerate(unsaved_files):
+                if not isinstance(value, str):
+                    # FIXME: It would be great to support an efficient version
+                    # of this, one day.
+                    value = value.read()
+                    print(value)
+                if not isinstance(value, str):
+                    raise TypeError('Unexpected unsaved file contents.')
+                unsaved_files_array[i].name = name
+                unsaved_files_array[i].contents = value
+                unsaved_files_array[i].length = len(value)
+        ptr = conf.lib.clang_reparseTranslationUnit(self, len(unsaved_files),
+                unsaved_files_array, options)
+
+    def save(self, filename):
+        """Saves the TranslationUnit to a file.
+
+        This is equivalent to passing -emit-ast to the clang frontend. The
+        saved file can be loaded back into a TranslationUnit. Or, if it
+        corresponds to a header, it can be used as a pre-compiled header file.
+
+        If an error occurs while saving, a TranslationUnitSaveError is raised.
+        If the error was TranslationUnitSaveError.ERROR_INVALID_TU, this means
+        the constructed TranslationUnit was not valid at time of save. In this
+        case, the reason(s) why should be available via
+        TranslationUnit.diagnostics().
+
+        filename -- The path to save the translation unit to.
+        """
+        options = conf.lib.clang_defaultSaveOptions(self)
+        result = int(conf.lib.clang_saveTranslationUnit(self, filename,
+                                                        options))
+        if result != 0:
+            raise TranslationUnitSaveError(result,
+                'Error saving TranslationUnit.')
+
+    def codeComplete(self, path, line, column, unsaved_files=None,
+                     include_macros=False, include_code_patterns=False,
+                     include_brief_comments=False):
+        """
+        Code complete in this translation unit.
+
+        In-memory contents for files can be provided by passing a list of pairs
+        as unsaved_files, the first items should be the filenames to be mapped
+        and the second should be the contents to be substituted for the
+        file. The contents may be passed as strings or file objects.
+        """
+        options = 0
+
+        if include_macros:
+            options += 1
+
+        if include_code_patterns:
+            options += 2
+
+        if include_brief_comments:
+            options += 4
+
+        if unsaved_files is None:
+            unsaved_files = []
+
+        unsaved_files_array = 0
+        if len(unsaved_files):
+            unsaved_files_array = (_CXUnsavedFile * len(unsaved_files))()
+            for i,(name,value) in enumerate(unsaved_files):
+                if not isinstance(value, str):
+                    # FIXME: It would be great to support an efficient version
+                    # of this, one day.
+                    value = value.read()
+                    print(value)
+                if not isinstance(value, str):
+                    raise TypeError('Unexpected unsaved file contents.')
+                unsaved_files_array[i].name = name
+                unsaved_files_array[i].contents = value
+                unsaved_files_array[i].length = len(value)
+        ptr = conf.lib.clang_codeCompleteAt(self, path, line, column,
+                unsaved_files_array, len(unsaved_files), options)
+        if ptr:
+            return CodeCompletionResults(ptr)
+        return None
+
+    def get_tokens(self, locations=None, extent=None):
+        """Obtain tokens in this translation unit.
+
+        This is a generator for Token instances. The caller specifies a range
+        of source code to obtain tokens for. The range can be specified as a
+        2-tuple of SourceLocation or as a SourceRange. If both are defined,
+        behavior is undefined.
+        """
+        if locations is not None:
+            extent = SourceRange(start=locations[0], end=locations[1])
+
+        return TokenGroup.get_tokens(self, extent)
+
+class File(ClangObject):
+    """
+    The File class represents a particular source file that is part of a
+    translation unit.
+    """
+
+    @staticmethod
+    def from_name(translation_unit, file_name):
+        """Retrieve a file handle within the given translation unit."""
+        return File(conf.lib.clang_getFile(translation_unit, file_name))
+
+    @property
+    def name(self):
+        """Return the complete file and path name of the file."""
+        return conf.lib.clang_getCString(conf.lib.clang_getFileName(self))
+
+    @property
+    def time(self):
+        """Return the last modification time of the file."""
+        return conf.lib.clang_getFileTime(self)
+
+    def __bytes__(self):
+        return self.name
+
+    def __repr__(self):
+        return "<File: %s>" % (self.name)
+
+    @staticmethod
+    def from_cursor_result(res, fn, args):
+        assert isinstance(res, File)
+
+        # Copy a reference to the TranslationUnit to prevent premature GC.
+        res._tu = args[0]._tu
+        return res
+
+class FileInclusion(object):
+    """
+    The FileInclusion class represents the inclusion of one source file by
+    another via a '#include' directive or as the input file for the translation
+    unit. This class provides information about the included file, the including
+    file, the location of the '#include' directive and the depth of the included
+    file in the stack. Note that the input file has depth 0.
+    """
+
+    def __init__(self, src, tgt, loc, depth):
+        self.source = src
+        self.include = tgt
+        self.location = loc
+        self.depth = depth
+
+    @property
+    def is_input_file(self):
+        """True if the included file is the input file."""
+        return self.depth == 0
+
+class CompilationDatabaseError(Exception):
+    """Represents an error that occurred when working with a CompilationDatabase
+
+    Each error is associated to an enumerated value, accessible under
+    e.cdb_error. Consumers can compare the value with one of the ERROR_
+    constants in this class.
+    """
+
+    # An unknown error occurred
+    ERROR_UNKNOWN = 0
+
+    # The database could not be loaded
+    ERROR_CANNOTLOADDATABASE = 1
+
+    def __init__(self, enumeration, message):
+        assert isinstance(enumeration, int)
+
+        if enumeration > 1:
+            raise Exception("Encountered undefined CompilationDatabase error "
+                            "constant: %d. Please file a bug to have this "
+                            "value supported." % enumeration)
+
+        self.cdb_error = enumeration
+        Exception.__init__(self, 'Error %d: %s' % (enumeration, message))
+
+class CompileCommand(object):
+    """Represents the compile command used to build a file"""
+    def __init__(self, cmd, ccmds):
+        self.cmd = cmd
+        # Keep a reference to the originating CompileCommands
+        # to prevent garbage collection
+        self.ccmds = ccmds
+
+    @property
+    def directory(self):
+        """Get the working directory for this CompileCommand"""
+        return conf.lib.clang_CompileCommand_getDirectory(self.cmd)
+
+    @property
+    def filename(self):
+        """Get the working filename for this CompileCommand"""
+        return conf.lib.clang_CompileCommand_getFilename(self.cmd)
+
+    @property
+    def arguments(self):
+        """
+        Get an iterable object providing each argument in the
+        command line for the compiler invocation as a _CXString.
+
+        Invariant : the first argument is the compiler executable
+        """
+        length = conf.lib.clang_CompileCommand_getNumArgs(self.cmd)
+        for i in range(length):
+            yield conf.lib.clang_CompileCommand_getArg(self.cmd, i)
+
+class CompileCommands(object):
+    """
+    CompileCommands is an iterable object containing all CompileCommand
+    that can be used for building a specific file.
+    """
+    def __init__(self, ccmds):
+        self.ccmds = ccmds
+
+    def __del__(self):
+        conf.lib.clang_CompileCommands_dispose(self.ccmds)
+
+    def __len__(self):
+        return int(conf.lib.clang_CompileCommands_getSize(self.ccmds))
+
+    def __getitem__(self, i):
+        cc = conf.lib.clang_CompileCommands_getCommand(self.ccmds, i)
+        if not cc:
+            raise IndexError
+        return CompileCommand(cc, self)
+
+    @staticmethod
+    def from_result(res, fn, args):
+        if not res:
+            return None
+        return CompileCommands(res)
+
+class CompilationDatabase(ClangObject):
+    """
+    The CompilationDatabase is a wrapper class around
+    clang::tooling::CompilationDatabase
+
+    It enables querying how a specific source file can be built.
+    """
+
+    def __del__(self):
+        conf.lib.clang_CompilationDatabase_dispose(self)
+
+    @staticmethod
+    def from_result(res, fn, args):
+        if not res:
+            raise CompilationDatabaseError(0,
+                                           "CompilationDatabase loading failed")
+        return CompilationDatabase(res)
+
+    @staticmethod
+    def fromDirectory(buildDir):
+        """Builds a CompilationDatabase from the database found in buildDir"""
+        errorCode = c_uint()
+        try:
+            cdb = conf.lib.clang_CompilationDatabase_fromDirectory(buildDir,
+                byref(errorCode))
+        except CompilationDatabaseError as e:
+            raise CompilationDatabaseError(int(errorCode.value),
+                                           "CompilationDatabase loading failed")
+        return cdb
+
+    def getCompileCommands(self, filename):
+        """
+        Get an iterable object providing all the CompileCommands available to
+        build filename. Returns None if filename is not found in the database.
+        """
+        return conf.lib.clang_CompilationDatabase_getCompileCommands(self,
+                                                                     filename)
+
+    def getAllCompileCommands(self):
+        """
+        Get an iterable object providing all the CompileCommands available from
+        the database.
+        """
+        return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)
+
+
+class Token(Structure):
+    """Represents a single token from the preprocessor.
+
+    Tokens are effectively segments of source code. Source code is first parsed
+    into tokens before being converted into the AST and Cursors.
+
+    Tokens are obtained from parsed TranslationUnit instances. You currently
+    can't create tokens manually.
+    """
+    _fields_ = [
+        ('int_data', c_uint * 4),
+        ('ptr_data', c_void_p)
+    ]
+
+    @property
+    def spelling(self):
+        """The spelling of this token.
+
+        This is the textual representation of the token in source.
+        """
+        return conf.lib.clang_getTokenSpelling(self._tu, self)
+
+    @property
+    def kind(self):
+        """Obtain the TokenKind of the current token."""
+        return TokenKind.from_value(conf.lib.clang_getTokenKind(self))
+
+    @property
+    def location(self):
+        """The SourceLocation this Token occurs at."""
+        return conf.lib.clang_getTokenLocation(self._tu, self)
+
+    @property
+    def extent(self):
+        """The SourceRange this Token occupies."""
+        return conf.lib.clang_getTokenExtent(self._tu, self)
+
+    @property
+    def cursor(self):
+        """The Cursor this Token corresponds to."""
+        cursor = Cursor()
+
+        conf.lib.clang_annotateTokens(self._tu, byref(self), 1, byref(cursor))
+
+        return cursor
+
+# Now comes the plumbing to hook up the C library.
+
+# Register callback types in common container.
+callbacks['translation_unit_includes'] = CFUNCTYPE(None, c_object_p,
+        POINTER(SourceLocation), c_uint, py_object)
+callbacks['cursor_visit'] = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
+callbacks['fields_visit'] = CFUNCTYPE(c_int, Cursor, py_object)
+
+# Functions strictly alphabetical order.
+functionList = [
+  ("clang_annotateTokens",
+   [TranslationUnit, POINTER(Token), c_uint, POINTER(Cursor)]),
+
+  ("clang_CompilationDatabase_dispose",
+   [c_object_p]),
+
+  ("clang_CompilationDatabase_fromDirectory",
+   [c_char_p, POINTER(c_uint)],
+   c_object_p,
+   CompilationDatabase.from_result),
+
+  ("clang_CompilationDatabase_getAllCompileCommands",
+   [c_object_p],
+   c_object_p,
+   CompileCommands.from_result),
+
+  ("clang_CompilationDatabase_getCompileCommands",
+   [c_object_p, c_char_p],
+   c_object_p,
+   CompileCommands.from_result),
+
+  ("clang_CompileCommands_dispose",
+   [c_object_p]),
+
+  ("clang_CompileCommands_getCommand",
+   [c_object_p, c_uint],
+   c_object_p),
+
+  ("clang_CompileCommands_getSize",
+   [c_object_p],
+   c_uint),
+
+  ("clang_CompileCommand_getArg",
+   [c_object_p, c_uint],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_CompileCommand_getDirectory",
+   [c_object_p],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_CompileCommand_getFilename",
+   [c_object_p],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_CompileCommand_getNumArgs",
+   [c_object_p],
+   c_uint),
+
+  ("clang_codeCompleteAt",
+   [TranslationUnit, c_char_p, c_int, c_int, c_void_p, c_int, c_int],
+   POINTER(CCRStructure)),
+
+  ("clang_codeCompleteGetDiagnostic",
+   [CodeCompletionResults, c_int],
+   Diagnostic),
+
+  ("clang_codeCompleteGetNumDiagnostics",
+   [CodeCompletionResults],
+   c_int),
+
+  ("clang_createIndex",
+   [c_int, c_int],
+   c_object_p),
+
+  ("clang_createTranslationUnit",
+   [Index, c_char_p],
+   c_object_p),
+
+  ("clang_CXXConstructor_isConvertingConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXConstructor_isCopyConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXConstructor_isDefaultConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXConstructor_isMoveConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXField_isMutable",
+   [Cursor],
+   bool),
+
+  ("clang_CXXMethod_isConst",
+   [Cursor],
+   bool),
+
+  ("clang_CXXMethod_isDefaulted",
+   [Cursor],
+   bool),
+
+  ("clang_CXXMethod_isPureVirtual",
+   [Cursor],
+   bool),
+
+  ("clang_CXXMethod_isStatic",
+   [Cursor],
+   bool),
+
+  ("clang_CXXMethod_isVirtual",
+   [Cursor],
+   bool),
+
+  ("clang_defaultDiagnosticDisplayOptions",
+   [],
+   c_uint),
+
+  ("clang_defaultSaveOptions",
+   [TranslationUnit],
+   c_uint),
+
+  ("clang_disposeCodeCompleteResults",
+   [CodeCompletionResults]),
+
+# ("clang_disposeCXTUResourceUsage",
+#  [CXTUResourceUsage]),
+
+  ("clang_disposeDiagnostic",
+   [Diagnostic]),
+
+  ("clang_disposeIndex",
+   [Index]),
+
+  ("clang_disposeString",
+   [_CXString]),
+
+  ("clang_disposeTokens",
+   [TranslationUnit, POINTER(Token), c_uint]),
+
+  ("clang_disposeTranslationUnit",
+   [TranslationUnit]),
+
+  ("clang_equalCursors",
+   [Cursor, Cursor],
+   bool),
+
+  ("clang_equalLocations",
+   [SourceLocation, SourceLocation],
+   bool),
+
+  ("clang_equalRanges",
+   [SourceRange, SourceRange],
+   bool),
+
+  ("clang_equalTypes",
+   [Type, Type],
+   bool),
+
+  ("clang_formatDiagnostic",
+   [Diagnostic, c_uint],
+   _CXString),
+
+  ("clang_getArgType",
+   [Type, c_uint],
+   Type,
+   Type.from_result),
+
+  ("clang_getArrayElementType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_getArraySize",
+   [Type],
+   c_longlong),
+
+  ("clang_getFieldDeclBitWidth",
+   [Cursor],
+   c_int),
+
+  ("clang_getCanonicalCursor",
+   [Cursor],
+   Cursor,
+   Cursor.from_cursor_result),
+
+  ("clang_getCanonicalType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_getChildDiagnostics",
+   [Diagnostic],
+   c_object_p),
+
+  ("clang_getCompletionAvailability",
+   [c_void_p],
+   c_int),
+
+  ("clang_getCompletionBriefComment",
+   [c_void_p],
+   _CXString),
+
+  ("clang_getCompletionChunkCompletionString",
+   [c_void_p, c_int],
+   c_object_p),
+
+  ("clang_getCompletionChunkKind",
+   [c_void_p, c_int],
+   c_int),
+
+  ("clang_getCompletionChunkText",
+   [c_void_p, c_int],
+   _CXString),
+
+  ("clang_getCompletionPriority",
+   [c_void_p],
+   c_int),
+
+  ("clang_getCString",
+   [_CXString],
+   c_char_p),
+
+  ("clang_getCursor",
+   [TranslationUnit, SourceLocation],
+   Cursor),
+
+  ("clang_getCursorDefinition",
+   [Cursor],
+   Cursor,
+   Cursor.from_result),
+
+  ("clang_getCursorDisplayName",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getCursorExtent",
+   [Cursor],
+   SourceRange),
+
+  ("clang_getCursorLexicalParent",
+   [Cursor],
+   Cursor,
+   Cursor.from_cursor_result),
+
+  ("clang_getCursorLocation",
+   [Cursor],
+   SourceLocation),
+
+  ("clang_getCursorReferenced",
+   [Cursor],
+   Cursor,
+   Cursor.from_result),
+
+  ("clang_getCursorReferenceNameRange",
+   [Cursor, c_uint, c_uint],
+   SourceRange),
+
+  ("clang_getCursorSemanticParent",
+   [Cursor],
+   Cursor,
+   Cursor.from_cursor_result),
+
+  ("clang_getCursorSpelling",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getCursorType",
+   [Cursor],
+   Type,
+   Type.from_result),
+
+  ("clang_getCursorUSR",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_Cursor_getMangling",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+# ("clang_getCXTUResourceUsage",
+#  [TranslationUnit],
+#  CXTUResourceUsage),
+
+  ("clang_getCXXAccessSpecifier",
+   [Cursor],
+   c_uint),
+
+  ("clang_getDeclObjCTypeEncoding",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getDiagnostic",
+   [c_object_p, c_uint],
+   c_object_p),
+
+  ("clang_getDiagnosticCategory",
+   [Diagnostic],
+   c_uint),
+
+  ("clang_getDiagnosticCategoryText",
+   [Diagnostic],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getDiagnosticFixIt",
+   [Diagnostic, c_uint, POINTER(SourceRange)],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getDiagnosticInSet",
+   [c_object_p, c_uint],
+   c_object_p),
+
+  ("clang_getDiagnosticLocation",
+   [Diagnostic],
+   SourceLocation),
+
+  ("clang_getDiagnosticNumFixIts",
+   [Diagnostic],
+   c_uint),
+
+  ("clang_getDiagnosticNumRanges",
+   [Diagnostic],
+   c_uint),
+
+  ("clang_getDiagnosticOption",
+   [Diagnostic, POINTER(_CXString)],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getDiagnosticRange",
+   [Diagnostic, c_uint],
+   SourceRange),
+
+  ("clang_getDiagnosticSeverity",
+   [Diagnostic],
+   c_int),
+
+  ("clang_getDiagnosticSpelling",
+   [Diagnostic],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getElementType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_getEnumConstantDeclUnsignedValue",
+   [Cursor],
+   c_ulonglong),
+
+  ("clang_getEnumConstantDeclValue",
+   [Cursor],
+   c_longlong),
+
+  ("clang_getEnumDeclIntegerType",
+   [Cursor],
+   Type,
+   Type.from_result),
+
+  ("clang_getFile",
+   [TranslationUnit, c_char_p],
+   c_object_p),
+
+  ("clang_getFileName",
+   [File],
+   _CXString), # TODO go through _CXString.from_result?
+
+  ("clang_getFileTime",
+   [File],
+   c_uint),
+
+  ("clang_getIBOutletCollectionType",
+   [Cursor],
+   Type,
+   Type.from_result),
+
+  ("clang_getIncludedFile",
+   [Cursor],
+   File,
+   File.from_cursor_result),
+
+  ("clang_getInclusions",
+   [TranslationUnit, callbacks['translation_unit_includes'], py_object]),
+
+  ("clang_getInstantiationLocation",
+   [SourceLocation, POINTER(c_object_p), POINTER(c_uint), POINTER(c_uint),
+    POINTER(c_uint)]),
+
+  ("clang_getLocation",
+   [TranslationUnit, File, c_uint, c_uint],
+   SourceLocation),
+
+  ("clang_getLocationForOffset",
+   [TranslationUnit, File, c_uint],
+   SourceLocation),
+
+  ("clang_getNullCursor",
+   None,
+   Cursor),
+
+  ("clang_getNumArgTypes",
+   [Type],
+   c_uint),
+
+  ("clang_getNumCompletionChunks",
+   [c_void_p],
+   c_int),
+
+  ("clang_getNumDiagnostics",
+   [c_object_p],
+   c_uint),
+
+  ("clang_getNumDiagnosticsInSet",
+   [c_object_p],
+   c_uint),
+
+  ("clang_getNumElements",
+   [Type],
+   c_longlong),
+
+  ("clang_getNumOverloadedDecls",
+   [Cursor],
+   c_uint),
+
+  ("clang_getOverloadedDecl",
+   [Cursor, c_uint],
+   Cursor,
+   Cursor.from_cursor_result),
+
+  ("clang_getPointeeType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_getRange",
+   [SourceLocation, SourceLocation],
+   SourceRange),
+
+  ("clang_getRangeEnd",
+   [SourceRange],
+   SourceLocation),
+
+  ("clang_getRangeStart",
+   [SourceRange],
+   SourceLocation),
+
+  ("clang_getResultType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_getSpecializedCursorTemplate",
+   [Cursor],
+   Cursor,
+   Cursor.from_cursor_result),
+
+  ("clang_getTemplateCursorKind",
+   [Cursor],
+   c_uint),
+
+  ("clang_getTokenExtent",
+   [TranslationUnit, Token],
+   SourceRange),
+
+  ("clang_getTokenKind",
+   [Token],
+   c_uint),
+
+  ("clang_getTokenLocation",
+   [TranslationUnit, Token],
+   SourceLocation),
+
+  ("clang_getTokenSpelling",
+   [TranslationUnit, Token],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getTranslationUnitCursor",
+   [TranslationUnit],
+   Cursor,
+   Cursor.from_result),
+
+  ("clang_getTranslationUnitSpelling",
+   [TranslationUnit],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getTUResourceUsageName",
+   [c_uint],
+   c_char_p),
+
+  ("clang_getTypeDeclaration",
+   [Type],
+   Cursor,
+   Cursor.from_result),
+
+  ("clang_getTypedefDeclUnderlyingType",
+   [Cursor],
+   Type,
+   Type.from_result),
+
+  ("clang_getTypeKindSpelling",
+   [c_uint],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_getTypeSpelling",
+   [Type],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_hashCursor",
+   [Cursor],
+   c_uint),
+
+  ("clang_isAttribute",
+   [CursorKind],
+   bool),
+
+  ("clang_isConstQualifiedType",
+   [Type],
+   bool),
+
+  ("clang_isCursorDefinition",
+   [Cursor],
+   bool),
+
+  ("clang_isDeclaration",
+   [CursorKind],
+   bool),
+
+  ("clang_isExpression",
+   [CursorKind],
+   bool),
+
+  ("clang_isFileMultipleIncludeGuarded",
+   [TranslationUnit, File],
+   bool),
+
+  ("clang_isFunctionTypeVariadic",
+   [Type],
+   bool),
+
+  ("clang_isInvalid",
+   [CursorKind],
+   bool),
+
+  ("clang_isPODType",
+   [Type],
+   bool),
+
+  ("clang_isPreprocessing",
+   [CursorKind],
+   bool),
+
+  ("clang_isReference",
+   [CursorKind],
+   bool),
+
+  ("clang_isRestrictQualifiedType",
+   [Type],
+   bool),
+
+  ("clang_isStatement",
+   [CursorKind],
+   bool),
+
+  ("clang_isTranslationUnit",
+   [CursorKind],
+   bool),
+
+  ("clang_isUnexposed",
+   [CursorKind],
+   bool),
+
+  ("clang_isVirtualBase",
+   [Cursor],
+   bool),
+
+  ("clang_isVolatileQualifiedType",
+   [Type],
+   bool),
+
+  ("clang_parseTranslationUnit",
+   [Index, c_char_p, c_void_p, c_int, c_void_p, c_int, c_int],
+   c_object_p),
+
+  ("clang_reparseTranslationUnit",
+   [TranslationUnit, c_int, c_void_p, c_int],
+   c_int),
+
+  ("clang_saveTranslationUnit",
+   [TranslationUnit, c_char_p, c_uint],
+   c_int),
+
+  ("clang_tokenize",
+   [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)]),
+
+  ("clang_visitChildren",
+   [Cursor, callbacks['cursor_visit'], py_object],
+   c_uint),
+
+  ("clang_Cursor_getNumArguments",
+   [Cursor],
+   c_int),
+
+  ("clang_Cursor_getArgument",
+   [Cursor, c_uint],
+   Cursor,
+   Cursor.from_result),
+
+  ("clang_Cursor_getNumTemplateArguments",
+   [Cursor],
+   c_int),
+
+  ("clang_Cursor_getTemplateArgumentKind",
+   [Cursor, c_uint],
+   TemplateArgumentKind.from_id),
+
+  ("clang_Cursor_getTemplateArgumentType",
+   [Cursor, c_uint],
+   Type,
+   Type.from_result),
+
+  ("clang_Cursor_getTemplateArgumentValue",
+   [Cursor, c_uint],
+   c_longlong),
+
+  ("clang_Cursor_getTemplateArgumentUnsignedValue",
+   [Cursor, c_uint],
+   c_ulonglong),
+
+  ("clang_Cursor_isAnonymous",
+   [Cursor],
+   bool),
+
+  ("clang_Cursor_isBitField",
+   [Cursor],
+   bool),
+
+  ("clang_Cursor_getBriefCommentText",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_Cursor_getRawCommentText",
+   [Cursor],
+   _CXString,
+   _CXString.from_result),
+
+  ("clang_Cursor_getOffsetOfField",
+   [Cursor],
+   c_longlong),
+
+  ("clang_Type_getAlignOf",
+   [Type],
+   c_longlong),
+
+  ("clang_Type_getClassType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_Type_getOffsetOf",
+   [Type, c_char_p],
+   c_longlong),
+
+  ("clang_Type_getSizeOf",
+   [Type],
+   c_longlong),
+
+  ("clang_Type_getCXXRefQualifier",
+   [Type],
+   c_uint),
+
+  ("clang_Type_getNamedType",
+   [Type],
+   Type,
+   Type.from_result),
+
+  ("clang_Type_visitFields",
+   [Type, callbacks['fields_visit'], py_object],
+   c_uint),
+]
+
+class LibclangError(Exception):
+    def __init__(self, message):
+        self.m = message
+
+    def __str__(self):
+        return self.m
+
+def register_function(lib, item, ignore_errors):
+    # A function may not exist, if these bindings are used with an older or
+    # incompatible version of libclang.so.
+    try:
+        func = getattr(lib, item[0])
+    except AttributeError as e:
+        msg = str(e) + ". Please ensure that your python bindings are "\
+                       "compatible with your libclang.so version."
+        if ignore_errors:
+            return
+        raise LibclangError(msg)
+
+    if len(item) >= 2:
+        func.argtypes = item[1]
+
+    if len(item) >= 3:
+        func.restype = item[2]
+
+    if len(item) == 4:
+        func.errcheck = item[3]
+
+def register_functions(lib, ignore_errors):
+    """Register function prototypes with a libclang library instance.
+
+    This must be called as part of library instantiation so Python knows how
+    to call out to the shared library.
+    """
+
+    def register(item):
+        return register_function(lib, item, ignore_errors)
+
+    for f in functionList:
+        register(f)
+
+class Config:
+    library_path = None
+    library_file = None
+    compatibility_check = False
+    loaded = False
+
+    @staticmethod
+    def set_library_path(path):
+        """Set the path in which to search for libclang"""
+        if Config.loaded:
+            raise Exception("library path must be set before before using " \
+                            "any other functionalities in libclang.")
+
+        Config.library_path = path
+
+    @staticmethod
+    def set_library_file(filename):
+        """Set the exact location of libclang"""
+        if Config.loaded:
+            raise Exception("library file must be set before before using " \
+                            "any other functionalities in libclang.")
+
+        Config.library_file = filename
+
+    @staticmethod
+    def set_compatibility_check(check_status):
+        """ Perform compatibility check when loading libclang
+
+        The python bindings are only tested and evaluated with the version of
+        libclang they are provided with. To ensure correct behavior a (limited)
+        compatibility check is performed when loading the bindings. This check
+        will throw an exception, as soon as it fails.
+
+        In case these bindings are used with an older version of libclang, parts
+        that have been stable between releases may still work. Users of the
+        python bindings can disable the compatibility check. This will cause
+        the python bindings to load, even though they are written for a newer
+        version of libclang. Failures now arise if unsupported or incompatible
+        features are accessed. The user is required to test themselves if the
+        features they are using are available and compatible between different
+        libclang versions.
+        """
+        if Config.loaded:
+            raise Exception("compatibility_check must be set before before " \
+                            "using any other functionalities in libclang.")
+
+        Config.compatibility_check = check_status
+
+    @CachedProperty
+    def lib(self):
+        lib = self.get_cindex_library()
+        register_functions(lib, not Config.compatibility_check)
+        Config.loaded = True
+        return lib
+
+    def get_filename(self):
+        if Config.library_file:
+            return Config.library_file
+
+        import platform
+        name = platform.system()
+
+        if name == 'Darwin':
+            file = 'libclang.dylib'
+        elif name == 'Windows':
+            file = 'libclang.dll'
+        else:
+            file = 'libclang.so'
+
+        if Config.library_path:
+            file = Config.library_path + '/' + file
+
+        return file
+
+    def get_cindex_library(self):
+        try:
+            library = cdll.LoadLibrary(self.get_filename())
+        except OSError as e:
+            msg = str(e) + ". To provide a path to libclang use " \
+                           "Config.set_library_path() or " \
+                           "Config.set_library_file()."
+            raise LibclangError(msg)
+
+        return library
+
+    def function_exists(self, name):
+        try:
+            getattr(self.lib, name)
+        except AttributeError:
+            return False
+
+        return True
+
+def register_enumerations():
+    for name, value in clang.enumerations.TokenKinds:
+        TokenKind.register(value, name)
+
+conf = Config()
+register_enumerations()
+
+__all__ = [
+    'Config',
+    'CodeCompletionResults',
+    'CompilationDatabase',
+    'CompileCommands',
+    'CompileCommand',
+    'CursorKind',
+    'Cursor',
+    'Diagnostic',
+    'File',
+    'FixIt',
+    'Index',
+    'SourceLocation',
+    'SourceRange',
+    'TokenKind',
+    'Token',
+    'TranslationUnitLoadError',
+    'TranslationUnit',
+    'TypeKind',
+    'Type',
+]
diff --git a/pybind11/tools/clang/enumerations.py b/pybind11/tools/clang/enumerations.py
new file mode 100644
index 0000000000000000000000000000000000000000..a86a48ade3bd7ad00e455bebb3b94ecf25ddf8e4
--- /dev/null
+++ b/pybind11/tools/clang/enumerations.py
@@ -0,0 +1,34 @@
+#===- enumerations.py - Python Enumerations ------------------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+"""
+Clang Enumerations
+==================
+
+This module provides static definitions of enumerations that exist in libclang.
+
+Enumerations are typically defined as a list of tuples. The exported values are
+typically munged into other types or classes at module load time.
+
+All enumerations are centrally defined in this file so they are all grouped
+together and easier to audit. And, maybe even one day this file will be
+automatically generated by scanning the libclang headers!
+"""
+
+# Maps to CXTokenKind. Note that libclang maintains a separate set of token
+# enumerations from the C++ API.
+TokenKinds = [
+    ('PUNCTUATION', 0),
+    ('KEYWORD', 1),
+    ('IDENTIFIER', 2),
+    ('LITERAL', 3),
+    ('COMMENT', 4),
+]
+
+__all__ = ['TokenKinds']
diff --git a/pybind11/tools/cmake_uninstall.cmake.in b/pybind11/tools/cmake_uninstall.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..1e5d2bb876d17f24b88b19ddb525d4146869944b
--- /dev/null
+++ b/pybind11/tools/cmake_uninstall.cmake.in
@@ -0,0 +1,23 @@
+# Source: https://gitlab.kitware.com/cmake/community/-/wikis/FAQ#can-i-do-make-uninstall-with-cmake
+
+if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt")
+endif()
+
+file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS
+      "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval)
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif()
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif()
+endforeach()
diff --git a/pybind11/tools/libsize.py b/pybind11/tools/libsize.py
new file mode 100644
index 0000000000000000000000000000000000000000..50f88bdb3dc57ac6df11a249d789ac5018a7fc86
--- /dev/null
+++ b/pybind11/tools/libsize.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function, division
+import os
+import sys
+
+# Internal build script for generating debugging test .so size.
+# Usage:
+#     python libsize.py file.so save.txt -- displays the size of file.so and, if save.txt exists, compares it to the
+#                                           size in it, then overwrites save.txt with the new size for future runs.
+
+if len(sys.argv) != 3:
+    sys.exit("Invalid arguments: usage: python libsize.py file.so save.txt")
+
+lib = sys.argv[1]
+save = sys.argv[2]
+
+if not os.path.exists(lib):
+    sys.exit("Error: requested file ({}) does not exist".format(lib))
+
+libsize = os.path.getsize(lib)
+
+print("------", os.path.basename(lib), "file size:", libsize, end='')
+
+if os.path.exists(save):
+    with open(save) as sf:
+        oldsize = int(sf.readline())
+
+    if oldsize > 0:
+        change = libsize - oldsize
+        if change == 0:
+            print(" (no change)")
+        else:
+            print(" (change of {:+} bytes = {:+.2%})".format(change, change / oldsize))
+else:
+    print()
+
+with open(save, 'w') as sf:
+    sf.write(str(libsize))
diff --git a/pybind11/tools/mkdoc.py b/pybind11/tools/mkdoc.py
new file mode 100755
index 0000000000000000000000000000000000000000..a22aacdefd0171078874bd77bf0175229646656f
--- /dev/null
+++ b/pybind11/tools/mkdoc.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#  Syntax: mkdoc.py [-I<path> ..] [.. a list of header files ..]
+#
+#  Extract documentation from C++ header files to use it in Python bindings
+#
+
+import os
+import sys
+import platform
+import re
+import textwrap
+
+from clang import cindex
+from clang.cindex import CursorKind
+from collections import OrderedDict
+from glob import glob
+from threading import Thread, Semaphore
+from multiprocessing import cpu_count
+
+RECURSE_LIST = [
+    CursorKind.TRANSLATION_UNIT,
+    CursorKind.NAMESPACE,
+    CursorKind.CLASS_DECL,
+    CursorKind.STRUCT_DECL,
+    CursorKind.ENUM_DECL,
+    CursorKind.CLASS_TEMPLATE
+]
+
+PRINT_LIST = [
+    CursorKind.CLASS_DECL,
+    CursorKind.STRUCT_DECL,
+    CursorKind.ENUM_DECL,
+    CursorKind.ENUM_CONSTANT_DECL,
+    CursorKind.CLASS_TEMPLATE,
+    CursorKind.FUNCTION_DECL,
+    CursorKind.FUNCTION_TEMPLATE,
+    CursorKind.CONVERSION_FUNCTION,
+    CursorKind.CXX_METHOD,
+    CursorKind.CONSTRUCTOR,
+    CursorKind.FIELD_DECL
+]
+
+PREFIX_BLACKLIST = [
+    CursorKind.TRANSLATION_UNIT
+]
+
+CPP_OPERATORS = {
+    '<=': 'le', '>=': 'ge', '==': 'eq', '!=': 'ne', '[]': 'array',
+    '+=': 'iadd', '-=': 'isub', '*=': 'imul', '/=': 'idiv', '%=':
+    'imod', '&=': 'iand', '|=': 'ior', '^=': 'ixor', '<<=': 'ilshift',
+    '>>=': 'irshift', '++': 'inc', '--': 'dec', '<<': 'lshift', '>>':
+    'rshift', '&&': 'land', '||': 'lor', '!': 'lnot', '~': 'bnot',
+    '&': 'band', '|': 'bor', '+': 'add', '-': 'sub', '*': 'mul', '/':
+    'div', '%': 'mod', '<': 'lt', '>': 'gt', '=': 'assign', '()': 'call'
+}
+
+CPP_OPERATORS = OrderedDict(
+    sorted(CPP_OPERATORS.items(), key=lambda t: -len(t[0])))
+
+job_count = cpu_count()
+job_semaphore = Semaphore(job_count)
+
+
+class NoFilenamesError(ValueError):
+    pass
+
+
+def d(s):
+    return s if isinstance(s, str) else s.decode('utf8')
+
+
+def sanitize_name(name):
+    name = re.sub(r'type-parameter-0-([0-9]+)', r'T\1', name)
+    for k, v in CPP_OPERATORS.items():
+        name = name.replace('operator%s' % k, 'operator_%s' % v)
+    name = re.sub('<.*>', '', name)
+    name = ''.join([ch if ch.isalnum() else '_' for ch in name])
+    name = re.sub('_$', '', re.sub('_+', '_', name))
+    return '__doc_' + name
+
+
+def process_comment(comment):
+    result = ''
+
+    # Remove C++ comment syntax
+    leading_spaces = float('inf')
+    for s in comment.expandtabs(tabsize=4).splitlines():
+        s = s.strip()
+        if s.startswith('/*'):
+            s = s[2:].lstrip('*')
+        elif s.endswith('*/'):
+            s = s[:-2].rstrip('*')
+        elif s.startswith('///'):
+            s = s[3:]
+        if s.startswith('*'):
+            s = s[1:]
+        if len(s) > 0:
+            leading_spaces = min(leading_spaces, len(s) - len(s.lstrip()))
+        result += s + '\n'
+
+    if leading_spaces != float('inf'):
+        result2 = ""
+        for s in result.splitlines():
+            result2 += s[leading_spaces:] + '\n'
+        result = result2
+
+    # Doxygen tags
+    cpp_group = r'([\w:]+)'
+    param_group = r'([\[\w:\]]+)'
+
+    s = result
+    s = re.sub(r'\\c\s+%s' % cpp_group, r'``\1``', s)
+    s = re.sub(r'\\a\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\e\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\em\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\b\s+%s' % cpp_group, r'**\1**', s)
+    s = re.sub(r'\\ingroup\s+%s' % cpp_group, r'', s)
+    s = re.sub(r'\\param%s?\s+%s' % (param_group, cpp_group),
+               r'\n\n$Parameter ``\2``:\n\n', s)
+    s = re.sub(r'\\tparam%s?\s+%s' % (param_group, cpp_group),
+               r'\n\n$Template parameter ``\2``:\n\n', s)
+
+    for in_, out_ in {
+        'return': 'Returns',
+        'author': 'Author',
+        'authors': 'Authors',
+        'copyright': 'Copyright',
+        'date': 'Date',
+        'remark': 'Remark',
+        'sa': 'See also',
+        'see': 'See also',
+        'extends': 'Extends',
+        'throw': 'Throws',
+        'throws': 'Throws'
+    }.items():
+        s = re.sub(r'\\%s\s*' % in_, r'\n\n$%s:\n\n' % out_, s)
+
+    s = re.sub(r'\\details\s*', r'\n\n', s)
+    s = re.sub(r'\\brief\s*', r'', s)
+    s = re.sub(r'\\short\s*', r'', s)
+    s = re.sub(r'\\ref\s*', r'', s)
+
+    s = re.sub(r'\\code\s?(.*?)\s?\\endcode',
+               r"```\n\1\n```\n", s, flags=re.DOTALL)
+
+    # HTML/TeX tags
+    s = re.sub(r'<tt>(.*?)</tt>', r'``\1``', s, flags=re.DOTALL)
+    s = re.sub(r'<pre>(.*?)</pre>', r"```\n\1\n```\n", s, flags=re.DOTALL)
+    s = re.sub(r'<em>(.*?)</em>', r'*\1*', s, flags=re.DOTALL)
+    s = re.sub(r'<b>(.*?)</b>', r'**\1**', s, flags=re.DOTALL)
+    s = re.sub(r'\\f\$(.*?)\\f\$', r'$\1$', s, flags=re.DOTALL)
+    s = re.sub(r'<li>', r'\n\n* ', s)
+    s = re.sub(r'</?ul>', r'', s)
+    s = re.sub(r'</li>', r'\n\n', s)
+
+    s = s.replace('``true``', '``True``')
+    s = s.replace('``false``', '``False``')
+
+    # Re-flow text
+    wrapper = textwrap.TextWrapper()
+    wrapper.expand_tabs = True
+    wrapper.replace_whitespace = True
+    wrapper.drop_whitespace = True
+    wrapper.width = 70
+    wrapper.initial_indent = wrapper.subsequent_indent = ''
+
+    result = ''
+    in_code_segment = False
+    for x in re.split(r'(```)', s):
+        if x == '```':
+            if not in_code_segment:
+                result += '```\n'
+            else:
+                result += '\n```\n\n'
+            in_code_segment = not in_code_segment
+        elif in_code_segment:
+            result += x.strip()
+        else:
+            for y in re.split(r'(?: *\n *){2,}', x):
+                wrapped = wrapper.fill(re.sub(r'\s+', ' ', y).strip())
+                if len(wrapped) > 0 and wrapped[0] == '$':
+                    result += wrapped[1:] + '\n'
+                    wrapper.initial_indent = \
+                        wrapper.subsequent_indent = ' ' * 4
+                else:
+                    if len(wrapped) > 0:
+                        result += wrapped + '\n\n'
+                    wrapper.initial_indent = wrapper.subsequent_indent = ''
+    return result.rstrip().lstrip('\n')
+
+
+def extract(filename, node, prefix, output):
+    if not (node.location.file is None or
+            os.path.samefile(d(node.location.file.name), filename)):
+        return 0
+    if node.kind in RECURSE_LIST:
+        sub_prefix = prefix
+        if node.kind not in PREFIX_BLACKLIST:
+            if len(sub_prefix) > 0:
+                sub_prefix += '_'
+            sub_prefix += d(node.spelling)
+        for i in node.get_children():
+            extract(filename, i, sub_prefix, output)
+    if node.kind in PRINT_LIST:
+        comment = d(node.raw_comment) if node.raw_comment is not None else ''
+        comment = process_comment(comment)
+        sub_prefix = prefix
+        if len(sub_prefix) > 0:
+            sub_prefix += '_'
+        if len(node.spelling) > 0:
+            name = sanitize_name(sub_prefix + d(node.spelling))
+            output.append((name, filename, comment))
+
+
+class ExtractionThread(Thread):
+    def __init__(self, filename, parameters, output):
+        Thread.__init__(self)
+        self.filename = filename
+        self.parameters = parameters
+        self.output = output
+        job_semaphore.acquire()
+
+    def run(self):
+        print('Processing "%s" ..' % self.filename, file=sys.stderr)
+        try:
+            index = cindex.Index(
+                cindex.conf.lib.clang_createIndex(False, True))
+            tu = index.parse(self.filename, self.parameters)
+            extract(self.filename, tu.cursor, '', self.output)
+        finally:
+            job_semaphore.release()
+
+
+def read_args(args):
+    parameters = []
+    filenames = []
+    if "-x" not in args:
+        parameters.extend(['-x', 'c++'])
+    if not any(it.startswith("-std=") for it in args):
+        parameters.append('-std=c++11')
+
+    if platform.system() == 'Darwin':
+        dev_path = '/Applications/Xcode.app/Contents/Developer/'
+        lib_dir = dev_path + 'Toolchains/XcodeDefault.xctoolchain/usr/lib/'
+        sdk_dir = dev_path + 'Platforms/MacOSX.platform/Developer/SDKs'
+        libclang = lib_dir + 'libclang.dylib'
+
+        if os.path.exists(libclang):
+            cindex.Config.set_library_path(os.path.dirname(libclang))
+
+        if os.path.exists(sdk_dir):
+            sysroot_dir = os.path.join(sdk_dir, next(os.walk(sdk_dir))[1][0])
+            parameters.append('-isysroot')
+            parameters.append(sysroot_dir)
+    elif platform.system() == 'Linux':
+        # cython.util.find_library does not find `libclang` for all clang
+        # versions and distributions. LLVM switched to a monolithical setup
+        # that includes everything under /usr/lib/llvm{version_number}/
+        # We therefore glob for the library and select the highest version
+        library_file = sorted(glob("/usr/lib/llvm-*/lib/libclang.so"), reverse=True)[0]
+        cindex.Config.set_library_file(library_file)
+
+        # clang doesn't find its own base includes by default on Linux,
+        # but different distros install them in different paths.
+        # Try to autodetect, preferring the highest numbered version.
+        def clang_folder_version(d):
+            return [int(ver) for ver in re.findall(r'(?<!lib)(?<!\d)\d+', d)]
+        clang_include_dir = max((
+            path
+            for libdir in ['lib64', 'lib', 'lib32']
+            for path in glob('/usr/%s/clang/*/include' % libdir)
+            if os.path.isdir(path)
+        ), default=None, key=clang_folder_version)
+        if clang_include_dir:
+            parameters.extend(['-isystem', clang_include_dir])
+
+    for item in args:
+        if item.startswith('-'):
+            parameters.append(item)
+        else:
+            filenames.append(item)
+
+    if len(filenames) == 0:
+        raise NoFilenamesError("args parameter did not contain any filenames")
+
+    return parameters, filenames
+
+
+def extract_all(args):
+    parameters, filenames = read_args(args)
+    output = []
+    for filename in filenames:
+        thr = ExtractionThread(filename, parameters, output)
+        thr.start()
+
+    print('Waiting for jobs to finish ..', file=sys.stderr)
+    for i in range(job_count):
+        job_semaphore.acquire()
+
+    return output
+
+
+def write_header(comments, out_file=sys.stdout):
+    print('''/*
+  This file contains docstrings for the Python bindings.
+  Do not edit! These were automatically extracted by mkdoc.py
+ */
+
+#define __EXPAND(x)                                      x
+#define __COUNT(_1, _2, _3, _4, _5, _6, _7, COUNT, ...)  COUNT
+#define __VA_SIZE(...)                                   __EXPAND(__COUNT(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1))
+#define __CAT1(a, b)                                     a ## b
+#define __CAT2(a, b)                                     __CAT1(a, b)
+#define __DOC1(n1)                                       __doc_##n1
+#define __DOC2(n1, n2)                                   __doc_##n1##_##n2
+#define __DOC3(n1, n2, n3)                               __doc_##n1##_##n2##_##n3
+#define __DOC4(n1, n2, n3, n4)                           __doc_##n1##_##n2##_##n3##_##n4
+#define __DOC5(n1, n2, n3, n4, n5)                       __doc_##n1##_##n2##_##n3##_##n4##_##n5
+#define __DOC6(n1, n2, n3, n4, n5, n6)                   __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6
+#define __DOC7(n1, n2, n3, n4, n5, n6, n7)               __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6##_##n7
+#define DOC(...)                                         __EXPAND(__EXPAND(__CAT2(__DOC, __VA_SIZE(__VA_ARGS__)))(__VA_ARGS__))
+
+#if defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+''', file=out_file)
+
+
+    name_ctr = 1
+    name_prev = None
+    for name, _, comment in list(sorted(comments, key=lambda x: (x[0], x[1]))):
+        if name == name_prev:
+            name_ctr += 1
+            name = name + "_%i" % name_ctr
+        else:
+            name_prev = name
+            name_ctr = 1
+        print('\nstatic const char *%s =%sR"doc(%s)doc";' %
+              (name, '\n' if '\n' in comment else ' ', comment), file=out_file)
+
+    print('''
+#if defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+''', file=out_file)
+
+
+def mkdoc(args):
+    args = list(args)
+    out_path = None
+    for idx, arg in enumerate(args):
+        if arg.startswith("-o"):
+            args.remove(arg)
+            try:
+                out_path = arg[2:] or args.pop(idx)
+            except IndexError:
+                print("-o flag requires an argument")
+                exit(-1)
+            break
+
+    comments = extract_all(args)
+
+    if out_path:
+        try:
+            with open(out_path, 'w') as out_file:
+                write_header(comments, out_file)
+        except:
+            # In the event of an error, don't leave a partially-written
+            # output file.
+            try:
+                os.unlink(out_path)
+            except:
+                pass
+            raise
+    else:
+        write_header(comments)
+
+
+if __name__ == '__main__':
+    try:
+        mkdoc(sys.argv[1:])
+    except NoFilenamesError:
+        print('Syntax: %s [.. a list of header files ..]' % sys.argv[0])
+        exit(-1)
diff --git a/pybind11/tools/pybind11Common.cmake b/pybind11/tools/pybind11Common.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8f7f57b5171e12b55a7752d19d7cabdaf9085961
--- /dev/null
+++ b/pybind11/tools/pybind11Common.cmake
@@ -0,0 +1,296 @@
+#[======================================================[.rst
+
+Adds the following targets::
+
+    pybind11::pybind11 - link to headers and pybind11
+    pybind11::module - Adds module links
+    pybind11::embed - Adds embed links
+    pybind11::lto - Link time optimizations (manual selection)
+    pybind11::thin_lto - Link time optimizations (manual selection)
+    pybind11::python_link_helper - Adds link to Python libraries
+    pybind11::python2_no_register - Avoid warning/error with Python 2 + C++14/7
+    pybind11::windows_extras - MSVC bigobj and mp for building multithreaded
+
+Adds the following functions::
+
+    pybind11_strip(target) - strip target after building on linux/macOS
+
+
+#]======================================================]
+
+# CMake 3.10 has an include_guard command, but we can't use that yet
+if(TARGET pybind11::lto)
+  return()
+endif()
+
+# If we are in subdirectory mode, all IMPORTED targets must be GLOBAL. If we
+# are in CONFIG mode, they should be "normal" targets instead.
+# In CMake 3.11+ you can promote a target to global after you create it,
+# which might be simpler than this check.
+get_property(
+  is_config
+  TARGET pybind11::headers
+  PROPERTY IMPORTED)
+if(NOT is_config)
+  set(optional_global GLOBAL)
+endif()
+
+# --------------------- Shared targets ----------------------------
+
+# Build an interface library target:
+add_library(pybind11::pybind11 IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::pybind11
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::headers)
+
+# Build a module target:
+add_library(pybind11::module IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::module
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11)
+
+# Build an embed library target:
+add_library(pybind11::embed IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::embed
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11)
+
+# ----------------------- no register ----------------------
+
+# Workaround for Python 2.7 and C++17 (C++14 as a warning) incompatibility
+# This adds the flags -Wno-register and -Wno-deprecated-register if the compiler
+# is Clang 3.9+ or AppleClang and the compile language is CXX, or /wd5033 for MSVC (all languages,
+# since MSVC didn't recognize COMPILE_LANGUAGE until CMake 3.11+).
+
+add_library(pybind11::python2_no_register INTERFACE IMPORTED ${optional_global})
+set(clang_4plus
+    "$<AND:$<CXX_COMPILER_ID:Clang>,$<NOT:$<VERSION_LESS:$<CXX_COMPILER_VERSION>,3.9>>>")
+set(no_register "$<OR:${clang_4plus},$<CXX_COMPILER_ID:AppleClang>>")
+
+if(MSVC AND CMAKE_VERSION VERSION_LESS 3.11)
+  set(cxx_no_register "${no_register}")
+else()
+  set(cxx_no_register "$<AND:$<COMPILE_LANGUAGE:CXX>,${no_register}>")
+endif()
+
+set(msvc "$<CXX_COMPILER_ID:MSVC>")
+
+set_property(
+  TARGET pybind11::python2_no_register
+  PROPERTY INTERFACE_COMPILE_OPTIONS
+           "$<${cxx_no_register}:-Wno-register;-Wno-deprecated-register>" "$<${msvc}:/wd5033>")
+
+# --------------------------- link helper ---------------------------
+
+add_library(pybind11::python_link_helper IMPORTED INTERFACE ${optional_global})
+
+if(CMAKE_VERSION VERSION_LESS 3.13)
+  # In CMake 3.11+, you can set INTERFACE properties via the normal methods, and
+  # this would be simpler.
+  set_property(
+    TARGET pybind11::python_link_helper
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES "$<$<PLATFORM_ID:Darwin>:-undefined dynamic_lookup>")
+else()
+  # link_options was added in 3.13+
+  # This is safer, because you are ensured the deduplication pass in CMake will not consider
+  # these separate and remove one but not the other.
+  set_property(
+    TARGET pybind11::python_link_helper
+    APPEND
+    PROPERTY INTERFACE_LINK_OPTIONS "$<$<PLATFORM_ID:Darwin>:LINKER:-undefined,dynamic_lookup>")
+endif()
+
+# ------------------------ Windows extras -------------------------
+
+add_library(pybind11::windows_extras IMPORTED INTERFACE ${optional_global})
+
+if(MSVC)
+  # /MP enables multithreaded builds (relevant when there are many files), /bigobj is
+  # needed for bigger binding projects due to the limit to 64k addressable sections
+  set_property(
+    TARGET pybind11::windows_extras
+    APPEND
+    PROPERTY INTERFACE_COMPILE_OPTIONS /bigobj)
+
+  if(CMAKE_VERSION VERSION_LESS 3.11)
+    set_property(
+      TARGET pybind11::windows_extras
+      APPEND
+      PROPERTY INTERFACE_COMPILE_OPTIONS $<$<NOT:$<CONFIG:Debug>>:/MP>)
+  else()
+    # Only set these options for C++ files.  This is important so that, for
+    # instance, projects that include other types of source files like CUDA
+    # .cu files don't get these options propagated to nvcc since that would
+    # cause the build to fail.
+    set_property(
+      TARGET pybind11::windows_extras
+      APPEND
+      PROPERTY INTERFACE_COMPILE_OPTIONS $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+  endif()
+endif()
+
+# ----------------------- Legacy option --------------------------
+
+# Warn or error if old variable name used
+if(PYBIND11_CPP_STANDARD)
+  string(REGEX MATCH [[..$]] VAL "${PYBIND11_CPP_STANDARD}")
+  if(CMAKE_CXX_STANDARD)
+    if(NOT CMAKE_CXX_STANDARD STREQUAL VAL)
+      message(WARNING "CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} does not match "
+                      "PYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}, "
+                      "please remove PYBIND11_CPP_STANDARD from your cache")
+    endif()
+  else()
+    set(supported_standards 11 14 17 20)
+    if("${VAL}" IN_LIST supported_standards)
+      message(WARNING "USE -DCMAKE_CXX_STANDARD=${VAL} instead of PYBIND11_CPP_STANDARD")
+      set(CMAKE_CXX_STANDARD
+          ${VAL}
+          CACHE STRING "From PYBIND11_CPP_STANDARD")
+    else()
+      message(FATAL_ERROR "PYBIND11_CPP_STANDARD should be replaced with CMAKE_CXX_STANDARD "
+                          "(last two chars: ${VAL} not understood as a valid CXX std)")
+    endif()
+  endif()
+endif()
+
+# --------------------- Python specifics -------------------------
+
+# Check to see which Python mode we are in, new, old, or no python
+if(PYBIND11_NOPYTHON)
+  set(_pybind11_nopython ON)
+elseif(
+  PYBIND11_FINDPYTHON
+  OR Python_FOUND
+  OR Python2_FOUND
+  OR Python3_FOUND)
+  # New mode
+  include("${CMAKE_CURRENT_LIST_DIR}/pybind11NewTools.cmake")
+
+else()
+
+  # Classic mode
+  include("${CMAKE_CURRENT_LIST_DIR}/pybind11Tools.cmake")
+
+endif()
+
+# --------------------- LTO -------------------------------
+
+include(CheckCXXCompilerFlag)
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.
+# cxxflags and linkerflags are lists of flags to use.  The result variable is a
+# unique variable name for each set of flags: the compilation result will be
+# cached base on the result variable.  If the flags work, sets them in
+# cxxflags_out/linkerflags_out internal cache variables (in addition to
+# ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out
+         linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if(${result})
+    set(${cxxflags_out}
+        "${cxxflags}"
+        PARENT_SCOPE)
+    set(${linkerflags_out}
+        "${linkerflags}"
+        PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_pybind11_generate_lto target prefer_thin_lto)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+    set(cxx_append "")
+    set(linker_append "")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+      # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+      set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+      set(cxx_append ";-fno-fat-lto-objects")
+    endif()
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND prefer_thin_lto)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_FLTO_THIN "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
+        PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+
+    if(NOT HAS_FLTO_THIN)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_FLTO "-flto${cxx_append}" "-flto${linker_append}" PYBIND11_LTO_CXX_FLAGS
+        PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    # Intel equivalent to LTO is called IPO
+    _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO "-ipo" "-ipo"
+                                                  PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+  elseif(MSVC)
+    # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+    # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+    # with - instead of /, even if it is a bit non-standard:
+    _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG "/GL" "-LTCG"
+                                                  PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if(PYBIND11_LTO_CXX_FLAGS)
+    set(not_debug "$<NOT:$<CONFIG:Debug>>")
+    set(cxx_lang "$<COMPILE_LANGUAGE:CXX>")
+    if(MSVC AND CMAKE_VERSION VERSION_LESS 3.11)
+      set(genex "${not_debug}")
+    else()
+      set(genex "$<AND:${not_debug},${cxx_lang}>")
+    endif()
+    set_property(
+      TARGET ${target}
+      APPEND
+      PROPERTY INTERFACE_COMPILE_OPTIONS "$<${genex}:${PYBIND11_LTO_CXX_FLAGS}>")
+    if(CMAKE_PROJECT_NAME STREQUAL "pybind11")
+      message(STATUS "${target} enabled")
+    endif()
+  else()
+    if(CMAKE_PROJECT_NAME STREQUAL "pybind11")
+      message(STATUS "${target} disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  if(PYBIND11_LTO_LINKER_FLAGS)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      set_property(
+        TARGET ${target}
+        APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>")
+    else()
+      set_property(
+        TARGET ${target}
+        APPEND
+        PROPERTY INTERFACE_LINK_OPTIONS "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>")
+    endif()
+  endif()
+endfunction()
+
+add_library(pybind11::lto IMPORTED INTERFACE ${optional_global})
+_pybind11_generate_lto(pybind11::lto FALSE)
+
+add_library(pybind11::thin_lto IMPORTED INTERFACE ${optional_global})
+_pybind11_generate_lto(pybind11::thin_lto TRUE)
+
+# ---------------------- pybind11_strip -----------------------------
+
+function(pybind11_strip target_name)
+  # Strip unnecessary sections of the binary on Linux/Mac OS
+  if(CMAKE_STRIP)
+    if(APPLE)
+      set(x_opt -x)
+    endif()
+
+    add_custom_command(
+      TARGET ${target_name}
+      POST_BUILD
+      COMMAND ${CMAKE_STRIP} ${x_opt} $<TARGET_FILE:${target_name}>)
+  endif()
+endfunction()
diff --git a/pybind11/tools/pybind11Config.cmake.in b/pybind11/tools/pybind11Config.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..3f11172963fadd973a7124778f7ff501fa580a9a
--- /dev/null
+++ b/pybind11/tools/pybind11Config.cmake.in
@@ -0,0 +1,145 @@
+#[=============================================================================[.rst
+
+pybind11Config.cmake
+--------------------
+
+PYBIND11 cmake module.
+This module sets the following variables in your project::
+
+  pybind11_FOUND - true if pybind11 and all required components found on the system
+  pybind11_VERSION - pybind11 version in format Major.Minor.Release
+  pybind11_VERSION_TYPE - pybind11 version type (dev, release)
+  pybind11_INCLUDE_DIRS - Directories where pybind11 and python headers are located.
+  pybind11_INCLUDE_DIR - Directory where pybind11 headers are located.
+  pybind11_DEFINITIONS - Definitions necessary to use pybind11, namely USING_pybind11.
+  pybind11_LIBRARIES - compile flags and python libraries (as needed) to link against.
+  pybind11_LIBRARY - empty.
+
+
+Available components: None
+
+
+Exported targets::
+
+If pybind11 is found, this module defines the following :prop_tgt:`IMPORTED`
+interface library targets::
+
+  pybind11::module - for extension modules
+  pybind11::embed - for embedding the Python interpreter
+
+Python headers, libraries (as needed by platform), and the C++ standard
+are attached to the target.
+
+Advanced targets are also supplied - these are primary for users building
+complex applications, and they are available in all modes::
+
+  pybind11::headers - Just the pybind11 headers and minimum compile requirements
+  pybind11::pybind11 - Python headers too
+  pybind11::python_link_helper - Just the "linking" part of pybind11:module, for CMake < 3.15
+  pybind11::python2_no_register - Quiets the warning/error when mixing C++14+ and Python 2, also included in pybind11::module
+  pybind11::thin_lto - An alternative to INTERPROCEDURAL_OPTIMIZATION
+  pybind11::lto - An alternative to INTERPROCEDURAL_OPTIMIZATION (also avoids thin LTO on clang)
+  pybind11::windows_extras - Adds bigobj and mp for MSVC
+
+Modes::
+
+There are two modes provided; classic, which is built on the old Python
+discovery packages in CMake, or the new FindPython mode, which uses FindPython
+from 3.12+ forward (3.15+ _highly_ recommended).
+
+New FindPython mode::
+
+To activate this mode, either call ``find_package(Python COMPONENTS Interpreter Development)``
+before finding this package, or set the ``PYBIND11_FINDPYTHON`` variable to ON. In this mode,
+you can either use the basic targets, or use the FindPython tools::
+
+  find_package(Python COMPONENTS Interpreter Development)
+  find_package(pybind11 CONFIG)
+
+  # pybind11 method:
+  pybind11_add_module(MyModule1 src1.cpp)
+
+  # Python method:
+  Python_add_library(MyModule2 src2.cpp)
+  target_link_libraries(MyModule2 pybind11::headers)
+  set_target_properties(MyModule2 PROPERTIES
+                                  INTERPROCEDURAL_OPTIMIZATION ON
+                                  CXX__VISIBILITY_PRESET ON
+                                  VISIBLITY_INLINES_HIDDEN ON)
+
+If you build targets yourself, you may be interested in stripping the output
+for reduced size; this is the one other feature that the helper function gives you.
+
+Classic mode::
+
+Set PythonLibsNew variables to influence python detection and
+CMAKE_CXX_STANDARD to influence standard setting. ::
+
+  find_package(pybind11 CONFIG REQUIRED)
+
+  # Create an extension module
+  add_library(mylib MODULE main.cpp)
+  target_link_libraries(mylib PUBLIC pybind11::module)
+
+  # Or embed the Python interpreter into an executable
+  add_executable(myexe main.cpp)
+  target_link_libraries(myexe PUBLIC pybind11::embed)
+
+Suggested usage::
+
+find_package with version info is not recommended except for release versions. ::
+
+  find_package(pybind11 CONFIG)
+  find_package(pybind11 2.0 EXACT CONFIG REQUIRED)
+
+
+The following variables can be set to guide the search for this package::
+
+  pybind11_DIR - CMake variable, set to directory containing this Config file
+  CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
+  PATH - environment variable, set to bin directory of this package
+  CMAKE_DISABLE_FIND_PACKAGE_pybind11 - CMake variable, disables
+    find_package(pybind11) when not REQUIRED, perhaps to force internal build
+
+Helper functions::
+
+  pybind11_add_module(...) - Add a library and setup all helpers
+  pybind11_strip(target) - Strip a target after building it (linux/macOS)
+  pybind11_extension(target) - Injects the Python extension name
+
+See ``pybind11Tools.cmake`` or ``pybind11NewTools.cmake`` for details on
+``pybind11_add_module``.
+
+#]=============================================================================]
+@PACKAGE_INIT@
+
+# Location of pybind11/pybind11.h
+set(pybind11_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
+
+set(pybind11_LIBRARY "")
+set(pybind11_DEFINITIONS USING_pybind11)
+set(pybind11_VERSION_TYPE "@pybind11_VERSION_TYPE@")
+
+check_required_components(pybind11)
+
+if(TARGET pybind11::python_link_helper)
+  # This has already been setup elsewhere, such as with a previous call or
+  # add_subdirectory
+  return()
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/pybind11Targets.cmake")
+
+# Easier to use / remember
+add_library(pybind11::headers IMPORTED INTERFACE)
+set_target_properties(pybind11::headers PROPERTIES INTERFACE_LINK_LIBRARIES
+                                                   pybind11::pybind11_headers)
+
+include("${CMAKE_CURRENT_LIST_DIR}/pybind11Common.cmake")
+
+if(NOT pybind11_FIND_QUIETLY)
+  message(
+    STATUS
+      "Found pybind11: ${pybind11_INCLUDE_DIR} (found version \"${pybind11_VERSION}\" ${pybind11_VERSION_TYPE})"
+  )
+endif()
diff --git a/pybind11/tools/pybind11NewTools.cmake b/pybind11/tools/pybind11NewTools.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8f771acd243a3a1ff5338a8aac88b3aae274bc06
--- /dev/null
+++ b/pybind11/tools/pybind11NewTools.cmake
@@ -0,0 +1,203 @@
+# tools/pybind11NewTools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2020 Wenzel Jakob <wenzel@inf.ethz.ch> and Henry Schreiner
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+get_property(
+  is_config
+  TARGET pybind11::headers
+  PROPERTY IMPORTED)
+
+if(pybind11_FIND_QUIETLY)
+  set(_pybind11_quiet QUIET)
+endif()
+
+if(CMAKE_VERSION VERSION_LESS 3.12)
+  message(FATAL_ERROR "You cannot use the new FindPython module with CMake < 3.12")
+endif()
+
+if(NOT Python_FOUND
+   AND NOT Python3_FOUND
+   AND NOT Python2_FOUND)
+  if(NOT DEFINED Python_FIND_IMPLEMENTATIONS)
+    set(Python_FIND_IMPLEMENTATIONS CPython PyPy)
+  endif()
+
+  # GitHub Actions like activation
+  if(NOT DEFINED Python_ROOT_DIR AND DEFINED ENV{pythonLocation})
+    set(Python_ROOT_DIR "$ENV{pythonLocation}")
+  endif()
+
+  find_package(Python REQUIRED COMPONENTS Interpreter Development ${_pybind11_quiet})
+
+  # If we are in submodule mode, export the Python targets to global targets.
+  # If this behavior is not desired, FindPython _before_ pybind11.
+  if(NOT is_config)
+    set_property(TARGET Python::Python PROPERTY IMPORTED_GLOBAL TRUE)
+    set_property(TARGET Python::Interpreter PROPERTY IMPORTED_GLOBAL TRUE)
+    if(TARGET Python::Module)
+      set_property(TARGET Python::Module PROPERTY IMPORTED_GLOBAL TRUE)
+    endif()
+  endif()
+endif()
+
+if(Python_FOUND)
+  set(_Python
+      Python
+      CACHE INTERNAL "" FORCE)
+elseif(Python3_FOUND AND NOT Python2_FOUND)
+  set(_Python
+      Python3
+      CACHE INTERNAL "" FORCE)
+elseif(Python2_FOUND AND NOT Python3_FOUND)
+  set(_Python
+      Python2
+      CACHE INTERNAL "" FORCE)
+else()
+  message(AUTHOR_WARNING "Python2 and Python3 both present, pybind11 in "
+                         "PYBIND11_NOPYTHON mode (manually activate to silence warning)")
+  set(_pybind11_nopython ON)
+  return()
+endif()
+
+if(PYBIND11_MASTER_PROJECT)
+  if(${_Python}_INTERPRETER_ID MATCHES "PyPy")
+    message(STATUS "PyPy ${${_Python}_PyPy_VERSION} (Py ${${_Python}_VERSION})")
+  else()
+    message(STATUS "${_Python} ${${_Python}_VERSION}")
+  endif()
+endif()
+
+# Debug check - see https://stackoverflow.com/questions/646518/python-how-to-detect-debug-Interpreter
+execute_process(COMMAND ${_Python}::Python -c "import sys; print(hasattr(sys, 'gettotalrefcount'))"
+                OUTPUT_VARIABLE PYTHON_IS_DEBUG)
+
+# Python debug libraries expose slightly different objects before 3.8
+# https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+# https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+if(PYTHON_IS_DEBUG)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_COMPILE_DEFINITIONS Py_DEBUG)
+endif()
+
+# Check on every access - since Python2 and Python3 could have been used - do nothing in that case.
+
+if(DEFINED ${_Python}_INCLUDE_DIRS)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${${_Python}_INCLUDE_DIRS}>)
+endif()
+
+if(DEFINED ${_Python}_VERSION AND ${_Python}_VERSION VERSION_LESS 3)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python2_no_register)
+endif()
+
+# In CMake 3.18+, you can find these separately, so include an if
+if(TARGET ${_Python}::${_Python})
+  set_property(
+    TARGET pybind11::embed
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES ${_Python}::${_Python})
+endif()
+
+# CMake 3.15+ has this
+if(TARGET ${_Python}::Module)
+  set_property(
+    TARGET pybind11::module
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES ${_Python}::Module)
+else()
+  set_property(
+    TARGET pybind11::module
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_link_helper)
+endif()
+
+function(pybind11_add_module target_name)
+  cmake_parse_arguments(PARSE_ARGV 1 ARG "STATIC;SHARED;MODULE;THIN_LTO;NO_EXTRAS" "" "")
+
+  if(ARG_ADD_LIBRARY_STATIC)
+    set(type STATIC)
+  elseif(ARG_ADD_LIBRARY_SHARED)
+    set(type SHARED)
+  else()
+    set(type MODULE)
+  endif()
+
+  if("${_Python}" STREQUAL "Python")
+    python_add_library(${target_name} ${type} WITH_SOABI ${ARG_UNPARSED_ARGUMENTS})
+  elseif("${_Python}" STREQUAL "Python3")
+    python3_add_library(${target_name} ${type} WITH_SOABI ${ARG_UNPARSED_ARGUMENTS})
+  elseif("${_Python}" STREQUAL "Python2")
+    python2_add_library(${target_name} ${type} WITH_SOABI ${ARG_UNPARSED_ARGUMENTS})
+  else()
+    message(FATAL_ERROR "Cannot detect FindPython version: ${_Python}")
+  endif()
+
+  target_link_libraries(${target_name} PRIVATE pybind11::headers)
+
+  if(type STREQUAL "MODULE")
+    target_link_libraries(${target_name} PRIVATE pybind11::module)
+  else()
+    target_link_libraries(${target_name} PRIVATE pybind11::embed)
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+
+  if(DEFINED ${_Python}_VERSION AND ${_Python}_VERSION VERSION_LESS 3)
+    target_link_libraries(${target_name} PRIVATE pybind11::python2_no_register)
+  endif()
+
+  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden"
+                                                  CUDA_VISIBILITY_PRESET "hidden")
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    if(ARG_THIN_LTO)
+      target_link_libraries(${target_name} PRIVATE pybind11::thin_lto)
+    else()
+      target_link_libraries(${target_name} PRIVATE pybind11::lto)
+    endif()
+  endif()
+
+  if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
+    # Strip unnecessary sections of the binary on Linux/Mac OS
+    pybind11_strip(${target_name})
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+endfunction()
+
+function(pybind11_extension name)
+  set_property(TARGET ${name} PROPERTY PREFIX "")
+
+  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    set_property(TARGET ${name} PROPERTY SUFFIX ".pyd")
+  endif()
+
+  if(${_Python}_SOABI)
+    get_property(
+      suffix
+      TARGET ${name}
+      PROPERTY SUFFIX)
+    if(NOT suffix)
+      set(suffix "${CMAKE_SHARED_MODULE_SUFFIX}")
+    endif()
+    set_property(TARGET ${name} PROPERTY SUFFIX ".${${_Python}_SOABI}${suffix}")
+  endif()
+endfunction()
diff --git a/pybind11/tools/pybind11Tools.cmake b/pybind11/tools/pybind11Tools.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..10f15a30917056f8d69cff833e2c905aede08e50
--- /dev/null
+++ b/pybind11/tools/pybind11Tools.cmake
@@ -0,0 +1,188 @@
+# tools/pybind11Tools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+# Built-in in CMake 3.5+
+include(CMakeParseArguments)
+
+if(pybind11_FIND_QUIETLY)
+  set(_pybind11_quiet QUIET)
+endif()
+
+# If this is the first run, PYTHON_VERSION can stand in for PYBIND11_PYTHON_VERSION
+if(NOT DEFINED PYBIND11_PYTHON_VERSION AND DEFINED PYTHON_VERSION)
+  message(WARNING "Set PYBIND11_PYTHON_VERSION to search for a specific version, not "
+                  "PYTHON_VERSION (which is an output). Assuming that is what you "
+                  "meant to do and continuing anyway.")
+  set(PYBIND11_PYTHON_VERSION
+      "${PYTHON_VERSION}"
+      CACHE STRING "Python version to use for compiling modules")
+  unset(PYTHON_VERSION)
+  unset(PYTHON_VERSION CACHE)
+else()
+  # If this is set as a normal variable, promote it, otherwise, make an empty cache variable.
+  set(PYBIND11_PYTHON_VERSION
+      "${PYBIND11_PYTHON_VERSION}"
+      CACHE STRING "Python version to use for compiling modules")
+endif()
+
+# A user can set versions manually too
+set(Python_ADDITIONAL_VERSIONS
+    "3.9;3.8;3.7;3.6;3.5;3.4"
+    CACHE INTERNAL "")
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED ${_pybind11_quiet})
+list(REMOVE_AT CMAKE_MODULE_PATH -1)
+
+# Cache variables so pybind11_add_module can be used in parent projects
+set(PYTHON_INCLUDE_DIRS
+    ${PYTHON_INCLUDE_DIRS}
+    CACHE INTERNAL "")
+set(PYTHON_LIBRARIES
+    ${PYTHON_LIBRARIES}
+    CACHE INTERNAL "")
+set(PYTHON_MODULE_PREFIX
+    ${PYTHON_MODULE_PREFIX}
+    CACHE INTERNAL "")
+set(PYTHON_MODULE_EXTENSION
+    ${PYTHON_MODULE_EXTENSION}
+    CACHE INTERNAL "")
+set(PYTHON_VERSION_MAJOR
+    ${PYTHON_VERSION_MAJOR}
+    CACHE INTERNAL "")
+set(PYTHON_VERSION_MINOR
+    ${PYTHON_VERSION_MINOR}
+    CACHE INTERNAL "")
+set(PYTHON_VERSION
+    ${PYTHON_VERSION}
+    CACHE INTERNAL "")
+set(PYTHON_IS_DEBUG
+    "${PYTHON_IS_DEBUG}"
+    CACHE INTERNAL "")
+
+if(PYBIND11_MASTER_PROJECT)
+  if(PYTHON_MODULE_EXTENSION MATCHES "pypy")
+    if(NOT DEFINED PYPY_VERSION)
+      execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -c
+                [=[import sys; print(".".join(map(str, sys.pypy_version_info[:3])))]=]
+        OUTPUT_VARIABLE pypy_version)
+      set(PYPY_VERSION
+          ${pypy_version}
+          CACHE INTERNAL "")
+    endif()
+    message(STATUS "PYPY ${PYPY_VERSION} (Py ${PYTHON_VERSION})")
+  else()
+    message(STATUS "PYTHON ${PYTHON_VERSION}")
+  endif()
+endif()
+
+# Only add Python for build - must be added during the import for config since it has to be re-discovered.
+set_property(
+  TARGET pybind11::pybind11
+  APPEND
+  PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>)
+
+# Python debug libraries expose slightly different objects before 3.8
+# https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+# https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+if(PYTHON_IS_DEBUG)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_COMPILE_DEFINITIONS Py_DEBUG)
+endif()
+
+set_property(
+  TARGET pybind11::module
+  APPEND
+  PROPERTY
+    INTERFACE_LINK_LIBRARIES pybind11::python_link_helper
+    "$<$<OR:$<PLATFORM_ID:Windows>,$<PLATFORM_ID:Cygwin>>:$<BUILD_INTERFACE:${PYTHON_LIBRARIES}>>")
+
+if(PYTHON_VERSION VERSION_LESS 3)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python2_no_register)
+endif()
+
+set_property(
+  TARGET pybind11::embed
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+
+function(pybind11_extension name)
+  # The prefix and extension are provided by FindPythonLibsNew.cmake
+  set_target_properties(${name} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                           SUFFIX "${PYTHON_MODULE_EXTENSION}")
+endfunction()
+
+# Build a Python extension module:
+# pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+#                     [NO_EXTRAS] [THIN_LTO] source1 [source2 ...])
+#
+function(pybind11_add_module target_name)
+  set(options MODULE SHARED EXCLUDE_FROM_ALL NO_EXTRAS SYSTEM THIN_LTO)
+  cmake_parse_arguments(ARG "${options}" "" "" ${ARGN})
+
+  if(ARG_MODULE AND ARG_SHARED)
+    message(FATAL_ERROR "Can't be both MODULE and SHARED")
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if(ARG_EXCLUDE_FROM_ALL)
+    set(exclude_from_all EXCLUDE_FROM_ALL)
+  else()
+    set(exclude_from_all "")
+  endif()
+
+  add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS})
+
+  target_link_libraries(${target_name} PRIVATE pybind11::module)
+
+  if(ARG_SYSTEM)
+    message(
+      STATUS
+        "Warning: this does not have an effect - use NO_SYSTEM_FROM_IMPORTED if using imported targets"
+    )
+  endif()
+
+  pybind11_extension(${target_name})
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden"
+                                                  CUDA_VISIBILITY_PRESET "hidden")
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    if(ARG_THIN_LTO)
+      target_link_libraries(${target_name} PRIVATE pybind11::thin_lto)
+    else()
+      target_link_libraries(${target_name} PRIVATE pybind11::lto)
+    endif()
+  endif()
+
+  if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
+    pybind11_strip(${target_name})
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+
+endfunction()
diff --git a/pydiffvg/__init__.py b/pydiffvg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f3dd14e9a7f2da7b6fa7e731b6fd698fb45821
--- /dev/null
+++ b/pydiffvg/__init__.py
@@ -0,0 +1,9 @@
+from .device import *
+from .shape import *
+from .pixel_filter import *
+from .render_pytorch import *
+from .image import *
+from .parse_svg import *
+from .color import *
+from .optimize_svg import *
+from .save_svg import *
\ No newline at end of file
diff --git a/pydiffvg/color.py b/pydiffvg/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c360f1ce1601e87b34a6fd36c70274e24dad94
--- /dev/null
+++ b/pydiffvg/color.py
@@ -0,0 +1,24 @@
+import pydiffvg
+import torch
+
+class LinearGradient:
+    def __init__(self,
+                 begin = torch.tensor([0.0, 0.0]),
+                 end = torch.tensor([0.0, 0.0]),
+                 offsets = torch.tensor([0.0]),
+                 stop_colors = torch.tensor([0.0, 0.0, 0.0, 0.0])):
+        self.begin = begin
+        self.end = end
+        self.offsets = offsets
+        self.stop_colors = stop_colors
+
+class RadialGradient:
+    def __init__(self,
+                 center = torch.tensor([0.0, 0.0]),
+                 radius = torch.tensor([0.0, 0.0]),
+                 offsets = torch.tensor([0.0]),
+                 stop_colors = torch.tensor([0.0, 0.0, 0.0, 0.0])):
+        self.center = center
+        self.radius = radius
+        self.offsets = offsets
+        self.stop_colors = stop_colors
diff --git a/pydiffvg/device.py b/pydiffvg/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..420883d60130a8f21e96bae19ba6025ffd0ed55e
--- /dev/null
+++ b/pydiffvg/device.py
@@ -0,0 +1,25 @@
+import torch
+
+use_gpu = torch.cuda.is_available()
+device = torch.device('cuda') if use_gpu else torch.device('cpu')
+
+def set_use_gpu(v):
+    global use_gpu
+    global device
+    use_gpu = v
+    if not use_gpu:
+        device = torch.device('cpu')
+
+def get_use_gpu():
+    global use_gpu
+    return use_gpu
+
+def set_device(d):
+    global device
+    global use_gpu
+    device = d
+    use_gpu = device.type == 'cuda'
+
+def get_device():
+    global device
+    return device
diff --git a/pydiffvg/image.py b/pydiffvg/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..f83fea259fa25df503e67d9793f1b939a0f16177
--- /dev/null
+++ b/pydiffvg/image.py
@@ -0,0 +1,22 @@
+import numpy as np
+import skimage
+import skimage.io
+import os
+
+def imwrite(img, filename, gamma = 2.2, normalize = False):
+    directory = os.path.dirname(filename)
+    if directory != '' and not os.path.exists(directory):
+        os.makedirs(directory)
+
+    if not isinstance(img, np.ndarray):
+        img = img.data.numpy()
+    if normalize:
+        img_rng = np.max(img) - np.min(img)
+        if img_rng > 0:
+            img = (img - np.min(img)) / img_rng
+    img = np.clip(img, 0.0, 1.0)
+    if img.ndim==2:
+        #repeat along the third dimension
+        img=np.expand_dims(img,2)
+    img[:, :, :3] = np.power(img[:, :, :3], 1.0/gamma)
+    skimage.io.imsave(filename, (img * 255).astype(np.uint8))
\ No newline at end of file
diff --git a/pydiffvg/optimize_svg.py b/pydiffvg/optimize_svg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce0097f51afca413cfd6a2dcf7ef257a443002ec
--- /dev/null
+++ b/pydiffvg/optimize_svg.py
@@ -0,0 +1,1607 @@
+import json
+import copy
+import xml.etree.ElementTree as etree
+from xml.dom import minidom
+import warnings
+import torch
+import numpy as np
+import re
+import sys
+import pydiffvg
+import math
+from collections import namedtuple
+import cssutils
+
+class SvgOptimizationSettings:
+
+    default_params = {
+        "optimize_color": True,
+        "color_lr": 2e-3,
+        "optimize_alpha": False,
+        "alpha_lr": 2e-3,
+        "optimizer": "Adam",
+        "transforms": {
+            "optimize_transforms":True,
+            "transform_mode":"rigid",
+            "translation_mult":1e-3,
+            "transform_lr":2e-3
+        },
+        "circles": {
+            "optimize_center": True,
+            "optimize_radius": True,
+            "shape_lr": 2e-1
+        },
+        "paths": {
+            "optimize_points": True,
+            "shape_lr": 2e-1
+        },
+        "gradients": {
+            "optimize_stops": True,
+            "stop_lr": 2e-3,
+            "optimize_color": True,
+            "color_lr": 2e-3,
+            "optimize_alpha": False,
+            "alpha_lr": 2e-3,
+            "optimize_location": True,
+            "location_lr": 2e-1
+        }
+    }
+
+    optims = {
+        "Adam": torch.optim.Adam,
+        "SGD": torch.optim.SGD,
+        "ASGD": torch.optim.ASGD,
+    }
+
+    #region methods
+    def __init__(self, f=None):
+        self.store = {}
+        if f is None:
+            self.store["default"] = copy.deepcopy(SvgOptimizationSettings.default_params)
+        else:
+            self.store = json.load(f)
+
+    # create default alias for root
+    def default_name(self, dname):
+        self.dname = dname
+        if dname not in self.store:
+            self.store[dname] = self.store["default"]
+
+    def retrieve(self, node_id):
+        if node_id not in self.store:
+            return (self.store["default"], False)
+        else:
+            return (self.store[node_id], True)
+
+    def reset_to_defaults(self, node_id):
+        if node_id in self.store:
+            del self.store[node_id]
+
+        return self.store["default"]
+
+    def undefault(self, node_id):
+        if node_id not in self.store:
+            self.store[node_id] = copy.deepcopy(self.store["default"])
+
+        return self.store[node_id]
+
+    def override_optimizer(self, optimizer):
+        if optimizer is not None:
+            for v in self.store.values():
+                v["optimizer"] = optimizer
+
+    def global_override(self, path, value):
+        for store in self.store.values():
+            d = store
+            for key in path[:-1]:
+                d = d[key]
+
+            d[path[-1]] = value
+
+    def save(self, file):
+        self.store["default"] = self.store[self.dname]
+        json.dump(self.store, file, indent="\t")
+    #endregion
+
+class OptimizableSvg:
+
+    class TransformTools:
+        @staticmethod
+        def parse_matrix(vals):
+            assert(len(vals)==6)
+            return np.array([[vals[0],vals[2],vals[4]],[vals[1], vals[3], vals[5]],[0,0,1]])
+
+        @staticmethod
+        def parse_translate(vals):
+            assert(len(vals)>=1 and len(vals)<=2)
+            mat=np.eye(3)
+            mat[0,2]=vals[0]
+            if len(vals)>1:
+                mat[1,2]=vals[1]
+            return mat
+
+        @staticmethod
+        def parse_rotate(vals):
+            assert (len(vals) == 1 or len(vals) == 3)
+            mat = np.eye(3)
+            rads=math.radians(vals[0])
+            sint=math.sin(rads)
+            cost=math.cos(rads)
+            mat[0:2, 0:2] = np.array([[cost,-sint],[sint,cost]])
+            if len(vals) > 1:
+                tr1=parse_translate(vals[1:3])
+                tr2=parse_translate([-vals[1],-vals[2]])
+                mat=tr1 @ mat @ tr2
+            return mat
+
+        @staticmethod
+        def parse_scale(vals):
+            assert (len(vals) >= 1 and len(vals) <= 2)
+            d=np.array([vals[0], vals[1] if len(vals)>1 else vals[0],1])
+            return np.diag(d)
+
+        @staticmethod
+        def parse_skewx(vals):
+            assert(len(vals)==1)
+            m=np.eye(3)
+            m[0,1]=vals[0]
+            return m
+
+        @staticmethod
+        def parse_skewy(vals):
+            assert (len(vals) == 1)
+            m = np.eye(3)
+            m[1, 0] = vals[0]
+            return m
+
+        @staticmethod
+        def transformPoints(pointsTensor, transform):
+            assert(transform is not None)
+            one=torch.ones((pointsTensor.shape[0],1),device=pointsTensor.device)
+            homo_points = torch.cat([pointsTensor, one], dim=1)
+            mult = transform.mm(homo_points.permute(1,0)).permute(1,0)
+            tfpoints=mult[:, 0:2].contiguous()
+            #print(torch.norm(mult[:,2]-one))
+            assert(pointsTensor.shape == tfpoints.shape)
+            return tfpoints
+
+        @staticmethod
+        def promote_numpy(M):
+            ret = np.eye(3)
+            ret[0:2, 0:2] = M
+            return ret
+
+        @staticmethod
+        def recompose_numpy(Theta,ScaleXY,ShearX,TXY):
+            cost=math.cos(Theta)
+            sint=math.sin(Theta)
+            Rot=np.array([[cost, -sint],[sint, cost]])
+            Scale=np.diag(ScaleXY)
+            Shear=np.eye(2)
+            Shear[0,1]=ShearX
+
+            Translate=np.eye(3)
+            Translate[0:2,2]=TXY
+            
+            M=OptimizableSvg.TransformTools.promote_numpy(Rot @ Scale @ Shear) @ Translate
+            return M
+
+        @staticmethod
+        def promote(m):
+            M=torch.eye(3).to(m.device)
+            M[0:2,0:2]=m
+            return M
+
+        @staticmethod
+        def make_rot(Theta):
+            sint=Theta.sin().squeeze()
+            cost=Theta.cos().squeeze()
+            #m=torch.tensor([[cost, -sint],[sint, cost]])
+            Rot=torch.stack((torch.stack((cost,-sint)),torch.stack((sint,cost))))
+            return Rot
+
+        @staticmethod
+        def make_scale(ScaleXY):
+            if ScaleXY.squeeze().dim()==0:
+                ScaleXY=ScaleXY.squeeze()
+                #uniform scale
+                return torch.diag(torch.stack([ScaleXY,ScaleXY])).to(ScaleXY.device)
+            else:
+                return torch.diag(ScaleXY).to(ScaleXY.device)
+
+        @staticmethod
+        def make_shear(ShearX):
+            m=torch.eye(2).to(ShearX.device)
+            m[0,1]=ShearX
+            return m
+
+        @staticmethod
+        def make_translate(TXY):
+            m=torch.eye(3).to(TXY.device)
+            m[0:2,2]=TXY
+            return m
+
+        @staticmethod
+        def recompose(Theta,ScaleXY,ShearX,TXY):
+            Rot=OptimizableSvg.TransformTools.make_rot(Theta)
+            Scale=OptimizableSvg.TransformTools.make_scale(ScaleXY)
+            Shear=OptimizableSvg.TransformTools.make_shear(ShearX)
+            Translate=OptimizableSvg.TransformTools.make_translate(TXY)
+
+            return OptimizableSvg.TransformTools.promote(Rot.mm(Scale).mm(Shear)).mm(Translate)
+
+        TransformDecomposition=namedtuple("TransformDecomposition","theta scale shear translate")
+        TransformProperties=namedtuple("TransformProperties", "has_rotation has_scale has_mirror scale_uniform has_shear has_translation")
+
+        @staticmethod
+        def make_named(decomp):
+            if not isinstance(decomp,OptimizableSvg.TransformTools.TransformDecomposition):
+                decomp=OptimizableSvg.TransformTools.TransformDecomposition(theta=decomp[0],scale=decomp[1],shear=decomp[2],translate=decomp[3])
+            return decomp
+
+        @staticmethod
+        def analyze_transform(decomp):
+            decomp=OptimizableSvg.TransformTools.make_named(decomp)
+            epsilon=1e-3
+            has_rotation=abs(decomp.theta)>epsilon
+            has_scale=abs((abs(decomp.scale)-1)).max()>epsilon
+            scale_len=decomp.scale.squeeze().ndim>0 if isinstance(decomp.scale,np.ndarray) else decomp.scale.squeeze().dim() > 0
+            has_mirror=scale_len and decomp.scale[0]*decomp.scale[1] < 0
+            scale_uniform=not scale_len or abs(abs(decomp.scale[0])-abs(decomp.scale[1]))<epsilon
+            has_shear=abs(decomp.shear)>epsilon
+            has_translate=max(abs(decomp.translate[0]),abs(decomp.translate[1]))>epsilon
+
+            return OptimizableSvg.TransformTools.TransformProperties(has_rotation=has_rotation,has_scale=has_scale,has_mirror=has_mirror,scale_uniform=scale_uniform,has_shear=has_shear,has_translation=has_translate)
+
+        @staticmethod
+        def check_and_decomp(M):
+            decomp=OptimizableSvg.TransformTools.decompose(M) if M is not None else OptimizableSvg.TransformTools.TransformDecomposition(theta=0,scale=(1,1),shear=0,translate=(0,0))
+            props=OptimizableSvg.TransformTools.analyze_transform(decomp)
+            return (decomp, props)
+
+        @staticmethod
+        def tf_to_string(M):
+            tfstring = "matrix({} {} {} {} {} {})".format(M[0, 0], M[1, 0], M[0, 1], M[1, 1], M[0, 2], M[1, 2])
+            return tfstring
+
+        @staticmethod
+        def decomp_to_string(decomp):
+            decomp = OptimizableSvg.TransformTools.make_named(decomp)
+            ret=""
+            props=OptimizableSvg.TransformTools.analyze_transform(decomp)
+            if props.has_rotation:
+                ret+="rotate({}) ".format(math.degrees(decomp.theta.item()))
+            if props.has_scale:
+                if decomp.scale.dim()==0:
+                    ret += "scale({}) ".format(decomp.scale.item())
+                else:
+                    ret+="scale({} {}) ".format(decomp.scale[0], decomp.scale[1])
+            if props.has_shear:
+                ret+="skewX({}) ".format(decomp.shear.item())
+            if props.has_translation:
+                ret+="translate({} {}) ".format(decomp.translate[0],decomp.translate[1])
+
+            return ret
+
+        @staticmethod
+        def decompose(M):
+            m = M[0:2, 0:2]
+            t0=M[0:2, 2]
+            #get translation so that we can post-multiply with it
+            TXY=np.linalg.solve(m,t0)
+
+            T=np.eye(3)
+            T[0:2,2]=TXY
+
+            q, r = np.linalg.qr(m)
+
+            ref = np.array([[1, 0], [0, np.sign(np.linalg.det(q))]])
+
+            Rot = np.dot(q, ref)
+
+            ref2 = np.array([[1, 0], [0, np.sign(np.linalg.det(r))]])
+
+            r2 = np.dot(ref2, r)
+
+            Ref = np.dot(ref, ref2)
+            
+            sc = np.diag(r2)
+            Scale = np.diagflat(sc)
+
+            Shear = np.eye(2)
+            Shear[0, 1] = r2[0, 1] / sc[0]
+            #the actual shear coefficient
+            ShearX=r2[0, 1] / sc[0]
+
+            if np.sum(sc) < 0:
+                # both scales are negative, flip this and add a 180 rotation
+                Rot = np.dot(Rot, -np.eye(2))
+                Scale = -Scale
+
+            Theta = math.atan2(Rot[1, 0], Rot[0, 0])
+            ScaleXY = np.array([Scale[0,0],Scale[1,1]*Ref[1,1]])
+
+            return OptimizableSvg.TransformTools.TransformDecomposition(theta=Theta, scale=ScaleXY, shear=ShearX, translate=TXY)
+
+    #region suboptimizers
+
+    #optimizes color, but really any tensor that needs to stay between 0 and 1 per-entry
+    class ColorOptimizer:
+        def __init__(self,tensor,optim_type,lr):
+            self.tensor=tensor
+            self.optim=optim_type([tensor],lr=lr)
+
+        def zero_grad(self):
+            self.optim.zero_grad()
+
+        def step(self):
+            self.optim.step()
+            self.tensor.data.clamp_(min=1e-4,max=1.)
+
+    #optimizes gradient stop positions
+    class StopOptimizer:
+        def __init__(self,stops,optim_type,lr):
+            self.stops=stops
+            self.optim=optim_type([stops],lr=lr)
+
+        def zero_grad(self):
+            self.optim.zero_grad()
+
+        def step(self):
+            self.optim.step()
+            self.stops.data.clamp_(min=0., max=1.)
+            self.stops.data, _ = self.stops.sort()
+            self.stops.data[0] = 0.
+            self.stops.data[-1]=1.
+
+    #optimizes gradient: stop, positions, colors+opacities, locations
+    class GradientOptimizer:
+        def __init__(self, begin, end, offsets, stops, optim_params):
+            self.begin=begin.clone().detach() if begin is not None else None
+            self.end=end.clone().detach() if end is not None else None
+            self.offsets=offsets.clone().detach() if offsets is not None else None
+            self.stop_colors=stops[:,0:3].clone().detach() if stops is not None else None
+            self.stop_alphas=stops[:,3].clone().detach() if stops is not None else None
+            self.optimizers=[]
+
+            if optim_params["gradients"]["optimize_stops"] and self.offsets is not None:
+                self.offsets.requires_grad_(True)
+                self.optimizers.append(OptimizableSvg.StopOptimizer(self.offsets,SvgOptimizationSettings.optims[optim_params["optimizer"]],optim_params["gradients"]["stop_lr"]))
+            if optim_params["gradients"]["optimize_color"] and self.stop_colors is not None:
+                self.stop_colors.requires_grad_(True)
+                self.optimizers.append(OptimizableSvg.ColorOptimizer(self.stop_colors,SvgOptimizationSettings.optims[optim_params["optimizer"]],optim_params["gradients"]["color_lr"]))
+            if optim_params["gradients"]["optimize_alpha"] and self.stop_alphas is not None:
+                self.stop_alphas.requires_grad_(True)
+                self.optimizers.append(OptimizableSvg.ColorOptimizer(self.stop_alphas,SvgOptimizationSettings.optims[optim_params["optimizer"]],optim_params["gradients"]["alpha_lr"]))
+            if optim_params["gradients"]["optimize_location"] and self.begin is not None and self.end is not None:
+                self.begin.requires_grad_(True)
+                self.end.requires_grad_(True)
+                self.optimizers.append(SvgOptimizationSettings.optims[optim_params["optimizer"]]([self.begin,self.end],lr=optim_params["gradients"]["location_lr"]))
+
+
+        def get_vals(self):
+            return self.begin, self.end, self.offsets, torch.cat((self.stop_colors,self.stop_alphas.unsqueeze(1)),1) if self.stop_colors is not None and self.stop_alphas is not None else None
+
+        def zero_grad(self):
+            for optim in self.optimizers:
+                optim.zero_grad()
+
+        def step(self):
+            for optim in self.optimizers:
+                optim.step()
+
+    class TransformOptimizer:
+        def __init__(self,transform,optim_params):
+            self.transform=transform
+            self.optimizes=optim_params["transforms"]["optimize_transforms"] and transform is not None
+            self.params=copy.deepcopy(optim_params)
+            self.transform_mode=optim_params["transforms"]["transform_mode"]
+
+            if self.optimizes:
+                optimvars=[]
+                self.residual=None
+                lr=optim_params["transforms"]["transform_lr"]
+                tmult=optim_params["transforms"]["translation_mult"]
+                decomp,props=OptimizableSvg.TransformTools.check_and_decomp(transform.cpu().numpy())
+                if self.transform_mode=="move":
+                    #only translation and rotation should be set
+                    if props.has_scale or props.has_shear or props.has_mirror:
+                        print("Warning: set to optimize move only, but input transform has residual scale or shear")
+                        self.residual=self.transform.clone().detach().requires_grad_(False)
+                        self.Theta=torch.tensor(0,dtype=torch.float32,requires_grad=True,device=transform.device)
+                        self.translation=torch.tensor([0, 0],dtype=torch.float32,requires_grad=True,device=transform.device)
+                    else:
+                        self.residual=None
+                        self.Theta=torch.tensor(decomp.theta,dtype=torch.float32,requires_grad=True,device=transform.device)
+                        self.translation=torch.tensor(decomp.translate,dtype=torch.float32,requires_grad=True,device=transform.device)
+                    optimvars+=[{'params':x,'lr':lr} for x in [self.Theta]]+[{'params':self.translation,'lr':lr*tmult}]
+                elif self.transform_mode=="rigid":
+                    #only translation, rotation, and uniform scale should be set
+                    if props.has_shear or props.has_mirror or not props.scale_uniform:
+                        print("Warning: set to optimize rigid transform only, but input transform has residual shear, mirror or non-uniform scale")
+                        self.residual = self.transform.clone().detach().requires_grad_(False)
+                        self.Theta = torch.tensor(0, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.translation = torch.tensor([0, 0], dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.scale=torch.tensor(1, dtype=torch.float32, requires_grad=True,device=transform.device)
+                    else:
+                        self.residual = None
+                        self.Theta = torch.tensor(decomp.theta, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.translation = torch.tensor(decomp.translate, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.scale = torch.tensor(decomp.scale[0], dtype=torch.float32, requires_grad=True,device=transform.device)
+                    optimvars += [{'params':x,'lr':lr} for x in [self.Theta, self.scale]]+[{'params':self.translation,'lr':lr*tmult}]
+                elif self.transform_mode=="similarity":
+                    if props.has_shear or not props.scale_uniform:
+                        print("Warning: set to optimize rigid transform only, but input transform has residual shear or non-uniform scale")
+                        self.residual = self.transform.clone().detach().requires_grad_(False)
+                        self.Theta = torch.tensor(0, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.translation = torch.tensor([0, 0], dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.scale=torch.tensor(1, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.scale_sign=torch.tensor(1,dtype=torch.float32,requires_grad=False,device=transform.device)
+                    else:
+                        self.residual = None
+                        self.Theta = torch.tensor(decomp.theta, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.translation = torch.tensor(decomp.translate, dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.scale = torch.tensor(decomp.scale[0], dtype=torch.float32, requires_grad=True,device=transform.device)
+                        self.scale_sign = torch.tensor(np.sign(decomp.scale[0]*decomp.scale[1]), dtype=torch.float32, requires_grad=False,device=transform.device)
+                    optimvars += [{'params':x,'lr':lr} for x in [self.Theta, self.scale]]+[{'params':self.translation,'lr':lr*tmult}]
+                elif self.transform_mode=="affine":
+                    self.Theta = torch.tensor(decomp.theta, dtype=torch.float32, requires_grad=True,device=transform.device)
+                    self.translation = torch.tensor(decomp.translate, dtype=torch.float32, requires_grad=True,device=transform.device)
+                    self.scale = torch.tensor(decomp.scale, dtype=torch.float32, requires_grad=True,device=transform.device)
+                    self.shear = torch.tensor(decomp.shear, dtype=torch.float32, requires_grad=True,device=transform.device)
+                    optimvars += [{'params':x,'lr':lr} for x in [self.Theta, self.scale, self.shear]]+[{'params':self.translation,'lr':lr*tmult}]
+                else:
+                    raise ValueError("Unrecognized transform mode '{}'".format(self.transform_mode))
+                self.optimizer=SvgOptimizationSettings.optims[optim_params["optimizer"]](optimvars)
+
+        def get_transform(self):
+            if not self.optimizes:
+                return self.transform
+            else:
+                if self.transform_mode == "move":
+                    composed=OptimizableSvg.TransformTools.recompose(self.Theta,torch.tensor([1.],device=self.Theta.device),torch.tensor(0.,device=self.Theta.device),self.translation)
+                    return self.residual.mm(composed) if self.residual is not None else composed
+                elif self.transform_mode == "rigid":
+                    composed = OptimizableSvg.TransformTools.recompose(self.Theta, self.scale, torch.tensor(0.,device=self.Theta.device),
+                                                                       self.translation)
+                    return self.residual.mm(composed) if self.residual is not None else composed
+                elif self.transform_mode == "similarity":
+                    composed=OptimizableSvg.TransformTools.recompose(self.Theta, torch.cat((self.scale,self.scale*self.scale_sign)),torch.tensor(0.,device=self.Theta.device),self.translation)
+                    return self.residual.mm(composed) if self.residual is not None else composed
+                elif self.transform_mode == "affine":
+                    composed = OptimizableSvg.TransformTools.recompose(self.Theta, self.scale, self.shear, self.translation)
+                    return composed
+                else:
+                    raise ValueError("Unrecognized transform mode '{}'".format(self.transform_mode))
+
+        def tfToString(self):
+            if self.transform is None:
+                return None
+            elif not self.optimizes:
+                return OptimizableSvg.TransformTools.tf_to_string(self.transform)
+            else:
+                if self.transform_mode == "move":
+                    str=OptimizableSvg.TransformTools.decomp_to_string((self.Theta,torch.tensor([1.]),torch.tensor(0.),self.translation))
+                    return (OptimizableSvg.TransformTools.tf_to_string(self.residual) if self.residual is not None else "")+" "+str
+                elif self.transform_mode == "rigid":
+                    str = OptimizableSvg.TransformTools.decomp_to_string((self.Theta, self.scale, torch.tensor(0.),
+                                                                       self.translation))
+                    return (OptimizableSvg.TransformTools.tf_to_string(self.residual) if self.residual is not None else "")+" "+str
+                elif self.transform_mode == "similarity":
+                    str=OptimizableSvg.TransformTools.decomp_to_string((self.Theta, torch.cat((self.scale,self.scale*self.scale_sign)),torch.tensor(0.),self.translation))
+                    return (OptimizableSvg.TransformTools.tf_to_string(self.residual) if self.residual is not None else "")+" "+str
+                elif self.transform_mode == "affine":
+                    str = OptimizableSvg.TransformTools.decomp_to_string((self.Theta, self.scale, self.shear, self.translation))
+                    return composed
+
+        def zero_grad(self):
+            if self.optimizes:
+                self.optimizer.zero_grad()
+
+        def step(self):
+            if self.optimizes:
+                self.optimizer.step()
+
+    #endregion
+
+    #region Nodes
+    class SvgNode:
+        def __init__(self,id,transform,appearance,settings):
+            self.id=id
+            self.children=[]
+            self.optimizers=[]
+            self.device = settings.device
+            self.transform=torch.tensor(transform,dtype=torch.float32,device=self.device) if transform is not None else None
+            self.transform_optim=OptimizableSvg.TransformOptimizer(self.transform,settings.retrieve(self.id)[0])
+            self.optimizers.append(self.transform_optim)
+            self.proc_appearance(appearance,settings.retrieve(self.id)[0])
+
+        def tftostring(self):
+            return self.transform_optim.tfToString()
+
+        def appearanceToString(self):
+            appstring=""
+            for key,value in self.appearance.items():
+                if key in ["fill", "stroke"]:
+                    #a paint-type value
+                    if value[0] == "none":
+                        appstring+="{}:none;".format(key)
+                    elif value[0] == "solid":
+                        appstring += "{}:{};".format(key,OptimizableSvg.rgb_to_string(value[1]))
+                    elif value[0] == "url":
+                        appstring += "{}:url(#{});".format(key,value[1].id)
+                        #appstring += "{}:{};".format(key,"#ff00ff")
+                elif key in ["opacity", "fill-opacity", "stroke-opacity", "stroke-width", "fill-rule"]:
+                    appstring+="{}:{};".format(key,value)
+                else:
+                    raise ValueError("Don't know how to write appearance parameter '{}'".format(key))
+            return appstring
+
+
+        def write_xml_common_attrib(self,node,tfname="transform"):
+            if self.transform is not None:
+                node.set(tfname,self.tftostring())
+            if len(self.appearance)>0:
+                node.set('style',self.appearanceToString())
+            if self.id is not None:
+                node.set('id',self.id)
+
+
+        def proc_appearance(self,appearance,optim_params):
+            self.appearance=appearance
+            for key, value in appearance.items():
+                if key == "fill" or key == "stroke":
+                    if optim_params["optimize_color"] and value[0]=="solid":
+                        value[1].requires_grad_(True)
+                        self.optimizers.append(OptimizableSvg.ColorOptimizer(value[1],SvgOptimizationSettings.optims[optim_params["optimizer"]],optim_params["color_lr"]))
+                elif key == "fill-opacity" or key == "stroke-opacity" or key == "opacity":
+                    if optim_params["optimize_alpha"]:
+                        value[1].requires_grad_(True)
+                        self.optimizers.append(OptimizableSvg.ColorOptimizer(value[1], optim_params["optimizer"],
+                                                                             optim_params["alpha_lr"]))
+                elif key == "fill-rule" or key == "stroke-width":
+                    pass
+                else:
+                    raise RuntimeError("Unrecognized appearance key '{}'".format(key))
+
+        def prop_transform(self,intform):
+            return intform.matmul(self.transform_optim.get_transform()) if self.transform is not None else intform
+
+        def prop_appearance(self,inappearance):
+            outappearance=copy.copy(inappearance)
+            for key,value in self.appearance.items():
+                if key == "fill":
+                    #gets replaced
+                    outappearance[key]=value
+                elif key == "fill-opacity":
+                    #gets multiplied
+                    outappearance[key] = outappearance[key]*value
+                elif key == "fill-rule":
+                    #gets replaced
+                    outappearance[key] = value
+                elif key =="opacity":
+                    # gets multiplied
+                    outappearance[key] = outappearance[key]*value
+                elif key == "stroke":
+                    # gets replaced
+                    outappearance[key] = value
+                elif key == "stroke-opacity":
+                    # gets multiplied
+                    outappearance[key] = outappearance[key]*value
+                elif key =="stroke-width":
+                    # gets replaced
+                    outappearance[key] = value
+                else:
+                    raise RuntimeError("Unrecognized appearance key '{}'".format(key))
+            return outappearance
+
+        def zero_grad(self):
+            for optim in self.optimizers:
+                optim.zero_grad()
+            for child in self.children:
+                child.zero_grad()
+
+        def step(self):
+            for optim in self.optimizers:
+                optim.step()
+            for child in self.children:
+                child.step()
+
+        def get_type(self):
+            return "Generic node"
+
+        def is_shape(self):
+            return False
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            raise NotImplementedError("Abstract SvgNode cannot recurse")
+
+    class GroupNode(SvgNode):
+        def __init__(self, id, transform, appearance,settings):
+            super().__init__(id, transform, appearance,settings)
+
+        def get_type(self):
+            return "Group node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            outtf=self.prop_transform(transform)
+            outapp=self.prop_appearance(appearance)
+            for child in self.children:
+                child.build_scene(shapes,shape_groups,outtf,outapp)
+
+        def write_xml(self, parent):
+            elm=etree.SubElement(parent,"g")
+            self.write_xml_common_attrib(elm)
+
+            for child in self.children:
+                child.write_xml(elm)
+
+    class RootNode(SvgNode):
+        def __init__(self, id, transform, appearance,settings):
+            super().__init__(id, transform, appearance,settings)
+
+        def write_xml(self,document):
+            elm=etree.Element('svg')
+            self.write_xml_common_attrib(elm)
+            elm.set("version","2.0")
+            elm.set("width",str(document.canvas[0]))
+            elm.set("height", str(document.canvas[1]))
+            elm.set("xmlns","http://www.w3.org/2000/svg")
+            elm.set("xmlns:xlink","http://www.w3.org/1999/xlink")
+            #write definitions before we write any children
+            document.write_defs(elm)
+
+            #write the children
+            for child in self.children:
+                child.write_xml(elm)
+
+            return elm
+
+        def get_type(self):
+            return "Root node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            outtf = self.prop_transform(transform).to(self.device)
+            for child in self.children:
+                child.build_scene(shapes,shape_groups,outtf,appearance)
+
+        @staticmethod
+        def get_default_appearance(device):
+            default_appearance = {"fill": ("solid", torch.tensor([0., 0., 0.],device=device)),
+                                  "fill-opacity": torch.tensor([1.],device=device),
+                                  "fill-rule": "nonzero",
+                                  "opacity": torch.tensor([1.],device=device),
+                                  "stroke": ("none", None),
+                                  "stroke-opacity": torch.tensor([1.],device=device),
+                                  "stroke-width": torch.tensor([0.],device=device)}
+            return default_appearance
+
+        @staticmethod
+        def get_default_transform():
+            return torch.eye(3)
+
+
+
+    class ShapeNode(SvgNode):
+        def __init__(self, id, transform, appearance,settings):
+            super().__init__(id, transform, appearance,settings)
+
+        def get_type(self):
+            return "Generic shape node"
+
+        def is_shape(self):
+            return True
+
+        def construct_paint(self,value,combined_opacity,transform):
+            if value[0]   == "none":
+                return None
+            elif value[0] == "solid":
+                return torch.cat([value[1],combined_opacity]).to(self.device)
+            elif value[0] == "url":
+                #get the gradient object from this node
+                return value[1].getGrad(combined_opacity,transform)
+            else:
+                raise ValueError("Unknown paint value type '{}'".format(value[0]))
+
+        def make_shape_group(self,appearance,transform,num_shapes,num_subobjects):
+            fill=self.construct_paint(appearance["fill"],appearance["opacity"]*appearance["fill-opacity"],transform)
+            stroke=self.construct_paint(appearance["stroke"],appearance["opacity"]*appearance["stroke-opacity"],transform)
+            sg = pydiffvg.ShapeGroup(shape_ids=torch.tensor(range(num_shapes, num_shapes + num_subobjects)),
+                                     fill_color=fill,
+                                     use_even_odd_rule=appearance["fill-rule"]=="evenodd",
+                                     stroke_color=stroke,
+                                     shape_to_canvas=transform,
+                                     id=self.id)
+            return sg
+
+    class PathNode(ShapeNode):
+        def __init__(self, id, transform, appearance,settings, paths):
+            super().__init__(id, transform, appearance,settings)
+            self.proc_paths(paths,settings.retrieve(self.id)[0])
+
+        def proc_paths(self,paths,optim_params):
+            self.paths=paths
+            if optim_params["paths"]["optimize_points"]:
+                ptlist=[]
+                for path in paths:
+                    ptlist.append(path.points.requires_grad_(True))
+                self.optimizers.append(SvgOptimizationSettings.optims[optim_params["optimizer"]](ptlist,lr=optim_params["paths"]["shape_lr"]))
+
+        def get_type(self):
+            return "Path node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            applytf=self.prop_transform(transform)
+            applyapp = self.prop_appearance(appearance)
+            sg=self.make_shape_group(applyapp,applytf,len(shapes),len(self.paths))
+            for path in self.paths:
+                disp_path=pydiffvg.Path(path.num_control_points,path.points,path.is_closed,applyapp["stroke-width"],path.id)
+                shapes.append(disp_path)
+            shape_groups.append(sg)
+
+        def path_to_string(self,path):
+            path_string = "M {},{} ".format(path.points[0][0].item(), path.points[0][1].item())
+            idx = 1
+            numpoints = path.points.shape[0]
+            for type in path.num_control_points:
+                toproc = type + 1
+                if type == 0:
+                    # add line
+                    path_string += "L "
+                elif type == 1:
+                    # add quadric
+                    path_string += "Q "
+                elif type == 2:
+                    # add cubic
+                    path_string += "C "
+                while toproc > 0:
+                    path_string += "{},{} ".format(path.points[idx % numpoints][0].item(),
+                                                   path.points[idx % numpoints][1].item())
+                    idx += 1
+                    toproc -= 1
+            if path.is_closed:
+                path_string += "Z "
+
+            return path_string
+
+        def paths_string(self):
+            pstr=""
+            for path in self.paths:
+                pstr+=self.path_to_string(path)
+            return pstr
+
+        def write_xml(self, parent):
+            elm = etree.SubElement(parent, "path")
+            self.write_xml_common_attrib(elm)
+            elm.set("d",self.paths_string())
+
+            for child in self.children:
+                child.write_xml(elm)
+
+    class RectNode(ShapeNode):
+        def __init__(self, id, transform, appearance,settings, rect):
+            super().__init__(id, transform, appearance,settings)
+            self.rect=torch.tensor(rect,dtype=torch.float,device=settings.device)
+            optim_params=settings.retrieve(self.id)[0]
+            #borrowing path settings for this
+            if optim_params["paths"]["optimize_points"]:
+                self.optimizers.append(SvgOptimizationSettings.optims[optim_params["optimizer"]]([self.rect],lr=optim_params["paths"]["shape_lr"]))
+
+        def get_type(self):
+            return "Rect node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            applytf=self.prop_transform(transform)
+            applyapp = self.prop_appearance(appearance)
+            sg=self.make_shape_group(applyapp,applytf,len(shapes),1)
+            shapes.append(pydiffvg.Rect(self.rect[0:2],self.rect[0:2]+self.rect[2:4],applyapp["stroke-width"],self.id))
+            shape_groups.append(sg)
+
+        def write_xml(self, parent):
+            elm = etree.SubElement(parent, "rect")
+            self.write_xml_common_attrib(elm)
+            elm.set("x",str(self.rect[0]))
+            elm.set("y", str(self.rect[1]))
+            elm.set("width", str(self.rect[2]))
+            elm.set("height", str(self.rect[3]))
+
+            for child in self.children:
+                child.write_xml(elm)
+
+    class CircleNode(ShapeNode):
+        def __init__(self, id, transform, appearance,settings, rect):
+            super().__init__(id, transform, appearance,settings)
+            self.circle=torch.tensor(rect,dtype=torch.float,device=settings.device)
+            optim_params=settings.retrieve(self.id)[0]
+            #borrowing path settings for this
+            if optim_params["paths"]["optimize_points"]:
+                self.optimizers.append(SvgOptimizationSettings.optims[optim_params["optimizer"]]([self.circle],lr=optim_params["paths"]["shape_lr"]))
+
+        def get_type(self):
+            return "Circle node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            applytf=self.prop_transform(transform)
+            applyapp = self.prop_appearance(appearance)
+            sg=self.make_shape_group(applyapp,applytf,len(shapes),1)
+            shapes.append(pydiffvg.Circle(self.circle[2],self.circle[0:2],applyapp["stroke-width"],self.id))
+            shape_groups.append(sg)
+
+        def write_xml(self, parent):
+            elm = etree.SubElement(parent, "circle")
+            self.write_xml_common_attrib(elm)
+            elm.set("cx",str(self.circle[0]))
+            elm.set("cy", str(self.circle[1]))
+            elm.set("r", str(self.circle[2]))
+
+            for child in self.children:
+                child.write_xml(elm)
+
+
+    class EllipseNode(ShapeNode):
+        def __init__(self, id, transform, appearance,settings, ellipse):
+            super().__init__(id, transform, appearance,settings)
+            self.ellipse=torch.tensor(ellipse,dtype=torch.float,device=settings.device)
+            optim_params=settings.retrieve(self.id)[0]
+            #borrowing path settings for this
+            if optim_params["paths"]["optimize_points"]:
+                self.optimizers.append(SvgOptimizationSettings.optims[optim_params["optimizer"]]([self.ellipse],lr=optim_params["paths"]["shape_lr"]))
+
+        def get_type(self):
+            return "Ellipse node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            applytf=self.prop_transform(transform)
+            applyapp = self.prop_appearance(appearance)
+            sg=self.make_shape_group(applyapp,applytf,len(shapes),1)
+            shapes.append(pydiffvg.Ellipse(self.ellipse[2:4],self.ellipse[0:2],applyapp["stroke-width"],self.id))
+            shape_groups.append(sg)
+
+        def write_xml(self, parent):
+            elm = etree.SubElement(parent, "ellipse")
+            self.write_xml_common_attrib(elm)
+            elm.set("cx", str(self.ellipse[0]))
+            elm.set("cy", str(self.ellipse[1]))
+            elm.set("rx", str(self.ellipse[2]))
+            elm.set("ry", str(self.ellipse[3]))
+
+            for child in self.children:
+                child.write_xml(elm)
+
+    class PolygonNode(ShapeNode):
+        def __init__(self, id, transform, appearance,settings, points):
+            super().__init__(id, transform, appearance,settings)
+            self.points=points
+            optim_params=settings.retrieve(self.id)[0]
+            #borrowing path settings for this
+            if optim_params["paths"]["optimize_points"]:
+                self.optimizers.append(SvgOptimizationSettings.optims[optim_params["optimizer"]]([self.points],lr=optim_params["paths"]["shape_lr"]))
+
+        def get_type(self):
+            return "Polygon node"
+
+        def build_scene(self,shapes,shape_groups,transform,appearance):
+            applytf=self.prop_transform(transform)
+            applyapp = self.prop_appearance(appearance)
+            sg=self.make_shape_group(applyapp,applytf,len(shapes),1)
+            shapes.append(pydiffvg.Polygon(self.points,True,applyapp["stroke-width"],self.id))
+            shape_groups.append(sg)
+
+        def point_string(self):
+            ret=""
+            for i in range(self.points.shape[0]):
+                pt=self.points[i,:]
+                #assert pt.shape == (1,2)
+                ret+= str(pt[0])+","+str(pt[1])+" "
+            return ret
+
+        def write_xml(self, parent):
+            elm = etree.SubElement(parent, "polygon")
+            self.write_xml_common_attrib(elm)
+            elm.set("points",self.point_string())
+
+            for child in self.children:
+                child.write_xml(elm)
+
+    class GradientNode(SvgNode):
+        def __init__(self, id, transform,settings,begin,end,offsets,stops,href):
+            super().__init__(id, transform, {},settings)
+            self.optim=OptimizableSvg.GradientOptimizer(begin, end, offsets, stops, settings.retrieve(id)[0])
+            self.optimizers.append(self.optim)
+            self.href=href
+
+        def is_ref(self):
+            return self.href is not None
+
+        def get_type(self):
+            return "Gradient node"
+
+        def get_stops(self):
+            _, _, offsets, stops=self.optim.get_vals()
+            return offsets, stops
+
+        def get_points(self):
+            begin, end, _, _ =self.optim.get_vals()
+            return begin, end
+
+        def write_xml(self, parent):
+            elm = etree.SubElement(parent, "linearGradient")
+            self.write_xml_common_attrib(elm,tfname="gradientTransform")
+
+            begin, end, offsets, stops = self.optim.get_vals()
+
+            if self.href is None:
+                #we have stops
+                for idx, offset in enumerate(offsets):
+                    stop=etree.SubElement(elm,"stop")
+                    stop.set("offset",str(offset.item()))
+                    stop.set("stop-color",OptimizableSvg.rgb_to_string(stops[idx,0:3]))
+                    stop.set("stop-opacity",str(stops[idx,3].item()))
+            else:
+                elm.set('xlink:href', "#{}".format(self.href.id))
+
+            if begin is not None and end is not None:
+                #no stops
+                elm.set('x1', str(begin[0].item()))
+                elm.set('y1', str(begin[1].item()))
+                elm.set('x2', str(end[0].item()))
+                elm.set('y2', str(end[1].item()))
+
+                # magic value to make this work
+                elm.set("gradientUnits", "userSpaceOnUse")
+
+            for child in self.children:
+                child.write_xml(elm)
+
+        def getGrad(self,combined_opacity,transform):
+            if self.is_ref():
+                offsets, stops=self.href.get_stops()
+            else:
+                offsets, stops=self.get_stops()
+
+            stops=stops.clone()
+            stops[:,3]*=combined_opacity
+
+            begin,end = self.get_points()
+
+            applytf=self.prop_transform(transform)
+            begin=OptimizableSvg.TransformTools.transformPoints(begin.unsqueeze(0),applytf).squeeze()
+            end = OptimizableSvg.TransformTools.transformPoints(end.unsqueeze(0), applytf).squeeze()
+
+            return pydiffvg.LinearGradient(begin, end, offsets, stops)
+    #endregion
+
+    def __init__(self, filename, settings=SvgOptimizationSettings(),optimize_background=False, verbose=False, device=torch.device("cpu")):
+        self.settings=settings
+        self.verbose=verbose
+        self.device=device
+        self.settings.device=device
+
+        tree = etree.parse(filename)
+        root = tree.getroot()
+
+        #in case we need global optimization
+        self.optimizers=[]
+        self.background=torch.tensor([1.,1.,1.],dtype=torch.float32,requires_grad=optimize_background,device=self.device)
+
+        if optimize_background:
+            p=settings.retrieve("default")[0]
+            self.optimizers.append(OptimizableSvg.ColorOptimizer(self.background,SvgOptimizationSettings.optims[p["optimizer"]],p["color_lr"]))
+
+        self.defs={}
+
+        self.depth=0
+
+        self.dirty=True
+        self.scene=None
+
+        self.parseRoot(root)
+
+    recognised_shapes=["path","circle","rect","ellipse","polygon"]
+
+    #region core functionality
+    def build_scene(self):
+        if self.dirty:
+            shape_groups=[]
+            shapes=[]
+            self.root.build_scene(shapes,shape_groups,OptimizableSvg.RootNode.get_default_transform().to(self.device),OptimizableSvg.RootNode.get_default_appearance(self.device))
+            self.scene=(self.canvas[0],self.canvas[1],shapes,shape_groups)
+            self.dirty=False
+        return self.scene
+
+    def zero_grad(self):
+        self.root.zero_grad()
+        for optim in self.optimizers:
+            optim.zero_grad()
+        for item in self.defs.values():
+            if issubclass(item.__class__,OptimizableSvg.SvgNode):
+                item.zero_grad()
+
+    def render(self,scale=None,seed=0):
+        #render at native resolution
+        scene = self.build_scene()
+        scene_args = pydiffvg.RenderFunction.serialize_scene(*scene)
+        render = pydiffvg.RenderFunction.apply
+        out_size=(scene[0],scene[1]) if scale is None else (int(scene[0]*scale),int(scene[1]*scale))
+        img = render(out_size[0],  # width
+                     out_size[1],  # height
+                     2,  # num_samples_x
+                     2,  # num_samples_y
+                     seed,  # seed
+                     None, # background_image
+                     *scene_args)
+        return img
+
+    def step(self):
+        self.dirty=True
+        self.root.step()
+        for optim in self.optimizers:
+            optim.step()
+        for item in self.defs.values():
+            if issubclass(item.__class__, OptimizableSvg.SvgNode):
+                item.step()
+    #endregion
+
+    #region reporting
+
+    def offset_str(self,s):
+        return ("\t"*self.depth)+s
+
+    def reportSkippedAttribs(self, node, non_skipped=[]):
+        skipped=set([k for k in node.attrib.keys() if not OptimizableSvg.is_namespace(k)])-set(non_skipped)
+        if len(skipped)>0:
+            tag=OptimizableSvg.remove_namespace(node.tag) if "id" not in node.attrib else "{}#{}".format(OptimizableSvg.remove_namespace(node.tag),node.attrib["id"])
+            print(self.offset_str("Warning: Skipping the following attributes of node '{}': {}".format(tag,", ".join(["'{}'".format(atr) for atr in skipped]))))
+
+    def reportSkippedChildren(self,node,skipped):
+        skipped_names=["{}#{}".format(elm.tag,elm.attrib["id"]) if "id" in elm.attrib else elm.tag for elm in skipped]
+        if len(skipped)>0:
+            tag = OptimizableSvg.remove_namespace(node.tag) if "id" not in node.attrib else "{}#{}".format(OptimizableSvg.remove_namespace(node.tag),
+                                                                                            node.attrib["id"])
+            print(self.offset_str("Warning: Skipping the following children of node '{}': {}".format(tag,", ".join(["'{}'".format(name) for name in skipped_names]))))
+
+    #endregion
+
+    #region parsing
+    @staticmethod
+    def remove_namespace(s):
+        """
+            {...} ... -> ...
+        """
+        return re.sub('{.*}', '', s)
+
+    @staticmethod
+    def is_namespace(s):
+        return re.match('{.*}', s) is not None
+
+    @staticmethod
+    def parseTransform(node):
+        if "transform" not in node.attrib and "gradientTransform" not in node.attrib:
+            return None
+
+        tf_string=node.attrib["transform"] if "transform" in node.attrib else node.attrib["gradientTransform"]
+        tforms=tf_string.split(")")[:-1]
+        mat=np.eye(3)
+        for tform in tforms:
+            type = tform.split("(")[0]
+            args = [float(val) for val in re.split("[, ]+",tform.split("(")[1])]
+            if type == "matrix":
+                mat=mat @ OptimizableSvg.TransformTools.parse_matrix(args)
+            elif type == "translate":
+                mat = mat @ OptimizableSvg.TransformTools.parse_translate(args)
+            elif type == "rotate":
+                mat = mat @ OptimizableSvg.TransformTools.parse_rotate(args)
+            elif type == "scale":
+                mat = mat @ OptimizableSvg.TransformTools.parse_scale(args)
+            elif type == "skewX":
+                mat = mat @ OptimizableSvg.TransformTools.parse_skewx(args)
+            elif type == "skewY":
+                mat = mat @ OptimizableSvg.TransformTools.parse_skewy(args)
+            else:
+                raise ValueError("Unknown transform type '{}'".format(type))
+        return mat
+
+    #dictionary that defines what constant do we need to multiply different units to get the value in pixels
+    #gleaned from the CSS definition
+    unit_dict = {"px":1,
+                 "mm":4,
+                 "cm":40,
+                 "in":25.4*4,
+                 "pt":25.4*4/72,
+                 "pc":25.4*4/6
+                 }
+
+    @staticmethod
+    def parseLength(s):
+        #length is a number followed possibly by a unit definition
+        #we assume that default unit is the pixel (px) equal to 0.25mm
+        #last two characters might be unit
+        val=None
+        for i in range(len(s)):
+            try:
+                val=float(s[:len(s)-i])
+                unit=s[len(s)-i:]
+                break
+            except ValueError:
+                continue
+        if len(unit)>0 and unit not in OptimizableSvg.unit_dict:
+            raise ValueError("Unknown or unsupported unit '{}' encountered while parsing".format(unit))
+        if unit != "":
+            val*=OptimizableSvg.unit_dict[unit]
+        return val
+
+    @staticmethod
+    def parseOpacity(s):
+        is_percent=s.endswith("%")
+        s=s.rstrip("%")
+        val=float(s)
+        if is_percent:
+            val=val/100
+        return np.clip(val,0.,1.)
+
+    @staticmethod
+    def parse_color(s):
+        """
+            Hex to tuple
+        """
+        if s[0] != '#':
+            raise ValueError("Color argument `{}` not supported".format(s))
+        s = s.lstrip('#')
+        if len(s)==6:
+            rgb = tuple(int(s[i:i + 2], 16) for i in (0, 2, 4))
+            return torch.tensor([rgb[0] / 255.0, rgb[1] / 255.0, rgb[2] / 255.0])
+        elif len(s)==3:
+            rgb = tuple((int(s[i:i + 1], 16)) for i in (0, 1, 2))
+            return torch.tensor([rgb[0] / 15.0, rgb[1] / 15.0, rgb[2] / 15.0])
+        else:
+            raise ValueError("Color argument `{}` not supported".format(s))
+        # sRGB to RGB
+        # return torch.pow(torch.tensor([rgb[0] / 255.0, rgb[1] / 255.0, rgb[2] / 255.0]), 2.2)
+
+
+    @staticmethod
+    def rgb_to_string(val):
+        byte_rgb=(val.clone().detach()*255).type(torch.int)
+        byte_rgb.clamp_(min=0,max=255)
+        s="#{:02x}{:02x}{:02x}".format(*byte_rgb)
+        return s
+
+    #parses a "paint" string for use in fill and stroke definitions
+    @staticmethod
+    def parsePaint(paintStr,defs,device):
+        paintStr=paintStr.strip()
+        if paintStr=="none":
+            return ("none", None)
+        elif paintStr[0]=="#":
+            return ("solid",OptimizableSvg.parse_color(paintStr).to(device))
+        elif paintStr.startswith("url"):
+            url=paintStr.lstrip("url(").rstrip(")").strip("\'\"").lstrip("#")
+            if url not in defs:
+                raise ValueError("Paint-type attribute referencing an unknown object with ID '#{}'".format(url))
+            return ("url",defs[url])
+        else:
+            raise ValueError("Unrecognized paint string: '{}'".format(paintStr))
+
+    appearance_keys=["fill","fill-opacity","fill-rule","opacity","stroke","stroke-opacity","stroke-width"]
+
+    @staticmethod
+    def parseAppearance(node, defs, device):
+        ret={}
+        parse_keys = OptimizableSvg.appearance_keys
+        local_dict={key:value for key,value in node.attrib.items() if key in parse_keys}
+        css_dict={}
+        style_dict={}
+        appearance_dict={}
+        if "class" in node.attrib:
+            cls=node.attrib["class"]
+            if "."+cls in defs:
+                css_string=defs["."+cls]
+                css_dict={item.split(":")[0]:item.split(":")[1] for item in css_string.split(";") if len(item)>0 and item.split(":")[0] in parse_keys}
+        if "style" in node.attrib:
+            style_string=node.attrib["style"]
+            style_dict={item.split(":")[0]:item.split(":")[1] for item in style_string.split(";") if len(item)>0 and item.split(":")[0] in parse_keys}
+        appearance_dict.update(css_dict)
+        appearance_dict.update(style_dict)
+        appearance_dict.update(local_dict)
+        for key,value in appearance_dict.items():
+            if key=="fill":
+                ret[key]=OptimizableSvg.parsePaint(value,defs,device)
+            elif key == "fill-opacity":
+                ret[key]=torch.tensor(OptimizableSvg.parseOpacity(value),device=device)
+            elif key == "fill-rule":
+                ret[key]=value
+            elif key == "opacity":
+                ret[key]=torch.tensor(OptimizableSvg.parseOpacity(value),device=device)
+            elif key == "stroke":
+                ret[key]=OptimizableSvg.parsePaint(value,defs,device)
+            elif key == "stroke-opacity":
+                ret[key]=torch.tensor(OptimizableSvg.parseOpacity(value),device=device)
+            elif key == "stroke-width":
+                ret[key]=torch.tensor(OptimizableSvg.parseLength(value),device=device)
+            else:
+                raise ValueError("Error while parsing appearance attributes: key '{}' should not be here".format(key))
+
+        return ret
+
+    def parseRoot(self,root):
+        if self.verbose:
+            print(self.offset_str("Parsing root"))
+        self.depth += 1
+
+        # get document canvas dimensions
+        self.parseViewport(root)
+        canvmax=np.max(self.canvas)
+        self.settings.global_override(["transforms","translation_mult"],canvmax)
+        id=root.attrib["id"] if "id" in root.attrib else None
+
+        transform=OptimizableSvg.parseTransform(root)
+        appearance=OptimizableSvg.parseAppearance(root,self.defs,self.device)
+
+        version=root.attrib["version"] if "version" in root.attrib else "<unknown version>"
+        if version != "2.0":
+            print(self.offset_str("Warning: Version {} is not 2.0, strange things may happen".format(version)))
+
+        self.root=OptimizableSvg.RootNode(id,transform,appearance,self.settings)
+
+        if self.verbose:
+            self.reportSkippedAttribs(root, ["width", "height", "id", "transform","version", "style"]+OptimizableSvg.appearance_keys)
+
+        #go through the root children and parse them appropriately
+        skipped=[]
+        for child in root:
+            if OptimizableSvg.remove_namespace(child.tag) in OptimizableSvg.recognised_shapes:
+                self.parseShape(child,self.root)
+            elif OptimizableSvg.remove_namespace(child.tag) == "defs":
+                self.parseDefs(child)
+            elif OptimizableSvg.remove_namespace(child.tag) == "style":
+                self.parseStyle(child)
+            elif OptimizableSvg.remove_namespace(child.tag) == "g":
+                self.parseGroup(child,self.root)
+            else:
+                skipped.append(child)
+
+        if self.verbose:
+            self.reportSkippedChildren(root,skipped)
+
+        self.depth-=1
+
+    def parseShape(self,shape,parent):
+        tag=OptimizableSvg.remove_namespace(shape.tag)
+        if self.verbose:
+            print(self.offset_str("Parsing {}#{}".format(tag,shape.attrib["id"] if "id" in shape.attrib else "<No ID>")))
+
+        self.depth+=1
+        if tag == "path":
+            self.parsePath(shape,parent)
+        elif tag == "circle":
+            self.parseCircle(shape,parent)
+        elif tag == "rect":
+            self.parseRect(shape,parent)
+        elif tag == "ellipse":
+            self.parseEllipse(shape,parent)
+        elif tag == "polygon":
+            self.parsePolygon(shape,parent)
+        else:
+            raise ValueError("Encountered unknown shape type '{}'".format(tag))
+        self.depth -= 1
+
+    def parsePath(self,shape,parent):
+        path_string=shape.attrib['d']
+        name = ''
+        if 'id' in shape.attrib:
+            name = shape.attrib['id']
+        paths = pydiffvg.from_svg_path(path_string)
+        for idx, path in enumerate(paths):
+            path.stroke_width = torch.tensor([0.],device=self.device)
+            path.num_control_points=path.num_control_points.to(self.device)
+            path.points=path.points.to(self.device)
+            path.source_id = name
+            path.id = "{}-{}".format(name,idx) if len(paths)>1 else name
+        transform = OptimizableSvg.parseTransform(shape)
+        appearance = OptimizableSvg.parseAppearance(shape,self.defs,self.device)
+        node=OptimizableSvg.PathNode(name,transform,appearance,self.settings,paths)
+        parent.children.append(node)
+
+        if self.verbose:
+            self.reportSkippedAttribs(shape, ["id","d","transform","style"]+OptimizableSvg.appearance_keys)
+            self.reportSkippedChildren(shape,list(shape))
+
+    def parseEllipse(self, shape, parent):
+        cx = float(shape.attrib["cx"]) if "cx" in shape.attrib else 0.
+        cy = float(shape.attrib["cy"]) if "cy" in shape.attrib else 0.
+        rx = float(shape.attrib["rx"])
+        ry = float(shape.attrib["ry"])
+        name = ''
+        if 'id' in shape.attrib:
+            name = shape.attrib['id']
+        transform = OptimizableSvg.parseTransform(shape)
+        appearance = OptimizableSvg.parseAppearance(shape, self.defs, self.device)
+        node = OptimizableSvg.EllipseNode(name, transform, appearance, self.settings, (cx, cy, rx, ry))
+        parent.children.append(node)
+
+        if self.verbose:
+            self.reportSkippedAttribs(shape, ["id", "x", "y", "r", "transform",
+                                              "style"] + OptimizableSvg.appearance_keys)
+            self.reportSkippedChildren(shape, list(shape))
+
+    def parsePolygon(self, shape, parent):
+        points_string = shape.attrib['points']
+        name = ''
+        points=[]
+        for point_string in points_string.split(" "):
+            if len(point_string) == 0:
+                continue
+            coord_strings=point_string.split(",")
+            assert len(coord_strings)==2
+            points.append([float(coord_strings[0]),float(coord_strings[1])])
+        points=torch.tensor(points,dtype=torch.float,device=self.device)
+        if 'id' in shape.attrib:
+            name = shape.attrib['id']
+        transform = OptimizableSvg.parseTransform(shape)
+        appearance = OptimizableSvg.parseAppearance(shape, self.defs, self.device)
+        node = OptimizableSvg.PolygonNode(name, transform, appearance, self.settings, points)
+        parent.children.append(node)
+
+        if self.verbose:
+            self.reportSkippedAttribs(shape, ["id", "points", "transform", "style"] + OptimizableSvg.appearance_keys)
+            self.reportSkippedChildren(shape, list(shape))
+
+    def parseCircle(self,shape,parent):
+        cx = float(shape.attrib["cx"]) if "cx" in shape.attrib else 0.
+        cy = float(shape.attrib["cy"]) if "cy" in shape.attrib else 0.
+        r = float(shape.attrib["r"])
+        name = ''
+        if 'id' in shape.attrib:
+            name = shape.attrib['id']
+        transform = OptimizableSvg.parseTransform(shape)
+        appearance = OptimizableSvg.parseAppearance(shape, self.defs, self.device)
+        node = OptimizableSvg.CircleNode(name, transform, appearance, self.settings, (cx, cy, r))
+        parent.children.append(node)
+
+        if self.verbose:
+            self.reportSkippedAttribs(shape, ["id", "x", "y", "r", "transform",
+                                              "style"] + OptimizableSvg.appearance_keys)
+            self.reportSkippedChildren(shape, list(shape))
+
+    def parseRect(self,shape,parent):
+        x =      float(shape.attrib["x"]) if "x" in shape.attrib else 0.
+        y =      float(shape.attrib["y"]) if "y" in shape.attrib else 0.
+        width =  float(shape.attrib["width"])
+        height = float(shape.attrib["height"])
+        name = ''
+        if 'id' in shape.attrib:
+            name = shape.attrib['id']
+        transform = OptimizableSvg.parseTransform(shape)
+        appearance = OptimizableSvg.parseAppearance(shape, self.defs, self.device)
+        node = OptimizableSvg.RectNode(name, transform, appearance, self.settings, (x,y,width,height))
+        parent.children.append(node)
+
+        if self.verbose:
+            self.reportSkippedAttribs(shape, ["id", "x", "y", "width", "height", "transform", "style"] + OptimizableSvg.appearance_keys)
+            self.reportSkippedChildren(shape, list(shape))
+
+    def parseGroup(self,group,parent):
+        tag = OptimizableSvg.remove_namespace(group.tag)
+        id = group.attrib["id"] if "id" in group.attrib else "<No ID>"
+        if self.verbose:
+            print(self.offset_str("Parsing {}#{}".format(tag, id)))
+
+        self.depth+=1
+
+        transform=self.parseTransform(group)
+
+        #todo process more attributes
+        appearance=OptimizableSvg.parseAppearance(group,self.defs,self.device)
+        node=OptimizableSvg.GroupNode(id,transform,appearance,self.settings)
+        parent.children.append(node)
+
+        if self.verbose:
+            self.reportSkippedAttribs(group,["id","transform","style"]+OptimizableSvg.appearance_keys)
+
+        skipped_children=[]
+        for child in group:
+            if OptimizableSvg.remove_namespace(child.tag) in OptimizableSvg.recognised_shapes:
+                self.parseShape(child,node)
+            elif OptimizableSvg.remove_namespace(child.tag) == "defs":
+                self.parseDefs(child)
+            elif OptimizableSvg.remove_namespace(child.tag) == "style":
+                self.parseStyle(child)
+            elif OptimizableSvg.remove_namespace(child.tag) == "g":
+                self.parseGroup(child,node)
+            else:
+                skipped_children.append(child)
+
+        if self.verbose:
+            self.reportSkippedChildren(group,skipped_children)
+
+        self.depth-=1
+
+    def parseStyle(self,style_node):
+        tag = OptimizableSvg.remove_namespace(style_node.tag)
+        id = style_node.attrib["id"] if "id" in style_node.attrib else "<No ID>"
+        if self.verbose:
+            print(self.offset_str("Parsing {}#{}".format(tag, id)))
+
+        if style_node.attrib["type"] != "text/css":
+            raise ValueError("Only text/css style recognized, got {}".format(style_node.attrib["type"]))
+
+        self.depth += 1
+
+        # creating only a dummy node
+        node = OptimizableSvg.SvgNode(id, None, {}, self.settings)
+
+        if self.verbose:
+            self.reportSkippedAttribs(def_node, ["id"])
+
+        if len(style_node)>0:
+            raise ValueError("Style node should not have children (has {})".format(len(style_node)))
+
+        # collect CSS classes
+        sheet = cssutils.parseString(style_node.text)
+        for rule in sheet:
+            if hasattr(rule, 'selectorText') and hasattr(rule, 'style'):
+                name = rule.selectorText
+                if len(name) >= 2 and name[0] == '.':
+                    self.defs[name] = rule.style.getCssText().replace("\n","")
+                else:
+                    raise ValueError("Unrecognized CSS selector {}".format(name))
+            else:
+                raise ValueError("No style or selector text in CSS rule")
+
+        if self.verbose:
+            self.reportSkippedChildren(def_node, skipped_children)
+
+        self.depth -= 1
+
+    def parseDefs(self,def_node):
+        #only linear gradients are currently supported
+        tag = OptimizableSvg.remove_namespace(def_node.tag)
+        id = def_node.attrib["id"] if "id" in def_node.attrib else "<No ID>"
+        if self.verbose:
+            print(self.offset_str("Parsing {}#{}".format(tag, id)))
+
+        self.depth += 1
+
+
+        # creating only a dummy node
+        node = OptimizableSvg.SvgNode(id, None, {},self.settings)
+
+        if self.verbose:
+            self.reportSkippedAttribs(def_node, ["id"])
+
+        skipped_children = []
+        for child in def_node:
+            if OptimizableSvg.remove_namespace(child.tag) == "linearGradient":
+                self.parseGradient(child,node)
+            elif OptimizableSvg.remove_namespace(child.tag) in OptimizableSvg.recognised_shapes:
+                raise NotImplementedError("Definition/instantiation of shapes not supported")
+            elif OptimizableSvg.remove_namespace(child.tag) == "defs":
+                raise NotImplementedError("Definition within definition not supported")
+            elif OptimizableSvg.remove_namespace(child.tag) == "g":
+                raise NotImplementedError("Groups within definition not supported")
+            else:
+                skipped_children.append(child)
+
+            if len(node.children)>0:
+                #take this node out and enter it into defs
+                self.defs[node.children[0].id]=node.children[0]
+                node.children.pop()
+
+
+        if self.verbose:
+            self.reportSkippedChildren(def_node, skipped_children)
+
+        self.depth -= 1
+
+    def parseGradientStop(self,stop):
+        param_dict={key:value for key,value in stop.attrib.items() if key in ["id","offset","stop-color","stop-opacity"]}
+        style_dict={}
+        if "style" in stop.attrib:
+            style_dict={item.split(":")[0]:item.split(":")[1] for item in stop.attrib["style"].split(";") if len(item)>0}
+        param_dict.update(style_dict)
+
+        offset=OptimizableSvg.parseOpacity(param_dict["offset"])
+        color=OptimizableSvg.parse_color(param_dict["stop-color"])
+        opacity=OptimizableSvg.parseOpacity(param_dict["stop-opacity"]) if "stop-opacity" in param_dict else 1.
+
+        return offset, color, opacity
+
+    def parseGradient(self, gradient_node, parent):
+        tag = OptimizableSvg.remove_namespace(gradient_node.tag)
+        id = gradient_node.attrib["id"] if "id" in gradient_node.attrib else "<No ID>"
+        if self.verbose:
+            print(self.offset_str("Parsing {}#{}".format(tag, id)))
+
+        self.depth += 1
+        if "stop" not in [OptimizableSvg.remove_namespace(child.tag) for child in gradient_node]\
+            and "href" not in [OptimizableSvg.remove_namespace(key) for key in gradient_node.attrib.keys()]:
+            raise ValueError("Gradient {} has neither stops nor a href link to them".format(id))
+
+        transform=self.parseTransform(gradient_node)
+        begin=None
+        end = None
+        offsets=[]
+        stops=[]
+        href=None
+
+        if "x1" in gradient_node.attrib or "y1" in gradient_node.attrib:
+            begin=np.array([0.,0.])
+            if "x1" in gradient_node.attrib:
+                begin[0] = float(gradient_node.attrib["x1"])
+            if "y1" in gradient_node.attrib:
+                begin[1] = float(gradient_node.attrib["y1"])
+            begin = torch.tensor(begin.transpose(),dtype=torch.float32)
+
+        if "x2" in gradient_node.attrib or "y2" in gradient_node.attrib:
+            end=np.array([0.,0.])
+            if "x2" in gradient_node.attrib:
+                end[0] = float(gradient_node.attrib["x2"])
+            if "y2" in gradient_node.attrib:
+                end[1] = float(gradient_node.attrib["y2"])
+            end=torch.tensor(end.transpose(),dtype=torch.float32)
+
+        stop_nodes=[node for node in list(gradient_node) if OptimizableSvg.remove_namespace(node.tag)=="stop"]
+        if len(stop_nodes)>0:
+            stop_nodes=sorted(stop_nodes,key=lambda n: float(n.attrib["offset"]))
+
+            for stop in stop_nodes:
+                offset, color, opacity = self.parseGradientStop(stop)
+                offsets.append(offset)
+                stops.append(np.concatenate((color,np.array([opacity]))))
+
+        hkey=next((value for key,value in gradient_node.attrib.items() if OptimizableSvg.remove_namespace(key)=="href"),None)
+        if hkey is not None:
+            href=self.defs[hkey.lstrip("#")]
+
+        parent.children.append(OptimizableSvg.GradientNode(id,transform,self.settings,begin.to(self.device) if begin is not None else begin,end.to(self.device) if end is not None else end,torch.tensor(offsets,dtype=torch.float32,device=self.device) if len(offsets)>0 else None,torch.tensor(np.array(stops),dtype=torch.float32,device=self.device) if len(stops)>0 else None,href))
+
+        self.depth -= 1
+
+    def parseViewport(self, root):
+        if "width" in root.attrib and "height" in root.attrib:
+            self.canvas = np.array([int(math.ceil(float(root.attrib["width"]))), int(math.ceil(float(root.attrib["height"])))])
+        elif "viewBox" in root.attrib:
+            s=root.attrib["viewBox"].split(" ")
+            w=s[2]
+            h=s[3]
+            self.canvas = np.array(
+                [int(math.ceil(float(w))), int(math.ceil(float(h)))])
+        else:
+            raise ValueError("Size information is missing from document definition")
+    #endregion
+
+    #region writing
+    def write_xml(self):
+        tree=self.root.write_xml(self)
+        
+        return minidom.parseString(etree.tostring(tree, 'utf-8')).toprettyxml(indent="  ")
+
+    def write_defs(self,root):
+        if len(self.defs)==0:
+            return
+
+        defnode = etree.SubElement(root, 'defs')
+        stylenode = etree.SubElement(root,'style')
+        stylenode.set('type','text/css')
+        stylenode.text=""
+
+        defcpy=copy.copy(self.defs)
+        while len(defcpy)>0:
+            torem=[]
+            for key,value in defcpy.items():
+                if issubclass(value.__class__,OptimizableSvg.SvgNode):
+                    if value.href is None or value.href not in defcpy:
+                        value.write_xml(defnode)
+                        torem.append(key)
+                    else:
+                        continue
+                else:
+                    #this is a string, and hence a CSS attribute
+                    stylenode.text+=key+" {"+value+"}\n"
+                    torem.append(key)
+
+            for key in torem:
+                del defcpy[key]
+    #endregion
+
+
diff --git a/pydiffvg/parse_svg.py b/pydiffvg/parse_svg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb1f3fc286074f3cd82b37baffbdd00440b72a8a
--- /dev/null
+++ b/pydiffvg/parse_svg.py
@@ -0,0 +1,583 @@
+import torch
+import xml.etree.ElementTree as etree
+import numpy as np
+import diffvg
+import os
+import pydiffvg
+import svgpathtools
+import svgpathtools.parser
+import re
+import warnings
+import cssutils
+import logging
+import matplotlib.colors 
+cssutils.log.setLevel(logging.ERROR)
+
+def remove_namespaces(s):
+    """
+        {...} ... -> ...
+    """
+    return re.sub('{.*}', '', s)
+
+def parse_style(s, defs):
+    style_dict = {}
+    for e in s.split(';'):
+        key_value = e.split(':')
+        if len(key_value) == 2:
+            key = key_value[0].strip()
+            value = key_value[1].strip()
+            if key == 'fill' or key == 'stroke':
+                # Special case: convert colors into tensor in definitions so
+                # that different shapes can share the same color
+                value = parse_color(value, defs)
+            style_dict[key] = value
+    return style_dict
+
+def parse_hex(s):
+    """
+        Hex to tuple
+    """
+    s = s.lstrip('#')
+    if len(s) == 3:
+        s = s[0] + s[0] + s[1] + s[1] + s[2] + s[2]
+    rgb = tuple(int(s[i:i+2], 16) for i in (0, 2, 4))
+    # sRGB to RGB
+    # return torch.pow(torch.tensor([rgb[0] / 255.0, rgb[1] / 255.0, rgb[2] / 255.0]), 2.2)
+    return torch.pow(torch.tensor([rgb[0] / 255.0, rgb[1] / 255.0, rgb[2] / 255.0]), 1.0)
+
+def parse_int(s):
+    """
+        trim alphabets
+    """
+    return int(float(''.join(i for i in s if (not i.isalpha()))))
+
+def parse_color(s, defs):
+    if s is None:
+        return None
+    if isinstance(s, torch.Tensor):
+        return s
+    s = s.lstrip(' ')
+    color = torch.tensor([0.0, 0.0, 0.0, 1.0])
+    if s[0] == '#':
+        color[:3] = parse_hex(s)
+    elif s[:3] == 'url':
+        # url(#id)
+        color = defs[s[4:-1].lstrip('#')]
+    elif s == 'none':
+        color = None
+    elif s[:4] == 'rgb(':
+        rgb = s[4:-1].split(',')
+        color = torch.tensor([int(rgb[0]) / 255.0, int(rgb[1]) / 255.0, int(rgb[2]) / 255.0, 1.0])
+    elif s == 'none':
+        return None
+    else:
+        try : 
+            rgba = matplotlib.colors.to_rgba(s)
+            color = torch.tensor(rgba)
+        except ValueError : 
+            warnings.warn('Unknown color command ' + s)
+    return color
+
+# https://github.com/mathandy/svgpathtools/blob/7ebc56a831357379ff22216bec07e2c12e8c5bc6/svgpathtools/parser.py
+def _parse_transform_substr(transform_substr):
+    type_str, value_str = transform_substr.split('(')
+    value_str = value_str.replace(',', ' ')
+    values = list(map(float, filter(None, value_str.split(' '))))
+
+    transform = np.identity(3)
+    if 'matrix' in type_str:
+        transform[0:2, 0:3] = np.array([values[0:6:2], values[1:6:2]])
+    elif 'translate' in transform_substr:
+        transform[0, 2] = values[0]
+        if len(values) > 1:
+            transform[1, 2] = values[1]
+    elif 'scale' in transform_substr:
+        x_scale = values[0]
+        y_scale = values[1] if (len(values) > 1) else x_scale
+        transform[0, 0] = x_scale
+        transform[1, 1] = y_scale
+    elif 'rotate' in transform_substr:
+        angle = values[0] * np.pi / 180.0
+        if len(values) == 3:
+            offset = values[1:3]
+        else:
+            offset = (0, 0)
+        tf_offset = np.identity(3)
+        tf_offset[0:2, 2:3] = np.array([[offset[0]], [offset[1]]])
+        tf_rotate = np.identity(3)
+        tf_rotate[0:2, 0:2] = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]])
+        tf_offset_neg = np.identity(3)
+        tf_offset_neg[0:2, 2:3] = np.array([[-offset[0]], [-offset[1]]])
+
+        transform = tf_offset.dot(tf_rotate).dot(tf_offset_neg)
+    elif 'skewX' in transform_substr:
+        transform[0, 1] = np.tan(values[0] * np.pi / 180.0)
+    elif 'skewY' in transform_substr:
+        transform[1, 0] = np.tan(values[0] * np.pi / 180.0)
+    else:
+        # Return an identity matrix if the type of transform is unknown, and warn the user
+        warnings.warn('Unknown SVG transform type: {0}'.format(type_str))
+    return transform
+
+def parse_transform(transform_str):
+    """
+        Converts a valid SVG transformation string into a 3x3 matrix.
+        If the string is empty or null, this returns a 3x3 identity matrix
+    """
+    if not transform_str:
+        return np.identity(3)
+    elif not isinstance(transform_str, str):
+        raise TypeError('Must provide a string to parse')
+
+    total_transform = np.identity(3)
+    transform_substrs = transform_str.split(')')[:-1]  # Skip the last element, because it should be empty
+    for substr in transform_substrs:
+        total_transform = total_transform.dot(_parse_transform_substr(substr))
+
+    return torch.from_numpy(total_transform).type(torch.float32)
+
+def parse_linear_gradient(node, transform, defs):
+    begin = torch.tensor([0.0, 0.0])
+    end = torch.tensor([0.0, 0.0])
+    offsets = []
+    stop_colors = []
+    # Inherit from parent
+    for key in node.attrib:
+        if remove_namespaces(key) == 'href':
+            value = node.attrib[key]
+            parent = defs[value.lstrip('#')]
+            begin = parent.begin
+            end = parent.end
+            offsets = parent.offsets
+            stop_colors = parent.stop_colors
+
+    for attrib in node.attrib:
+        attrib = remove_namespaces(attrib)
+        if attrib == 'x1':
+            begin[0] = float(node.attrib['x1'])
+        elif attrib == 'y1':
+            begin[1] = float(node.attrib['y1'])
+        elif attrib == 'x2':
+            end[0] = float(node.attrib['x2'])
+        elif attrib == 'y2':
+            end[1] = float(node.attrib['y2'])
+        elif attrib == 'gradientTransform':
+            transform = transform @ parse_transform(node.attrib['gradientTransform'])
+
+    begin = transform @ torch.cat((begin, torch.ones([1])))
+    begin = begin / begin[2]
+    begin = begin[:2]
+    end = transform @ torch.cat((end, torch.ones([1])))
+    end = end / end[2]
+    end = end[:2]
+
+    for child in node:
+        tag = remove_namespaces(child.tag)
+        if tag == 'stop':
+            offset = float(child.attrib['offset'])
+            color = [0.0, 0.0, 0.0, 1.0]
+            if 'stop-color' in child.attrib:
+                c = parse_color(child.attrib['stop-color'], defs)
+                color[:3] = [c[0], c[1], c[2]]
+            if 'stop-opacity' in child.attrib:
+                color[3] = float(child.attrib['stop-opacity'])
+            if 'style' in child.attrib:
+                style = parse_style(child.attrib['style'], defs)
+                if 'stop-color' in style:
+                    c = parse_color(style['stop-color'], defs)
+                    color[:3] = [c[0], c[1], c[2]]
+                if 'stop-opacity' in style:
+                    color[3] = float(style['stop-opacity'])
+            offsets.append(offset)
+            stop_colors.append(color)
+    if isinstance(offsets, list):
+        offsets = torch.tensor(offsets)
+    if isinstance(stop_colors, list):
+        stop_colors = torch.tensor(stop_colors)
+
+    return pydiffvg.LinearGradient(begin, end, offsets, stop_colors)
+
+
+def parse_radial_gradient(node, transform, defs):
+    begin = torch.tensor([0.0, 0.0])
+    end = torch.tensor([0.0, 0.0])
+    center = torch.tensor([0.0, 0.0])
+    radius = torch.tensor([0.0, 0.0])
+    offsets = []
+    stop_colors = []
+    # Inherit from parent
+    for key in node.attrib:
+        if remove_namespaces(key) == 'href':
+            value = node.attrib[key]
+            parent = defs[value.lstrip('#')]
+            begin = parent.begin
+            end = parent.end
+            offsets = parent.offsets
+            stop_colors = parent.stop_colors
+
+    for attrib in node.attrib:
+        attrib = remove_namespaces(attrib)
+        if attrib == 'cx':
+            center[0] = float(node.attrib['cx'])
+        elif attrib == 'cy':
+            center[1] = float(node.attrib['cy'])
+        elif attrib == 'fx':
+            radius[0] = float(node.attrib['fx'])
+        elif attrib == 'fy':
+            radius[1] = float(node.attrib['fy'])
+        elif attrib == 'fr':
+            radius[0] = float(node.attrib['fr'])
+            radius[1] = float(node.attrib['fr'])
+        elif attrib == 'gradientTransform':
+            transform = transform @ parse_transform(node.attrib['gradientTransform'])
+
+    # TODO: this is incorrect
+    center = transform @ torch.cat((center, torch.ones([1])))
+    center = center / center[2]
+    center = center[:2]
+
+    for child in node:
+        tag = remove_namespaces(child.tag)
+        if tag == 'stop':
+            offset = float(child.attrib['offset'])
+            color = [0.0, 0.0, 0.0, 1.0]
+            if 'stop-color' in child.attrib:
+                c = parse_color(child.attrib['stop-color'], defs)
+                color[:3] = [c[0], c[1], c[2]]
+            if 'stop-opacity' in child.attrib:
+                color[3] = float(child.attrib['stop-opacity'])
+            if 'style' in child.attrib:
+                style = parse_style(child.attrib['style'], defs)
+                if 'stop-color' in style:
+                    c = parse_color(style['stop-color'], defs)
+                    color[:3] = [c[0], c[1], c[2]]
+                if 'stop-opacity' in style:
+                    color[3] = float(style['stop-opacity'])
+            offsets.append(offset)
+            stop_colors.append(color)
+    if isinstance(offsets, list):
+        offsets = torch.tensor(offsets)
+    if isinstance(stop_colors, list):
+        stop_colors = torch.tensor(stop_colors)
+
+    return pydiffvg.RadialGradient(begin, end, offsets, stop_colors)
+
+def parse_stylesheet(node, transform, defs):
+    # collect CSS classes
+    sheet = cssutils.parseString(node.text)
+    for rule in sheet:
+        if hasattr(rule, 'selectorText') and hasattr(rule, 'style'):
+            name = rule.selectorText
+            if len(name) >= 2 and name[0] == '.':
+                defs[name[1:]] = parse_style(rule.style.getCssText(), defs)
+    return defs
+
+def parse_defs(node, transform, defs):
+    for child in node:
+        tag = remove_namespaces(child.tag)
+        if tag == 'linearGradient':
+            if 'id' in child.attrib:
+                defs[child.attrib['id']] = parse_linear_gradient(child, transform, defs)
+        elif tag == 'radialGradient':
+            if 'id' in child.attrib:
+                defs[child.attrib['id']] = parse_radial_gradient(child, transform, defs)
+        elif tag == 'style':
+            defs = parse_stylesheet(child, transform, defs)
+    return defs
+
+def parse_common_attrib(node, transform, fill_color, defs):
+    attribs = {}
+    if 'class' in node.attrib:
+        attribs.update(defs[node.attrib['class']])
+    attribs.update(node.attrib)
+
+    name = ''
+    if 'id' in node.attrib:
+        name = node.attrib['id']
+
+    stroke_color = None
+    stroke_width = torch.tensor(0.5)
+    use_even_odd_rule = False
+
+    new_transform = transform
+    if 'transform' in attribs:
+        new_transform = transform @ parse_transform(attribs['transform'])
+    if 'fill' in attribs:
+        fill_color = parse_color(attribs['fill'], defs)
+    fill_opacity = 1.0
+    if 'fill-opacity' in attribs:
+        fill_opacity *= float(attribs['fill-opacity'])
+    if 'opacity' in attribs:
+        fill_opacity *= float(attribs['opacity'])
+    # Ignore opacity if the color is a gradient
+    if isinstance(fill_color, torch.Tensor):
+        fill_color[3] = fill_opacity
+
+    if 'fill-rule' in attribs:
+        if attribs['fill-rule'] == "evenodd":
+            use_even_odd_rule = True
+        elif attribs['fill-rule'] == "nonzero":
+            use_even_odd_rule = False
+        else:
+            warnings.warn('Unknown fill-rule: {}'.format(attribs['fill-rule']))
+
+    if 'stroke' in attribs:
+        stroke_color = parse_color(attribs['stroke'], defs)
+
+    if 'stroke-width' in attribs:
+        stroke_width = attribs['stroke-width']
+        if stroke_width[-2:] == 'px':
+            stroke_width = stroke_width[:-2]
+        stroke_width = torch.tensor(float(stroke_width) / 2.0)
+
+    if 'style' in attribs:
+        style = parse_style(attribs['style'], defs)
+        if 'fill' in style:
+            fill_color = parse_color(style['fill'], defs)
+        fill_opacity = 1.0
+        if 'fill-opacity' in style:
+            fill_opacity *= float(style['fill-opacity'])
+        if 'opacity' in style:
+            fill_opacity *= float(style['opacity'])
+        if 'fill-rule' in style:
+            if style['fill-rule'] == "evenodd":
+                use_even_odd_rule = True
+            elif style['fill-rule'] == "nonzero":
+                use_even_odd_rule = False
+            else:
+                warnings.warn('Unknown fill-rule: {}'.format(style['fill-rule']))
+        # Ignore opacity if the color is a gradient
+        if isinstance(fill_color, torch.Tensor):
+            fill_color[3] = fill_opacity
+        if 'stroke' in style:
+            if style['stroke'] != 'none':
+                stroke_color = parse_color(style['stroke'], defs)
+                # Ignore opacity if the color is a gradient
+                if isinstance(stroke_color, torch.Tensor):
+                    if 'stroke-opacity' in style:
+                        stroke_color[3] = float(style['stroke-opacity'])
+                    if 'opacity' in style:
+                        stroke_color[3] *= float(style['opacity'])
+                if 'stroke-width' in style:
+                    stroke_width = style['stroke-width']
+                    if stroke_width[-2:] == 'px':
+                        stroke_width = stroke_width[:-2]
+                    stroke_width = torch.tensor(float(stroke_width) / 2.0)
+
+        if isinstance(fill_color, pydiffvg.LinearGradient):
+            fill_color.begin = new_transform @ torch.cat((fill_color.begin, torch.ones([1])))
+            fill_color.begin = fill_color.begin / fill_color.begin[2]
+            fill_color.begin = fill_color.begin[:2]
+            fill_color.end = new_transform @ torch.cat((fill_color.end, torch.ones([1])))
+            fill_color.end = fill_color.end / fill_color.end[2]
+            fill_color.end = fill_color.end[:2]
+        if isinstance(stroke_color, pydiffvg.LinearGradient):
+            stroke_color.begin = new_transform @ torch.cat((stroke_color.begin, torch.ones([1])))
+            stroke_color.begin = stroke_color.begin / stroke_color.begin[2]
+            stroke_color.begin = stroke_color.begin[:2]
+            stroke_color.end = new_transform @ torch.cat((stroke_color.end, torch.ones([1])))
+            stroke_color.end = stroke_color.end / stroke_color.end[2]
+            stroke_color.end = stroke_color.end[:2]
+        if 'filter' in style:
+            print('*** WARNING ***: Ignoring filter for path with id "{}"'.format(name))
+
+    return new_transform, fill_color, stroke_color, stroke_width, use_even_odd_rule
+
+def is_shape(tag):
+    return tag == 'path' or tag == 'polygon' or tag == 'line' or tag == 'circle' or tag == 'rect'
+
+def parse_shape(node, transform, fill_color, shapes, shape_groups, defs):
+    tag = remove_namespaces(node.tag)
+    new_transform, new_fill_color, stroke_color, stroke_width, use_even_odd_rule = \
+        parse_common_attrib(node, transform, fill_color, defs)
+    if tag == 'path':
+        d = node.attrib['d']
+        name = ''
+        if 'id' in node.attrib:
+            name = node.attrib['id']
+        force_closing = new_fill_color is not None
+        paths = pydiffvg.from_svg_path(d, new_transform, force_closing)
+        for idx, path in enumerate(paths):
+            assert(path.points.shape[1] == 2)
+            path.stroke_width = stroke_width
+            path.source_id = name
+            path.id = "{}-{}".format(name,idx) if len(paths)>1 else name
+        prev_shapes_size = len(shapes)
+        shapes = shapes + paths
+        shape_ids = torch.tensor(list(range(prev_shapes_size, len(shapes))))
+        shape_groups.append(pydiffvg.ShapeGroup(\
+            shape_ids = shape_ids,
+            fill_color = new_fill_color,
+            stroke_color = stroke_color,
+            use_even_odd_rule = use_even_odd_rule,
+            id = name))
+    elif tag == 'polygon':
+        name = ''
+        if 'id' in node.attrib:
+            name = node.attrib['id']
+        force_closing = new_fill_color is not None
+        pts = node.attrib['points'].strip()
+        pts = pts.split(' ')
+        # import ipdb; ipdb.set_trace()
+        pts = [[float(y) for y in re.split(',| ', x)] for x in pts if x]
+        pts = torch.tensor(pts, dtype=torch.float32).view(-1, 2)
+        polygon = pydiffvg.Polygon(pts, force_closing)
+        polygon.stroke_width = stroke_width
+        shape_ids = torch.tensor([len(shapes)])
+        shapes.append(polygon)
+        shape_groups.append(pydiffvg.ShapeGroup(\
+            shape_ids = shape_ids,
+            fill_color = new_fill_color,
+            stroke_color = stroke_color,
+            use_even_odd_rule = use_even_odd_rule,
+            shape_to_canvas = new_transform,
+            id = name))
+    elif tag == 'line':
+        x1 = float(node.attrib['x1'])
+        y1 = float(node.attrib['y1'])
+        x2 = float(node.attrib['x2'])
+        y2 = float(node.attrib['y2'])
+        p1 = torch.tensor([x1, y1])
+        p2 = torch.tensor([x2, y2])
+        points = torch.stack((p1, p2))
+        line = pydiffvg.Polygon(points, False)
+        line.stroke_width = stroke_width
+        shape_ids = torch.tensor([len(shapes)])
+        shapes.append(line)
+        shape_groups.append(pydiffvg.ShapeGroup(\
+            shape_ids = shape_ids,
+            fill_color = new_fill_color,
+            stroke_color = stroke_color,
+            use_even_odd_rule = use_even_odd_rule,
+            shape_to_canvas = new_transform))
+    elif tag == 'circle':
+        radius = float(node.attrib['r'])
+        cx = float(node.attrib['cx'])
+        cy = float(node.attrib['cy'])
+        name = ''
+        if 'id' in node.attrib:
+            name = node.attrib['id']
+        center = torch.tensor([cx, cy])
+        circle = pydiffvg.Circle(radius = torch.tensor(radius),
+                                 center = center)
+        circle.stroke_width = stroke_width
+        shape_ids = torch.tensor([len(shapes)])
+        shapes.append(circle)
+        shape_groups.append(pydiffvg.ShapeGroup(\
+            shape_ids = shape_ids,
+            fill_color = new_fill_color,
+            stroke_color = stroke_color,
+            use_even_odd_rule = use_even_odd_rule,
+            shape_to_canvas = new_transform))
+    elif tag == 'ellipse':
+        rx = float(node.attrib['rx'])
+        ry = float(node.attrib['ry'])
+        cx = float(node.attrib['cx'])
+        cy = float(node.attrib['cy'])
+        name = ''
+        if 'id' in node.attrib:
+            name = node.attrib['id']
+        center = torch.tensor([cx, cy])
+        circle = pydiffvg.Circle(radius = torch.tensor(radius),
+                                 center = center)
+        circle.stroke_width = stroke_width
+        shape_ids = torch.tensor([len(shapes)])
+        shapes.append(circle)
+        shape_groups.append(pydiffvg.ShapeGroup(\
+            shape_ids = shape_ids,
+            fill_color = new_fill_color,
+            stroke_color = stroke_color,
+            use_even_odd_rule = use_even_odd_rule,
+            shape_to_canvas = new_transform))
+    elif tag == 'rect':
+        x = 0.0
+        y = 0.0
+        if x in node.attrib:
+            x = float(node.attrib['x'])
+        if y in node.attrib:
+            y = float(node.attrib['y'])
+        w = float(node.attrib['width'])
+        h = float(node.attrib['height'])
+        p_min = torch.tensor([x, y])
+        p_max = torch.tensor([x + w, x + h])
+        rect = pydiffvg.Rect(p_min = p_min, p_max = p_max)
+        rect.stroke_width = stroke_width
+        shape_ids = torch.tensor([len(shapes)])
+        shapes.append(rect)
+        shape_groups.append(pydiffvg.ShapeGroup(\
+            shape_ids = shape_ids,
+            fill_color = new_fill_color,
+            stroke_color = stroke_color,
+            use_even_odd_rule = use_even_odd_rule,
+            shape_to_canvas = new_transform))
+    return shapes, shape_groups
+
+def parse_group(node, transform, fill_color, shapes, shape_groups, defs):
+    if 'transform' in node.attrib:
+        transform = transform @ parse_transform(node.attrib['transform'])
+    if 'fill' in node.attrib:
+        fill_color = parse_color(node.attrib['fill'], defs)
+    for child in node:
+        tag = remove_namespaces(child.tag)
+        if is_shape(tag):
+            shapes, shape_groups = parse_shape(\
+                child, transform, fill_color, shapes, shape_groups, defs)
+        elif tag == 'g':
+            shapes, shape_groups = parse_group(\
+                child, transform, fill_color, shapes, shape_groups, defs)
+    return shapes, shape_groups
+
+def parse_scene(node):
+    canvas_width = -1
+    canvas_height = -1
+    defs = {}
+    shapes = []
+    shape_groups = []
+    fill_color = torch.tensor([0.0, 0.0, 0.0, 1.0])
+    transform = torch.eye(3)
+    if 'viewBox' in node.attrib:
+        view_box_array = node.attrib['viewBox'].split()
+        canvas_width = parse_int(view_box_array[2])
+        canvas_height = parse_int(view_box_array[3])
+    else:
+        if 'width' in node.attrib:
+            canvas_width = parse_int(node.attrib['width'])
+        else:
+            print('Warning: Can\'t find canvas width.')
+        if 'height' in node.attrib:
+            canvas_height = parse_int(node.attrib['height'])
+        else:
+            print('Warning: Can\'t find canvas height.')
+    for child in node:
+        tag = remove_namespaces(child.tag)
+        if tag == 'defs':
+            defs = parse_defs(child, transform, defs)
+        elif tag == 'style':
+            defs = parse_stylesheet(child, transform, defs)
+        elif tag == 'linearGradient':
+            if 'id' in child.attrib:
+                defs[child.attrib['id']] = parse_linear_gradient(child, transform, defs)
+        elif tag == 'radialGradient':
+            if 'id' in child.attrib:
+                defs[child.attrib['id']] = parse_radial_gradient(child, transform, defs)
+        elif is_shape(tag):
+            shapes, shape_groups = parse_shape(\
+                child, transform, fill_color, shapes, shape_groups, defs)
+        elif tag == 'g':
+            shapes, shape_groups = parse_group(\
+                child, transform, fill_color, shapes, shape_groups, defs)
+    return canvas_width, canvas_height, shapes, shape_groups
+
+def svg_to_scene(filename):
+    """
+        Load from a SVG file and convert to PyTorch tensors.
+    """
+
+    tree = etree.parse(filename)
+    root = tree.getroot()
+    cwd = os.getcwd()
+    if (os.path.dirname(filename) != ''):
+        os.chdir(os.path.dirname(filename))
+    ret = parse_scene(root)
+    os.chdir(cwd)
+    return ret
diff --git a/pydiffvg/pixel_filter.py b/pydiffvg/pixel_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b0ff22507613e01a0fb9ac9701d1c49c68266e8
--- /dev/null
+++ b/pydiffvg/pixel_filter.py
@@ -0,0 +1,9 @@
+import torch
+import pydiffvg
+
+class PixelFilter:
+    def __init__(self,
+                 type,
+                 radius = torch.tensor(0.5)):
+        self.type = type
+        self.radius = radius
diff --git a/pydiffvg/render_pytorch.py b/pydiffvg/render_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b776ce67a0cdb587c8bd558fe5060a6d96e51e3c
--- /dev/null
+++ b/pydiffvg/render_pytorch.py
@@ -0,0 +1,870 @@
+import torch
+import diffvg
+import pydiffvg
+import time
+from enum import IntEnum
+import warnings
+
+print_timing = False
+
+def set_print_timing(val):
+    global print_timing
+    print_timing=val
+
+class OutputType(IntEnum):
+    color = 1
+    sdf = 2
+
+class RenderFunction(torch.autograd.Function):
+    """
+        The PyTorch interface of diffvg.
+    """
+    @staticmethod
+    def serialize_scene(canvas_width,
+                        canvas_height,
+                        shapes,
+                        shape_groups,
+                        filter = pydiffvg.PixelFilter(type = diffvg.FilterType.box,
+                                                      radius = torch.tensor(0.5)),
+                        output_type = OutputType.color,
+                        use_prefiltering = False,
+                        eval_positions = torch.tensor([])):
+        """
+            Given a list of shapes, convert them to a linear list of argument,
+            so that we can use it in PyTorch.
+        """
+        num_shapes = len(shapes)
+        num_shape_groups = len(shape_groups)
+        args = []
+        args.append(canvas_width)
+        args.append(canvas_height)
+        args.append(num_shapes)
+        args.append(num_shape_groups)
+        args.append(output_type)
+        args.append(use_prefiltering)
+        args.append(eval_positions.to(pydiffvg.get_device()))
+        for shape in shapes:
+            use_thickness = False
+            if isinstance(shape, pydiffvg.Circle):
+                assert(shape.center.is_contiguous())
+                args.append(diffvg.ShapeType.circle)
+                args.append(shape.radius.cpu())
+                args.append(shape.center.cpu())
+            elif isinstance(shape, pydiffvg.Ellipse):
+                assert(shape.radius.is_contiguous())
+                assert(shape.center.is_contiguous())
+                args.append(diffvg.ShapeType.ellipse)
+                args.append(shape.radius.cpu())
+                args.append(shape.center.cpu())
+            elif isinstance(shape, pydiffvg.Path):
+                assert(shape.num_control_points.is_contiguous())
+                assert(shape.points.is_contiguous())
+                assert(shape.points.shape[1] == 2)
+                assert(torch.isfinite(shape.points).all())
+                args.append(diffvg.ShapeType.path)
+                args.append(shape.num_control_points.to(torch.int32).cpu())
+                args.append(shape.points.cpu())
+                if len(shape.stroke_width.shape) > 0 and shape.stroke_width.shape[0] > 1:
+                    assert(torch.isfinite(shape.stroke_width).all())
+                    use_thickness = True
+                    args.append(shape.stroke_width.cpu())
+                else:
+                    args.append(None)
+                args.append(shape.is_closed)
+                args.append(shape.use_distance_approx)
+            elif isinstance(shape, pydiffvg.Polygon):
+                assert(shape.points.is_contiguous())
+                assert(shape.points.shape[1] == 2)
+                args.append(diffvg.ShapeType.path)
+                if shape.is_closed:
+                    args.append(torch.zeros(shape.points.shape[0], dtype = torch.int32))
+                else:
+                    args.append(torch.zeros(shape.points.shape[0] - 1, dtype = torch.int32))
+                args.append(shape.points.cpu())
+                args.append(None)  
+                args.append(shape.is_closed)
+                args.append(False) # use_distance_approx
+            elif isinstance(shape, pydiffvg.Rect):
+                assert(shape.p_min.is_contiguous())
+                assert(shape.p_max.is_contiguous())
+                args.append(diffvg.ShapeType.rect)
+                args.append(shape.p_min.cpu())
+                args.append(shape.p_max.cpu())
+            else:
+                assert(False)
+            if use_thickness:
+                args.append(torch.tensor(0.0))
+            else:
+                args.append(shape.stroke_width.cpu())
+
+        for shape_group in shape_groups:
+            assert(shape_group.shape_ids.is_contiguous())
+            args.append(shape_group.shape_ids.to(torch.int32).cpu())
+            # Fill color
+            if shape_group.fill_color is None:
+                args.append(None)
+            elif isinstance(shape_group.fill_color, torch.Tensor):
+                assert(shape_group.fill_color.is_contiguous())
+                args.append(diffvg.ColorType.constant)
+                args.append(shape_group.fill_color.cpu())
+            elif isinstance(shape_group.fill_color, pydiffvg.LinearGradient):
+                assert(shape_group.fill_color.begin.is_contiguous())
+                assert(shape_group.fill_color.end.is_contiguous())
+                assert(shape_group.fill_color.offsets.is_contiguous())
+                assert(shape_group.fill_color.stop_colors.is_contiguous())
+                args.append(diffvg.ColorType.linear_gradient)
+                args.append(shape_group.fill_color.begin.cpu())
+                args.append(shape_group.fill_color.end.cpu())
+                args.append(shape_group.fill_color.offsets.cpu())
+                args.append(shape_group.fill_color.stop_colors.cpu())
+            elif isinstance(shape_group.fill_color, pydiffvg.RadialGradient):
+                assert(shape_group.fill_color.center.is_contiguous())
+                assert(shape_group.fill_color.radius.is_contiguous())
+                assert(shape_group.fill_color.offsets.is_contiguous())
+                assert(shape_group.fill_color.stop_colors.is_contiguous())
+                args.append(diffvg.ColorType.radial_gradient)
+                args.append(shape_group.fill_color.center.cpu())
+                args.append(shape_group.fill_color.radius.cpu())
+                args.append(shape_group.fill_color.offsets.cpu())
+                args.append(shape_group.fill_color.stop_colors.cpu())
+
+            if shape_group.fill_color is not None:
+                # go through the underlying shapes and check if they are all closed
+                for shape_id in shape_group.shape_ids:
+                    if isinstance(shapes[shape_id], pydiffvg.Path):
+                        if not shapes[shape_id].is_closed:
+                            warnings.warn("Detected non-closed paths with fill color. This might causes unexpected results.", Warning)
+
+            # Stroke color
+            if shape_group.stroke_color is None:
+                args.append(None)
+            elif isinstance(shape_group.stroke_color, torch.Tensor):
+                assert(shape_group.stroke_color.is_contiguous())
+                args.append(diffvg.ColorType.constant)
+                args.append(shape_group.stroke_color.cpu())
+            elif isinstance(shape_group.stroke_color, pydiffvg.LinearGradient):
+                assert(shape_group.stroke_color.begin.is_contiguous())
+                assert(shape_group.stroke_color.end.is_contiguous())
+                assert(shape_group.stroke_color.offsets.is_contiguous())
+                assert(shape_group.stroke_color.stop_colors.is_contiguous())
+                assert(torch.isfinite(shape_group.stroke_color.stop_colors).all())
+                args.append(diffvg.ColorType.linear_gradient)
+                args.append(shape_group.stroke_color.begin.cpu())
+                args.append(shape_group.stroke_color.end.cpu())
+                args.append(shape_group.stroke_color.offsets.cpu())
+                args.append(shape_group.stroke_color.stop_colors.cpu())
+            elif isinstance(shape_group.stroke_color, pydiffvg.RadialGradient):
+                assert(shape_group.stroke_color.center.is_contiguous())
+                assert(shape_group.stroke_color.radius.is_contiguous())
+                assert(shape_group.stroke_color.offsets.is_contiguous())
+                assert(shape_group.stroke_color.stop_colors.is_contiguous())
+                assert(torch.isfinite(shape_group.stroke_color.stop_colors).all())
+                args.append(diffvg.ColorType.radial_gradient)
+                args.append(shape_group.stroke_color.center.cpu())
+                args.append(shape_group.stroke_color.radius.cpu())
+                args.append(shape_group.stroke_color.offsets.cpu())
+                args.append(shape_group.stroke_color.stop_colors.cpu())
+            args.append(shape_group.use_even_odd_rule)
+            # Transformation
+            args.append(shape_group.shape_to_canvas.contiguous().cpu())
+        args.append(filter.type)
+        args.append(filter.radius.cpu())
+        return args
+
+    @staticmethod
+    def forward(ctx,
+                width,
+                height,
+                num_samples_x,
+                num_samples_y,
+                seed,
+                background_image,
+                *args):
+        """
+            Forward rendering pass.
+        """
+        # Unpack arguments
+        current_index = 0
+        canvas_width = args[current_index]
+        current_index += 1
+        canvas_height = args[current_index]
+        current_index += 1
+        num_shapes = args[current_index]
+        current_index += 1
+        num_shape_groups = args[current_index]
+        current_index += 1
+        output_type = args[current_index]
+        current_index += 1
+        use_prefiltering = args[current_index]
+        current_index += 1
+        eval_positions = args[current_index]
+        current_index += 1
+        shapes = []
+        shape_groups = []
+        shape_contents = [] # Important to avoid GC deleting the shapes
+        color_contents = [] # Same as above
+        for shape_id in range(num_shapes):
+            shape_type = args[current_index]
+            current_index += 1
+            if shape_type == diffvg.ShapeType.circle:
+                radius = args[current_index]
+                current_index += 1
+                center = args[current_index]
+                current_index += 1
+                shape = diffvg.Circle(radius, diffvg.Vector2f(center[0], center[1]))
+            elif shape_type == diffvg.ShapeType.ellipse:
+                radius = args[current_index]
+                current_index += 1
+                center = args[current_index]
+                current_index += 1
+                shape = diffvg.Ellipse(diffvg.Vector2f(radius[0], radius[1]),
+                                       diffvg.Vector2f(center[0], center[1]))
+            elif shape_type == diffvg.ShapeType.path:
+                num_control_points = args[current_index]
+                current_index += 1
+                points = args[current_index]
+                current_index += 1
+                thickness = args[current_index]
+                current_index += 1
+                is_closed = args[current_index]
+                current_index += 1
+                use_distance_approx = args[current_index]
+                current_index += 1
+                shape = diffvg.Path(diffvg.int_ptr(num_control_points.data_ptr()),
+                                    diffvg.float_ptr(points.data_ptr()),
+                                    diffvg.float_ptr(thickness.data_ptr() if thickness is not None else 0),
+                                    num_control_points.shape[0],
+                                    points.shape[0],
+                                    is_closed,
+                                    use_distance_approx)
+            elif shape_type == diffvg.ShapeType.rect:
+                p_min = args[current_index]
+                current_index += 1
+                p_max = args[current_index]
+                current_index += 1
+                shape = diffvg.Rect(diffvg.Vector2f(p_min[0], p_min[1]),
+                                    diffvg.Vector2f(p_max[0], p_max[1]))
+            else:
+                assert(False)
+            stroke_width = args[current_index]
+            current_index += 1
+            shapes.append(diffvg.Shape(\
+                shape_type, shape.get_ptr(), stroke_width.item()))
+            shape_contents.append(shape)
+
+        for shape_group_id in range(num_shape_groups):
+            shape_ids = args[current_index]
+            current_index += 1
+            fill_color_type = args[current_index]
+            current_index += 1
+            if fill_color_type == diffvg.ColorType.constant:
+                color = args[current_index]
+                current_index += 1
+                fill_color = diffvg.Constant(\
+                    diffvg.Vector4f(color[0], color[1], color[2], color[3]))
+            elif fill_color_type == diffvg.ColorType.linear_gradient:
+                beg = args[current_index]
+                current_index += 1
+                end = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                fill_color = diffvg.LinearGradient(diffvg.Vector2f(beg[0], beg[1]),
+                                                   diffvg.Vector2f(end[0], end[1]),
+                                                   offsets.shape[0],
+                                                   diffvg.float_ptr(offsets.data_ptr()),
+                                                   diffvg.float_ptr(stop_colors.data_ptr()))
+            elif fill_color_type == diffvg.ColorType.radial_gradient:
+                center = args[current_index]
+                current_index += 1
+                radius = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                fill_color = diffvg.RadialGradient(diffvg.Vector2f(center[0], center[1]),
+                                                   diffvg.Vector2f(radius[0], radius[1]),
+                                                   offsets.shape[0],
+                                                   diffvg.float_ptr(offsets.data_ptr()),
+                                                   diffvg.float_ptr(stop_colors.data_ptr()))
+            elif fill_color_type is None:
+                fill_color = None
+            else:
+                assert(False)
+            stroke_color_type = args[current_index]
+            current_index += 1
+            if stroke_color_type == diffvg.ColorType.constant:
+                color = args[current_index]
+                current_index += 1
+                stroke_color = diffvg.Constant(\
+                    diffvg.Vector4f(color[0], color[1], color[2], color[3]))
+            elif stroke_color_type == diffvg.ColorType.linear_gradient:
+                beg = args[current_index]
+                current_index += 1
+                end = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                stroke_color = diffvg.LinearGradient(diffvg.Vector2f(beg[0], beg[1]),
+                                                     diffvg.Vector2f(end[0], end[1]),
+                                                     offsets.shape[0],
+                                                     diffvg.float_ptr(offsets.data_ptr()),
+                                                     diffvg.float_ptr(stop_colors.data_ptr()))
+            elif stroke_color_type == diffvg.ColorType.radial_gradient:
+                center = args[current_index]
+                current_index += 1
+                radius = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                stroke_color = diffvg.RadialGradient(diffvg.Vector2f(center[0], center[1]),
+                                                     diffvg.Vector2f(radius[0], radius[1]),
+                                                     offsets.shape[0],
+                                                     diffvg.float_ptr(offsets.data_ptr()),
+                                                     diffvg.float_ptr(stop_colors.data_ptr()))
+            elif stroke_color_type is None:
+                stroke_color = None
+            else:
+                assert(False)
+            use_even_odd_rule = args[current_index]
+            current_index += 1
+            shape_to_canvas = args[current_index]
+            current_index += 1
+
+            if fill_color is not None:
+                color_contents.append(fill_color)
+            if stroke_color is not None:
+                color_contents.append(stroke_color)
+            shape_groups.append(diffvg.ShapeGroup(\
+                diffvg.int_ptr(shape_ids.data_ptr()),
+                shape_ids.shape[0],
+                diffvg.ColorType.constant if fill_color_type is None else fill_color_type,
+                diffvg.void_ptr(0) if fill_color is None else fill_color.get_ptr(),
+                diffvg.ColorType.constant if stroke_color_type is None else stroke_color_type,
+                diffvg.void_ptr(0) if stroke_color is None else stroke_color.get_ptr(),
+                use_even_odd_rule,
+                diffvg.float_ptr(shape_to_canvas.data_ptr())))
+
+        filter_type = args[current_index]
+        current_index += 1
+        filter_radius = args[current_index]
+        current_index += 1
+        filt = diffvg.Filter(filter_type, filter_radius)
+
+        start = time.time()
+        scene = diffvg.Scene(canvas_width, canvas_height,
+            shapes, shape_groups, filt, pydiffvg.get_use_gpu(),
+            pydiffvg.get_device().index if pydiffvg.get_device().index is not None else -1)
+        time_elapsed = time.time() - start
+        global print_timing
+        if print_timing:
+            print('Scene construction, time: %.5f s' % time_elapsed)
+
+        if output_type == OutputType.color:
+            assert(eval_positions.shape[0] == 0)
+            rendered_image = torch.zeros(height, width, 4, device = pydiffvg.get_device())
+        else:
+            assert(output_type == OutputType.sdf)          
+            if eval_positions.shape[0] == 0:
+                rendered_image = torch.zeros(height, width, 1, device = pydiffvg.get_device())
+            else:
+                rendered_image = torch.zeros(eval_positions.shape[0], 1, device = pydiffvg.get_device())
+
+        if background_image is not None:
+            background_image = background_image.to(pydiffvg.get_device())
+            if background_image.shape[2] == 3:
+                background_image = torch.cat((\
+                    background_image, torch.ones(background_image.shape[0], background_image.shape[1], 1,
+                        device = background_image.device)), dim = 2)
+            background_image = background_image.contiguous()
+            assert(background_image.shape[0] == rendered_image.shape[0])
+            assert(background_image.shape[1] == rendered_image.shape[1])
+            assert(background_image.shape[2] == 4)
+
+        start = time.time()
+        diffvg.render(scene,
+                      diffvg.float_ptr(background_image.data_ptr() if background_image is not None else 0),
+                      diffvg.float_ptr(rendered_image.data_ptr() if output_type == OutputType.color else 0),
+                      diffvg.float_ptr(rendered_image.data_ptr() if output_type == OutputType.sdf else 0),
+                      width,
+                      height,
+                      num_samples_x,
+                      num_samples_y,
+                      seed,
+                      diffvg.float_ptr(0), # d_background_image
+                      diffvg.float_ptr(0), # d_render_image
+                      diffvg.float_ptr(0), # d_render_sdf
+                      diffvg.float_ptr(0), # d_translation
+                      use_prefiltering,
+                      diffvg.float_ptr(eval_positions.data_ptr()),
+                      eval_positions.shape[0])
+        assert(torch.isfinite(rendered_image).all())
+        time_elapsed = time.time() - start
+        if print_timing:
+            print('Forward pass, time: %.5f s' % time_elapsed)
+
+        ctx.scene = scene
+        ctx.background_image = background_image
+        ctx.shape_contents = shape_contents
+        ctx.color_contents = color_contents
+        ctx.filter = filt
+        ctx.width = width
+        ctx.height = height
+        ctx.num_samples_x = num_samples_x
+        ctx.num_samples_y = num_samples_y
+        ctx.seed = seed
+        ctx.output_type = output_type
+        ctx.use_prefiltering = use_prefiltering
+        ctx.eval_positions = eval_positions
+        return rendered_image
+
+    @staticmethod
+    def render_grad(grad_img,
+                    width,
+                    height,
+                    num_samples_x,
+                    num_samples_y,
+                    seed,
+                    background_image,
+                    *args):
+        if not grad_img.is_contiguous():
+            grad_img = grad_img.contiguous()
+        assert(torch.isfinite(grad_img).all())
+
+        # Unpack arguments
+        current_index = 0
+        canvas_width = args[current_index]
+        current_index += 1
+        canvas_height = args[current_index]
+        current_index += 1
+        num_shapes = args[current_index]
+        current_index += 1
+        num_shape_groups = args[current_index]
+        current_index += 1
+        output_type = args[current_index]
+        current_index += 1
+        use_prefiltering = args[current_index]
+        current_index += 1
+        eval_positions = args[current_index]
+        current_index += 1        
+        shapes = []
+        shape_groups = []
+        shape_contents = [] # Important to avoid GC deleting the shapes
+        color_contents = [] # Same as above
+        for shape_id in range(num_shapes):
+            shape_type = args[current_index]
+            current_index += 1
+            if shape_type == diffvg.ShapeType.circle:
+                radius = args[current_index]
+                current_index += 1
+                center = args[current_index]
+                current_index += 1
+                shape = diffvg.Circle(radius, diffvg.Vector2f(center[0], center[1]))
+            elif shape_type == diffvg.ShapeType.ellipse:
+                radius = args[current_index]
+                current_index += 1
+                center = args[current_index]
+                current_index += 1
+                shape = diffvg.Ellipse(diffvg.Vector2f(radius[0], radius[1]),
+                                       diffvg.Vector2f(center[0], center[1]))
+            elif shape_type == diffvg.ShapeType.path:
+                num_control_points = args[current_index]
+                current_index += 1
+                points = args[current_index]
+                current_index += 1
+                thickness = args[current_index]
+                current_index += 1
+                is_closed = args[current_index]
+                current_index += 1
+                use_distance_approx = args[current_index]
+                current_index += 1
+                shape = diffvg.Path(diffvg.int_ptr(num_control_points.data_ptr()),
+                                    diffvg.float_ptr(points.data_ptr()),
+                                    diffvg.float_ptr(thickness.data_ptr() if thickness is not None else 0),
+                                    num_control_points.shape[0],
+                                    points.shape[0],
+                                    is_closed,
+                                    use_distance_approx)
+            elif shape_type == diffvg.ShapeType.rect:
+                p_min = args[current_index]
+                current_index += 1
+                p_max = args[current_index]
+                current_index += 1
+                shape = diffvg.Rect(diffvg.Vector2f(p_min[0], p_min[1]),
+                                    diffvg.Vector2f(p_max[0], p_max[1]))
+            else:
+                assert(False)
+            stroke_width = args[current_index]
+            current_index += 1
+            shapes.append(diffvg.Shape(\
+                shape_type, shape.get_ptr(), stroke_width.item()))
+            shape_contents.append(shape)
+
+        for shape_group_id in range(num_shape_groups):
+            shape_ids = args[current_index]
+            current_index += 1
+            fill_color_type = args[current_index]
+            current_index += 1
+            if fill_color_type == diffvg.ColorType.constant:
+                color = args[current_index]
+                current_index += 1
+                fill_color = diffvg.Constant(\
+                    diffvg.Vector4f(color[0], color[1], color[2], color[3]))
+            elif fill_color_type == diffvg.ColorType.linear_gradient:
+                beg = args[current_index]
+                current_index += 1
+                end = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                fill_color = diffvg.LinearGradient(diffvg.Vector2f(beg[0], beg[1]),
+                                                   diffvg.Vector2f(end[0], end[1]),
+                                                   offsets.shape[0],
+                                                   diffvg.float_ptr(offsets.data_ptr()),
+                                                   diffvg.float_ptr(stop_colors.data_ptr()))
+            elif fill_color_type == diffvg.ColorType.radial_gradient:
+                center = args[current_index]
+                current_index += 1
+                radius = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                fill_color = diffvg.RadialGradient(diffvg.Vector2f(center[0], center[1]),
+                                                   diffvg.Vector2f(radius[0], radius[1]),
+                                                   offsets.shape[0],
+                                                   diffvg.float_ptr(offsets.data_ptr()),
+                                                   diffvg.float_ptr(stop_colors.data_ptr()))
+            elif fill_color_type is None:
+                fill_color = None
+            else:
+                assert(False)
+            stroke_color_type = args[current_index]
+            current_index += 1
+            if stroke_color_type == diffvg.ColorType.constant:
+                color = args[current_index]
+                current_index += 1
+                stroke_color = diffvg.Constant(\
+                    diffvg.Vector4f(color[0], color[1], color[2], color[3]))
+            elif stroke_color_type == diffvg.ColorType.linear_gradient:
+                beg = args[current_index]
+                current_index += 1
+                end = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                stroke_color = diffvg.LinearGradient(diffvg.Vector2f(beg[0], beg[1]),
+                                                     diffvg.Vector2f(end[0], end[1]),
+                                                     offsets.shape[0],
+                                                     diffvg.float_ptr(offsets.data_ptr()),
+                                                     diffvg.float_ptr(stop_colors.data_ptr()))
+            elif stroke_color_type == diffvg.ColorType.radial_gradient:
+                center = args[current_index]
+                current_index += 1
+                radius = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                stroke_color = diffvg.RadialGradient(diffvg.Vector2f(center[0], center[1]),
+                                                     diffvg.Vector2f(radius[0], radius[1]),
+                                                     offsets.shape[0],
+                                                     diffvg.float_ptr(offsets.data_ptr()),
+                                                     diffvg.float_ptr(stop_colors.data_ptr()))
+            elif stroke_color_type is None:
+                stroke_color = None
+            else:
+                assert(False)
+            use_even_odd_rule = args[current_index]
+            current_index += 1
+            shape_to_canvas = args[current_index]
+            current_index += 1
+
+            if fill_color is not None:
+                color_contents.append(fill_color)
+            if stroke_color is not None:
+                color_contents.append(stroke_color)
+            shape_groups.append(diffvg.ShapeGroup(\
+                diffvg.int_ptr(shape_ids.data_ptr()),
+                shape_ids.shape[0],
+                diffvg.ColorType.constant if fill_color_type is None else fill_color_type,
+                diffvg.void_ptr(0) if fill_color is None else fill_color.get_ptr(),
+                diffvg.ColorType.constant if stroke_color_type is None else stroke_color_type,
+                diffvg.void_ptr(0) if stroke_color is None else stroke_color.get_ptr(),
+                use_even_odd_rule,
+                diffvg.float_ptr(shape_to_canvas.data_ptr())))
+
+        filter_type = args[current_index]
+        current_index += 1
+        filter_radius = args[current_index]
+        current_index += 1
+        filt = diffvg.Filter(filter_type, filter_radius)
+
+        scene = diffvg.Scene(canvas_width, canvas_height,
+            shapes, shape_groups, filt, pydiffvg.get_use_gpu(),
+            pydiffvg.get_device().index if pydiffvg.get_device().index is not None else -1)
+
+        if output_type == OutputType.color:
+            assert(grad_img.shape[2] == 4)
+        else:
+            assert(grad_img.shape[2] == 1)
+
+        if background_image is not None:
+            background_image = background_image.to(pydiffvg.get_device())
+            if background_image.shape[2] == 3:
+                background_image = torch.cat((\
+                    background_image, torch.ones(background_image.shape[0], background_image.shape[1], 1,
+                        device = background_image.device)), dim = 2)
+            background_image = background_image.contiguous()
+            assert(background_image.shape[0] == rendered_image.shape[0])
+            assert(background_image.shape[1] == rendered_image.shape[1])
+            assert(background_image.shape[2] == 4)
+
+        translation_grad_image = \
+            torch.zeros(height, width, 2, device = pydiffvg.get_device())
+        start = time.time()
+        diffvg.render(scene,
+                      diffvg.float_ptr(background_image.data_ptr() if background_image is not None else 0),
+                      diffvg.float_ptr(0), # render_image
+                      diffvg.float_ptr(0), # render_sdf
+                      width,
+                      height,
+                      num_samples_x,
+                      num_samples_y,
+                      seed,
+                      diffvg.float_ptr(0), # d_background_image
+                      diffvg.float_ptr(grad_img.data_ptr() if output_type == OutputType.color else 0),
+                      diffvg.float_ptr(grad_img.data_ptr() if output_type == OutputType.sdf else 0),
+                      diffvg.float_ptr(translation_grad_image.data_ptr()),
+                      use_prefiltering,
+                      diffvg.float_ptr(eval_positions.data_ptr()),
+                      eval_positions.shape[0])
+        time_elapsed = time.time() - start
+        if print_timing:
+            print('Gradient pass, time: %.5f s' % time_elapsed)
+        assert(torch.isfinite(translation_grad_image).all())
+
+        return translation_grad_image
+
+    @staticmethod
+    def backward(ctx,
+                 grad_img):
+        if not grad_img.is_contiguous():
+            grad_img = grad_img.contiguous()
+        assert(torch.isfinite(grad_img).all())
+
+        scene = ctx.scene
+        width = ctx.width
+        height = ctx.height
+        num_samples_x = ctx.num_samples_x
+        num_samples_y = ctx.num_samples_y
+        seed = ctx.seed
+        output_type = ctx.output_type
+        use_prefiltering = ctx.use_prefiltering
+        eval_positions = ctx.eval_positions
+        background_image = ctx.background_image
+
+        if background_image is not None:
+            d_background_image = torch.zeros_like(background_image)
+        else:
+            d_background_image = None
+
+        start = time.time()
+        diffvg.render(scene,
+                      diffvg.float_ptr(background_image.data_ptr() if background_image is not None else 0),
+                      diffvg.float_ptr(0), # render_image
+                      diffvg.float_ptr(0), # render_sdf
+                      width,
+                      height,
+                      num_samples_x,
+                      num_samples_y,
+                      seed,
+                      diffvg.float_ptr(d_background_image.data_ptr() if background_image is not None else 0),
+                      diffvg.float_ptr(grad_img.data_ptr() if output_type == OutputType.color else 0),
+                      diffvg.float_ptr(grad_img.data_ptr() if output_type == OutputType.sdf else 0),
+                      diffvg.float_ptr(0), # d_translation
+                      use_prefiltering,
+                      diffvg.float_ptr(eval_positions.data_ptr()),
+                      eval_positions.shape[0])
+        time_elapsed = time.time() - start
+        global print_timing
+        if print_timing:
+            print('Backward pass, time: %.5f s' % time_elapsed)
+
+        d_args = []
+        d_args.append(None) # width
+        d_args.append(None) # height
+        d_args.append(None) # num_samples_x
+        d_args.append(None) # num_samples_y
+        d_args.append(None) # seed
+        d_args.append(d_background_image)
+        d_args.append(None) # canvas_width
+        d_args.append(None) # canvas_height
+        d_args.append(None) # num_shapes
+        d_args.append(None) # num_shape_groups
+        d_args.append(None) # output_type
+        d_args.append(None) # use_prefiltering
+        d_args.append(None) # eval_positions
+        for shape_id in range(scene.num_shapes):
+            d_args.append(None) # type
+            d_shape = scene.get_d_shape(shape_id)
+            use_thickness = False
+            if d_shape.type == diffvg.ShapeType.circle:
+                d_circle = d_shape.as_circle()
+                radius = torch.tensor(d_circle.radius)
+                assert(torch.isfinite(radius).all())
+                d_args.append(radius)
+                c = d_circle.center
+                c = torch.tensor((c.x, c.y))
+                assert(torch.isfinite(c).all())
+                d_args.append(c)
+            elif d_shape.type == diffvg.ShapeType.ellipse:
+                d_ellipse = d_shape.as_ellipse()
+                r = d_ellipse.radius
+                r = torch.tensor((d_ellipse.radius.x, d_ellipse.radius.y))
+                assert(torch.isfinite(r).all())
+                d_args.append(r)
+                c = d_ellipse.center
+                c = torch.tensor((c.x, c.y))
+                assert(torch.isfinite(c).all())
+                d_args.append(c)
+            elif d_shape.type == diffvg.ShapeType.path:
+                d_path = d_shape.as_path()
+                points = torch.zeros((d_path.num_points, 2))
+                thickness = None
+                if d_path.has_thickness():
+                    use_thickness = True
+                    thickness = torch.zeros(d_path.num_points)
+                    d_path.copy_to(diffvg.float_ptr(points.data_ptr()), diffvg.float_ptr(thickness.data_ptr()))
+                else:
+                    d_path.copy_to(diffvg.float_ptr(points.data_ptr()), diffvg.float_ptr(0))
+                assert(torch.isfinite(points).all())
+                if thickness is not None:
+                    assert(torch.isfinite(thickness).all())
+                d_args.append(None) # num_control_points
+                d_args.append(points)
+                d_args.append(thickness)
+                d_args.append(None) # is_closed
+                d_args.append(None) # use_distance_approx
+            elif d_shape.type == diffvg.ShapeType.rect:
+                d_rect = d_shape.as_rect()
+                p_min = torch.tensor((d_rect.p_min.x, d_rect.p_min.y))
+                p_max = torch.tensor((d_rect.p_max.x, d_rect.p_max.y))
+                assert(torch.isfinite(p_min).all())
+                assert(torch.isfinite(p_max).all())
+                d_args.append(p_min)
+                d_args.append(p_max)
+            else:
+                assert(False)
+            if use_thickness:
+                d_args.append(None)
+            else:
+                w = torch.tensor((d_shape.stroke_width))
+                assert(torch.isfinite(w).all())
+                d_args.append(w)
+
+        for group_id in range(scene.num_shape_groups):
+            d_shape_group = scene.get_d_shape_group(group_id)
+            d_args.append(None) # shape_ids
+            d_args.append(None) # fill_color_type
+            if d_shape_group.has_fill_color():
+                if d_shape_group.fill_color_type == diffvg.ColorType.constant:
+                    d_constant = d_shape_group.fill_color_as_constant()
+                    c = d_constant.color
+                    d_args.append(torch.tensor((c.x, c.y, c.z, c.w)))
+                elif d_shape_group.fill_color_type == diffvg.ColorType.linear_gradient:
+                    d_linear_gradient = d_shape_group.fill_color_as_linear_gradient()
+                    beg = d_linear_gradient.begin
+                    d_args.append(torch.tensor((beg.x, beg.y)))
+                    end = d_linear_gradient.end
+                    d_args.append(torch.tensor((end.x, end.y)))
+                    offsets = torch.zeros((d_linear_gradient.num_stops))
+                    stop_colors = torch.zeros((d_linear_gradient.num_stops, 4))
+                    d_linear_gradient.copy_to(\
+                        diffvg.float_ptr(offsets.data_ptr()),
+                        diffvg.float_ptr(stop_colors.data_ptr()))
+                    assert(torch.isfinite(stop_colors).all())
+                    d_args.append(offsets)
+                    d_args.append(stop_colors)
+                elif d_shape_group.fill_color_type == diffvg.ColorType.radial_gradient:
+                    d_radial_gradient = d_shape_group.fill_color_as_radial_gradient()
+                    center = d_radial_gradient.center
+                    d_args.append(torch.tensor((center.x, center.y)))
+                    radius = d_radial_gradient.radius
+                    d_args.append(torch.tensor((radius.x, radius.y)))
+                    offsets = torch.zeros((d_radial_gradient.num_stops))
+                    stop_colors = torch.zeros((d_radial_gradient.num_stops, 4))
+                    d_radial_gradient.copy_to(\
+                        diffvg.float_ptr(offsets.data_ptr()),
+                        diffvg.float_ptr(stop_colors.data_ptr()))
+                    assert(torch.isfinite(stop_colors).all())
+                    d_args.append(offsets)
+                    d_args.append(stop_colors)
+                else:
+                    assert(False)
+            d_args.append(None) # stroke_color_type
+            if d_shape_group.has_stroke_color():
+                if d_shape_group.stroke_color_type == diffvg.ColorType.constant:
+                    d_constant = d_shape_group.stroke_color_as_constant()
+                    c = d_constant.color
+                    d_args.append(torch.tensor((c.x, c.y, c.z, c.w)))
+                elif d_shape_group.stroke_color_type == diffvg.ColorType.linear_gradient:
+                    d_linear_gradient = d_shape_group.stroke_color_as_linear_gradient()
+                    beg = d_linear_gradient.begin
+                    d_args.append(torch.tensor((beg.x, beg.y)))
+                    end = d_linear_gradient.end
+                    d_args.append(torch.tensor((end.x, end.y)))
+                    offsets = torch.zeros((d_linear_gradient.num_stops))
+                    stop_colors = torch.zeros((d_linear_gradient.num_stops, 4))
+                    d_linear_gradient.copy_to(\
+                        diffvg.float_ptr(offsets.data_ptr()),
+                        diffvg.float_ptr(stop_colors.data_ptr()))
+                    assert(torch.isfinite(stop_colors).all())
+                    d_args.append(offsets)
+                    d_args.append(stop_colors)
+                elif d_shape_group.fill_color_type == diffvg.ColorType.radial_gradient:
+                    d_radial_gradient = d_shape_group.stroke_color_as_radial_gradient()
+                    center = d_radial_gradient.center
+                    d_args.append(torch.tensor((center.x, center.y)))
+                    radius = d_radial_gradient.radius
+                    d_args.append(torch.tensor((radius.x, radius.y)))
+                    offsets = torch.zeros((d_radial_gradient.num_stops))
+                    stop_colors = torch.zeros((d_radial_gradient.num_stops, 4))
+                    d_radial_gradient.copy_to(\
+                        diffvg.float_ptr(offsets.data_ptr()),
+                        diffvg.float_ptr(stop_colors.data_ptr()))
+                    assert(torch.isfinite(stop_colors).all())
+                    d_args.append(offsets)
+                    d_args.append(stop_colors)
+                else:
+                    assert(False)
+            d_args.append(None) # use_even_odd_rule
+            d_shape_to_canvas = torch.zeros((3, 3))
+            d_shape_group.copy_to(diffvg.float_ptr(d_shape_to_canvas.data_ptr()))
+            assert(torch.isfinite(d_shape_to_canvas).all())
+            d_args.append(d_shape_to_canvas)
+        d_args.append(None) # filter_type
+        d_args.append(torch.tensor(scene.get_d_filter_radius()))
+
+        return tuple(d_args)
diff --git a/pydiffvg/save_svg.py b/pydiffvg/save_svg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f5641a63849cfec25fa2f560d50e92dc78576c3
--- /dev/null
+++ b/pydiffvg/save_svg.py
@@ -0,0 +1,167 @@
+import torch
+import pydiffvg
+import xml.etree.ElementTree as etree
+from xml.dom import minidom
+def prettify(elem):
+    """Return a pretty-printed XML string for the Element.
+    """
+    rough_string = etree.tostring(elem, 'utf-8')
+    reparsed = minidom.parseString(rough_string)
+    return reparsed.toprettyxml(indent="  ")
+def save_svg(filename, width, height, shapes, shape_groups, use_gamma = False, background=None):
+    root = etree.Element('svg')
+    root.set('version', '1.1')
+    root.set('xmlns', 'http://www.w3.org/2000/svg')
+    root.set('width', str(width))
+    root.set('height', str(height))
+    if background is not None:
+        print(f"setting background to {background}")
+        root.set('style', str(background))
+    defs = etree.SubElement(root, 'defs')
+    g = etree.SubElement(root, 'g')
+    if use_gamma:
+        f = etree.SubElement(defs, 'filter')
+        f.set('id', 'gamma')
+        f.set('x', '0')
+        f.set('y', '0')
+        f.set('width', '100%')
+        f.set('height', '100%')
+        gamma = etree.SubElement(f, 'feComponentTransfer')
+        gamma.set('color-interpolation-filters', 'sRGB')
+        feFuncR = etree.SubElement(gamma, 'feFuncR')
+        feFuncR.set('type', 'gamma')
+        feFuncR.set('amplitude', str(1))
+        feFuncR.set('exponent', str(1/2.2))
+        feFuncG = etree.SubElement(gamma, 'feFuncG')
+        feFuncG.set('type', 'gamma')
+        feFuncG.set('amplitude', str(1))
+        feFuncG.set('exponent', str(1/2.2))
+        feFuncB = etree.SubElement(gamma, 'feFuncB')
+        feFuncB.set('type', 'gamma')
+        feFuncB.set('amplitude', str(1))
+        feFuncB.set('exponent', str(1/2.2))
+        feFuncA = etree.SubElement(gamma, 'feFuncA')
+        feFuncA.set('type', 'gamma')
+        feFuncA.set('amplitude', str(1))
+        feFuncA.set('exponent', str(1/2.2))
+        g.set('style', 'filter:url(#gamma)')
+    # Store color
+    for i, shape_group in enumerate(shape_groups):
+        def add_color(shape_color, name):
+            if isinstance(shape_color, pydiffvg.LinearGradient):
+                lg = shape_color
+                color = etree.SubElement(defs, 'linearGradient')
+                color.set('id', name)
+                color.set('x1', str(lg.begin[0].item()/width))
+                color.set('y1', str(lg.begin[1].item()/height))
+                color.set('x2', str(lg.end[0].item()/width))
+                color.set('y2', str(lg.end[1].item()/height))
+                offsets = lg.offsets.data.cpu().numpy()
+                stop_colors = lg.stop_colors.data.cpu().numpy()
+                for j in range(offsets.shape[0]):
+                    stop = etree.SubElement(color, 'stop')
+                    stop.set('offset', str(offsets[j]))
+                    c = lg.stop_colors[j, :]
+                    stop.set('stop-color', 'rgb({}, {}, {})'.format(\
+                        int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                    stop.set('stop-opacity', '{}'.format(c[3]))
+            if isinstance(shape_color, pydiffvg.RadialGradient):
+                lg = shape_color
+                color = etree.SubElement(defs, 'radialGradient')
+                color.set('id', name)
+                color.set('cx', str(lg.center[0].item()/width))
+                color.set('cy', str(lg.center[1].item()/height))
+                # this only support width=height
+                color.set('r', str(lg.radius[0].item()/width))
+                offsets = lg.offsets.data.cpu().numpy()
+                stop_colors = lg.stop_colors.data.cpu().numpy()
+                for j in range(offsets.shape[0]):
+                    stop = etree.SubElement(color, 'stop')
+                    stop.set('offset', str(offsets[j]))
+                    c = lg.stop_colors[j, :]
+                    stop.set('stop-color', 'rgb({}, {}, {})'.format(\
+                        int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                    stop.set('stop-opacity', '{}'.format(c[3]))
+        if shape_group.fill_color is not None:
+            add_color(shape_group.fill_color, 'shape_{}_fill'.format(i))
+        if shape_group.stroke_color is not None:
+            add_color(shape_group.stroke_color, 'shape_{}_stroke'.format(i))
+    for i, shape_group in enumerate(shape_groups):
+        shape = shapes[shape_group.shape_ids[0]]
+        if isinstance(shape, pydiffvg.Circle):
+            shape_node = etree.SubElement(g, 'circle')
+            shape_node.set('r', str(shape.radius.item()))
+            shape_node.set('cx', str(shape.center[0].item()))
+            shape_node.set('cy', str(shape.center[1].item()))
+        elif isinstance(shape, pydiffvg.Polygon):
+            shape_node = etree.SubElement(g, 'polygon')
+            points = shape.points.data.cpu().numpy()
+            path_str = ''
+            for j in range(0, shape.points.shape[0]):
+                path_str += '{} {}'.format(points[j, 0], points[j, 1])
+                if j != shape.points.shape[0] - 1:
+                    path_str +=  ' '
+            shape_node.set('points', path_str)
+        elif isinstance(shape, pydiffvg.Path):
+            shape_node = etree.SubElement(g, 'path')
+            num_segments = shape.num_control_points.shape[0]
+            num_control_points = shape.num_control_points.data.cpu().numpy()
+            points = shape.points.data.cpu().numpy()
+            num_points = shape.points.shape[0]
+            path_str = 'M {} {}'.format(points[0, 0], points[0, 1])
+            point_id = 1
+            for j in range(0, num_segments):
+                if num_control_points[j] == 0:
+                    p = point_id % num_points
+                    path_str += ' L {} {}'.format(\
+                            points[p, 0], points[p, 1])
+                    point_id += 1
+                elif num_control_points[j] == 1:
+                    p1 = (point_id + 1) % num_points
+                    path_str += ' Q {} {} {} {}'.format(\
+                            points[point_id, 0], points[point_id, 1],
+                            points[p1, 0], points[p1, 1])
+                    point_id += 2
+                elif num_control_points[j] == 2:
+                    p2 = (point_id + 2) % num_points
+                    path_str += ' C {} {} {} {} {} {}'.format(\
+                            points[point_id, 0], points[point_id, 1],
+                            points[point_id + 1, 0], points[point_id + 1, 1],
+                            points[p2, 0], points[p2, 1])
+                    point_id += 3
+            shape_node.set('d', path_str)
+        elif isinstance(shape, pydiffvg.Rect):
+            shape_node = etree.SubElement(g, 'rect')
+            shape_node.set('x', str(shape.p_min[0].item()))
+            shape_node.set('y', str(shape.p_min[1].item()))
+            shape_node.set('width', str(shape.p_max[0].item() - shape.p_min[0].item()))
+            shape_node.set('height', str(shape.p_max[1].item() - shape.p_min[1].item()))
+        else:
+            assert(False)
+        shape_node.set('stroke-width', str(2 * shape.stroke_width.data.cpu().item()))
+        if shape_group.fill_color is not None:
+            if isinstance(shape_group.fill_color, pydiffvg.LinearGradient):
+                shape_node.set('fill', 'url(#shape_{}_fill)'.format(i))
+            elif isinstance(shape_group.fill_color, pydiffvg.RadialGradient):
+                shape_node.set('fill', 'url(#shape_{}_fill)'.format(i))
+            else:
+                c = shape_group.fill_color.data.cpu().numpy()
+                shape_node.set('fill', 'rgb({}, {}, {})'.format(\
+                    int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                shape_node.set('opacity', str(c[3]))
+        else:
+            shape_node.set('fill', 'none')
+        if shape_group.stroke_color is not None:
+            if isinstance(shape_group.stroke_color, pydiffvg.LinearGradient):
+                shape_node.set('stroke', 'url(#shape_{}_stroke)'.format(i))
+            elif isinstance(shape_group.stroke_color, pydiffvg.LinearGradient):
+                shape_node.set('stroke', 'url(#shape_{}_stroke)'.format(i))
+            else:
+                c = shape_group.stroke_color.data.cpu().numpy()
+                shape_node.set('stroke', 'rgb({}, {}, {})'.format(\
+                    int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                shape_node.set('stroke-opacity', str(c[3]))
+            shape_node.set('stroke-linecap', 'round')
+            shape_node.set('stroke-linejoin', 'round')
+    with open(filename, "w") as f:
+        f.write(prettify(root))
diff --git a/pydiffvg/shape.py b/pydiffvg/shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..a87e9e501b10a933afec844709f8d58670bb4ba9
--- /dev/null
+++ b/pydiffvg/shape.py
@@ -0,0 +1,172 @@
+import torch
+import svgpathtools
+import math
+
+class Circle:
+    def __init__(self, radius, center, stroke_width = torch.tensor(1.0), id = ''):
+        self.radius = radius
+        self.center = center
+        self.stroke_width = stroke_width
+        self.id = id
+
+class Ellipse:
+    def __init__(self, radius, center, stroke_width = torch.tensor(1.0), id = ''):
+        self.radius = radius
+        self.center = center
+        self.stroke_width = stroke_width
+        self.id = id
+
+class Path:
+    def __init__(self,
+                 num_control_points,
+                 points,
+                 is_closed,
+                 stroke_width = torch.tensor(1.0),
+                 id = '',
+                 use_distance_approx = False):
+        self.num_control_points = num_control_points
+        self.points = points
+        self.is_closed = is_closed
+        self.stroke_width = stroke_width
+        self.id = id
+        self.use_distance_approx = use_distance_approx
+
+class Polygon:
+    def __init__(self, points, is_closed, stroke_width = torch.tensor(1.0), id = ''):
+        self.points = points
+        self.is_closed = is_closed
+        self.stroke_width = stroke_width
+        self.id = id
+
+class Rect:
+    def __init__(self, p_min, p_max, stroke_width = torch.tensor(1.0), id = ''):
+        self.p_min = p_min
+        self.p_max = p_max
+        self.stroke_width = stroke_width
+        self.id = id
+
+class ShapeGroup:
+    def __init__(self,
+                 shape_ids,
+                 fill_color,
+                 use_even_odd_rule = True,
+                 stroke_color = None,
+                 shape_to_canvas = torch.eye(3),
+                 id = ''):
+        self.shape_ids = shape_ids
+        self.fill_color = fill_color
+        self.use_even_odd_rule = use_even_odd_rule
+        self.stroke_color = stroke_color
+        self.shape_to_canvas = shape_to_canvas
+        self.id = id
+
+def from_svg_path(path_str, shape_to_canvas = torch.eye(3), force_close = False):
+    path = svgpathtools.parse_path(path_str)
+    if len(path) == 0:
+        return []
+    ret_paths = []
+    subpaths = path.continuous_subpaths()
+    for subpath in subpaths:
+        if subpath.isclosed():
+            if len(subpath) > 1 and isinstance(subpath[-1], svgpathtools.Line) and subpath[-1].length() < 1e-5:
+                subpath.remove(subpath[-1])
+                subpath[-1].end = subpath[0].start # Force closing the path
+                subpath.end = subpath[-1].end
+                assert(subpath.isclosed())
+        else:
+            beg = subpath[0].start
+            end = subpath[-1].end
+            if abs(end - beg) < 1e-5:
+                subpath[-1].end = beg # Force closing the path
+                subpath.end = subpath[-1].end
+                assert(subpath.isclosed())
+            elif force_close:
+                subpath.append(svgpathtools.Line(end, beg))
+                subpath.end = subpath[-1].end
+                assert(subpath.isclosed())
+
+        num_control_points = []
+        points = []
+
+        for i, e in enumerate(subpath):
+            if i == 0:
+                points.append((e.start.real, e.start.imag))
+            else:
+                # Must begin from the end of previous segment
+                assert(e.start.real == points[-1][0])
+                assert(e.start.imag == points[-1][1])
+            if isinstance(e, svgpathtools.Line):
+                num_control_points.append(0)
+            elif isinstance(e, svgpathtools.QuadraticBezier):
+                num_control_points.append(1)
+                points.append((e.control.real, e.control.imag))
+            elif isinstance(e, svgpathtools.CubicBezier):
+                num_control_points.append(2)
+                points.append((e.control1.real, e.control1.imag))
+                points.append((e.control2.real, e.control2.imag))
+            elif isinstance(e, svgpathtools.Arc):
+                # Convert to Cubic curves
+                # https://www.joecridge.me/content/pdf/bezier-arcs.pdf
+                start = e.theta * math.pi / 180.0
+                stop = (e.theta + e.delta) * math.pi / 180.0
+
+                sign = 1.0
+                if stop < start:
+                    sign = -1.0
+
+                epsilon = 0.00001
+                debug = abs(e.delta) >= 90.0
+                while (sign * (stop - start) > epsilon):
+                    arc_to_draw = stop - start
+                    if arc_to_draw > 0.0:
+                        arc_to_draw = min(arc_to_draw, 0.5 * math.pi)
+                    else:
+                        arc_to_draw = max(arc_to_draw, -0.5 * math.pi)
+                    alpha = arc_to_draw / 2.0
+                    cos_alpha = math.cos(alpha)
+                    sin_alpha = math.sin(alpha)
+                    cot_alpha = 1.0 / math.tan(alpha)
+                    phi = start + alpha
+                    cos_phi = math.cos(phi)
+                    sin_phi = math.sin(phi)
+                    lambda_ = (4.0 - cos_alpha) / 3.0
+                    mu = sin_alpha + (cos_alpha - lambda_) * cot_alpha
+                    last = sign * (stop - (start + arc_to_draw)) <= epsilon
+                    num_control_points.append(2)
+                    rx = e.radius.real
+                    ry = e.radius.imag
+                    cx = e.center.real
+                    cy = e.center.imag
+                    rot = e.phi * math.pi / 180.0
+                    cos_rot = math.cos(rot)
+                    sin_rot = math.sin(rot)
+                    x = lambda_ * cos_phi + mu * sin_phi
+                    y = lambda_ * sin_phi - mu * cos_phi
+                    xx = x * cos_rot - y * sin_rot
+                    yy = x * sin_rot + y * cos_rot
+                    points.append((cx + rx * xx, cy + ry * yy))
+                    x = lambda_ * cos_phi - mu * sin_phi
+                    y = lambda_ * sin_phi + mu * cos_phi
+                    xx = x * cos_rot - y * sin_rot
+                    yy = x * sin_rot + y * cos_rot
+                    points.append((cx + rx * xx, cy + ry * yy))
+                    if not last:
+                        points.append((cx + rx * math.cos(rot + start + arc_to_draw),
+                                       cy + ry * math.sin(rot + start + arc_to_draw)))
+                    start += arc_to_draw
+                    first = False
+            if i != len(subpath) - 1:
+                points.append((e.end.real, e.end.imag))
+            else:
+                if subpath.isclosed():
+                    # Must end at the beginning of first segment
+                    assert(e.end.real == points[0][0])
+                    assert(e.end.imag == points[0][1])
+                else:
+                    points.append((e.end.real, e.end.imag))
+        points = torch.tensor(points)
+        points = torch.cat((points, torch.ones([points.shape[0], 1])), dim = 1) @ torch.transpose(shape_to_canvas, 0, 1)
+        points = points / points[:, 2:3]
+        points = points[:, :2].contiguous()
+        ret_paths.append(Path(torch.tensor(num_control_points), points, subpath.isclosed()))
+    return ret_paths
diff --git a/pydiffvg_tensorflow/__init__.py b/pydiffvg_tensorflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..268652487893d06c52d96b459fa05da3a6363da6
--- /dev/null
+++ b/pydiffvg_tensorflow/__init__.py
@@ -0,0 +1,24 @@
+import tensorflow as tf
+try:
+    import diffvg
+except ImportError:
+    print("Warning: diffvg is not installed when you import pydiffvg_tensorflow.")
+from .device import *
+from .shape import *
+from .pixel_filter import *
+from .render_tensorflow import *
+from .image import *
+from .color import *
+import os.path
+
+print(os.path.dirname(diffvg.__file__))
+
+if tf.__cxx11_abi_flag__ == 0:
+    __data_ptr_module = tf.load_op_library(os.path.join(os.path.dirname(diffvg.__file__), 'libdiffvg_tf_data_ptr_no_cxx11_abi.so'))
+else:
+    assert(tf.__cxx11_abi_flag__ == 1)
+    __data_ptr_module = tf.load_op_library(os.path.join(os.path.dirname(diffvg.__file__), 'libdiffvg_tf_data_ptr_cxx11_abi.so'))
+
+def data_ptr(tensor):    
+    addr_as_uint64 = __data_ptr_module.data_ptr(tensor)
+    return int(addr_as_uint64)
diff --git a/pydiffvg_tensorflow/color.py b/pydiffvg_tensorflow/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0db61215407dfbee8e4021aa0b32e70df473ddb
--- /dev/null
+++ b/pydiffvg_tensorflow/color.py
@@ -0,0 +1,23 @@
+import tensorflow as tf
+
+class LinearGradient:
+    def __init__(self,
+                 begin = tf.constant([0.0, 0.0]),
+                 end = tf.constant([0.0, 0.0]),
+                 offsets = tf.constant([0.0]),
+                 stop_colors = tf.constant([0.0, 0.0, 0.0, 0.0])):
+        self.begin = begin
+        self.end = end
+        self.offsets = offsets
+        self.stop_colors = stop_colors
+
+class RadialGradient:
+    def __init__(self,
+                 center = tf.constant([0.0, 0.0]),
+                 radius = tf.constant([0.0, 0.0]),
+                 offsets = tf.constant([0.0]),
+                 stop_colors = tf.constant([0.0, 0.0, 0.0, 0.0])):
+        self.center = center
+        self.radius = radius
+        self.offsets = offsets
+        self.stop_colors = stop_colors
diff --git a/pydiffvg_tensorflow/custom_ops/CMakeLists.txt b/pydiffvg_tensorflow/custom_ops/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e15c953c45f1bbb7f3bd646b4fee5eed304d9bca
--- /dev/null
+++ b/pydiffvg_tensorflow/custom_ops/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.12)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+project(diffvgTFCustomOp)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+include_directories(SYSTEM ${TensorFlow_INCLUDE_DIR})
+
+# Compile two versions of the library
+add_library(diffvg_tf_data_ptr_cxx11_abi SHARED data_ptr.cc)
+set_target_properties(diffvg_tf_data_ptr_cxx11_abi PROPERTIES COMPILE_FLAGS -D_GLIBCXX_USE_CXX11_ABI=1)
+set_target_properties(diffvg_tf_data_ptr_cxx11_abi PROPERTIES LINK_FLAGS -D_GLIBCXX_USE_CXX11_ABI=1)
+if(APPLE)
+    # .so instead of .dylib
+    set_target_properties(diffvg_tf_data_ptr_cxx11_abi PROPERTIES SUFFIX .so)
+endif()
+target_link_libraries(diffvg_tf_data_ptr_cxx11_abi ${TensorFlow_LIBRARY})
+
+add_library(diffvg_tf_data_ptr_no_cxx11_abi SHARED data_ptr.cc)
+set_target_properties(diffvg_tf_data_ptr_no_cxx11_abi PROPERTIES COMPILE_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0)
+set_target_properties(diffvg_tf_data_ptr_no_cxx11_abi PROPERTIES LINK_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0)
+if(APPLE)
+    # .so instead of .dylib
+    set_target_properties(diffvg_tf_data_ptr_no_cxx11_abi PROPERTIES SUFFIX .so)
+endif()
+target_link_libraries(diffvg_tf_data_ptr_no_cxx11_abi ${TensorFlow_LIBRARY})
diff --git a/pydiffvg_tensorflow/custom_ops/data_ptr.cc b/pydiffvg_tensorflow/custom_ops/data_ptr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb3caff33daef92c30ddb12ce035176fdd01e308
--- /dev/null
+++ b/pydiffvg_tensorflow/custom_ops/data_ptr.cc
@@ -0,0 +1,88 @@
+// TODO: add back acknowledgement to the original author when release.
+
+#pragma warning(disable : 4003 4061 4100 4127 4242 4244 4267 4355 4365 4388 4464 4514 4574 4623 4625 4626 4647 4668 4710 4820 4946 5026 5027 5031 5039)
+
+// For windows
+#define NOMINMAX
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include <stdint.h>
+#include <climits>
+
+using namespace tensorflow;
+
+/* Tensorflow custom ops does not allow parameter types of list of 
+   various data types. Therefore, we can't pass a list but we have
+   to pass each objects individually. 
+
+   Consult Tensorflow source code: /tensorflow/core/framework/tensor.h
+   for what is supported by Tensorflow
+*/
+
+REGISTER_OP("DataPtr")
+    .Attr("T: {float, int32} = DT_INT32")  // To preserve backwards compatibility, you should specify a default value when adding an attr to an existing op:
+    .Input("input: T")  // Tensor
+    .Output("output: uint64")  // scalar
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, {}); // scalar
+      return Status::OK();
+    });
+
+template <typename T>
+class DataPtrOp : public OpKernel {
+ public:
+  explicit DataPtrOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+    const T *tensor = input_tensor.flat<T>().data();
+
+    // Create an output tensor
+    // NOTE: The output datatype must match the Ops definition!!!.
+    Tensor* output_tensor = NULL;
+    // Always allocate on CPU
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    OP_REQUIRES_OK(context, 
+      context->allocate_output(0, {},  // Initialize a one-element scalar
+      &output_tensor,
+      alloc_attr)
+    );
+    auto output_flat = output_tensor->flat<uint64>();
+
+    // Cast pointer to unsigned long int
+    uintptr_t addr = (uintptr_t)tensor;
+
+    // Cast unsigned long int -> unsigned int64
+    uint64 addr_converted = addr;
+
+    output_flat(0) = addr_converted;
+  }
+};
+
+// Polymorphism: https://www.tensorflow.org/guide/extend/op#polymorphism
+REGISTER_KERNEL_BUILDER(
+  Name("DataPtr")
+  .Device(DEVICE_CPU)
+  .TypeConstraint<int32>("T"),
+  DataPtrOp<int32>);
+REGISTER_KERNEL_BUILDER(
+  Name("DataPtr")
+  .Device(DEVICE_CPU)
+  .TypeConstraint<float>("T"),
+  DataPtrOp<float>);
+REGISTER_KERNEL_BUILDER(
+  Name("DataPtr")
+  .Device(DEVICE_GPU)
+  .TypeConstraint<int32>("T")
+  .HostMemory("output"),
+  DataPtrOp<int32>);
+REGISTER_KERNEL_BUILDER(
+  Name("DataPtr")
+  .Device(DEVICE_GPU)
+  .TypeConstraint<float>("T")
+  .HostMemory("output"),
+  DataPtrOp<float>);
diff --git a/pydiffvg_tensorflow/device.py b/pydiffvg_tensorflow/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..271b6bdb261894fddd398a47db5dd5000b5de775
--- /dev/null
+++ b/pydiffvg_tensorflow/device.py
@@ -0,0 +1,59 @@
+import tensorflow as tf
+
+use_gpu = tf.test.is_gpu_available(
+    cuda_only=True,
+    min_cuda_compute_capability=None
+)
+cpu_device_id = 0
+gpu_device_id = 0
+
+def get_device_name():
+    """
+        Get the current tensorflow device name we are using.
+    """
+    global use_gpu
+    global cpu_device_id
+    global gpu_device_id
+    return '/device:gpu:' + str(gpu_device_id) if use_gpu else '/device:cpu:' + str(cpu_device_id)
+
+def set_use_gpu(v: bool):
+    """
+        Set whether to use CUDA or not.
+    """
+    global use_gpu
+    use_gpu = v
+
+def get_use_gpu():
+    """
+        Get whether we are using CUDA or not.
+    """
+    global use_gpu
+    return use_gpu
+
+def set_cpu_device_id(did: int):
+    """
+        Set the cpu device id we are using.
+    """
+    global cpu_device_id
+    cpu_device_id = did
+
+def get_cpu_device_id():
+    """
+        Get the cpu device id we are using.
+    """
+    global cpu_device_id
+    return cpu_device_id
+
+def set_gpu_device_id(did: int):
+    """
+        Set the gpu device id we are using.
+    """
+    global gpu_device_id
+    gpu_device_id = did
+
+def get_gpu_device_id():
+    """
+        Get the gpu device id we are using.
+    """
+    global gpu_device_id
+    return gpu_device_id
diff --git a/pydiffvg_tensorflow/image.py b/pydiffvg_tensorflow/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..18eb1e6b66ae077b1c9d4b534a5fce250fe3958a
--- /dev/null
+++ b/pydiffvg_tensorflow/image.py
@@ -0,0 +1,22 @@
+import numpy as np
+import skimage
+import skimage.io
+import os
+
+def imwrite(img, filename, gamma = 2.2, normalize = False):
+    directory = os.path.dirname(filename)
+    if directory != '' and not os.path.exists(directory):
+        os.makedirs(directory)
+
+    if not isinstance(img, np.ndarray):
+        img = img.numpy()
+    if normalize:
+        img_rng = np.max(img) - np.min(img)
+        if img_rng > 0:
+            img = (img - np.min(img)) / img_rng
+    img = np.clip(img, 0.0, 1.0)
+    if img.ndim==2:
+        #repeat along the third dimension
+        img=np.expand_dims(img,2)
+    img[:, :, :3] = np.power(img[:, :, :3], 1.0/gamma)
+    skimage.io.imsave(filename, (img * 255).astype(np.uint8))
\ No newline at end of file
diff --git a/pydiffvg_tensorflow/pixel_filter.py b/pydiffvg_tensorflow/pixel_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eff01742bfcea55240dc4d2c50006e3dd42aadb
--- /dev/null
+++ b/pydiffvg_tensorflow/pixel_filter.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+
+class PixelFilter:
+    def __init__(self,
+                 type,
+                 radius = tf.constant(0.5)):
+        self.type = type
+        self.radius = radius
diff --git a/pydiffvg_tensorflow/render_tensorflow.py b/pydiffvg_tensorflow/render_tensorflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a7efaa3fddef32fc2619c3fcaa88881354a7e9f
--- /dev/null
+++ b/pydiffvg_tensorflow/render_tensorflow.py
@@ -0,0 +1,664 @@
+import os
+import tensorflow as tf
+import diffvg
+import pydiffvg_tensorflow as pydiffvg
+import time
+from enum import IntEnum
+import warnings
+
+print_timing = False
+__EMPTY_TENSOR = tf.constant([])
+
+def is_empty_tensor(tensor):
+    return  tf.equal(tf.size(tensor), 0)
+
+def set_print_timing(val):
+    global print_timing
+    print_timing=val
+
+class OutputType(IntEnum):
+    color = 1
+    sdf = 2
+
+class ShapeType:
+    __shapetypes = [
+        diffvg.ShapeType.circle,
+        diffvg.ShapeType.ellipse,
+        diffvg.ShapeType.path,
+        diffvg.ShapeType.rect
+    ]
+
+    @staticmethod
+    def asTensor(type):
+        for i in range(len(ShapeType.__shapetypes)):
+            if ShapeType.__shapetypes[i] == type:
+                return tf.constant(i)
+
+    @staticmethod
+    def asShapeType(index: tf.Tensor):
+        if is_empty_tensor(index):
+            return None
+        try:
+            type = ShapeType.__shapetypes[index]
+        except IndexError:
+            print(f'{index} is out of range: [0, {len(ShapeType.__shapetypes)})')
+            import sys
+            sys.exit()
+        else:
+            return type
+
+class ColorType:
+    __colortypes = [
+        diffvg.ColorType.constant,
+        diffvg.ColorType.linear_gradient,
+        diffvg.ColorType.radial_gradient
+    ]
+
+    @staticmethod
+    def asTensor(type):
+        for i in range(len(ColorType.__colortypes)):
+            if ColorType.__colortypes[i] == type:
+                return tf.constant(i)
+
+    @staticmethod
+    def asColorType(index: tf.Tensor):
+        if is_empty_tensor(index):
+            return None
+        try:
+            type = ColorType.__colortypes[index]
+        except IndexError:
+            print(f'{index} is out of range: [0, {len(ColorType.__colortypes)})')
+            import sys
+            sys.exit()
+        else:
+            return type
+
+class FilterType:
+    __filtertypes = [
+        diffvg.FilterType.box,
+        diffvg.FilterType.tent,
+        diffvg.FilterType.hann
+    ]
+
+    @staticmethod
+    def asTensor(type):
+        for i in range(len(FilterType.__filtertypes)):
+            if FilterType.__filtertypes[i] == type:
+                return tf.constant(i)    
+
+    @staticmethod
+    def asFilterType(index: tf.Tensor):
+        if is_empty_tensor(index):
+            return None
+        try:
+            type = FilterType.__filtertypes[index]
+        except IndexError:
+            print(f'{index} is out of range: [0, {len(FilterType.__filtertypes)})')
+            import sys
+            sys.exit()
+        else:
+            return type
+
+def serialize_scene(canvas_width,
+                    canvas_height,
+                    shapes,
+                    shape_groups,
+                    filter = pydiffvg.PixelFilter(type = diffvg.FilterType.box,
+                                                  radius = tf.constant(0.5)),
+                    output_type = OutputType.color,
+                    use_prefiltering = False):
+    """
+        Given a list of shapes, convert them to a linear list of argument,
+        so that we can use it in TF.
+    """
+    with tf.device('/device:cpu:' + str(pydiffvg.get_cpu_device_id())):
+        num_shapes = len(shapes)
+        num_shape_groups = len(shape_groups)
+        args = []
+        args.append(tf.constant(canvas_width))
+        args.append(tf.constant(canvas_height))
+        args.append(tf.constant(num_shapes))
+        args.append(tf.constant(num_shape_groups))
+        args.append(tf.constant(output_type))
+        args.append(tf.constant(use_prefiltering))
+        for shape in shapes:
+            if isinstance(shape, pydiffvg.Circle):
+                args.append(ShapeType.asTensor(diffvg.ShapeType.circle))
+                args.append(tf.identity(shape.radius))
+                args.append(tf.identity(shape.center))
+            elif isinstance(shape, pydiffvg.Ellipse):
+                args.append(ShapeType.asTensor(diffvg.ShapeType.ellipse))
+                args.append(tf.identity(shape.radius))
+                args.append(tf.identity(shape.center))
+            elif isinstance(shape, pydiffvg.Path):
+                assert(shape.points.shape[1] == 2)
+                args.append(ShapeType.asTensor(diffvg.ShapeType.path))
+                args.append(tf.identity(shape.num_control_points))
+                args.append(tf.identity(shape.points))
+                args.append(tf.constant(shape.is_closed))
+                args.append(tf.constant(shape.use_distance_approx))
+            elif isinstance(shape, pydiffvg.Polygon):
+                assert(shape.points.shape[1] == 2)
+                args.append(ShapeType.asTensor(diffvg.ShapeType.path))
+                if shape.is_closed:
+                    args.append(tf.zeros(shape.points.shape[0], dtype = tf.int32))
+                else:
+                    args.append(tf.zeros(shape.points.shape[0] - 1, dtype = tf.int32))
+                args.append(tf.identity(shape.points))
+                args.append(tf.constant(shape.is_closed))
+            elif isinstance(shape, pydiffvg.Rect):
+                args.append(ShapeType.asTensor(diffvg.ShapeType.rect))
+                args.append(tf.identity(shape.p_min))
+                args.append(tf.identity(shape.p_max))
+            else:
+                assert(False)
+            args.append(tf.identity(shape.stroke_width))
+
+        for shape_group in shape_groups:
+            args.append(tf.identity(shape_group.shape_ids))
+            # Fill color
+            if shape_group.fill_color is None:
+                args.append(__EMPTY_TENSOR)
+            elif tf.is_tensor(shape_group.fill_color):
+                args.append(ColorType.asTensor(diffvg.ColorType.constant))
+                args.append(tf.identity(shape_group.fill_color))
+            elif isinstance(shape_group.fill_color, pydiffvg.LinearGradient):
+                args.append(ColorType.asTensor(diffvg.ColorType.linear_gradient))
+                args.append(tf.identity(shape_group.fill_color.begin))
+                args.append(tf.identity(shape_group.fill_color.end))
+                args.append(tf.identity(shape_group.fill_color.offsets))
+                args.append(tf.identity(shape_group.fill_color.stop_colors))
+            elif isinstance(shape_group.fill_color, pydiffvg.RadialGradient):
+                args.append(ColorType.asTensor(diffvg.ColorType.radial_gradient))
+                args.append(tf.identity(shape_group.fill_color.center))
+                args.append(tf.identity(shape_group.fill_color.radius))
+                args.append(tf.identity(shape_group.fill_color.offsets))
+                args.append(tf.identity(shape_group.fill_color.stop_colors))
+
+            if shape_group.fill_color is not None:
+                # go through the underlying shapes and check if they are all closed
+                for shape_id in shape_group.shape_ids:
+                    if isinstance(shapes[shape_id], pydiffvg.Path):
+                        if not shapes[shape_id].is_closed:
+                            warnings.warn("Detected non-closed paths with fill color. This might causes unexpected results.", Warning)
+
+            # Stroke color
+            if shape_group.stroke_color is None:
+                args.append(__EMPTY_TENSOR)
+            elif tf.is_tensor(shape_group.stroke_color):
+                args.append(tf.constant(0))
+                args.append(tf.identity(shape_group.stroke_color))
+            elif isinstance(shape_group.stroke_color, pydiffvg.LinearGradient):
+                args.append(ColorType.asTensor(diffvg.ColorType.linear_gradient))
+                args.append(tf.identity(shape_group.stroke_color.begin))
+                args.append(tf.identity(shape_group.stroke_color.end))
+                args.append(tf.identity(shape_group.stroke_color.offsets))
+                args.append(tf.identity(shape_group.stroke_color.stop_colors))
+            elif isinstance(shape_group.stroke_color, pydiffvg.RadialGradient):
+                args.append(ColorType.asTensor(diffvg.ColorType.radial_gradient))
+                args.append(tf.identity(shape_group.stroke_color.center))
+                args.append(tf.identity(shape_group.stroke_color.radius))
+                args.append(tf.identity(shape_group.stroke_color.offsets))
+                args.append(tf.identity(shape_group.stroke_color.stop_colors))
+            args.append(tf.constant(shape_group.use_even_odd_rule))
+            # Transformation
+            args.append(tf.identity(shape_group.shape_to_canvas))
+        args.append(FilterType.asTensor(filter.type))
+        args.append(tf.constant(filter.radius))
+    return args
+
+class Context: pass
+
+def forward(width,
+            height,
+            num_samples_x,
+            num_samples_y,
+            seed,
+            *args):
+    """
+        Forward rendering pass: given a serialized scene and output an image.
+    """
+    # Unpack arguments
+    with tf.device('/device:cpu:' + str(pydiffvg.get_cpu_device_id())):
+        current_index = 0
+        canvas_width = int(args[current_index])
+        current_index += 1
+        canvas_height = int(args[current_index])
+        current_index += 1
+        num_shapes = int(args[current_index])
+        current_index += 1
+        num_shape_groups = int(args[current_index])
+        current_index += 1
+        output_type = OutputType(int(args[current_index]))
+        current_index += 1
+        use_prefiltering = bool(args[current_index])
+        current_index += 1
+        shapes = []
+        shape_groups = []
+        shape_contents = [] # Important to avoid GC deleting the shapes
+        color_contents = [] # Same as above
+        for shape_id in range(num_shapes):
+            shape_type = ShapeType.asShapeType(args[current_index])
+            current_index += 1
+            if shape_type == diffvg.ShapeType.circle:
+                radius = args[current_index]
+                current_index += 1
+                center = args[current_index]
+                current_index += 1
+                shape = diffvg.Circle(float(radius),
+                                      diffvg.Vector2f(float(center[0]), float(center[1])))
+            elif shape_type == diffvg.ShapeType.ellipse:
+                radius = args[current_index]
+                current_index += 1
+                center = args[current_index]
+                current_index += 1
+                shape = diffvg.Ellipse(diffvg.Vector2f(float(radius[0]), float(radius[1])),
+                                       diffvg.Vector2f(float(center[0]), float(center[1])))
+            elif shape_type == diffvg.ShapeType.path:
+                num_control_points = args[current_index]
+                current_index += 1
+                points = args[current_index]
+                current_index += 1
+                is_closed = args[current_index]
+                current_index += 1
+                use_distance_approx = args[current_index]
+                current_index += 1
+                shape = diffvg.Path(diffvg.int_ptr(pydiffvg.data_ptr(num_control_points)),
+                                    diffvg.float_ptr(pydiffvg.data_ptr(points)),
+                                    diffvg.float_ptr(0), # thickness
+                                    num_control_points.shape[0],
+                                    points.shape[0],
+                                    is_closed,
+                                    use_distance_approx)
+            elif shape_type == diffvg.ShapeType.rect:
+                p_min = args[current_index]
+                current_index += 1
+                p_max = args[current_index]
+                current_index += 1
+                shape = diffvg.Rect(diffvg.Vector2f(float(p_min[0]), float(p_min[1])),
+                                    diffvg.Vector2f(float(p_max[0]), float(p_max[1])))
+            else:
+                assert(False)
+            stroke_width = args[current_index]
+            current_index += 1
+            shapes.append(diffvg.Shape(\
+                shape_type, shape.get_ptr(), float(stroke_width)))
+            shape_contents.append(shape)
+
+        for shape_group_id in range(num_shape_groups):
+            shape_ids = args[current_index]
+            current_index += 1
+            fill_color_type = ColorType.asColorType(args[current_index])
+            current_index += 1
+            if fill_color_type == diffvg.ColorType.constant:
+                color = args[current_index]
+                current_index += 1
+                fill_color = diffvg.Constant(\
+                    diffvg.Vector4f(color[0], color[1], color[2], color[3]))
+            elif fill_color_type == diffvg.ColorType.linear_gradient:
+                beg = args[current_index]
+                current_index += 1
+                end = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                fill_color = diffvg.LinearGradient(diffvg.Vector2f(float(beg[0]), float(beg[1])),
+                                                   diffvg.Vector2f(float(end[0]), float(end[1])),
+                                                   offsets.shape[0],
+                                                   diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                                                   diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+            elif fill_color_type == diffvg.ColorType.radial_gradient:
+                center = args[current_index]
+                current_index += 1
+                radius = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                fill_color = diffvg.RadialGradient(diffvg.Vector2f(float(center[0]), float(center[1])),
+                                                   diffvg.Vector2f(float(radius[0]), float(radius[1])),
+                                                   offsets.shape[0],
+                                                   diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                                                   diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+            elif fill_color_type is None:
+                fill_color = None
+            else:
+                assert(False)
+
+            stroke_color_type = ColorType.asColorType(args[current_index])
+            current_index += 1
+            if stroke_color_type == diffvg.ColorType.constant:
+                color = args[current_index]
+                current_index += 1
+                stroke_color = diffvg.Constant(\
+                    diffvg.Vector4f(float(color[0]),
+                                    float(color[1]),
+                                    float(color[2]),
+                                    float(color[3])))
+            elif stroke_color_type == diffvg.ColorType.linear_gradient:
+                beg = args[current_index]
+                current_index += 1
+                end = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                stroke_color = diffvg.LinearGradient(\
+                    diffvg.Vector2f(float(beg[0]), float(beg[1])),
+                    diffvg.Vector2f(float(end[0]), float(end[1])),
+                    offsets.shape[0],
+                    diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                    diffvg.float_ptr(stop_colors.data_ptr()))
+            elif stroke_color_type == diffvg.ColorType.radial_gradient:
+                center = args[current_index]
+                current_index += 1
+                radius = args[current_index]
+                current_index += 1
+                offsets = args[current_index]
+                current_index += 1
+                stop_colors = args[current_index]
+                current_index += 1
+                assert(offsets.shape[0] == stop_colors.shape[0])
+                stroke_color = diffvg.RadialGradient(\
+                    diffvg.Vector2f(float(center[0]), float(center[1])),
+                    diffvg.Vector2f(float(radius[0]), float(radius[1])),
+                    offsets.shape[0],
+                    diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                    diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+            elif stroke_color_type is None:
+                stroke_color = None
+            else:
+                assert(False)
+            use_even_odd_rule = bool(args[current_index])
+            current_index += 1
+            shape_to_canvas = args[current_index]
+            current_index += 1
+
+            if fill_color is not None:
+                color_contents.append(fill_color)
+            if stroke_color is not None:
+                color_contents.append(stroke_color)
+            shape_groups.append(diffvg.ShapeGroup(\
+                diffvg.int_ptr(pydiffvg.data_ptr(shape_ids)),
+                shape_ids.shape[0],
+                diffvg.ColorType.constant if fill_color_type is None else fill_color_type,
+                diffvg.void_ptr(0) if fill_color is None else fill_color.get_ptr(),
+                diffvg.ColorType.constant if stroke_color_type is None else stroke_color_type,
+                diffvg.void_ptr(0) if stroke_color is None else stroke_color.get_ptr(),
+                use_even_odd_rule,
+                diffvg.float_ptr(pydiffvg.data_ptr(shape_to_canvas))))
+
+        filter_type = FilterType.asFilterType(args[current_index])
+        current_index += 1
+        filter_radius = args[current_index]
+        current_index += 1
+        filt = diffvg.Filter(filter_type, filter_radius)
+
+    device_name = pydiffvg.get_device_name()
+    device_spec = tf.DeviceSpec.from_string(device_name)
+    use_gpu = device_spec.device_type == 'GPU'
+    gpu_index = device_spec.device_index if device_spec.device_index is not None else 0
+
+    start = time.time()
+    scene = diffvg.Scene(canvas_width,
+                         canvas_height,
+                         shapes,
+                         shape_groups,
+                         filt,
+                         use_gpu,
+                         gpu_index)
+    time_elapsed = time.time() - start
+    global print_timing
+    if print_timing:
+        print('Scene construction, time: %.5f s' % time_elapsed)
+
+    with tf.device(device_name):
+        if output_type == OutputType.color:
+            rendered_image = tf.zeros((int(height), int(width), 4), dtype = tf.float32)
+        else:
+            assert(output_type == OutputType.sdf)
+            rendered_image = tf.zeros((int(height), int(width), 1), dtype = tf.float32)
+
+        start = time.time()
+        diffvg.render(scene,
+                      diffvg.float_ptr(0), # background image
+                      diffvg.float_ptr(pydiffvg.data_ptr(rendered_image) if output_type == OutputType.color else 0),
+                      diffvg.float_ptr(pydiffvg.data_ptr(rendered_image) if output_type == OutputType.sdf else 0),
+                      width,
+                      height,
+                      int(num_samples_x),
+                      int(num_samples_y),
+                      seed,
+                      diffvg.float_ptr(0), # d_background_image
+                      diffvg.float_ptr(0), # d_render_image
+                      diffvg.float_ptr(0), # d_render_sdf
+                      diffvg.float_ptr(0), # d_translation
+                      use_prefiltering,
+                      diffvg.float_ptr(0), # eval_positions
+                      0 ) # num_eval_positions (automatically set to entire raster)
+        time_elapsed = time.time() - start
+        if print_timing:
+            print('Forward pass, time: %.5f s' % time_elapsed)
+
+    ctx = Context()
+    ctx.scene = scene
+    ctx.shape_contents = shape_contents
+    ctx.color_contents = color_contents
+    ctx.filter = filt
+    ctx.width = width
+    ctx.height = height
+    ctx.num_samples_x = num_samples_x
+    ctx.num_samples_y = num_samples_y
+    ctx.seed = seed
+    ctx.output_type = output_type
+    ctx.use_prefiltering = use_prefiltering
+    return rendered_image, ctx
+
+@tf.custom_gradient
+def render(*x):
+    """
+        The main TensorFlow interface of C++ diffvg.
+    """
+    assert(tf.executing_eagerly())
+    if pydiffvg.get_use_gpu() and os.environ.get('TF_FORCE_GPU_ALLOW_GROWTH') != 'true':
+        print('******************** WARNING ********************')
+        print('Tensorflow by default allocates all GPU memory,')
+        print('causing huge amount of page faults when rendering.')
+        print('Please set the environment variable TF_FORCE_GPU_ALLOW_GROWTH to true,')
+        print('so that Tensorflow allocates memory on demand.')
+        print('*************************************************')
+
+    width = x[0]
+    height = x[1]
+    num_samples_x = x[2]
+    num_samples_y = x[3]
+    seed = x[4]
+    args = x[5:]
+    img, ctx = forward(width, height, num_samples_x, num_samples_y, seed, *args)
+
+    def backward(grad_img):
+        scene = ctx.scene
+        width = ctx.width
+        height = ctx.height
+        num_samples_x = ctx.num_samples_x
+        num_samples_y = ctx.num_samples_y
+        seed = ctx.seed
+        output_type = ctx.output_type
+        use_prefiltering = ctx.use_prefiltering
+
+        start = time.time()
+        with tf.device(pydiffvg.get_device_name()):
+            diffvg.render(scene,
+                          diffvg.float_ptr(0), # background_image
+                          diffvg.float_ptr(0), # render_image
+                          diffvg.float_ptr(0), # render_sdf
+                          width,
+                          height,
+                          num_samples_x,
+                          num_samples_y,
+                          seed,
+                          diffvg.float_ptr(0), # d_background_image
+                          diffvg.float_ptr(pydiffvg.data_ptr(grad_img) if output_type == OutputType.color else 0),
+                          diffvg.float_ptr(pydiffvg.data_ptr(grad_img) if output_type == OutputType.sdf else 0),
+                          diffvg.float_ptr(0), # d_translation
+                          use_prefiltering,
+                          diffvg.float_ptr(0), # eval_positions
+                          0 ) # num_eval_positions (automatically set to entire raster))
+        time_elapsed = time.time() - start
+        global print_timing
+        if print_timing:
+            print('Backward pass, time: %.5f s' % time_elapsed)
+
+        with tf.device('/device:cpu:' + str(pydiffvg.get_cpu_device_id())):
+            d_args = []
+            d_args.append(None) # width
+            d_args.append(None) # height
+            d_args.append(None) # num_samples_x
+            d_args.append(None) # num_samples_y
+            d_args.append(None) # seed
+            d_args.append(None) # canvas_width
+            d_args.append(None) # canvas_height
+            d_args.append(None) # num_shapes
+            d_args.append(None) # num_shape_groups
+            d_args.append(None) # output_type
+            d_args.append(None) # use_prefiltering
+            for shape_id in range(scene.num_shapes):
+                d_args.append(None) # type
+                d_shape = scene.get_d_shape(shape_id)
+                if d_shape.type == diffvg.ShapeType.circle:
+                    d_circle = d_shape.as_circle()
+                    radius = tf.constant(d_circle.radius)
+                    d_args.append(radius)
+                    c = d_circle.center
+                    c = tf.constant((c.x, c.y))
+                    d_args.append(c)
+                elif d_shape.type == diffvg.ShapeType.ellipse:
+                    d_ellipse = d_shape.as_ellipse()
+                    r = d_ellipse.radius
+                    r = tf.constant((d_ellipse.radius.x, d_ellipse.radius.y))
+                    d_args.append(r)
+                    c = d_ellipse.center
+                    c = tf.constant((c.x, c.y))
+                    d_args.append(c)
+                elif d_shape.type == diffvg.ShapeType.path:
+                    d_path = d_shape.as_path()
+                    points = tf.zeros((d_path.num_points, 2), dtype=tf.float32)
+                    d_path.copy_to(diffvg.float_ptr(pydiffvg.data_ptr(points)),diffvg.float_ptr(0))
+                    d_args.append(None) # num_control_points
+                    d_args.append(points)
+                    d_args.append(None) # is_closed
+                    d_args.append(None) # use_distance_approx
+                elif d_shape.type == diffvg.ShapeType.rect:
+                    d_rect = d_shape.as_rect()
+                    p_min = tf.constant((d_rect.p_min.x, d_rect.p_min.y))
+                    p_max = tf.constant((d_rect.p_max.x, d_rect.p_max.y))
+                    d_args.append(p_min)
+                    d_args.append(p_max)
+                else:
+                    assert(False)
+                w = tf.constant((d_shape.stroke_width))
+                d_args.append(w)
+
+            for group_id in range(scene.num_shape_groups):
+                d_shape_group = scene.get_d_shape_group(group_id)
+                d_args.append(None) # shape_ids
+                d_args.append(None) # fill_color_type
+                if d_shape_group.has_fill_color():
+                    if d_shape_group.fill_color_type == diffvg.ColorType.constant:
+                        d_constant = d_shape_group.fill_color_as_constant()
+                        c = d_constant.color
+                        d_args.append(tf.constant((c.x, c.y, c.z, c.w)))
+                    elif d_shape_group.fill_color_type == diffvg.ColorType.linear_gradient:
+                        d_linear_gradient = d_shape_group.fill_color_as_linear_gradient()
+                        beg = d_linear_gradient.begin
+                        d_args.append(tf.constant((beg.x, beg.y)))
+                        end = d_linear_gradient.end
+                        d_args.append(tf.constant((end.x, end.y)))
+                        offsets = tf.zeros((d_linear_gradient.num_stops), dtype=tf.float32)
+                        stop_colors = tf.zeros((d_linear_gradient.num_stops, 4), dtype=tf.float32)
+                        # HACK: tensorflow's eager mode uses a cache to store scalar
+                        #       constants to avoid memory copy. If we pass scalar tensors
+                        #       into the C++ code and modify them, we would corrupt the
+                        #       cache, causing incorrect result in future scalar constant
+                        #       creations. Thus we force tensorflow to copy by plusing a zero.
+                        # (also see https://github.com/tensorflow/tensorflow/issues/11186
+                        #  for more discussion regarding copying tensors)
+                        if offsets.shape.num_elements() == 1:
+                            offsets = offsets + 0
+                        d_linear_gradient.copy_to(\
+                            diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                            diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+                        d_args.append(offsets)
+                        d_args.append(stop_colors)
+                    elif d_shape_group.fill_color_type == diffvg.ColorType.radial_gradient:
+                        d_radial_gradient = d_shape_group.fill_color_as_radial_gradient()
+                        center = d_radial_gradient.center
+                        d_args.append(tf.constant((center.x, center.y)))
+                        radius = d_radial_gradient.radius
+                        d_args.append(tf.constant((radius.x, radius.y)))
+                        offsets = tf.zeros((d_radial_gradient.num_stops))
+                        if offsets.shape.num_elements() == 1:
+                            offsets = offsets + 0
+                        stop_colors = tf.zeros((d_radial_gradient.num_stops, 4))
+                        d_radial_gradient.copy_to(\
+                            diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                            diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+                        d_args.append(offsets)
+                        d_args.append(stop_colors)
+                    else:
+                        assert(False)
+                d_args.append(None) # stroke_color_type
+                if d_shape_group.has_stroke_color():
+                    if d_shape_group.stroke_color_type == diffvg.ColorType.constant:
+                        d_constant = d_shape_group.stroke_color_as_constant()
+                        c = d_constant.color
+                        d_args.append(tf.constant((c.x, c.y, c.z, c.w)))
+                    elif d_shape_group.stroke_color_type == diffvg.ColorType.linear_gradient:
+                        d_linear_gradient = d_shape_group.stroke_color_as_linear_gradient()
+                        beg = d_linear_gradient.begin
+                        d_args.append(tf.constant((beg.x, beg.y)))
+                        end = d_linear_gradient.end
+                        d_args.append(tf.constant((end.x, end.y)))
+                        offsets = tf.zeros((d_linear_gradient.num_stops))
+                        stop_colors = tf.zeros((d_linear_gradient.num_stops, 4))
+                        if offsets.shape.num_elements() == 1:
+                            offsets = offsets + 0
+                        d_linear_gradient.copy_to(\
+                            diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                            diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+                        d_args.append(offsets)
+                        d_args.append(stop_colors)
+                    elif d_shape_group.fill_color_type == diffvg.ColorType.radial_gradient:
+                        d_radial_gradient = d_shape_group.stroke_color_as_radial_gradient()
+                        center = d_radial_gradient.center
+                        d_args.append(tf.constant((center.x, center.y)))
+                        radius = d_radial_gradient.radius
+                        d_args.append(tf.constant((radius.x, radius.y)))
+                        offsets = tf.zeros((d_radial_gradient.num_stops))
+                        stop_colors = tf.zeros((d_radial_gradient.num_stops, 4))
+                        if offsets.shape.num_elements() == 1:
+                            offsets = offsets + 0
+                        d_radial_gradient.copy_to(\
+                            diffvg.float_ptr(pydiffvg.data_ptr(offsets)),
+                            diffvg.float_ptr(pydiffvg.data_ptr(stop_colors)))
+                        d_args.append(offsets)
+                        d_args.append(stop_colors)
+                    else:
+                        assert(False)
+                d_args.append(None) # use_even_odd_rule
+                d_shape_to_canvas = tf.zeros((3, 3), dtype = tf.float32)
+                d_shape_group.copy_to(diffvg.float_ptr(pydiffvg.data_ptr(d_shape_to_canvas)))
+                d_args.append(d_shape_to_canvas)
+            d_args.append(None) # filter_type
+            d_args.append(tf.constant(scene.get_d_filter_radius()))
+
+        return d_args
+
+    return img, backward
diff --git a/pydiffvg_tensorflow/shape.py b/pydiffvg_tensorflow/shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..432a3b5dc2fd1b8eb03c306a8123c76e6b9302ff
--- /dev/null
+++ b/pydiffvg_tensorflow/shape.py
@@ -0,0 +1,54 @@
+import tensorflow as tf
+import math
+
+class Circle:
+    def __init__(self, radius, center, stroke_width = tf.constant(1.0), id = ''):
+        self.radius = radius
+        self.center = center
+        self.stroke_width = stroke_width
+        self.id = id
+
+class Ellipse:
+    def __init__(self, radius, center, stroke_width = tf.constant(1.0), id = ''):
+        self.radius = radius
+        self.center = center
+        self.stroke_width = stroke_width
+        self.id = id
+
+class Path:
+    def __init__(self, num_control_points, points, is_closed, stroke_width = tf.constant(1.0), id = '', use_distance_approx = False):
+        self.num_control_points = num_control_points
+        self.points = points
+        self.is_closed = is_closed
+        self.stroke_width = stroke_width
+        self.id = id
+        self.use_distance_approx = use_distance_approx
+
+class Polygon:
+    def __init__(self, points, is_closed, stroke_width = tf.constant(1.0), id = ''):
+        self.points = points
+        self.is_closed = is_closed
+        self.stroke_width = stroke_width
+        self.id = id
+
+class Rect:
+    def __init__(self, p_min, p_max, stroke_width = tf.constant(1.0), id = ''):
+        self.p_min = p_min
+        self.p_max = p_max
+        self.stroke_width = stroke_width
+        self.id = id
+
+class ShapeGroup:
+    def __init__(self,
+                 shape_ids,
+                 fill_color,
+                 use_even_odd_rule = True,
+                 stroke_color = None,
+                 shape_to_canvas = tf.eye(3),
+                 id = ''):
+        self.shape_ids = shape_ids
+        self.fill_color = fill_color
+        self.use_even_odd_rule = use_even_odd_rule
+        self.stroke_color = stroke_color
+        self.shape_to_canvas = shape_to_canvas
+        self.id = id
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..5fa45d0c52bba60af064d64b035a11c5944968ec
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
+[tool.poetry]
+name = "diffvg"
+version = "0.1.0"
+description = ""
+authors = ["Marco Lee <marco@goodnotesapp.com>"]
+
+[tool.poetry.dependencies]
+python = "=3.8"
+pygame = "^2.0.1"
+
+[tool.poetry.dev-dependencies]
+torch = "^1.8.1"
+torchvision = "^0.9.1"
+numpy = "^1.20.2"
+scikit-image = "^0.18.1"
+svgwrite = "^1.4.1"
+svgpathtools = "^1.4.1"
+cssutils = "^2.2.0"
+numba = "^0.53.1"
+torch-tools = "^0.1.5"
+visdom = "^0.1.8"
+cmake = "^3.18.4"
+
+[build-system]
+requires = ["poetry-core>=1.0.0a5"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3be824e99dd6af86cc7203802c4d68505b8f4a08
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,55 @@
+# LIVE
+torch
+torchvision
+numpy
+scikit-image
+cmake
+ffmpeg
+svgwrite
+svgpathtools
+cssutils
+numba
+torch-tools
+scikit-fmm
+easydict
+visdom
+opencv-python==4.5.4.60
+PyYAML>=5.3.1
+
+
+
+# Base ----------------------------------------
+# matplotlib>=3.2.2
+# numpy>=1.18.5
+# opencv-python-headless>=4.5.5.64
+# Pillow>=7.1.2
+# PyYAML>=5.3.1
+# requests>=2.23.0
+# scipy>=1.4.1
+# torch>=1.7.0
+# torchvision>=0.8.1
+# tqdm>=4.41.0
+
+# Logging -------------------------------------
+# tensorboard>=2.4.1
+# wandb
+
+# Plotting ------------------------------------
+# pandas>=1.1.4
+# seaborn>=0.11.0
+
+# Export --------------------------------------
+# coremltools>=4.1  # CoreML export
+# onnx>=1.9.0  # ONNX export
+# onnx-simplifier>=0.3.6  # ONNX simplifier
+# scikit-learn==0.19.2  # CoreML quantization
+# tensorflow>=2.4.1  # TFLite export
+# tensorflowjs>=3.9.0  # TF.js export
+# openvino-dev  # OpenVINO export
+
+# Extras --------------------------------------
+# albumentations>=1.0.3
+# Cython  # for pycocotools https://github.com/cocodataset/cocoapi/issues/172
+# pycocotools>=2.0  # COCO mAP
+# roboflow
+# thop  # FLOPs computation
diff --git a/sample_boundary.h b/sample_boundary.h
new file mode 100644
index 0000000000000000000000000000000000000000..28af12959f578c9f72872c85b59b957729c5ba68
--- /dev/null
+++ b/sample_boundary.h
@@ -0,0 +1,454 @@
+#pragma once
+
+#include "diffvg.h"
+#include "shape.h"
+#include "scene.h"
+#include "vector.h"
+#include "cdf.h"
+
+struct PathBoundaryData {
+    int base_point_id;
+    int point_id;
+    float t;
+};
+
+struct BoundaryData {
+    PathBoundaryData path;
+    bool is_stroke;
+};
+
+DEVICE
+Vector2f sample_boundary(const Circle &circle,
+                         float t,
+                         Vector2f &normal,
+                         float &pdf,
+                         BoundaryData &,
+                         float stroke_perturb_direction,
+                         float stroke_radius) {
+    // Parametric form of a circle (t in [0, 1)):
+    // x = center.x + r * cos(2pi * t)
+    // y = center.y + r * sin(2pi * t)
+    auto offset = Vector2f{
+        circle.radius * cos(2 * float(M_PI) * t),
+        circle.radius * sin(2 * float(M_PI) * t)
+    };
+    normal = normalize(offset);
+    pdf /= (2 * float(M_PI) * circle.radius);
+    auto ret = circle.center + offset;
+    if (stroke_perturb_direction != 0.f) {
+        ret += stroke_perturb_direction * stroke_radius * normal;
+        if (stroke_perturb_direction < 0) {
+            // normal should point towards the perturb direction
+            normal = -normal;
+        }
+    }
+    return ret;
+}
+
+DEVICE
+Vector2f sample_boundary(const Ellipse &ellipse,
+                         float t,
+                         Vector2f &normal,
+                         float &pdf,
+                         BoundaryData &,
+                         float stroke_perturb_direction,
+                         float stroke_radius) {
+    // Parametric form of a ellipse (t in [0, 1)):
+    // x = center.x + r.x * cos(2pi * t)
+    // y = center.y + r.y * sin(2pi * t)
+    const auto &r = ellipse.radius;
+    auto offset = Vector2f{
+        r.x * cos(2 * float(M_PI) * t),
+        r.y * sin(2 * float(M_PI) * t)
+    };
+    auto dxdt = -r.x * sin(2 * float(M_PI) * t) * 2 * float(M_PI);
+    auto dydt = r.y * cos(2 * float(M_PI) * t) * 2 * float(M_PI);
+    // tangent is normalize(dxdt, dydt)
+    normal = normalize(Vector2f{dydt, -dxdt});
+    pdf /= sqrt(square(dxdt) + square(dydt));
+    auto ret = ellipse.center + offset;
+    if (stroke_perturb_direction != 0.f) {
+        ret += stroke_perturb_direction * stroke_radius * normal;
+        if (stroke_perturb_direction < 0) {
+            // normal should point towards the perturb direction
+            normal = -normal;
+        }
+    }
+    return ret;
+}
+
+DEVICE
+Vector2f sample_boundary(const Path &path,
+                         const float *path_length_cdf,
+                         const float *path_length_pmf,
+                         const int *point_id_map,
+                         float path_length,
+                         float t,
+                         Vector2f &normal,
+                         float &pdf,
+                         BoundaryData &data,
+                         float stroke_perturb_direction,
+                         float stroke_radius) {
+    if (stroke_perturb_direction != 0.f && !path.is_closed) {
+        // We need to samples the "caps" of the path
+        // length of a cap is pi * abs(stroke_perturb_direction)
+        // there are two caps
+        auto cap_length = 0.f;
+        if (path.thickness != nullptr) {
+            auto r0 = path.thickness[0];
+            auto r1 = path.thickness[path.num_points - 1];
+            cap_length = float(M_PI) * (r0 + r1);
+        } else {
+            cap_length = 2 * float(M_PI) * stroke_radius;
+        }
+        auto cap_prob = cap_length / (cap_length + path_length);
+        if (t < cap_prob) {
+            t = t / cap_prob;
+            pdf *= cap_prob;
+            auto r0 = stroke_radius;
+            auto r1 = stroke_radius;
+            if (path.thickness != nullptr) {
+                r0 = path.thickness[0];
+                r1 = path.thickness[path.num_points - 1];
+            }
+            // HACK: in theory we want to compute the tangent and
+            //       sample the hemi-circle, but here we just sample the
+            //       full circle since it's less typing
+            if (stroke_perturb_direction < 0) {
+                // Sample the cap at the beginning
+                auto p0 = Vector2f{path.points[0], path.points[1]};
+                auto offset = Vector2f{
+                    r0 * cos(2 * float(M_PI) * t),
+                    r0 * sin(2 * float(M_PI) * t)
+                };
+                normal = normalize(offset);
+                pdf /= (2 * float(M_PI) * r0);
+                data.path.base_point_id = 0;
+                data.path.point_id = 0;
+                data.path.t = 0;
+                return p0 + offset;
+            } else {
+                // Sample the cap at the end
+                auto p0 = Vector2f{path.points[2 * (path.num_points - 1)],
+                                   path.points[2 * (path.num_points - 1) + 1]};
+                auto offset = Vector2f{
+                    r1 * cos(2 * float(M_PI) * t),
+                    r1 * sin(2 * float(M_PI) * t)
+                };
+                normal = normalize(offset);
+                pdf /= (2 * float(M_PI) * r1);
+                data.path.base_point_id = path.num_base_points - 1;
+                data.path.point_id = path.num_points - 2 - 
+                                     path.num_control_points[data.path.base_point_id];
+                data.path.t = 1;
+                return p0 + offset;
+            }
+        } else {
+            t = (t - cap_prob) / (1 - cap_prob);
+            pdf *= (1 - cap_prob);
+        }
+    }
+    // Binary search on path_length_cdf
+    auto sample_id = sample(path_length_cdf,
+                            path.num_base_points,
+                            t,
+                            &t);
+    assert(sample_id >= 0 && sample_id < path.num_base_points);
+    auto point_id = point_id_map[sample_id];
+    if (path.num_control_points[sample_id] == 0) {
+        // Straight line
+        auto i0 = point_id;
+        auto i1 = (i0 + 1) % path.num_points;
+        assert(i0 < path.num_points);
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        data.path.base_point_id = sample_id;
+        data.path.point_id = point_id;
+        data.path.t = t;
+        if (t < -1e-3f || t > 1+1e-3f) {
+            // return invalid sample
+            pdf = 0;
+            return Vector2f{0, 0};
+        }
+        auto tangent = (p1 - p0);
+        auto tan_len = length(tangent);
+        if (tan_len == 0) {
+            // return invalid sample
+            pdf = 0;
+            return Vector2f{0, 0};
+        }
+        normal = Vector2f{-tangent.y, tangent.x} / tan_len;
+        // length of tangent is the Jacobian of the sampling transformation
+        pdf *= path_length_pmf[sample_id] / tan_len;
+        auto ret = p0 + t * (p1 - p0);
+        if (stroke_perturb_direction != 0.f) {
+            auto r0 = stroke_radius;
+            auto r1 = stroke_radius;
+            if (path.thickness != nullptr) {
+                r0 = path.thickness[i0];
+                r1 = path.thickness[i1];
+            }
+            auto r = r0 + t * (r1 - r0);
+            ret += stroke_perturb_direction * r * normal;
+            if (stroke_perturb_direction < 0) {
+                // normal should point towards the perturb direction
+                normal = -normal;
+            }
+        }
+        return ret;
+    } else if (path.num_control_points[sample_id] == 1) {
+        // Quadratic Bezier curve
+        auto i0 = point_id;
+        auto i1 = i0 + 1;
+        auto i2 = (i0 + 2) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+        auto eval = [&](float t) -> Vector2f {
+            auto tt = 1 - t;
+            return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+        };
+        data.path.base_point_id = sample_id;
+        data.path.point_id = point_id;
+        data.path.t = t;
+        if (t < -1e-3f || t > 1+1e-3f) {
+            // return invalid sample
+            pdf = 0;
+            return Vector2f{0, 0};
+        }
+        auto tangent = 2 * (1 - t) * (p1 - p0) + 2 * t * (p2 - p1);
+        auto tan_len = length(tangent);
+        if (tan_len == 0) {
+            // return invalid sample
+            pdf = 0;
+            return Vector2f{0, 0};
+        }
+        normal = Vector2f{-tangent.y, tangent.x} / tan_len;
+        // length of tangent is the Jacobian of the sampling transformation
+        pdf *= path_length_pmf[sample_id] / tan_len;
+        auto ret = eval(t);
+        if (stroke_perturb_direction != 0.f) {
+            auto r0 = stroke_radius;
+            auto r1 = stroke_radius;
+            auto r2 = stroke_radius;
+            if (path.thickness != nullptr) {
+                r0 = path.thickness[i0];
+                r1 = path.thickness[i1];
+                r2 = path.thickness[i2];
+            }
+            auto tt = 1 - t;
+            auto r = (tt*tt)*r0 + (2*tt*t)*r1 + (t*t)*r2;
+            ret += stroke_perturb_direction * r * normal;
+            if (stroke_perturb_direction < 0) {
+                // normal should point towards the perturb direction
+                normal = -normal;
+            }
+        }
+        return ret;
+    } else if (path.num_control_points[sample_id] == 2) {
+        // Cubic Bezier curve
+        auto i0 = point_id;
+        auto i1 = point_id + 1;
+        auto i2 = point_id + 2;
+        auto i3 = (point_id + 3) % path.num_points;
+        assert(i0 >= 0 && i2 < path.num_points);
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+        auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+        auto eval = [&](float t) -> Vector2f {
+            auto tt = 1 - t;
+            return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+        };
+        data.path.base_point_id = sample_id;
+        data.path.point_id = point_id;
+        data.path.t = t;
+        if (t < -1e-3f || t > 1+1e-3f) {
+            // return invalid sample
+            pdf = 0;
+            return Vector2f{0, 0};
+        }
+        auto tangent = 3 * square(1 - t) * (p1 - p0) + 6 * (1 - t) * t * (p2 - p1) + 3 * t * t * (p3 - p2);
+        auto tan_len = length(tangent);
+        if (tan_len == 0) {
+            // return invalid sample
+            pdf = 0;
+            return Vector2f{0, 0};
+        }
+        normal = Vector2f{-tangent.y, tangent.x} / tan_len;
+        // length of tangent is the Jacobian of the sampling transformation
+        pdf *= path_length_pmf[sample_id] / tan_len;
+        auto ret = eval(t);
+        if (stroke_perturb_direction != 0.f) {
+            auto r0 = stroke_radius;
+            auto r1 = stroke_radius;
+            auto r2 = stroke_radius;
+            auto r3 = stroke_radius;
+            if (path.thickness != nullptr) {
+                r0 = path.thickness[i0];
+                r1 = path.thickness[i1];
+                r2 = path.thickness[i2];
+                r3 = path.thickness[i3];
+            }
+            auto tt = 1 - t;
+            auto r = (tt*tt*tt)*r0 + (3*tt*tt*t)*r1 + (3*tt*t*t)*r2 + (t*t*t)*r3;
+            ret += stroke_perturb_direction * r * normal;
+            if (stroke_perturb_direction < 0) {
+                // normal should point towards the perturb direction
+                normal = -normal;
+            }
+        }
+        return ret;
+    } else {
+        assert(false);
+    }
+    assert(false);
+    return Vector2f{0, 0};
+}
+
+DEVICE
+Vector2f sample_boundary(const Rect &rect,
+                         float t, Vector2f &normal,
+                         float &pdf,
+                         BoundaryData &,
+                         float stroke_perturb_direction,
+                         float stroke_radius) {
+    // Roll a dice to decide whether to sample width or height
+    auto w = rect.p_max.x - rect.p_min.x;
+    auto h = rect.p_max.y - rect.p_min.y;
+    pdf /= (2 * (w +h));
+    if (t <= w / (w + h)) {
+        // Sample width
+        // reuse t for the next dice
+        t *= (w + h) / w;
+        // Roll a dice to decide whether to sample upper width or lower width
+        if (t < 0.5f) {
+            // Sample upper width
+            normal = Vector2f{0, -1};
+            auto ret = rect.p_min + 2 * t * Vector2f{rect.p_max.x - rect.p_min.x, 0.f};
+            if (stroke_perturb_direction != 0.f) {
+                ret += stroke_perturb_direction * stroke_radius * normal;
+                if (stroke_perturb_direction < 0) {
+                    // normal should point towards the perturb direction
+                    normal = -normal;
+                }
+            }
+            return ret;
+        } else {
+            // Sample lower width
+            normal = Vector2f{0, 1};
+            auto ret = Vector2f{rect.p_min.x, rect.p_max.y} +
+                2 * (t - 0.5f) * Vector2f{rect.p_max.x - rect.p_min.x, 0.f};
+            if (stroke_perturb_direction != 0.f) {
+                ret += stroke_perturb_direction * stroke_radius * normal;
+                if (stroke_perturb_direction < 0) {
+                    // normal should point towards the perturb direction
+                    normal = -normal;
+                }
+            }
+            return ret;
+        }
+    } else {
+        // Sample height
+        // reuse t for the next dice
+        assert(h > 0);
+        t = (t - w / (w + h)) * (w + h) / h;
+        // Roll a dice to decide whether to sample left height or right height
+        if (t < 0.5f) {
+            // Sample left height
+            normal = Vector2f{-1, 0};
+            auto ret = rect.p_min + 2 * t * Vector2f{0.f, rect.p_max.y - rect.p_min.y};
+            if (stroke_perturb_direction != 0.f) {
+                ret += stroke_perturb_direction * stroke_radius * normal;
+                if (stroke_perturb_direction < 0) {
+                    // normal should point towards the perturb direction
+                    normal = -normal;
+                }
+            }
+            return ret;
+        } else {
+            // Sample right height
+            normal = Vector2f{1, 0};
+            auto ret = Vector2f{rect.p_max.x, rect.p_min.y} +
+                2 * (t - 0.5f) * Vector2f{0.f, rect.p_max.y - rect.p_min.y};
+            if (stroke_perturb_direction != 0.f) {
+                ret += stroke_perturb_direction * stroke_radius * normal;
+                if (stroke_perturb_direction < 0) {
+                    // normal should point towards the perturb direction
+                    normal = -normal;
+                }
+            }
+            return ret;
+        }
+    }
+}
+
+DEVICE
+Vector2f sample_boundary(const SceneData &scene,
+                         int shape_group_id,
+                         int shape_id,
+                         float t,
+                         Vector2f &normal,
+                         float &pdf,
+                         BoundaryData &data) {
+    const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+    const Shape &shape = scene.shapes[shape_id];
+    pdf = 1;
+    // Choose which one to sample: stroke discontinuities or fill discontinuities.
+    // TODO: we don't need to sample fill discontinuities when stroke alpha is 1 and both
+    // fill and stroke color exists
+    auto stroke_perturb = false;
+    if (shape_group.fill_color != nullptr && shape_group.stroke_color != nullptr) {
+        if (t < 0.5f) {
+            stroke_perturb = false;
+            t = 2 * t;
+            pdf = 0.5f;
+        } else {
+            stroke_perturb = true;
+            t = 2 * (t - 0.5f);
+            pdf = 0.5f;
+        }
+    } else if (shape_group.stroke_color != nullptr) {
+        stroke_perturb = true;
+    }
+    data.is_stroke = stroke_perturb;
+    auto stroke_perturb_direction = 0.f;
+    if (stroke_perturb) {
+        if (t < 0.5f) {
+            stroke_perturb_direction = -1.f;
+            t = 2 * t;
+            pdf *= 0.5f;
+        } else {
+            stroke_perturb_direction = 1.f;
+            t = 2 * (t - 0.5f);
+            pdf *= 0.5f;
+        }
+    }
+    switch (shape.type) {
+        case ShapeType::Circle:
+            return sample_boundary(
+                *(const Circle *)shape.ptr, t, normal, pdf, data, stroke_perturb_direction, shape.stroke_width);
+        case ShapeType::Ellipse:
+            return sample_boundary(
+                *(const Ellipse *)shape.ptr, t, normal, pdf, data, stroke_perturb_direction, shape.stroke_width);
+        case ShapeType::Path:
+            return sample_boundary(
+                *(const Path *)shape.ptr,
+                scene.path_length_cdf[shape_id],
+                scene.path_length_pmf[shape_id],
+                scene.path_point_id_map[shape_id],
+                scene.shapes_length[shape_id],
+                t,
+                normal,
+                pdf,
+                data,
+                stroke_perturb_direction,
+                shape.stroke_width);
+        case ShapeType::Rect:
+            return sample_boundary(
+                *(const Rect *)shape.ptr, t, normal, pdf, data, stroke_perturb_direction, shape.stroke_width);
+    }
+    assert(false);
+    return Vector2f{};
+}
+
diff --git a/scene.cpp b/scene.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1799c962146fbca169594e73f304daa76aa36d0b
--- /dev/null
+++ b/scene.cpp
@@ -0,0 +1,1035 @@
+#include "scene.h"
+#include "aabb.h"
+#include "cuda_utils.h"
+#include "filter.h"
+#include "shape.h"
+#include <numeric>
+#include <algorithm>
+#include <cstring>
+#include <chrono>
+#include <cstddef>
+
+size_t align(size_t s) {
+    auto a = alignof(std::max_align_t);
+    return ((s + a - 1) / a) * a;
+}
+
+template <typename T>
+void allocate(bool use_gpu, T **p) {
+    if (use_gpu) {
+#ifdef __NVCC__
+        checkCuda(cudaMallocManaged(p, sizeof(T)));
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    } else {
+        *p = (T*)malloc(sizeof(T));
+    }
+}
+
+template <typename T>
+void allocate(bool use_gpu, size_t size, T **p) {
+    if (use_gpu) {
+#ifdef __NVCC__
+        checkCuda(cudaMallocManaged(p, size * sizeof(T)));
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    } else {
+        *p = (T*)malloc(size * sizeof(T));
+    }
+}
+
+void copy_and_init_shapes(Scene &scene,
+                          const std::vector<const Shape *> &shape_list) {
+    for (int shape_id = 0; shape_id < scene.num_shapes; shape_id++) {
+        switch (shape_list[shape_id]->type) {
+            case ShapeType::Circle: {
+                Circle *p = (Circle *)scene.shapes[shape_id].ptr;
+                const Circle *p_ = (const Circle*)(shape_list[shape_id]->ptr);
+                *p = *p_;
+                Circle *d_p = (Circle *)scene.d_shapes[shape_id].ptr;
+                d_p->radius = 0;
+                d_p->center = Vector2f{0, 0};
+                break;
+            } case ShapeType::Ellipse: {
+                Ellipse *p = (Ellipse *)scene.shapes[shape_id].ptr;
+                const Ellipse *p_ = (const Ellipse*)(shape_list[shape_id]->ptr);
+                *p = *p_;
+                Ellipse *d_p = (Ellipse *)scene.d_shapes[shape_id].ptr;
+                d_p->radius = Vector2f{0, 0};
+                d_p->center = Vector2f{0, 0};
+                break;
+            } case ShapeType::Path: {
+                Path *p = (Path *)scene.shapes[shape_id].ptr;
+                const Path *p_ = (const Path*)(shape_list[shape_id]->ptr);
+                p->num_points = p_->num_points;
+                p->num_base_points = p_->num_base_points;
+                for (int i = 0; i < p_->num_base_points; i++) {
+                    p->num_control_points[i] = p_->num_control_points[i];
+                }
+                for (int i = 0; i < 2 * p_->num_points; i++) {
+                    p->points[i] = p_->points[i];
+                }
+                p->is_closed = p_->is_closed;
+                p->use_distance_approx = p_->use_distance_approx;
+                Path *d_p = (Path *)scene.d_shapes[shape_id].ptr;
+                d_p->num_points = p_->num_points;
+                d_p->num_base_points = p_->num_base_points;
+                for (int i = 0; i < 2 * p_->num_points; i++) {
+                    d_p->points[i] = 0;
+                }
+                d_p->is_closed = p_->is_closed;
+                if (p_->thickness != nullptr) {
+                    for (int i = 0; i < p_->num_points; i++) {
+                        p->thickness[i] = p_->thickness[i];
+                        d_p->thickness[i] = 0;
+                    }
+                }
+                d_p->use_distance_approx = p_->use_distance_approx;
+                break;
+            } case ShapeType::Rect: {
+                Rect *p = (Rect *)scene.shapes[shape_id].ptr;
+                const Rect *p_ = (const Rect*)(shape_list[shape_id]->ptr);
+                *p = *p_;
+                Rect *d_p = (Rect *)scene.d_shapes[shape_id].ptr;
+                d_p->p_min = Vector2f{0, 0};
+                d_p->p_max = Vector2f{0, 0};
+                break;
+            } default: {
+                assert(false);
+                break;
+            }
+        }
+        scene.shapes[shape_id].type = shape_list[shape_id]->type;
+        scene.shapes[shape_id].stroke_width = shape_list[shape_id]->stroke_width;
+        scene.d_shapes[shape_id].type = shape_list[shape_id]->type;
+        scene.d_shapes[shape_id].stroke_width = 0;
+    }
+}
+
+std::vector<float>
+compute_shape_length(const std::vector<const Shape *> &shape_list) {
+    int num_shapes = (int)shape_list.size();
+    std::vector<float> shape_length_list(num_shapes, 0.f);
+    for (int shape_id = 0; shape_id < num_shapes; shape_id++) {
+        auto shape_length = 0.f;
+        switch (shape_list[shape_id]->type) {
+            case ShapeType::Circle: {
+                const Circle *p_ = (const Circle*)(shape_list[shape_id]->ptr);
+                shape_length += float(2.f * M_PI) * p_->radius;
+                break;
+            } case ShapeType::Ellipse: {
+                const Ellipse *p_ = (const Ellipse*)(shape_list[shape_id]->ptr);
+                // https://en.wikipedia.org/wiki/Ellipse#Circumference
+                // Ramanujan's ellipse circumference approximation
+                auto a = p_->radius.x;
+                auto b = p_->radius.y;
+                shape_length += float(M_PI) * (3 * (a + b) - sqrt((3 * a + b) * (a + 3 * b)));
+                break;
+            } case ShapeType::Path: {
+                const Path *p_ = (const Path*)(shape_list[shape_id]->ptr);
+                auto length = 0.f;
+                auto point_id = 0;
+                for (int i = 0; i < p_->num_base_points; i++) {
+                    if (p_->num_control_points[i] == 0) {
+                        // Straight line
+                        auto i0 = point_id;
+                        assert(i0 < p_->num_points);
+                        auto i1 = (i0 + 1) % p_->num_points;
+                        point_id += 1;
+                        auto p0 = Vector2f{p_->points[2 * i0], p_->points[2 * i0 + 1]};
+                        auto p1 = Vector2f{p_->points[2 * i1], p_->points[2 * i1 + 1]};
+                        length += distance(p1, p0);
+                    } else if (p_->num_control_points[i] == 1) {
+                        // Quadratic Bezier curve
+                        auto i0 = point_id;
+                        auto i1 = i0 + 1;
+                        auto i2 = (i0 + 2) % p_->num_points;
+                        point_id += 2;
+                        auto p0 = Vector2f{p_->points[2 * i0], p_->points[2 * i0 + 1]};
+                        auto p1 = Vector2f{p_->points[2 * i1], p_->points[2 * i1 + 1]};
+                        auto p2 = Vector2f{p_->points[2 * i2], p_->points[2 * i2 + 1]};
+                        auto eval = [&](float t) -> Vector2f {
+                            auto tt = 1 - t;
+                            return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+                        };
+                        // We use 3-point samples to approximate the length
+                        auto v0 = p0;
+                        auto v1 = eval(0.5f);
+                        auto v2 = p2;
+                        length += distance(v1, v0) + distance(v1, v2);
+                    } else if (p_->num_control_points[i] == 2) {
+                        // Cubic Bezier curve
+                        auto i0 = point_id;
+                        auto i1 = i0 + 1;
+                        auto i2 = i0 + 2;
+                        auto i3 = (i0 + 3) % p_->num_points;
+                        point_id += 3;
+                        auto p0 = Vector2f{p_->points[2 * i0], p_->points[2 * i0 + 1]};
+                        auto p1 = Vector2f{p_->points[2 * i1], p_->points[2 * i1 + 1]};
+                        auto p2 = Vector2f{p_->points[2 * i2], p_->points[2 * i2 + 1]};
+                        auto p3 = Vector2f{p_->points[2 * i3], p_->points[2 * i3 + 1]};
+                        auto eval = [&](float t) -> Vector2f {
+                            auto tt = 1 - t;
+                            return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+                        };
+                        // We use 4-point samples to approximate the length
+                        auto v0 = p0;
+                        auto v1 = eval(1.f/3.f);
+                        auto v2 = eval(2.f/3.f);
+                        auto v3 = p3;
+                        length += distance(v1, v0) + distance(v1, v2) + distance(v2, v3);
+                    } else {
+                        assert(false);
+                    }
+                }
+                assert(isfinite(length));
+                shape_length += length;
+                break;
+            } case ShapeType::Rect: {
+                const Rect *p_ = (const Rect*)(shape_list[shape_id]->ptr);
+                shape_length += 2 * (p_->p_max.x - p_->p_min.x + p_->p_max.y - p_->p_min.y);
+                break;
+            } default: {
+                assert(false);
+                break;
+            }
+        }
+        assert(isfinite(shape_length));
+        shape_length_list[shape_id] = shape_length;
+    }
+    return shape_length_list;
+}
+
+void build_shape_cdfs(Scene &scene,
+                      const std::vector<const ShapeGroup *> &shape_group_list,
+                      const std::vector<float> &shape_length_list) {
+    int sample_id = 0;
+    for (int shape_group_id = 0; shape_group_id < (int)shape_group_list.size(); shape_group_id++) {
+        const ShapeGroup *shape_group = shape_group_list[shape_group_id];
+        for (int i = 0; i < shape_group->num_shapes; i++) {
+            int shape_id = shape_group->shape_ids[i];
+            float length = shape_length_list[shape_id];
+            scene.sample_shape_id[sample_id] = shape_id;
+            if (sample_id == 0) {
+                scene.sample_shapes_cdf[sample_id] = length;
+            } else {
+                scene.sample_shapes_cdf[sample_id] = length +
+                    scene.sample_shapes_cdf[sample_id - 1];
+            }
+            assert(isfinite(length));
+            scene.sample_shapes_pmf[sample_id] = length;
+            scene.sample_group_id[sample_id] = shape_group_id;
+            sample_id++;
+        }
+    }
+    assert(sample_id == scene.num_total_shapes);
+    auto normalization = scene.sample_shapes_cdf[scene.num_total_shapes - 1];
+    if (normalization <= 0) {
+        char buf[256];
+        sprintf(buf, "The total length of the shape boundaries in the scene is equal or less than 0. Length = %f", normalization);
+        throw std::runtime_error(buf);
+    }
+    if (!isfinite(normalization)) {
+        char buf[256];
+        sprintf(buf, "The total length of the shape boundaries in the scene is not a number. Length = %f", normalization);
+        throw std::runtime_error(buf);
+    }
+    assert(normalization > 0);
+    for (int sample_id = 0; sample_id < scene.num_total_shapes; sample_id++) {
+        scene.sample_shapes_cdf[sample_id] /= normalization;
+        scene.sample_shapes_pmf[sample_id] /= normalization;
+    }
+}
+
+void build_path_cdfs(Scene &scene,
+                     const std::vector<const Shape *> &shape_list,
+                     const std::vector<float> &shape_length_list) {
+    for (int shape_id = 0; shape_id < scene.num_shapes; shape_id++) {
+        if (shape_list[shape_id]->type == ShapeType::Path) {
+            const Path &path = shape_list[shape_id]->as_path();
+            float *pmf = scene.path_length_pmf[shape_id];
+            float *cdf = scene.path_length_cdf[shape_id];
+            int *point_id_map = scene.path_point_id_map[shape_id];
+            auto path_length = shape_length_list[shape_id];
+            auto inv_length = 1.f / path_length;
+            auto point_id = 0;
+            for (int i = 0; i < path.num_base_points; i++) {
+                point_id_map[i] = point_id;
+                if (path.num_control_points[i] == 0) {
+                    // Straight line
+                    auto i0 = point_id;
+                    auto i1 = (i0 + 1) % path.num_points;
+                    point_id += 1;
+                    auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                    auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                    auto d = distance(p0, p1) * inv_length;
+                    pmf[i] = d;
+                    if (i == 0) {
+                        cdf[i] = d;
+                    } else {
+                        cdf[i] = d + cdf[i - 1];
+                    }
+                } else if (path.num_control_points[i] == 1) {
+                    // Quadratic Bezier curve
+                    auto i0 = point_id;
+                    auto i1 = i0 + 1;
+                    auto i2 = (i0 + 2) % path.num_points;
+                    point_id += 2;
+                    auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                    auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                    auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                    auto eval = [&](float t) -> Vector2f {
+                        auto tt = 1 - t;
+                        return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+                    };
+                    // We use 3-point samples to approximate the length
+                    auto v0 = p0;
+                    auto v1 = eval(0.5f);
+                    auto v2 = p2;
+                    auto d = (distance(v0, v1) + distance(v1, v2)) * inv_length;
+                    pmf[i] = d;
+                    if (i == 0) {
+                        cdf[i] = d;
+                    } else {
+                        cdf[i] = d + cdf[i - 1];
+                    }
+                } else if (path.num_control_points[i] == 2) {
+                    // Cubic Bezier curve
+                    auto i0 = point_id;
+                    auto i1 = point_id + 1;
+                    auto i2 = point_id + 2;
+                    auto i3 = (point_id + 3) % path.num_points;
+                    point_id += 3;
+                    auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                    auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                    auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                    auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+                    auto eval = [&](float t) -> Vector2f {
+                        auto tt = 1 - t;
+                        return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+                    };
+                    // We use 4-point samples to approximate the length
+                    auto v0 = p0;
+                    auto v1 = eval(1.f/3.f);
+                    auto v2 = eval(2.f/3.f);
+                    auto v3 = p3;
+                    auto d = (distance(v1, v0) + distance(v1, v2) + distance(v2, v3)) * inv_length;
+                    pmf[i] = d;
+                    if (i == 0) {
+                        cdf[i] = d;
+                    } else {
+                        cdf[i] = d + cdf[i - 1];
+                    }
+                } else {
+                    assert(false);
+                }
+            }
+        }
+    }
+}
+
+void copy_and_init_shape_groups(Scene &scene,
+                                const std::vector<const ShapeGroup *> &shape_group_list) {
+    for (int group_id = 0; group_id < scene.num_shape_groups; group_id++) {
+        const ShapeGroup *shape_group = shape_group_list[group_id];
+        auto copy_and_init_color = [&](const ColorType &color_type, void *color_ptr, void *target_ptr, void *d_target_ptr) {
+            switch (color_type) {
+                case ColorType::Constant: {
+                    Constant *c = (Constant*)target_ptr;
+                    Constant *d_c = (Constant*)d_target_ptr;
+                    const Constant *c_ = (const Constant*)color_ptr;
+                    *c = *c_;
+                    d_c->color = Vector4{0, 0, 0, 0};
+                    break;
+                } case ColorType::LinearGradient: {
+                    LinearGradient *c = (LinearGradient*)target_ptr;
+                    LinearGradient *d_c = (LinearGradient*)d_target_ptr;
+                    const LinearGradient *c_ = (const LinearGradient*)color_ptr;
+                    c->begin = c_->begin;
+                    c->end = c_->end;
+                    c->num_stops = c_->num_stops;
+                    for (int i = 0; i < c_->num_stops; i++) {
+                        c->stop_offsets[i] = c_->stop_offsets[i];
+                    }
+                    for (int i = 0; i < 4 * c_->num_stops; i++) {
+                        c->stop_colors[i] = c_->stop_colors[i];
+                    }
+                    d_c->begin = Vector2f{0, 0};
+                    d_c->end = Vector2f{0, 0};
+                    d_c->num_stops = c_->num_stops;
+                    for (int i = 0; i < c_->num_stops; i++) {
+                        d_c->stop_offsets[i] = 0;
+                    }
+                    for (int i = 0; i < 4 * c_->num_stops; i++) {
+                        d_c->stop_colors[i] = 0;
+                    }
+                    break;
+                } case ColorType::RadialGradient: {
+                    RadialGradient *c = (RadialGradient*)target_ptr;
+                    RadialGradient *d_c = (RadialGradient*)d_target_ptr;
+                    const RadialGradient *c_ = (const RadialGradient*)color_ptr;
+                    c->center = c_->center;
+                    c->radius = c_->radius;
+                    c->num_stops = c_->num_stops;
+                    for (int i = 0; i < c_->num_stops; i++) {
+                        c->stop_offsets[i] = c_->stop_offsets[i];
+                    }
+                    for (int i = 0; i < 4 * c_->num_stops; i++) {
+                        c->stop_colors[i] = c_->stop_colors[i];
+                    }
+                    d_c->center = Vector2f{0, 0};
+                    d_c->radius = Vector2f{0, 0};
+                    d_c->num_stops = c_->num_stops;
+                    for (int i = 0; i < c_->num_stops; i++) {
+                        d_c->stop_offsets[i] = 0;
+                    }
+                    for (int i = 0; i < 4 * c_->num_stops; i++) {
+                        d_c->stop_colors[i] = 0;
+                    }
+                    break;
+                } default: {
+                    assert(false);
+                }
+            }
+        };
+        for (int i = 0; i < shape_group->num_shapes; i++) {
+            scene.shape_groups[group_id].shape_ids[i] = shape_group->shape_ids[i];
+        }
+        scene.shape_groups[group_id].num_shapes = shape_group->num_shapes;
+        scene.shape_groups[group_id].use_even_odd_rule = shape_group->use_even_odd_rule;
+        scene.shape_groups[group_id].canvas_to_shape = shape_group->canvas_to_shape;
+        scene.shape_groups[group_id].shape_to_canvas = shape_group->shape_to_canvas;
+        scene.d_shape_groups[group_id].shape_ids = nullptr;
+        scene.d_shape_groups[group_id].num_shapes = shape_group->num_shapes;
+        scene.d_shape_groups[group_id].use_even_odd_rule = shape_group->use_even_odd_rule;
+        scene.d_shape_groups[group_id].canvas_to_shape = Matrix3x3f{};
+        scene.d_shape_groups[group_id].shape_to_canvas = Matrix3x3f{};
+
+        scene.shape_groups[group_id].fill_color_type = shape_group->fill_color_type;
+        scene.d_shape_groups[group_id].fill_color_type = shape_group->fill_color_type;
+        if (shape_group->fill_color != nullptr) {
+            copy_and_init_color(shape_group->fill_color_type,
+                                shape_group->fill_color,
+                                scene.shape_groups[group_id].fill_color,
+                                scene.d_shape_groups[group_id].fill_color);
+        }
+        scene.shape_groups[group_id].stroke_color_type = shape_group->stroke_color_type;
+        scene.d_shape_groups[group_id].stroke_color_type = shape_group->stroke_color_type;
+        if (shape_group->stroke_color != nullptr) {
+            copy_and_init_color(shape_group->stroke_color_type,
+                                shape_group->stroke_color,
+                                scene.shape_groups[group_id].stroke_color,
+                                scene.d_shape_groups[group_id].stroke_color);
+        }
+    }
+}
+
+DEVICE uint32_t morton2D(const Vector2f &p, int canvas_width, int canvas_height) {
+    auto scene_bounds = Vector2f{canvas_width, canvas_height};
+    auto pp = p / scene_bounds;
+    TVector2<uint32_t> pp_i{pp.x * 1023, pp.y * 1023};
+    return (expand_bits(pp_i.x) << 1u) |
+           (expand_bits(pp_i.y) << 0u);
+}
+
+template <bool sort>
+void build_bvh(const Scene &scene, BVHNode *nodes, int num_primitives) {
+    auto bvh_size = 2 * num_primitives - 1;
+    if (bvh_size > 1) {
+        if (sort) {
+            // Sort by Morton code
+            std::sort(nodes, nodes + num_primitives,
+                [&] (const BVHNode &n0, const BVHNode &n1) {
+                    auto p0 = 0.5f * (n0.box.p_min + n0.box.p_max);
+                    auto p1 = 0.5f * (n1.box.p_min + n1.box.p_max);
+                    auto m0 = morton2D(p0, scene.canvas_width, scene.canvas_height);
+                    auto m1 = morton2D(p1, scene.canvas_width, scene.canvas_height);
+                    return m0 < m1;
+            });
+        }
+        for (int i = num_primitives; i < bvh_size; i++) {
+            nodes[i] = BVHNode{-1, -1, AABB{}, 0.f};
+        }
+        int prev_beg = 0;
+        int prev_end = num_primitives;
+        // For handling odd number of nodes at a level
+        int leftover = prev_end % 2 == 0 ? -1 : prev_end - 1;
+        while (prev_end - prev_beg >= 1 || leftover != -1) {
+            int length = (prev_end - prev_beg) / 2;
+            if ((prev_end - prev_beg) % 2 == 1 && leftover != -1 &&
+                    leftover != prev_end - 1) {
+                length += 1;
+            }
+            for (int i = 0; i < length; i++) {
+                BVHNode node;
+                node.child0 = prev_beg + 2 * i;
+                node.child1 = prev_beg + 2 * i + 1;
+                if (node.child1 >= prev_end) {
+                    assert(leftover != -1);
+                    node.child1 = leftover;
+                    leftover = -1;
+                }
+                AABB child0_box = nodes[node.child0].box;
+                AABB child1_box = nodes[node.child1].box;
+                node.box = merge(child0_box, child1_box);
+                node.max_radius = std::max(nodes[node.child0].max_radius,
+                                           nodes[node.child1].max_radius);
+                nodes[prev_end + i] = node;
+            }
+            if (length == 1 && leftover == -1) {
+                break;
+            }
+            prev_beg = prev_end;
+            prev_end = prev_beg + length;
+            if (length % 2 == 1 && leftover == -1) {
+                leftover = prev_end - 1;
+            }
+        }
+    }
+    assert(nodes[2 * num_primitives - 2].child0 != -1);
+}
+
+void compute_bounding_boxes(Scene &scene,
+                            const std::vector<const Shape *> &shape_list,
+                            const std::vector<const ShapeGroup *> &shape_group_list) {
+    for (int shape_id = 0; shape_id < scene.num_shapes; shape_id++) {
+        switch (shape_list[shape_id]->type) {
+            case ShapeType::Circle: {
+                const Circle *p = (const Circle*)(shape_list[shape_id]->ptr);
+                scene.shapes_bbox[shape_id] = AABB{p->center - p->radius,
+                                                   p->center + p->radius};
+                break;
+            } case ShapeType::Ellipse: {
+                const Ellipse *p = (const Ellipse*)(shape_list[shape_id]->ptr);
+                scene.shapes_bbox[shape_id] = AABB{p->center - p->radius,
+                                                   p->center + p->radius};
+                break;
+            } case ShapeType::Path: {
+                const Path *p = (const Path*)(shape_list[shape_id]->ptr);
+                AABB box;
+                if (p->num_points > 0) {
+                    box = AABB{Vector2f{p->points[0], p->points[1]},
+                               Vector2f{p->points[0], p->points[1]}};
+                }
+                for (int i = 1; i < p->num_points; i++) {
+                    box = merge(box, Vector2f{p->points[2 * i], p->points[2 * i + 1]});
+                }
+                scene.shapes_bbox[shape_id] = box;
+                std::vector<AABB> boxes(p->num_base_points);
+                std::vector<float> thickness(p->num_base_points);
+                std::vector<int> first_point_id(p->num_base_points);
+                auto r = shape_list[shape_id]->stroke_width;
+                auto point_id = 0;
+                for (int i = 0; i < p->num_base_points; i++) {
+                    first_point_id[i] = point_id;
+                    if (p->num_control_points[i] == 0) {
+                        // Straight line
+                        auto i0 = point_id;
+                        auto i1 = (i0 + 1) % p->num_points;
+                        point_id += 1;
+                        auto p0 = Vector2f{p->points[2 * i0], p->points[2 * i0 + 1]};
+                        auto p1 = Vector2f{p->points[2 * i1], p->points[2 * i1 + 1]};
+                        boxes[i] = AABB();
+                        boxes[i] = merge(boxes[i], p0);
+                        boxes[i] = merge(boxes[i], p1);
+                        auto r0 = r;
+                        auto r1 = r;
+                        // override radius if path has thickness
+                        if (p->thickness != nullptr) {
+                            r0 = p->thickness[i0];
+                            r1 = p->thickness[i1];
+                        }
+                        thickness[i] = max(r0, r1);
+                    } else if (p->num_control_points[i] == 1) {
+                        // Quadratic Bezier curve
+                        auto i0 = point_id;
+                        auto i1 = i0 + 1;
+                        auto i2 = (i0 + 2) % p->num_points;
+                        point_id += 2;
+                        auto p0 = Vector2f{p->points[2 * i0], p->points[2 * i0 + 1]};
+                        auto p1 = Vector2f{p->points[2 * i1], p->points[2 * i1 + 1]};
+                        auto p2 = Vector2f{p->points[2 * i2], p->points[2 * i2 + 1]};
+                        boxes[i] = AABB();
+                        boxes[i] = merge(boxes[i], p0);
+                        boxes[i] = merge(boxes[i], p1);
+                        boxes[i] = merge(boxes[i], p2);
+                        auto r0 = r;
+                        auto r1 = r;
+                        auto r2 = r;
+                        // override radius if path has thickness
+                        if (p->thickness != nullptr) {
+                            r0 = p->thickness[i0];
+                            r1 = p->thickness[i1];
+                            r2 = p->thickness[i2];
+                        }
+                        thickness[i] = max(max(r0, r1), r2);
+                    } else if (p->num_control_points[i] == 2) {
+                        // Cubic Bezier curve
+                        auto i0 = point_id;
+                        auto i1 = i0 + 1;
+                        auto i2 = i0 + 2;
+                        auto i3 = (i0 + 3) % p->num_points;
+                        point_id += 3;
+                        auto p0 = Vector2f{p->points[2 * i0], p->points[2 * i0 + 1]};
+                        auto p1 = Vector2f{p->points[2 * i1], p->points[2 * i1 + 1]};
+                        auto p2 = Vector2f{p->points[2 * i2], p->points[2 * i2 + 1]};
+                        auto p3 = Vector2f{p->points[2 * i3], p->points[2 * i3 + 1]};
+                        boxes[i] = AABB();
+                        boxes[i] = merge(boxes[i], p0);
+                        boxes[i] = merge(boxes[i], p1);
+                        boxes[i] = merge(boxes[i], p2);
+                        boxes[i] = merge(boxes[i], p3);
+                        auto r0 = r;
+                        auto r1 = r;
+                        auto r2 = r;
+                        auto r3 = r;
+                        // override radius if path has thickness
+                        if (p->thickness != nullptr) {
+                            r0 = p->thickness[i0];
+                            r1 = p->thickness[i1];
+                            r2 = p->thickness[i2];
+                            r3 = p->thickness[i3];
+                        }
+                        thickness[i] = max(max(max(r0, r1), r2), r3);
+                    } else {
+                        assert(false);
+                    }
+                }
+                // Sort the boxes by y
+                std::vector<int> idx(boxes.size());
+                std::iota(idx.begin(), idx.end(), 0);
+                std::sort(idx.begin(), idx.end(), [&](int i0, int i1) {
+                    const AABB &b0 = boxes[i0];
+                    const AABB &b1 = boxes[i1];
+                    auto b0y = 0.5f * (b0.p_min.y + b0.p_max.y);
+                    auto b1y = 0.5f * (b1.p_min.y + b1.p_max.y);
+                    return b0y < b1y;
+                });
+                BVHNode *nodes = scene.path_bvhs[shape_id];
+                for (int i = 0; i < (int)idx.size(); i++) {
+                    nodes[i] = BVHNode{idx[i],
+                                       -(first_point_id[idx[i]]+1),
+                                       boxes[idx[i]],
+                                       thickness[idx[i]]};
+                }
+                build_bvh<false /*sort*/>(scene, nodes, boxes.size());
+                break;
+            } case ShapeType::Rect: {
+                const Rect *p = (const Rect*)(shape_list[shape_id]->ptr);
+                scene.shapes_bbox[shape_id] = AABB{p->p_min, p->p_max};
+                break;
+            } default: {
+                assert(false);
+                break;
+            }
+        }
+    }
+    
+    for (int shape_group_id = 0; shape_group_id < (int)shape_group_list.size(); shape_group_id++) {
+        const ShapeGroup *shape_group = shape_group_list[shape_group_id];
+        // Build a BVH for each shape group
+        BVHNode *nodes = scene.shape_groups_bvh_nodes[shape_group_id];
+        for (int i = 0; i < shape_group->num_shapes; i++) {
+            auto shape_id = shape_group->shape_ids[i];
+            auto r = shape_group->stroke_color == nullptr ? 0 : shape_list[shape_id]->stroke_width;
+            nodes[i] = BVHNode{shape_id,
+                               -1,
+                               scene.shapes_bbox[shape_id],
+                               r};
+        }
+        build_bvh<true /*sort*/>(scene, nodes, shape_group->num_shapes);
+    }
+
+    BVHNode *nodes = scene.bvh_nodes;
+    for (int shape_group_id = 0; shape_group_id < (int)shape_group_list.size(); shape_group_id++) {
+        const ShapeGroup *shape_group = shape_group_list[shape_group_id];
+        auto max_radius = shape_list[shape_group->shape_ids[0]]->stroke_width;
+        if (shape_list[shape_group->shape_ids[0]]->type == ShapeType::Path) {
+            const Path *p = (const Path*)(shape_list[shape_group->shape_ids[0]]->ptr);
+            if (p->thickness != nullptr) {
+                const BVHNode *nodes = scene.path_bvhs[shape_group->shape_ids[0]];
+                max_radius = nodes[0].max_radius;
+            }
+        }
+        for (int i = 1; i < shape_group->num_shapes; i++) {
+            auto shape_id = shape_group->shape_ids[i];
+            auto shape = shape_list[shape_id];
+            auto r = shape->stroke_width;
+            if (shape->type == ShapeType::Path) {
+                const Path *p = (const Path*)(shape_list[shape_id]->ptr);
+                if (p->thickness != nullptr) {
+                    const BVHNode *nodes = scene.path_bvhs[shape_id];
+                    r = nodes[0].max_radius;
+                }
+            }
+            max_radius = std::max(max_radius, r);
+        }
+        // Fetch group bbox from BVH
+        auto bbox = scene.shape_groups_bvh_nodes[shape_group_id][2 * shape_group->num_shapes - 2].box;
+        // Transform box from local to world space
+        nodes[shape_group_id].child0 = shape_group_id;
+        nodes[shape_group_id].child1 = -1;
+        nodes[shape_group_id].box = transform(shape_group->shape_to_canvas, bbox);
+        if (shape_group->stroke_color == nullptr) {
+            nodes[shape_group_id].max_radius = 0;
+        } else {
+            nodes[shape_group_id].max_radius = max_radius;
+        }
+    }
+    build_bvh<true /*sort*/>(scene, nodes, shape_group_list.size());
+}
+
+template <bool alloc_mode>
+size_t allocate_buffers(Scene &scene,
+                        const std::vector<const Shape *> &shape_list,
+                        const std::vector<const ShapeGroup *> &shape_group_list) {
+    auto num_shapes = shape_list.size();
+    auto num_shape_groups = shape_group_list.size();
+
+    size_t buffer_size = 0;
+    if (alloc_mode) scene.shapes = (Shape*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(Shape) * num_shapes);
+    if (alloc_mode) scene.d_shapes = (Shape*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(Shape) * num_shapes); 
+    if (alloc_mode) scene.shape_groups = (ShapeGroup*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(ShapeGroup) * num_shape_groups);
+    if (alloc_mode) scene.d_shape_groups = (ShapeGroup*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(ShapeGroup) * num_shape_groups);
+    if (alloc_mode) scene.sample_shapes_cdf = (float*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(float) * scene.num_total_shapes);
+    if (alloc_mode) scene.sample_shapes_pmf = (float*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(float) * scene.num_total_shapes);
+    if (alloc_mode) scene.sample_shape_id = (int*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(int) * scene.num_total_shapes);
+    if (alloc_mode) scene.sample_group_id = (int*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(int) * scene.num_total_shapes);
+    if (alloc_mode) scene.shapes_length = (float*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(float) * num_shapes);
+    if (alloc_mode) scene.path_length_cdf = (float**)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(float*) * num_shapes);
+    if (alloc_mode) scene.path_length_pmf = (float**)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(float*) * num_shapes);
+    if (alloc_mode) scene.path_point_id_map = (int**)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(int*) * num_shapes);
+    if (alloc_mode) scene.filter = (Filter*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(Filter));
+    if (alloc_mode) scene.d_filter = (DFilter*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(DFilter));
+    if (alloc_mode) scene.shapes_bbox = (AABB*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(AABB) * num_shapes);
+    if (alloc_mode) scene.path_bvhs = (BVHNode**)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(BVHNode*) * num_shapes);
+    if (alloc_mode) scene.shape_groups_bvh_nodes = (BVHNode**)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(BVHNode*) * num_shape_groups);
+    if (alloc_mode) scene.bvh_nodes = (BVHNode*)&scene.buffer[buffer_size];
+    buffer_size += align(sizeof(BVHNode) * (2 * num_shape_groups - 1));
+
+    if (alloc_mode) {
+        for (int i = 0; i < num_shapes; i++) {
+            scene.path_length_cdf[i] = nullptr;
+            scene.path_length_pmf[i] = nullptr;
+            scene.path_point_id_map[i] = nullptr;
+            scene.path_bvhs[i] = nullptr;
+        }
+    }
+
+    for (int shape_id = 0; shape_id < scene.num_shapes; shape_id++) {
+        switch (shape_list[shape_id]->type) {
+            case ShapeType::Circle: {
+                if (alloc_mode) scene.shapes[shape_id].ptr = (Circle*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Circle)); // scene.shapes[shape_id].ptr
+                if (alloc_mode) scene.d_shapes[shape_id].ptr = (Circle*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Circle)); // scene.d_shapes[shape_id].ptr
+                break;
+            } case ShapeType::Ellipse: {
+                if (alloc_mode) scene.shapes[shape_id].ptr = (Ellipse*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Ellipse)); // scene.shapes[shape_id].ptr
+                if (alloc_mode) scene.d_shapes[shape_id].ptr = (Ellipse*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Ellipse)); // scene.d_shapes[shape_id].ptr
+                break;
+            } case ShapeType::Path: {
+                if (alloc_mode) scene.shapes[shape_id].ptr = (Path*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Path)); // scene.shapes[shape_id].ptr
+                if (alloc_mode) scene.d_shapes[shape_id].ptr = (Path*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Path)); // scene.d_shapes[shape_id].ptr
+
+                const Path *p_ = (const Path*)(shape_list[shape_id]->ptr);
+                Path *p = nullptr, *d_p = nullptr;
+                if (alloc_mode) p = (Path*)scene.shapes[shape_id].ptr;
+                if (alloc_mode) d_p = (Path*)scene.d_shapes[shape_id].ptr; 
+                if (alloc_mode) p->num_control_points = (int*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(int) * p_->num_base_points); // p->num_control_points
+                if (alloc_mode) p->points = (float*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(float) * (2 * p_->num_points)); // p->points
+                if (alloc_mode) d_p->points = (float*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(float) * (2 * p_->num_points)); // d_p->points
+                if (p_->thickness != nullptr) {
+                    if (alloc_mode) p->thickness = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * p_->num_points); // p->thickness
+                    if (alloc_mode) d_p->thickness = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * p_->num_points); // d_p->thickness
+                } else {
+                    if (alloc_mode) p->thickness = nullptr;
+                    if (alloc_mode) d_p->thickness = nullptr;
+                }
+                if (alloc_mode) scene.path_length_pmf[shape_id] = (float*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(float) * p_->num_base_points); // scene.path_length_pmf
+                if (alloc_mode) scene.path_length_cdf[shape_id] = (float*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(float) * p_->num_base_points); // scene.path_length_cdf
+                if (alloc_mode) scene.path_point_id_map[shape_id] = (int*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(int) * p_->num_base_points); // scene.path_point_id_map
+                if (alloc_mode) scene.path_bvhs[shape_id] = (BVHNode*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(BVHNode) * (2 * p_->num_base_points - 1));
+                break;
+            } case ShapeType::Rect: {
+                if (alloc_mode) scene.shapes[shape_id].ptr = (Ellipse*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Rect)); // scene.shapes[shape_id].ptr
+                if (alloc_mode) scene.d_shapes[shape_id].ptr = (Ellipse*)&scene.buffer[buffer_size];
+                buffer_size += align(sizeof(Rect)); // scene.d_shapes[shape_id].ptr
+                break;
+            } default: {
+                assert(false);
+                break;
+            }
+        }
+    }
+
+    for (int group_id = 0; group_id < scene.num_shape_groups; group_id++) {
+        const ShapeGroup *shape_group = shape_group_list[group_id];
+        if (shape_group->fill_color != nullptr) {
+            switch (shape_group->fill_color_type) {
+                case ColorType::Constant: {
+                    if (alloc_mode) scene.shape_groups[group_id].fill_color = (Constant*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(Constant)); // color
+                    if (alloc_mode) scene.d_shape_groups[group_id].fill_color = (Constant*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(Constant)); // d_color
+                    break;
+                } case ColorType::LinearGradient: {
+                    if (alloc_mode) scene.shape_groups[group_id].fill_color = (LinearGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(LinearGradient)); // color
+                    if (alloc_mode) scene.d_shape_groups[group_id].fill_color = (LinearGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(LinearGradient)); // d_color
+
+                    const LinearGradient *c_ = (const LinearGradient *)shape_group->fill_color;
+                    LinearGradient *c = nullptr, *d_c = nullptr;
+                    if (alloc_mode) c = (LinearGradient *)scene.shape_groups[group_id].fill_color;
+                    if (alloc_mode) d_c = (LinearGradient *)scene.d_shape_groups[group_id].fill_color;
+                    if (alloc_mode) c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // c->stop_offsets
+                    if (alloc_mode) c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // c->stop_colors
+                    if (alloc_mode) d_c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // d_c->stop_offsets
+                    if (alloc_mode) d_c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // d_c->stop_colors
+                    break;
+                } case ColorType::RadialGradient: {
+                    if (alloc_mode) scene.shape_groups[group_id].fill_color = (RadialGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(RadialGradient)); // color
+                    if (alloc_mode) scene.d_shape_groups[group_id].fill_color = (RadialGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(RadialGradient)); // d_color
+
+                    const RadialGradient *c_ = (const RadialGradient *)shape_group->fill_color;
+                    RadialGradient *c = nullptr, *d_c = nullptr;
+                    if (alloc_mode) c = (RadialGradient *)scene.shape_groups[group_id].fill_color;
+                    if (alloc_mode) d_c = (RadialGradient *)scene.d_shape_groups[group_id].fill_color;
+                    if (alloc_mode) c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // c->stop_offsets
+                    if (alloc_mode) c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // c->stop_colors
+                    if (alloc_mode) d_c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // d_c->stop_offsets
+                    if (alloc_mode) d_c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // d_c->stop_colors
+                    break;
+                } default: {
+                    assert(false);
+                }
+            }
+        } else {
+            if (alloc_mode) scene.shape_groups[group_id].fill_color = nullptr;
+            if (alloc_mode) scene.d_shape_groups[group_id].fill_color = nullptr;
+        }
+        if (shape_group->stroke_color != nullptr) {
+            switch (shape_group->stroke_color_type) {
+                case ColorType::Constant: {
+                    if (alloc_mode) scene.shape_groups[group_id].stroke_color = (Constant*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(Constant)); // color
+                    if (alloc_mode) scene.d_shape_groups[group_id].stroke_color = (Constant*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(Constant)); // d_color
+                    break;
+                } case ColorType::LinearGradient: {
+                    if (alloc_mode) scene.shape_groups[group_id].stroke_color = (LinearGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(LinearGradient)); // color
+                    if (alloc_mode) scene.shape_groups[group_id].stroke_color = (LinearGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(LinearGradient)); // d_color
+
+                    const LinearGradient *c_ = (const LinearGradient *)shape_group->stroke_color;
+                    LinearGradient *c = nullptr, *d_c = nullptr;
+                    if (alloc_mode) c = (LinearGradient *)scene.shape_groups[group_id].stroke_color;
+                    if (alloc_mode) d_c = (LinearGradient *)scene.d_shape_groups[group_id].stroke_color;
+                    if (alloc_mode) c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // c->stop_offsets
+                    if (alloc_mode) c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // c->stop_colors
+                    if (alloc_mode) d_c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // d_c->stop_offsets
+                    if (alloc_mode) d_c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // d_c->stop_colors
+                    break;
+                } case ColorType::RadialGradient: {
+                    if (alloc_mode) scene.shape_groups[group_id].stroke_color = (RadialGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(RadialGradient)); // color
+                    if (alloc_mode) scene.shape_groups[group_id].stroke_color = (RadialGradient*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(RadialGradient)); // d_color
+
+                    const RadialGradient *c_ = (const RadialGradient *)shape_group->stroke_color;
+                    RadialGradient *c = nullptr, *d_c = nullptr;
+                    if (alloc_mode) c = (RadialGradient *)scene.shape_groups[group_id].stroke_color;
+                    if (alloc_mode) d_c = (RadialGradient *)scene.d_shape_groups[group_id].stroke_color;
+                    if (alloc_mode) c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // c->stop_offsets
+                    if (alloc_mode) c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // c->stop_colors
+                    if (alloc_mode) d_c->stop_offsets = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * c_->num_stops); // d_c->stop_offsets
+                    if (alloc_mode) d_c->stop_colors = (float*)&scene.buffer[buffer_size];
+                    buffer_size += align(sizeof(float) * 4 * c_->num_stops); // d_c->stop_colors
+                    break;
+                } default: {
+                    assert(false);
+                }
+            }
+        } else {
+            if (alloc_mode) scene.shape_groups[group_id].stroke_color = nullptr;
+            if (alloc_mode) scene.d_shape_groups[group_id].stroke_color = nullptr;
+        }
+        if (alloc_mode) scene.shape_groups[group_id].shape_ids = (int*)&scene.buffer[buffer_size];
+        buffer_size += align(sizeof(int) * shape_group->num_shapes); // shape_group->shape_ids
+        if (alloc_mode) scene.shape_groups_bvh_nodes[group_id] = (BVHNode*)&scene.buffer[buffer_size];
+        buffer_size += align(sizeof(BVHNode) * (2 * shape_group->num_shapes - 1)); // scene.shape_groups_bvh_nodes[group_id]
+    }
+    return buffer_size;
+}
+
+Scene::Scene(int canvas_width,
+             int canvas_height,
+             const std::vector<const Shape *> &shape_list,
+             const std::vector<const ShapeGroup *> &shape_group_list,
+             const Filter &filter,
+             bool use_gpu,
+             int gpu_index)
+    : canvas_width(canvas_width),
+      canvas_height(canvas_height),
+      num_shapes(shape_list.size()),
+      num_shape_groups(shape_group_list.size()),
+      use_gpu(use_gpu),
+      gpu_index(gpu_index) {
+    if (num_shapes == 0) {
+        return;
+    }
+    // Shape group may reuse some of the shapes,
+    // record the total number of shapes.
+    int num_total_shapes = 0;
+    for (const ShapeGroup *sg : shape_group_list) {
+        num_total_shapes += sg->num_shapes;
+    }
+    this->num_total_shapes = num_total_shapes;
+
+    // Memory initialization
+#ifdef __NVCC__
+    int old_device_id = -1;
+#endif
+    if (use_gpu) {
+#ifdef __NVCC__
+        checkCuda(cudaGetDevice(&old_device_id));
+        if (gpu_index != -1) {
+            checkCuda(cudaSetDevice(gpu_index));
+        }
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    }
+
+    size_t buffer_size = allocate_buffers<false /*alloc_mode*/>(*this, shape_list, shape_group_list);
+    // Allocate a huge buffer for everything
+    allocate<uint8_t>(use_gpu, buffer_size, &buffer);
+    // memset(buffer, 111, buffer_size);
+    // Actually distribute the buffer
+    allocate_buffers<true /*alloc_mode*/>(*this, shape_list, shape_group_list);
+    copy_and_init_shapes(*this, shape_list);
+    copy_and_init_shape_groups(*this, shape_group_list);
+
+    std::vector<float> shape_length_list = compute_shape_length(shape_list);
+    // Copy shape_length
+    if (use_gpu) {
+#ifdef __NVCC__
+        checkCuda(cudaMemcpy(this->shapes_length, &shape_length_list[0], num_shapes * sizeof(float), cudaMemcpyHostToDevice));
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    } else {
+        memcpy(this->shapes_length, &shape_length_list[0], num_shapes * sizeof(float));
+    }
+    build_shape_cdfs(*this, shape_group_list, shape_length_list);
+    build_path_cdfs(*this, shape_list, shape_length_list);
+    compute_bounding_boxes(*this, shape_list, shape_group_list);
+
+    // Filter initialization
+    *(this->filter) = filter;
+    this->d_filter->radius = 0;
+
+    if (use_gpu) {
+#ifdef __NVCC__
+        if (old_device_id != -1) {
+            checkCuda(cudaSetDevice(old_device_id));
+        }
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    }
+}
+
+Scene::~Scene() {
+    if (num_shapes == 0) {
+        return;
+    }
+    if (use_gpu) {
+#ifdef __NVCC__
+        int old_device_id = -1;
+        checkCuda(cudaGetDevice(&old_device_id));
+        if (gpu_index != -1) {
+            checkCuda(cudaSetDevice(gpu_index));
+        }
+
+        checkCuda(cudaFree(buffer));
+
+        checkCuda(cudaSetDevice(old_device_id));
+#else
+        // Don't throw because C++ don't want a destructor to throw.
+        std::cerr << "diffvg not compiled with GPU";
+        exit(1);
+#endif
+    } else {
+        free(buffer);
+    }
+}
+
+Shape Scene::get_d_shape(int shape_id) const {
+    return d_shapes[shape_id];
+}
+
+ShapeGroup Scene::get_d_shape_group(int group_id) const {
+    return d_shape_groups[group_id];
+}
+
+float Scene::get_d_filter_radius() const {
+    return d_filter->radius;
+}
diff --git a/scene.h b/scene.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2f452dd33f139df89805967b416e21b5ffe109f
--- /dev/null
+++ b/scene.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include "diffvg.h"
+#include "aabb.h"
+#include <vector>
+
+struct Shape;
+struct ShapeGroup;
+struct Filter;
+struct DFilter;
+
+struct BVHNode {
+    int child0, child1; // child1 is negative if it is a leaf
+    AABB box;
+    float max_radius;
+};
+
+struct Scene {
+    Scene(int canvas_width,
+          int canvas_height,
+          const std::vector<const Shape *> &shape_list,
+          const std::vector<const ShapeGroup *> &shape_group_list,
+          const Filter &filter,
+          bool use_gpu,
+          int gpu_index);
+
+    ~Scene();
+
+    int canvas_width;
+    int canvas_height;
+
+    uint8_t *buffer;
+
+    Shape *shapes;
+    Shape *d_shapes;
+    ShapeGroup *shape_groups;
+    ShapeGroup *d_shape_groups;
+    Filter *filter;
+    DFilter *d_filter;
+    // For accelerating intersection
+    AABB *shapes_bbox;
+    BVHNode **path_bvhs; // Only for Path
+    BVHNode **shape_groups_bvh_nodes; // One BVH for each shape group
+    BVHNode *bvh_nodes;
+
+    int num_shapes;
+    int num_shape_groups;
+    // shape_groups reuse shape, so the total number of shapes
+    // doesn't equal to num_shapes
+    int num_total_shapes;
+    bool use_gpu;
+    int gpu_index;
+
+    // For edge sampling
+    float *shapes_length;
+    float *sample_shapes_cdf;
+    float *sample_shapes_pmf;
+    int *sample_shape_id;
+    int *sample_group_id;
+    float **path_length_cdf;
+    float **path_length_pmf;
+    int **path_point_id_map;
+
+    ShapeGroup get_d_shape_group(int group_id) const;
+    Shape get_d_shape(int shape_id) const;
+    float get_d_filter_radius() const;
+};
+
+struct SceneData {
+    int canvas_width;
+    int canvas_height;
+    Shape *shapes;
+    Shape *d_shapes;
+    ShapeGroup *shape_groups;
+    ShapeGroup *d_shape_groups;
+    Filter *filter;
+    DFilter *d_filter;
+    AABB *shapes_bbox;
+    BVHNode **path_bvhs; // Only for Path
+    BVHNode **shape_groups_bvh_nodes;
+    BVHNode *bvh_nodes;
+    int num_shapes;
+    int num_shape_groups;
+    int num_total_shapes;
+    // For edge sampling
+    float *shapes_length;
+    float *sample_shapes_cdf;
+    float *sample_shapes_pmf;
+    int *sample_shape_id;
+    int *sample_group_id;
+    float **path_length_cdf;
+    float **path_length_pmf;
+    int **path_point_id_map;
+};
+
+inline SceneData get_scene_data(const Scene &scene) {
+    return SceneData{scene.canvas_width,
+                     scene.canvas_height,
+                     scene.shapes,
+                     scene.d_shapes,
+                     scene.shape_groups,
+                     scene.d_shape_groups,
+                     scene.filter,
+                     scene.d_filter,
+                     scene.shapes_bbox,
+                     scene.path_bvhs,
+                     scene.shape_groups_bvh_nodes,
+                     scene.bvh_nodes,
+                     scene.num_shapes,
+                     scene.num_shape_groups,
+                     scene.num_total_shapes,
+                     scene.shapes_length,
+                     scene.sample_shapes_cdf,
+                     scene.sample_shapes_pmf,
+                     scene.sample_shape_id,
+                     scene.sample_group_id,
+                     scene.path_length_cdf,
+                     scene.path_length_pmf,
+                     scene.path_point_id_map};
+}
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb9f6735b7adb7684bc72cbcb74c4284afd4119
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,98 @@
+# Adapted from https://github.com/pybind/cmake_example/blob/master/setup.py
+import os
+import re
+import sys
+import platform
+import subprocess
+import importlib
+from sysconfig import get_paths
+
+import importlib
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from setuptools.command.install import install
+from distutils.sysconfig import get_config_var
+from distutils.version import LooseVersion
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir, build_with_cuda):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+        self.build_with_cuda = build_with_cuda
+
+class Build(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        super().run()
+
+    def build_extension(self, ext):
+        if isinstance(ext, CMakeExtension):
+            extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+            info = get_paths()
+            include_path = info['include']
+            cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                          '-DPYTHON_INCLUDE_PATH=' + include_path]
+
+            cfg = 'Debug' if self.debug else 'Release'
+            build_args = ['--config', cfg]
+
+            if platform.system() == "Windows":
+                cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir),
+                               '-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+                if sys.maxsize > 2**32:
+                    cmake_args += ['-A', 'x64']
+                build_args += ['--', '/m']
+            else:
+                cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+                build_args += ['--', '-j8']
+
+            if ext.build_with_cuda:
+                cmake_args += ['-DDIFFVG_CUDA=1']
+            else:
+                cmake_args += ['-DDIFFVG_CUDA=0']
+
+            env = os.environ.copy()
+            env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+                                                                  self.distribution.get_version())
+            if not os.path.exists(self.build_temp):
+                os.makedirs(self.build_temp)
+            subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+            subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+        else:
+            super().build_extension(ext)
+
+torch_spec = importlib.util.find_spec("torch")
+tf_spec = importlib.util.find_spec("tensorflow")
+packages = []
+build_with_cuda = False
+if torch_spec is not None:
+    packages.append('pydiffvg')
+    import torch
+    if torch.cuda.is_available():
+        build_with_cuda = True
+if tf_spec is not None and sys.platform != 'win32':
+    packages.append('pydiffvg_tensorflow')
+    if not build_with_cuda:
+        import tensorflow as tf
+        if tf.test.is_gpu_available(cuda_only=True, min_cuda_compute_capability=None):
+            build_with_cuda = True
+if len(packages) == 0:
+    print('Error: PyTorch or Tensorflow must be installed. For Windows platform only PyTorch is supported.')
+    exit()
+# Override build_with_cuda with environment variable
+if 'DIFFVG_CUDA' in os.environ:
+    build_with_cuda = os.environ['DIFFVG_CUDA'] == '1'
+
+setup(name = 'diffvg',
+      version = '0.0.1',
+      install_requires = ["svgpathtools"],
+      description = 'Differentiable Vector Graphics',
+      ext_modules = [CMakeExtension('diffvg', '', build_with_cuda)],
+      cmdclass = dict(build_ext=Build, install=install),
+      packages = packages,
+      zip_safe = False)
diff --git a/shape.cpp b/shape.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..19a30962f7505d8cfdd65254a25c4230c6782a46
--- /dev/null
+++ b/shape.cpp
@@ -0,0 +1,22 @@
+#include "shape.h"
+
+void Path::copy_to(ptr<float> points, ptr<float> thickness) const {
+    float *p = points.get();
+    for (int i = 0; i < 2 * num_points; i++) {
+        p[i] = this->points[i];
+    }
+    if (this->thickness != nullptr) {
+        float *t = thickness.get();
+        for (int i = 0; i < num_points; i++) {
+            t[i] = this->thickness[i];
+        }
+    }
+}
+
+void ShapeGroup::copy_to(ptr<float> shape_to_canvas) const {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            shape_to_canvas.get()[i * 3 + j] = this->shape_to_canvas(i, j);
+        }
+    }
+}
diff --git a/shape.h b/shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..b549f31e73a65696b1a0ac9814ddeedba20cf121
--- /dev/null
+++ b/shape.h
@@ -0,0 +1,169 @@
+#pragma once
+
+#include "diffvg.h"
+#include "color.h"
+#include "ptr.h"
+#include "vector.h"
+#include "matrix.h"
+
+enum class ShapeType {
+    Circle,
+    Ellipse,
+    Path,
+    Rect
+};
+
+struct Circle {
+    float radius;
+    Vector2f center;
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+};
+
+struct Ellipse {
+    Vector2f radius;
+    Vector2f center;
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+};
+
+struct Path {
+    Path(ptr<int> num_control_points,
+         ptr<float> points,
+         ptr<float> thickness,
+         int num_base_points,
+         int num_points,
+         bool is_closed,
+         bool use_distance_approx) :
+        num_control_points(num_control_points.get()),
+        points(points.get()),
+        thickness(thickness.get()),
+        num_base_points(num_base_points),
+        num_points(num_points),
+        is_closed(is_closed),
+        use_distance_approx(use_distance_approx) {}
+
+    int *num_control_points;
+    float *points;
+    float *thickness;
+    int num_base_points;
+    int num_points;
+    bool is_closed;
+    bool use_distance_approx;
+
+    bool has_thickness() const {
+        return thickness != nullptr;
+    }
+    void copy_to(ptr<float> points, ptr<float> thickness) const;
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+};
+
+struct Rect {
+    Vector2f p_min;
+    Vector2f p_max;
+
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+};
+
+struct Shape {
+    Shape() {}
+    Shape(const ShapeType &type,
+          ptr<void> shape_ptr,
+          float stroke_width)    
+        : type(type), ptr(shape_ptr.get()), stroke_width(stroke_width) {}
+
+    Circle as_circle() const {
+        return *(Circle*)ptr;
+    }
+
+    Ellipse as_ellipse() const {
+        return *(Ellipse*)ptr;
+    }
+
+    Path as_path() const {
+        return *(Path*)ptr;
+    }
+
+    Rect as_rect() const {
+        return *(Rect*)ptr;
+    }
+
+    ShapeType type;
+    void *ptr;
+    float stroke_width;
+};
+
+struct ShapeGroup {
+    ShapeGroup() {}
+    ShapeGroup(ptr<int> shape_ids,
+               int num_shapes,
+               const ColorType &fill_color_type,
+               ptr<void> fill_color,
+               const ColorType &stroke_color_type,
+               ptr<void> stroke_color,
+               bool use_even_odd_rule,
+               ptr<float> shape_to_canvas)
+        : shape_ids(shape_ids.get()),
+          num_shapes(num_shapes),
+          fill_color_type(fill_color_type),
+          fill_color(fill_color.get()),
+          stroke_color_type(stroke_color_type),
+          stroke_color(stroke_color.get()),
+          use_even_odd_rule(use_even_odd_rule),
+          shape_to_canvas(shape_to_canvas.get()) {
+        canvas_to_shape = inverse(this->shape_to_canvas);
+    }
+
+    bool has_fill_color() const {
+        return fill_color != nullptr;
+    }
+
+    Constant fill_color_as_constant() const {
+        return *(Constant*)fill_color;
+    }
+
+    LinearGradient fill_color_as_linear_gradient() const {
+        return *(LinearGradient*)fill_color;
+    }
+
+    RadialGradient fill_color_as_radial_gradient() const {
+        return *(RadialGradient*)fill_color;
+    }
+
+    bool has_stroke_color() const {
+        return stroke_color != nullptr;
+    }
+
+    Constant stroke_color_as_constant() const {
+        return *(Constant*)stroke_color;
+    }
+
+    LinearGradient stroke_color_as_linear_gradient() const {
+        return *(LinearGradient*)stroke_color;
+    }
+
+    RadialGradient stroke_color_as_radial_gradient() const {
+        return *(RadialGradient*)stroke_color;
+    }
+
+    void copy_to(ptr<float> shape_to_canvas) const;
+
+    int *shape_ids;
+    int num_shapes;
+    ColorType fill_color_type;
+    void *fill_color;
+    ColorType stroke_color_type;
+    void *stroke_color;
+    bool use_even_odd_rule;
+    Matrix3x3f canvas_to_shape;
+    Matrix3x3f shape_to_canvas;
+};
diff --git a/solve.h b/solve.h
new file mode 100644
index 0000000000000000000000000000000000000000..99f730d627d4e69b0973073593fb23ac54637f06
--- /dev/null
+++ b/solve.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "diffvg.h"
+
+template <typename T>
+DEVICE
+inline bool solve_quadratic(T a, T b, T c, T *t0, T *t1) {
+    // From https://github.com/mmp/pbrt-v3/blob/master/src/core/pbrt.h#L419
+    T discrim = square(b) - 4 * a * c;
+    if (discrim < 0) {
+        return false;
+    }
+    T root_discrim = sqrt(discrim);
+
+    T q;
+    if (b < 0) {
+        q = -0.5f * (b - root_discrim);
+    } else {
+        q = -0.5f * (b + root_discrim);
+    }
+    *t0 = q / a;
+    *t1 = c / q;
+    if (*t0 > *t1) {
+        swap_(*t0, *t1);
+    }
+    return true;
+}
+
+template <typename T>
+DEVICE
+inline int solve_cubic(T a, T b, T c, T d, T t[3]) {
+    if (fabs(a) < 1e-6f) {
+        if (solve_quadratic(b, c, d, &t[0], &t[1])) {
+            return 2;
+        } else {
+            return 0;
+        }
+    }
+    // normalize cubic equation
+    b /= a;
+    c /= a;
+    d /= a;
+    T Q = (b * b - 3 * c) / 9.f;
+    T R = (2 * b * b * b - 9 * b * c + 27 * d) / 54.f;
+    if (R * R < Q * Q * Q) {
+        // 3 real roots
+        T theta = acos(R / sqrt(Q * Q * Q));
+        t[0] = -2.f * sqrt(Q) * cos(theta / 3.f) - b / 3.f;
+        t[1] = -2.f * sqrt(Q) * cos((theta + 2.f * T(M_PI)) / 3.f) - b / 3.f;
+        t[2] = -2.f * sqrt(Q) * cos((theta - 2.f * T(M_PI)) / 3.f) - b / 3.f;
+        return 3;
+    } else {
+        T A = R > 0 ? -pow(R + sqrt(R * R - Q * Q * Q), T(1./3.)):
+                           pow(-R + sqrt(R * R - Q * Q * Q), T(1./3.));
+        T B = fabs(A) > 1e-6f ? Q / A : T(0);
+        t[0] = (A + B) - b / T(3);
+        return 1;
+    }
+}
diff --git a/thrust/.github/workflows/mirror-main-to-master.yml b/thrust/.github/workflows/mirror-main-to-master.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5c4707573542b45f2a7473993869495b46b888d8
--- /dev/null
+++ b/thrust/.github/workflows/mirror-main-to-master.yml
@@ -0,0 +1,17 @@
+on:
+  push:
+    branches:
+      - 'main'
+
+jobs:
+  mirror_job:
+    runs-on: ubuntu-latest
+    name: Mirror main branch to master branch
+    steps:
+    - name: Mirror action step
+      id: mirror
+      uses: google/mirror-branch-action@v1.0
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        source: 'main'
+        dest: 'master'
diff --git a/thrust/.gitignore b/thrust/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9b1947f8a06078709dbfe557c815fb67ee1ef672
--- /dev/null
+++ b/thrust/.gitignore
@@ -0,0 +1,8 @@
+thrust/system/cuda/detail/.gitignore
+*.bash
+*.log
+.p4config
+run
+build*
+doc/html
+discrete_voronoi.pgm
diff --git a/thrust/.gitmodules b/thrust/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..1d8e604ef7afc2427a88a71e7c3820ca24c240d2
--- /dev/null
+++ b/thrust/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "cub"]
+	path = dependencies/cub
+	url = ../cub.git
diff --git a/thrust/CHANGELOG.md b/thrust/CHANGELOG.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e845a81e6ff0a876dffd0c58136282e5ace4439
--- /dev/null
+++ b/thrust/CHANGELOG.md
@@ -0,0 +1,1659 @@
+# Thrust 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
+
+## Summary
+
+Thrust 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
+  and the CUDA Toolkit 11.1 release.
+
+## Bug Fixes
+
+- #1214, NVBug 200619442: Stop using `std::allocator` APIs deprecated in C++17.
+- #1216, NVBug 200540293: Make `thrust::optional` work with Clang when used
+    with older libstdc++.
+- #1207, NVBug 200618218: Don't force C++14 with older compilers that don't
+    support it.
+- #1218: Wrap includes of `<memory>` and `<algorithm>` to avoid circular
+    inclusion with NVC++.
+
+# Thrust 1.9.10 (NVIDIA HPC SDK 20.5)
+
+## Summary
+
+Thrust 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
+It adds CMake support for compilation with NVC++ and a number of minor bug fixes
+  for NVC++.
+It also adds CMake `find_package` support, which replaces the broken 3rd-party
+  legacy `FindThrust.cmake` script.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- #1082: Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- #1089: C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- #1130: CMake `find_package` support.
+  This is significant because there is a legacy `FindThrust.cmake` script
+    authored by a third party in widespread use in the community which has a
+    bug in how it parses Thrust version numbers which will cause it to
+    incorrectly parse 1.9.10.
+  This script only handles the first digit of each part of the Thrust version
+    number correctly: for example, Thrust 17.17.17 would be interpreted as
+    Thrust 1.1.1701717.
+  You can find directions for using the new CMake `find_package` support and
+    migrating away from the legacy `FindThrust.cmake` [here](https://github.com/thrust/thrust/blob/master/thrust/cmake/README.md)
+- #1129: Added `thrust::detail::single_device_tls_caching_allocator`, a
+    convenient way to get an MR caching allocator for device memory, which is
+    used by NVC++.
+
+## Other Enhancements
+
+- #1129: Refactored RDC handling in CMake to be a global option and not create
+    two targets for each example and test.
+
+## Bug Fixes
+
+- #1129: Fix the legacy `thrust::return_temporary_buffer` API to support
+    passing a size.
+  This was necessary to enable usage of Thrust caching MR allocators with
+    synchronous Thrust algorithms.
+  This change has allowed NVC++’s C++17 Parallel Algorithms implementation to
+    switch to use Thrust caching MR allocators for device temporary storage,
+    which gives a 2x speedup on large multi-GPU systems such as V100 and A100
+    DGX where `cudaMalloc` is very slow.
+- #1128: Respect `CUDA_API_PER_THREAD_DEFAULT_STREAM`.
+  Thanks to Rong Ou for this contribution.
+- #1131: Fix the one-policy overload of `thrust::async::copy` to not copy the
+    policy, resolving use-afer-move issues.
+- #1145: When cleaning up type names in `unittest::base_class_name`, only call
+    `std::string::replace` if we found the substring we are looking to replace.
+- #1139: Don't use `cxx::__demangle` in NVC++.
+- #1102: Don't use `thrust::detail::normal_distribution_nvcc` for Feta because
+    it uses `erfcinv`, a non-standard function that Feta doesn't have.
+
+# Thrust 1.9.9 (CUDA Toolkit 11.0)
+
+## Summary
+
+Thrust 1.9.9 adds support for NVC++, which uses Thrust to implement
+  GPU-accelerated C++17 Parallel Algorithms.
+`thrust::zip_function` and `thrust::shuffle` were also added.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+All other deprecated platforms will be dropped in the near future.
+
+## Breaking Changes
+
+- #1082: Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- #1089: C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+  `THRUST_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- #1086: Support for NVC++ aka "Feta".
+  The most significant change is in how we use `__CUDA_ARCH__`.
+  Now, there are four macros that must be used:
+  - `THRUST_IS_DEVICE_CODE`, which should be used in an `if` statement around
+      device-only code.
+  - `THRUST_INCLUDE_DEVICE_CODE`, which should be used in an `#if` preprocessor
+      directive inside of the `if` statement mentioned in the prior bullet.
+  - `THRUST_IS_HOST_CODE`, which should be used in an `if` statement around
+      host-only code.
+  - `THRUST_INCLUDE_HOST_CODE`, which should be used in an `#if` preprocessor
+      directive inside of the `if` statement mentioned in the prior bullet.
+- #1085: `thrust::shuffle`.
+  Thanks to Rory Mitchell for this contribution.
+- #1029: `thrust::zip_function`, a facility for zipping functions that take N
+    parameters instead of a tuple of N parameters as `thrust::zip_iterator`
+    does.
+  Thanks to Ben Jude for this contribution.
+- #1068: `thrust::system::cuda::managed_memory_pointer`, a universal memory
+    strongly typed pointer compatible with the ISO C++ Standard Library.
+
+## Other Enhancements
+
+- #1029: Thrust is now built and tested with NVCC warnings treated as errors.
+- #1029: MSVC C++11 support.
+- #1029: `THRUST_DEPRECATED` abstraction for generating compile-time
+    deprecation warning messages.
+- #1029: `thrust::pointer<T>::pointer_to(reference)`.
+- #1070: Unit test for `thrust::inclusive_scan` with a user defined types.
+  Thanks to Conor Hoekstra for this contribution.
+
+## Bug Fixes
+
+- #1088: Allow `thrust::replace` to take functions that have non-`const`
+    `operator()`.
+- #1094: Add missing `constexpr` to `par_t` constructors.
+  Thanks to Patrick Stotko for this contribution.
+- #1077: Remove `__device__` from CUDA MR-based device allocators to fix
+    obscure "host function called from host device function" warning that occurs
+    when you use the new Thrust MR-based allocators.
+- #1029: Remove inconsistently-used `THRUST_BEGIN`/`END_NS` macros.
+- #1029: Fix C++ dialect detection on newer MSVC.
+- #1029 Use `_Pragma`/`__pragma` instead of `#pragma` in macros.
+- #1029: Replace raw `__cplusplus` checks with the appropriate Thrust macros.
+- #1105: Add a missing `<math.h>` include.
+- #1103: Fix regression of `thrust::detail::temporary_allocator` with non-CUDA
+    back ends.
+- #1111: Use Thrust's random number engine instead of `std::`s in device code.
+- #1108: Get rid of a GCC 9 warning about deprecated generation of copy ctors.
+
+# Thrust 1.9.8-1 (NVIDIA HPC SDK 20.3)
+
+## Summary
+
+Thrust 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3
+  release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 11.0
+  release.
+
+# Thrust 1.9.8 (CUDA Toolkit 11.0 Early Access)
+
+## Summary
+
+Thrust 1.9.8, which is included in the CUDA Toolkit 11.0 release, removes
+  Thrust's internal derivative of CUB, upstreams all relevant changes too CUB,
+  and adds CUB as a Git submodule.
+It will now be necessary to do `git clone --recursive` when checking out
+  Thrust, and to update the CUB submodule when pulling in new Thrust changes.
+Additionally, CUB is now included as a first class citizen in the CUDA toolkit.
+Thrust 1.9.8 also fixes bugs preventing most Thrust algorithms from working
+  with more than `2^31-1` elements.
+Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of
+  Thrust) work with large element counts.
+
+## Breaking Changes
+
+- Thrust will now use the version of CUB in your include path instead of its own
+    internal copy.
+  If you are using your own version of CUB, it may be older and incompatible
+    with Thrust.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+
+## Other Enhancements
+
+- Refactor Thrust and CUB to support 64-bit indices in most algorithms.
+  In most cases, Thrust now selects between kernels that use 32-bit indices and
+    64-bit indices at runtime depending on the size of the input.
+  This means large element counts work, but small element counts do not have to
+    pay for the register usage of 64-bit indices if they are not needed.
+  Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of
+    Thrust) work with more than `2^31-1` elements.
+  Notably, `thrust::sort` is still limited to less than `2^31-1` elements.
+- CUB is now a submodule and the internal copy of CUB has been removed.
+- #1051: Stop specifying the `__launch_bounds__` minimum blocks parameter
+    because it messes up register allocation and increases register pressure,
+    and we don't actually know at compile time how many blocks we will use
+    (aside from single tile kernels).
+
+## Bug Fixes
+
+- #1020: After making a CUDA API call, always clear the global CUDA error state
+    by calling `cudaGetLastError`.
+- #1021: Avoid calling destroy in the destructor of a Thrust vector if the
+    vector is empty.
+- #1046: Actually throw `thrust::bad_alloc` when `thrust::system::cuda::malloc`
+    fails instead of just constructing a temporary and doing nothing with it.
+- Add missing copy constructor or copy assignment operator to all classes that
+    GCC 9's `-Wdeprecated-copy` complains about
+- Add missing move operations to `thrust::system::cuda::vector`.
+- #1015: Check that the backend is CUDA before using CUDA-specifics in
+    `thrust::detail::temporary_allocator`.
+  Thanks to Hugh Winkler for this contribution.
+- #1055: More correctly detect the presence of aligned/sized `new`/`delete`.
+- #1043: Fix ill-formed specialization of `thrust::system::is_error_code_enum`
+    for `thrust::event_errc`.
+  Thanks to Toru Niina for this contribution.
+- #1027: Add tests for `thrust::tuple_for_each` and `thrust::tuple_subset`.
+  Thanks to Ben Jude for this contribution.
+- #1027: Use correct macro in `thrust::tuple_for_each`.
+  Thanks to Ben Jude for this contribution.
+- #1026: Use correct MSVC version formatting in CMake.
+  Thanks to Ben Jude for this contribution.
+- Workaround an NVCC issue with type aliases with template template arguments
+    containing a parameter pack.
+- Remove unused functions from the CUDA backend which call slow CUDA attribute
+    query APIs.
+- Replace `CUB_RUNTIME_FUNCTION` with `THRUST_RUNTIME_FUNCTION`.
+- Correct typo in `thrust::transform` documentation.
+  Thanks to Eden Yefet for this contribution.
+
+## Known Issues
+
+- `thrust::sort` remains limited to `2^31-1` elements for now.
+
+# Thrust 1.9.7-1 (CUDA Toolkit 10.2 for Tegra)
+
+## Summary
+
+Thrust 1.9.7-1 is a minor release accompanying the CUDA Toolkit 10.2 release
+  for Tegra.
+It is nearly identical to 1.9.7.
+
+## Bug Fixes
+
+- Remove support for GCC's broken nodiscard-like attribute.
+
+# Thrust 1.9.7 (CUDA Toolkit 10.2)
+
+## Summary
+
+Thrust 1.9.7 is a minor release accompanying the CUDA Toolkit 10.2 release.
+Unfortunately, although the version and patch numbers are identical, one bug
+  fix present in Thrust 1.9.7 (NVBug 2646034: Fix incorrect dependency handling
+  for stream acquisition in `thrust::future`) was not included in the CUDA
+  Toolkit 10.2 preview release for AArch64 SBSA.
+The tag `cuda-10.2aarch64sbsa` contains the exact version of Thrust present
+  in the CUDA Toolkit 10.2 preview release for AArch64 SBSA.
+
+## Bug Fixes
+
+- #967, NVBug 2448170: Fix the CUDA backend `thrust::for_each` so that it
+    supports large input sizes with 64-bit indices.
+- NVBug 2646034: Fix incorrect dependency handling for stream acquisition in
+    `thrust::future`.
+  - Not present in the CUDA Toolkit 10.2 preview release for AArch64 SBSA.
+- #968, NVBug 2612102: Fix the `thrust::mr::polymorphic_adaptor` to actually
+    use its template parameter.
+
+# Thrust 1.9.6-1 (NVIDIA HPC SDK 20.3)
+
+## Summary
+
+Thrust 1.9.6-1 is a variant of 1.9.6 accompanying the NVIDIA HPC SDK 20.3
+  release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 10.1
+  Update 2 release.
+
+# Thrust 1.9.6 (CUDA Toolkit 10.1 Update 2)
+
+## Summary
+
+Thrust 1.9.6 is a minor release accompanying the CUDA Toolkit 10.1 Update 2
+  release.
+
+## Bug Fixes
+
+- NVBug 2509847: Inconsistent alignment of `thrust::complex`
+- NVBug 2586774: Compilation failure with Clang + older libstdc++ that doesn't
+    have `std::is_trivially_copyable`
+- NVBug 200488234: CUDA header files contain Unicode characters which leads
+    compiling errors on Windows
+- #949, #973, NVBug 2422333, NVBug 2522259, NVBug 2528822:
+    `thrust::detail::aligned_reinterpret_cast` must be annotated with
+    `__host__ __device__`.
+- NVBug 2599629: Missing include in the OpenMP sort implementation
+- NVBug 200513211: Truncation warning in test code under VC142
+
+# Thrust 1.9.5 (CUDA Toolkit 10.1 Update 1)
+
+## Summary
+
+Thrust 1.9.5 is a minor release accompanying the CUDA Toolkit 10.1 Update 1
+  release.
+
+## Bug Fixes
+
+- NVBug 2502854: Fixed assignment of
+    `thrust::device_vector<thrust::complex<T>>` between host and device.
+
+# Thrust 1.9.4 (CUDA Toolkit 10.1)
+
+## Summary
+
+Thrust 1.9.4 adds asynchronous interfaces for parallel algorithms, a new
+  allocator system including caching allocators and unified memory support, as
+  well as a variety of other enhancements, mostly related to
+  C++11/C++14/C++17/C++20 support.
+The new asynchronous algorithms in the `thrust::async` namespace return
+  `thrust::event` or `thrust::future` objects, which can be waited upon to
+  synchronize with the completion of the parallel operation.
+
+## Breaking Changes
+
+Synchronous Thrust algorithms now block until all of their operations have
+  completed.
+Use the new asynchronous Thrust algorithms for non-blocking behavior.
+
+## New Features
+
+- `thrust::event` and `thrust::future<T>`, uniquely-owned asynchronous handles
+    consisting of a state (ready or not ready), content (some value; for
+    `thrust::future` only), and an optional set of objects that should be
+    destroyed only when the future's value is ready and has been consumed.
+  - The design is loosely based on C++11's `std::future`.
+  - They can be `.wait`'d on, and the value of a future can be waited on and
+      retrieved with `.get` or `.extract`.
+  - Multiple `thrust::event`s and `thrust::future`s can be combined with
+      `thrust::when_all`.
+  - `thrust::future`s can be converted to `thrust::event`s.
+  - Currently, these primitives are only implemented for the CUDA backend and
+      are C++11 only.
+- New asynchronous algorithms that return `thrust::event`/`thrust::future`s,
+    implemented as C++20 range style customization points:
+    - `thrust::async::reduce`.
+    - `thrust::async::reduce_into`, which takes a target location to store the
+        reduction result into.
+    - `thrust::async::copy`, including a two-policy overload that allows
+        explicit cross system copies which execution policy properties can be
+        attached to.
+    - `thrust::async::transform`.
+    - `thrust::async::for_each`.
+    - `thrust::async::stable_sort`.
+    - `thrust::async::sort`.
+    - By default the asynchronous algorithms use the new caching allocators.
+        Deallocation of temporary storage is deferred until the destruction of
+        the returned `thrust::future`. The content of `thrust::future`s is
+        stored in either device or universal memory and transferred to the host
+        only upon request to prevent unnecessary data migration.
+    - Asynchronous algorithms are currently only implemented for the CUDA
+        system and are C++11 only.
+- `exec.after(f, g, ...)`, a new execution policy method that takes a set of
+    `thrust::event`/`thrust::future`s and returns an execution policy that
+    operations on that execution policy should depend upon.
+- New logic and mindset for the type requirements for cross-system sequence
+    copies (currently only used by `thrust::async::copy`), based on:
+  - `thrust::is_contiguous_iterator` and `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR`
+      for detecting/indicating that an iterator points to contiguous storage.
+  - `thrust::is_trivially_relocatable` and
+      `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE` for detecting/indicating that a
+      type is `memcpy`able (based on principles from
+      [P1144](https://wg21.link/P1144)).
+  - The new approach reduces buffering, increases performance, and increases
+      correctness.
+  - The fast path is now enabled when copying CUDA `__half` and vector types with
+      `thrust::async::copy`.
+- All Thrust synchronous algorithms for the CUDA backend now actually
+    synchronize. Previously, any algorithm that did not allocate temporary
+    storage (counterexample: `thrust::sort`) and did not have a
+    computation-dependent result (counterexample: `thrust::reduce`) would
+    actually be launched asynchronously. Additionally, synchronous algorithms
+    that allocated temporary storage would become asynchronous if a custom
+    allocator was supplied that did not synchronize on allocation/deallocation,
+    unlike `cudaMalloc`/`cudaFree`. So, now `thrust::for_each`,
+    `thrust::transform`, `thrust::sort`, etc are truly synchronous. In some
+    cases this may be a performance regression; if you need asynchrony, use the
+    new asynchronous algorithms.
+- Thrust's allocator framework has been rewritten. It now uses a memory
+    resource system, similar to C++17's `std::pmr` but supporting static
+    polymorphism. Memory resources are objects that allocate untyped storage and
+    allocators are cheap handles to memory resources in this new model. The new
+    facilities live in `<thrust/mr/*>`.
+  - `thrust::mr::memory_resource<Pointer>`, the memory resource base class,
+      which takes a (possibly tagged) pointer to `void` type as a parameter.
+  - `thrust::mr::allocator<T, MemoryResource>`, an allocator backed by a memory
+      resource object.
+  - `thrust::mr::polymorphic_adaptor_resource<Pointer>`, a type-erased memory
+      resource adaptor.
+  - `thrust::mr::polymorphic_allocator<T>`, a C++17-style polymorphic allocator
+      backed by a type-erased memory resource object.
+  - New tunable C++17-style caching memory resources,
+      `thrust::mr::(disjoint_)?(un)?synchronized_pool_resource`, designed to
+      cache both small object allocations and large repetitive temporary
+      allocations. The disjoint variants use separate storage for management of
+      the pool, which is necessary if the memory being allocated cannot be
+      accessed on the host (e.g.  device memory).
+  - System-specific allocators were rewritten to use the new memory resource
+      framework.
+  - New `thrust::device_memory_resource` for allocating device memory.
+  - New `thrust::universal_memory_resource` for allocating memory that can be
+      accessed from both the host and device (e.g. `cudaMallocManaged`).
+  - New `thrust::universal_host_pinned_memory_resource` for allocating memory
+      that can be accessed from the host and the device but always resides in
+      host memory (e.g. `cudaMallocHost`).
+  - `thrust::get_per_device_resource` and `thrust::per_device_allocator`, which
+      lazily create and retrieve a per-device singleton memory resource.
+  - Rebinding mechanisms (`rebind_traits` and `rebind_alloc`) for
+      `thrust::allocator_traits`.
+  - `thrust::device_make_unique`, a factory function for creating a
+      `std::unique_ptr` to a newly allocated object in device memory.
+  - `<thrust/detail/memory_algorithms>`, a C++11 implementation of the C++17
+      uninitialized memory algorithms.
+  - `thrust::allocate_unique` and friends, based on the proposed C++23
+      [`std::allocate_unique`](https://wg21.link/P0211).
+- New type traits and metaprogramming facilities. Type traits are slowly being
+    migrated out of `thrust::detail::` and `<thrust/detail/*>`; their new home
+    will be `thrust::` and `<thrust/type_traits/*>`.
+  - `thrust::is_execution_policy`.
+  - `thrust::is_operator_less_or_greater_function_object`, which detects
+      `thrust::less`, `thrust::greater`, `std::less`, and `std::greater`.
+  - `thrust::is_operator_plus_function_object``, which detects `thrust::plus`
+      and `std::plus`.
+  - `thrust::remove_cvref(_t)?`, a C++11 implementation of C++20's
+      `thrust::remove_cvref(_t)?`.
+  - `thrust::void_t`, and various other new type traits.
+  - `thrust::integer_sequence` and friends, a C++11 implementation of C++20's
+      `std::integer_sequence`
+  - `thrust::conjunction`, `thrust::disjunction`, and `thrust::disjunction`, a
+      C++11 implementation of C++17's logical metafunctions.
+  - Some Thrust type traits (such as `thrust::is_constructible`) have been
+      redefined in terms of C++11's type traits when they are available.
+- `<thrust/detail/tuple_algorithms.h>`, new `std::tuple` algorithms:
+  - `thrust::tuple_transform`.
+  - `thrust::tuple_for_each`.
+  - `thrust::tuple_subset`.
+- Miscellaneous new `std::`-like facilities:
+  - `thrust::optional`, a C++11 implementation of C++17's `std::optional`.
+  - `thrust::addressof`, an implementation of C++11's `std::addressof`.
+  - `thrust::next` and `thrust::prev`, an implementation of C++11's `std::next`
+      and `std::prev`.
+  - `thrust::square`, a `<functional>` style unary function object that
+      multiplies its argument by itself.
+  - `<thrust/limits.h>` and `thrust::numeric_limits`, a customized version of
+      `<limits>` and `std::numeric_limits`.
+- `<thrust/detail/preprocessor.h>`, new general purpose preprocessor facilities:
+  - `THRUST_PP_CAT[2-5]`, concatenates two to five tokens.
+  - `THRUST_PP_EXPAND(_ARGS)?`, performs double expansion.
+  - `THRUST_PP_ARITY` and `THRUST_PP_DISPATCH`, tools for macro overloading.
+  - `THRUST_PP_BOOL`, boolean conversion.
+  - `THRUST_PP_INC` and `THRUST_PP_DEC`, increment/decrement.
+  - `THRUST_PP_HEAD`, a variadic macro that expands to the first argument.
+  - `THRUST_PP_TAIL`, a variadic macro that expands to all its arguments after
+      the first.
+  - `THRUST_PP_IIF`, bitwise conditional.
+  - `THRUST_PP_COMMA_IF`, and `THRUST_PP_HAS_COMMA`, facilities for adding and
+      detecting comma tokens.
+  - `THRUST_PP_IS_VARIADIC_NULLARY`, returns true if called with a nullary
+      `__VA_ARGS__`.
+  - `THRUST_CURRENT_FUNCTION`, expands to the name of the current function.
+- New C++11 compatibility macros:
+  - `THRUST_NODISCARD`, expands to `[[nodiscard]]` when available and the best
+      equivalent otherwise.
+  - `THRUST_CONSTEXPR`, expands to `constexpr` when available and the best
+      equivalent otherwise.
+  - `THRUST_OVERRIDE`, expands to `override` when available and the best
+      equivalent otherwise.
+  - `THRUST_DEFAULT`, expands to `= default;` when available and the best
+      equivalent otherwise.
+  - `THRUST_NOEXCEPT`, expands to `noexcept` when available and the best
+      equivalent otherwise.
+  - `THRUST_FINAL`, expands to `final` when available and the best equivalent
+      otherwise.
+  - `THRUST_INLINE_CONSTANT`, expands to `inline constexpr` when available and
+      the best equivalent otherwise.
+- `<thrust/detail/type_deduction.h>`, new C++11-only type deduction helpers:
+  - `THRUST_DECLTYPE_RETURNS*`, expand to function definitions with suitable
+      conditional `noexcept` qualifiers and trailing return types.
+  - `THRUST_FWD(x)`, expands to `::std::forward<decltype(x)>(x)`.
+  - `THRUST_MVCAP`, expands to a lambda move capture.
+  - `THRUST_RETOF`, expands to a decltype computing the return type of an
+      invocable.
+- New CMake build system.
+
+## New Examples
+
+- `mr_basic` demonstrates how to use the new memory resource allocator system.
+
+## Other Enhancements
+
+- Tagged pointer enhancements:
+  - New `thrust::pointer_traits` specialization for `void const*`.
+  - `nullptr` support to Thrust tagged pointers.
+  - New `explicit operator bool` for Thrust tagged pointers when using C++11
+      for `std::unique_ptr` interoperability.
+  - Added `thrust::reinterpret_pointer_cast` and `thrust::static_pointer_cast`
+      for casting Thrust tagged pointers.
+- Iterator enhancements:
+  - `thrust::iterator_system` is now SFINAE friendly.
+  - Removed cv qualifiers from iterator types when using
+      `thrust::iterator_system`.
+- Static assert enhancements:
+  - New `THRUST_STATIC_ASSERT_MSG`, takes an optional string constant to be
+      used as the error message when possible.
+  - Update `THRUST_STATIC_ASSERT(_MSG)` to use C++11's `static_assert` when
+      it's available.
+  - Introduce a way to test for static assertions.
+- Testing enhancements:
+  - Additional scalar and sequence types, including non-builtin types and
+      vectors with unified memory allocators, have been added to the list of
+      types used by generic unit tests.
+  - The generation of random input data has been improved to increase the range
+      of values used and catch more corner cases.
+  - New `unittest::truncate_to_max_representable` utility for avoiding the
+      generation of ranges that cannot be represented by the underlying element
+      type in generic unit test code.
+  - The test driver now synchronizes with CUDA devices and check for errors
+      after each test, when switching devices, and after each raw kernel launch.
+  - The `warningtester` uber header is now compiled with NVCC to avoid needing
+      to disable CUDA-specific code with the preprocessor.
+  - Fixed the unit test framework's `ASSERT_*` to print `char`s as `int`s.
+  - New `DECLARE_INTEGRAL_VARIABLE_UNITTEST` test declaration macro.
+  - New `DECLARE_VARIABLE_UNITTEST_WITH_TYPES_AND_NAME` test declaration macro.
+  - `thrust::system_error` in the CUDA backend now print out its `cudaError_t`
+      enumerator in addition to the diagnostic message.
+  - Stopped using conditionally signed types like `char`.
+
+## Bug Fixes
+
+- #897, NVBug 2062242: Fix compilation error when using `__device__` lambdas
+    with `thrust::reduce` on MSVC.
+- #908, NVBug 2089386: Static assert that `thrust::generate`/`thrust::fill`
+    isn't operating on const iterators.
+- #919 Fix compilation failure with `thrust::zip_iterator` and
+    `thrust::complex`.
+- #924, NVBug 2096679, NVBug 2315990: Fix dispatch for the CUDA backend's
+    `thrust::reduce` to use two functions (one with the pragma for disabling
+    exec checks, one with `THRUST_RUNTIME_FUNCTION`) instead of one. This fixes
+    a regression with device compilation that started in CUDA Toolkit 9.2.
+- #928, NVBug 2341455: Add missing `__host__ __device__` annotations to a
+    `thrust::complex::operator=` to satisfy GoUDA.
+- NVBug 2094642: Make `thrust::vector_base::clear` not depend on the element
+    type being default constructible.
+- NVBug 2289115: Remove flaky `simple_cuda_streams` example.
+- NVBug 2328572: Add missing `thrust::device_vector` constructor that takes an
+    allocator parameter.
+- NVBug 2455740: Update the `range_view` example to not use device-side launch.
+- NVBug 2455943: Ensure that sized unit tests that use
+    `thrust::counting_iterator` perform proper truncation.
+- NVBug 2455952: Refactor questionable `thrust::copy_if` unit tests.
+
+# Thrust 1.9.3 (CUDA Toolkit 10.0)
+
+## Summary
+
+Thrust 1.9.3 unifies and integrates CUDA Thrust and GitHub Thrust.
+
+## Bug Fixes
+
+- #725, #850, #855, #859, #860: Unify the `thrust::iter_swap` interface and fix
+    `thrust::device_reference` swapping.
+- NVBug 2004663: Add a `data` method to `thrust::detail::temporary_array` and
+    refactor temporary memory allocation in the CUDA backend to be exception
+    and leak safe.
+- #886, #894, #914: Various documentation typo fixes.
+- #724: Provide `NVVMIR_LIBRARY_DIR` environment variable to NVCC.
+- #878: Optimize `thrust::min/max_element` to only use
+    `thrust::detail::get_iterator_value` for non-numeric types.
+- #899: Make `thrust::cuda::experimental::pinned_allocator`'s comparison
+    operators `const`.
+- NVBug 2092152: Remove all includes of `<cuda.h>`.
+- #911: Fix default comparator element type for `thrust::merge_by_key`.
+
+## Acknowledgments
+
+- Thanks to Andrew Corrigan for contributing fixes for swapping interfaces.
+- Thanks to Francisco Facioni for contributing optimizations for
+    `thrust::min/max_element`.
+
+# Thrust 1.9.2 (CUDA Toolkit 9.2)
+
+## Summary
+
+Thrust 1.9.2 brings a variety of performance enhancements, bug fixes and test
+  improvements.
+CUB 1.7.5 was integrated, enhancing the performance of `thrust::sort` on
+  small data types and `thrust::reduce`.
+Changes were applied to `complex` to optimize memory access.
+Thrust now compiles with compiler warnings enabled and treated as errors.
+Additionally, the unit test suite and framework was enhanced to increase
+  coverage.
+
+## Breaking Changes
+
+- The `fallback_allocator` example was removed, as it was buggy and difficult
+    to support.
+
+## New Features
+
+- `<thrust/detail/alignment.h>`, utilities for memory alignment:
+  - `thrust::aligned_reinterpret_cast`.
+  - `thrust::aligned_storage_size`, which computes the amount of storage needed
+      for an object of a particular size and alignment.
+  - `thrust::alignment_of`, a C++03 implementation of C++11's
+      `std::alignment_of`.
+  - `thrust::aligned_storage`, a C++03 implementation of C++11's
+      `std::aligned_storage`.
+  - `thrust::max_align_t`, a C++03 implementation of C++11's
+      `std::max_align_t`.
+
+## Bug Fixes
+
+- NVBug 200385527, NVBug 200385119, NVBug 200385113, NVBug 200349350, NVBug
+    2058778: Various compiler warning issues.
+- NVBug 200355591: `thrust::reduce` performance issues.
+- NVBug 2053727: Fixed an ADL bug that caused user-supplied `allocate` to be
+    overlooked but `deallocate` to be called with GCC <= 4.3.
+- NVBug 1777043: Fixed `thrust::complex` to work with `thrust::sequence`.
+
+# Thrust 1.9.1-2 (CUDA Toolkit 9.1)
+
+## Summary
+
+Thrust 1.9.1-2 integrates version 1.7.4 of CUB and introduces a new CUDA backend
+  for `thrust::reduce` based on CUB.
+
+## Bug Fixes
+
+- NVBug 1965743: Remove unnecessary static qualifiers.
+- NVBug 1940974: Fix regression causing a compilation error when using
+    `thrust::merge_by_key` with `thrust::constant_iterator`s.
+- NVBug 1904217: Allow callables that take non-const refs to be used with
+    `thrust::reduce` and `thrust::*_scan`.
+
+# Thrust 1.9.0-5 (CUDA Toolkit 9.0)
+
+## Summary
+
+Thrust 1.9.0-5 replaces the original CUDA backend (bulk) with a new one
+  written using CUB, a high performance CUDA collectives library.
+This brings a substantial performance improvement to the CUDA backend across
+  the board.
+
+## Breaking Changes
+
+- Any code depending on CUDA backend implementation details will likely be
+    broken.
+
+## New Features
+
+- New CUDA backend based on CUB which delivers substantially higher performance.
+- `thrust::transform_output_iterator`, a fancy iterator that applies a function
+    to the output before storing the result.
+
+## New Examples
+
+- `transform_output_iterator` demonstrates use of the new fancy iterator
+    `thrust::transform_output_iterator`.
+
+## Other Enhancements
+
+- When C++11 is enabled, functors do not have to inherit from
+    `thrust::(unary|binary)_function` anymore to be used with
+    `thrust::transform_iterator`.
+- Added C++11 only move constructors and move assignment operators for
+    `thrust::detail::vector_base`-based classes, e.g. `thrust::host_vector`,
+    `thrust::device_vector`, and friends.
+
+## Bug Fixes
+
+- `sin(thrust::complex<double>)` no longer has precision loss to float.
+
+## Acknowledgments
+
+- Thanks to Manuel Schiller for contributing a C++11 based enhancement
+    regarding the deduction of functor return types, improving the performance
+    of `thrust::unique` and implementing `thrust::transform_output_iterator`.
+- Thanks to Thibault Notargiacomo for the implementation of move semantics for
+    the `thrust::vector_base`-based classes.
+- Thanks to Duane Merrill for developing CUB and helping to integrate it into
+    Thrust's backend.
+
+# Thrust 1.8.3 (CUDA Toolkit 8.0)
+
+## Summary
+
+Thrust 1.8.3 is a small bug fix release.
+
+## New Examples
+
+- `range_view` demonstrates the use of a view (a non-owning wrapper for an
+    iterator range with a container-like interface).
+
+## Bug Fixes
+
+- `thrust::(min|max|minmax)_element` can now accept raw device pointers when
+    an explicit device execution policy is used.
+- `thrust::clear` operations on vector types no longer requires the element
+    type to have a default constructor.
+
+# Thrust 1.8.2 (CUDA Toolkit 7.5)
+
+## Summary
+
+Thrust 1.8.2 is a small bug fix release.
+
+## Bug Fixes
+
+- Avoid warnings and errors concerning user functions called from
+    `__host__ __device__` functions.
+- #632: Fix an error in `thrust::set_intersection_by_key` with the CUDA backend.
+- #651: `thrust::copy` between host and device now accepts execution policies
+    with streams attached, i.e. `thrust::::cuda::par.on(stream)`.
+- #664: `thrust::for_each` and algorithms based on it no longer ignore streams
+    attached to execution policys.
+
+## Known Issues
+
+- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
+    Capability 5.0 devices.
+
+# Thrust 1.8.1 (CUDA Toolkit 7.0)
+
+## Summary
+
+Thrust 1.8.1 is a small bug fix release.
+
+## Bug Fixes
+
+- #615, #620: Fixed `thrust::for_each` and `thrust::reduce` to no longer fail on
+    large inputs.
+
+## Known Issues
+
+- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
+    Capability 5.0 devices.
+
+# Thrust 1.8.0
+
+## Summary
+
+Thrust 1.8.0 introduces support for algorithm invocation from CUDA device
+  code, support for CUDA streams, and algorithm performance improvements.
+Users may now invoke Thrust algorithms from CUDA device code, providing a
+  parallel algorithms library to CUDA programmers authoring custom kernels, as
+  well as allowing Thrust programmers to nest their algorithm calls within
+  functors.
+The `thrust::seq` execution policy allows users to require sequential algorithm
+  execution in the calling thread and makes a sequential algorithms library
+  available to individual CUDA threads.
+The `.on(stream)` syntax allows users to request a CUDA stream for kernels
+  launched during algorithm execution.
+Finally, new CUDA algorithm implementations provide substantial performance
+  improvements.
+
+## New Features
+
+- Algorithms in CUDA Device Code:
+    - Thrust algorithms may now be invoked from CUDA `__device__` and
+        `__host__` __device__ functions.
+      Algorithms invoked in this manner must be invoked with an execution
+        policy as the first parameter.
+      The following execution policies are supported in CUDA __device__ code:
+      - `thrust::seq`
+      - `thrust::cuda::par`
+      - `thrust::device`, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA.
+  - Device-side algorithm execution may not be parallelized unless CUDA Dynamic
+      Parallelism is available.
+- Execution Policies:
+  - CUDA Streams
+    - The `thrust::cuda::par.on(stream)` syntax allows users to request that
+        CUDA kernels launched during algorithm execution should occur on a given
+        stream.
+    - Algorithms executed with a CUDA stream in this manner may still
+        synchronize with other streams when allocating temporary storage or
+        returning results to the CPU.
+  - `thrust::seq`, which allows users to require that an algorithm execute
+      sequentially in the calling thread.
+- `thrust::complex`, a complex number data type.
+
+## New Examples
+
+- simple_cuda_streams demonstrates how to request a CUDA stream during
+    algorithm execution.
+- async_reduce demonstrates ways to achieve algorithm invocations which are
+    asynchronous with the calling thread.
+
+## Other Enhancements
+
+- CUDA sort performance for user-defined types is 300% faster on Tesla K20c for
+    large problem sizes.
+- CUDA merge performance is 200% faster on Tesla K20c for large problem sizes.
+- CUDA sort performance for primitive types is 50% faster on Tesla K20c for
+    large problem sizes.
+- CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem
+    sizes.
+- CUDA scan performance is 15% faster on Tesla K20c for large problem sizes.
+- fallback_allocator example is simpler.
+
+## Bug Fixes
+
+- #364: Iterators with unrelated system tags may be used with algorithms invoked
+    with an execution policy
+- #371: Do not redefine `__CUDA_ARCH__`.
+- #379: Fix crash when dereferencing transform_iterator on the host.
+- #391: Avoid use of uppercase variable names.
+- #392: Fix `thrust::copy` between `cusp::complex` and `std::complex`.
+- #396: Program compiled with gcc < 4.3 hangs during comparison sort.
+- #406: `fallback_allocator.cu` example checks device for unified addressing support.
+- #417: Avoid using `std::less<T>` in binary search algorithms.
+- #418: Avoid various warnings.
+- #443: Including version.h no longer configures default systems.
+- #578: NVCC produces warnings when sequential algorithms are used with CPU systems.
+
+## Known Issues
+
+- When invoked with primitive data types, thrust::sort, thrust::sort_by_key,
+    thrust::stable_sort, & thrust::stable_sort_by_key may
+- Sometimes linking fails when compiling with `-rdc=true` with NVCC.
+- The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last
+    element in a segment of equivalent keys instead of the first.
+
+## Acknowledgments
+
+- Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan
+    implementations.
+- Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation.
+- Thanks to Filipe Maia for contributing the implementation of thrust::complex.
+
+# Thrust 1.7.2 (CUDA Toolkit 6.5)
+
+## Summary
+
+Thrust 1.7.2 is a minor bug fix release.
+
+## Bug Fixes
+
+- Avoid use of `std::min` in generic find implementation.
+
+# Thrust 1.7.1 (CUDA Toolkit 6.0)
+
+## Summary
+
+Thrust 1.7.1 is a minor bug fix release.
+
+## Bug Fixes
+
+- Eliminate identifiers in `set_operations.cu` example with leading underscore.
+- Eliminate unused variable warning in CUDA `reduce_by_key` implementation.
+- Avoid deriving function objects from `std::unary_function` and
+    `std::binary_function`.
+
+# Thrust 1.7.0 (CUDA Toolkit 5.5)
+
+## Summary
+
+Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
+  well as several new algorithms and performance improvements.
+With this new interface, users may directly control how algorithms execute as
+  well as details such as the allocation of temporary storage.
+Key/value versions of thrust::merge and the set operation algorithms have been
+  added, as well stencil versions of partitioning algorithms.
+thrust::tabulate has been introduced to tabulate the values of functions taking
+  integers.
+For 32b types, new CUDA merge and set operations provide 2-15x faster
+  performance while a new CUDA comparison sort provides 1.3-4x faster
+  performance.
+Finally, a new TBB reduce_by_key implementation provides 80% faster
+  performance.
+
+## Breaking Changes
+
+- Dispatch:
+  - Custom user backend systems' tag types must now inherit from the
+      corresponding system's execution_policy template (e.g.
+      thrust::cuda::execution_policy) instead of the tag struct (e.g.
+      thrust::cuda::tag). Otherwise, algorithm specializations will silently go
+      unfound during dispatch. See examples/minimal_custom_backend.cu and
+      examples/cuda/fallback_allocator.cu for usage examples.
+  - thrust::advance and thrust::distance are no longer dispatched based on
+      iterator system type and thus may no longer be customized.
+- Iterators:
+  - iterator_facade and iterator_adaptor's Pointer template parameters have
+      been eliminated.
+  - iterator_adaptor has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_adaptor).
+  - iterator_facade has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_facade).
+  - iterator_core_access has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_core_access).
+  - All iterators' nested pointer typedef (the type of the result of
+      operator->) is now void instead of a pointer type to indicate that such
+      expressions are currently impossible.
+  - Floating point counting_iterators' nested difference_type typedef is now a
+      signed integral type instead of a floating point type.
+- Other:
+  - normal_distribution has been moved into the thrust::random namespace
+      (previously thrust::random::experimental::normal_distribution).
+  - Placeholder expressions may no longer include the comma operator.
+
+## New Features
+- Execution Policies:
+  - Users may directly control the dispatch of algorithm invocations with
+      optional execution policy arguments.
+    For example, instead of wrapping raw pointers allocated by cudaMalloc with
+      thrust::device_ptr, the thrust::device execution_policy may be passed as
+      an argument to an algorithm invocation to enable CUDA execution.
+  - The following execution policies are supported in this version:
+    - `thrust::host`
+    - `thrust::device`
+    - `thrust::cpp::par`
+    - `thrust::cuda::par`
+    - `thrust::omp::par`
+    - `thrust::tbb::par`
+- Algorithms:
+  - `thrust::merge_by_key`
+  - `thrust::partition` with stencil
+  - `thrust::partition_copy` with stencil
+  - `thrust::set_difference_by_key`
+  - `thrust::set_intersection_by_key`
+  - `thrust::set_symmetric_difference_by_key`
+  - `thrust::set_union_by_key`
+  - `thrust::stable_partition with stencil`
+  - `thrust::stable_partition_copy with stencil`
+  - `thrust::tabulate`
+- Memory Allocation:
+	- `thrust::malloc`
+	- `thrust::free`
+  - `thrust::get_temporary_buffer`
+  - `thrust::return_temporary_buffer`
+
+## New Examples
+
+- uninitialized_vector demonstrates how to use a custom allocator to avoid the
+    automatic initialization of elements in thrust::device_vector.
+
+## Other Enhancements
+
+- Authors of custom backend systems may manipulate arbitrary state during
+    algorithm dispatch by incorporating it into their execution_policy parameter.
+- Users may control the allocation of temporary storage during algorithm
+    execution by passing standard allocators as parameters via execution policies
+    such as thrust::device.
+- THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the
+    device backend.
+- CUDA merge performance is 2-15x faster.
+- CUDA comparison sort performance is 1.3-4x faster.
+- CUDA set operation performance is 1.5-15x faster.
+- TBB reduce_by_key performance is 80% faster.
+- Several algorithms have been parallelized with TBB.
+- Support for user allocators in vectors has been improved.
+- The sparse_vector example is now implemented with merge_by_key instead of
+    sort_by_key.
+- Warnings have been eliminated in various contexts.
+- Warnings about __host__ or __device__-only functions called from __host__
+    __device__ functions have been eliminated in various contexts.
+- Documentation about algorithm requirements have been improved.
+- Simplified the minimal_custom_backend example.
+- Simplified the cuda/custom_temporary_allocation example.
+- Simplified the cuda/fallback_allocator example.
+
+## Bug Fixes
+
+- #248: Fix broken `thrust::counting_iterator<float>` behavior with OpenMP.
+- #231, #209: Fix set operation failures with CUDA.
+- #187: Fix incorrect occupancy calculation with CUDA.
+- #153: Fix broken multi GPU behavior with CUDA.
+- #142: Eliminate warning produced by `thrust::random::taus88` and MSVC 2010.
+- #208: Correctly initialize elements in temporary storage when necessary.
+- #16: Fix compilation error when sorting bool with CUDA.
+- #10: Fix ambiguous overloads of `thrust::reinterpret_tag`.
+
+## Known Issues
+
+- GCC 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly
+    causing infinite recursion in examples such as
+    cuda/custom_temporary_allocation.
+
+## Acknowledgments
+
+- Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing
+    a faster merge implementation for CUDA.
+- Thanks to Sean Baxter for contributing a faster set operation implementation
+    for CUDA.
+- Thanks to Cliff Woolley for contributing a correct occupancy calculation
+    algorithm.
+
+# Thrust 1.6.0
+
+## Summary
+
+Thrust 1.6.0 provides an interface for customization and extension and a new
+  backend system based on the Threading Building Blocks library.
+With this new interface, programmers may customize the behavior of specific
+  algorithms as well as control the allocation of temporary storage or invent
+  entirely new backends.
+These enhancements also allow multiple different backend systems
+  such as CUDA and OpenMP to coexist within a single program.
+Support for TBB allows Thrust programs to integrate more naturally into
+  applications which may already employ the TBB task scheduler.
+
+## Breaking Changes
+
+- The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to
+    <thrust/system/cuda/experimental/pinned_allocator.h>
+- thrust::experimental::cuda::pinned_allocator has been moved to
+    thrust::cuda::experimental::pinned_allocator
+- The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
+- The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
+- The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
+- thrust::host_space_tag has been renamed thrust::host_system_tag
+- thrust::device_space_tag has been renamed thrust::device_system_tag
+- thrust::any_space_tag has been renamed thrust::any_system_tag
+- thrust::iterator_space has been renamed thrust::iterator_system
+
+## New Features
+
+- Backend Systems
+  - Threading Building Blocks (TBB) is now supported
+- Algorithms
+  - `thrust::for_each_n`
+  - `thrust::raw_reference_cast`
+- Types
+  - `thrust::pointer`
+  - `thrust::reference`
+
+## New Examples
+
+- `cuda/custom_temporary_allocation`
+- `cuda/fallback_allocator`
+- `device_ptr`
+- `expand`
+- `minimal_custom_backend`
+- `raw_reference_cast`
+- `set_operations`
+
+## Other Enhancements
+- thrust::for_each now returns the end of the input range similar to most other algorithms
+- thrust::pair and thrust::tuple have swap functionality
+- All CUDA algorithms now support large data types
+- Iterators may be dereferenced in user __device__ or __global__ functions
+- The safe use of different backend systems is now possible within a single binary
+
+## Bug Fixes
+
+- #469 `min_element` and `max_element` algorithms no longer require a const comparison operator
+
+## Known Issues
+
+- NVCC may crash when parsing TBB headers on Windows.
+
+# Thrust 1.5.3 (CUDA Toolkit 5.0)
+
+## Summary
+
+Thrust 1.5.3 is a minor bug fix release.
+
+## Bug Fixes
+
+- Avoid warnings about potential race due to `__shared__` non-POD variable
+
+# Thrust 1.5.2 (CUDA Toolkit 4.2)
+
+## Summary
+
+Thrust 1.5.2 is a minor bug fix release.
+
+## Bug Fixes
+
+- Fixed warning about C-style initialization of structures
+
+# Thrust 1.5.1 (CUDA Toolkit 4.1)
+
+## Summary
+
+Thrust 1.5.1 is a minor bug fix release.
+
+## Bug Fixes
+
+- Sorting data referenced by permutation_iterators on CUDA produces invalid results
+
+# Thrust 1.5.0
+
+## Summary
+
+Thrust 1.5.0 provides introduces new programmer productivity and performance
+  enhancements.
+New functionality for creating anonymous "lambda" functions has been added.
+A faster host sort provides 2-10x faster performance for sorting arithmetic
+  types on (single-threaded) CPUs.
+A new OpenMP sort provides 2.5x-3.0x speedup over the host sort using a
+  quad-core CPU.
+When sorting arithmetic types with the OpenMP backend the combined performance
+  improvement is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to
+  14.2x (8-bit types).
+A new CUDA `reduce_by_key` implementation provides 2-3x faster
+  performance.
+
+## Breaking Changes
+- device_ptr<void> no longer unsafely converts to device_ptr<T> without an
+    explicit cast.
+  Use the expression device_pointer_cast(static_cast<int*>(void_ptr.get())) to
+    convert, for example, device_ptr<void> to device_ptr<int>.
+
+## New Features
+
+- Algorithms:
+  - Stencil-less `thrust::transform_if`.
+- Lambda placeholders
+
+## New Examples
+- lambda
+
+## Other Enhancements
+
+- Host sort is 2-10x faster for arithmetic types
+- OMP sort provides speedup over host sort
+- `reduce_by_key` is 2-3x faster
+- `reduce_by_key` no longer requires O(N) temporary storage
+- CUDA scan algorithms are 10-40% faster
+- `host_vector` and `device_vector` are now documented
+- out-of-memory exceptions now provide detailed information from CUDART
+- improved histogram example
+- `device_reference` now has a specialized swap
+- `reduce_by_key` and scan algorithms are compatible with `discard_iterator`
+
+## Bug Fixes
+
+- #44: Allow `thrust::host_vector` to compile when `value_type` uses
+    `__align__`.
+- #198: Allow `thrust::adjacent_difference` to permit safe in-situ operation.
+- #303: Make thrust thread-safe.
+- #313: Avoid race conditions in `thrust::device_vector::insert`.
+- #314: Avoid unintended ADL invocation when dispatching copy.
+- #365: Fix merge and set operation failures.
+
+## Known Issues
+
+- None
+
+## Acknowledgments
+
+- Thanks to Manjunath Kudlur for contributing his Carbon library, from which
+    the lambda functionality is derived.
+- Thanks to Jean-Francois Bastien for suggesting a fix for #303.
+
+# Thrust 1.4.0 (CUDA Toolkit 4.0)
+
+## Summary
+
+Thrust 1.4.0 is the first release of Thrust to be included in the CUDA Toolkit.
+Additionally, it brings many feature and performance improvements.
+New set theoretic algorithms operating on sorted sequences have been added.
+Additionally, a new fancy iterator allows discarding redundant or otherwise
+  unnecessary output from algorithms, conserving memory storage and bandwidth.
+
+## Breaking Changes
+
+- Eliminations
+  - `thrust/is_sorted.h`
+  - `thrust/utility.h`
+  - `thrust/set_intersection.h`
+  - `thrust/experimental/cuda/ogl_interop_allocator.h` and the functionality
+      therein
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::gather` and `thrust::scatter` from host to device and vice versa
+      are no longer supported.
+  - Operations which modify the elements of a thrust::device_vector are no longer
+      available from source code compiled without nvcc when the device backend
+      is CUDA.
+    Instead, use the idiom from the cpp_interop example.
+
+## New Features
+
+- Algorithms:
+  - `thrust::copy_n`
+  - `thrust::merge`
+  - `thrust::set_difference`
+  - `thrust::set_symmetric_difference`
+  - `thrust::set_union`
+
+- Types
+  - `thrust::discard_iterator`
+
+- Device Support:
+  - Compute Capability 2.1 GPUs.
+
+## New Examples
+
+- run_length_decoding
+
+## Other Enhancements
+
+- Compilation warnings are substantially reduced in various contexts.
+- The compilation time of thrust::sort, thrust::stable_sort,
+    thrust::sort_by_key, and thrust::stable_sort_by_key are substantially
+    reduced.
+- A fast sort implementation is used when sorting primitive types with
+    thrust::greater.
+- The performance of thrust::set_intersection is improved.
+- The performance of thrust::fill is improved on SM 1.x devices.
+- A code example is now provided in each algorithm's documentation.
+- thrust::reverse now operates in-place
+
+## Bug Fixes
+
+- #212: `thrust::set_intersection` works correctly for large input sizes.
+- #275: `thrust::counting_iterator` and `thrust::constant_iterator` work
+    correctly with OpenMP as the backend when compiling with optimization.
+- #256: `min` and `max` correctly return their first argument as a tie-breaker
+- #248: `NDEBUG` is interpreted incorrectly
+
+## Known Issues
+
+- NVCC may generate code containing warnings when compiling some Thrust
+    algorithms.
+- When compiling with `-arch=sm_1x`, some Thrust algorithms may cause NVCC to
+    issue benign pointer advisories.
+- When compiling with `-arch=sm_1x` and -G, some Thrust algorithms may fail to
+    execute correctly.
+- `thrust::inclusive_scan`, `thrust::exclusive_scan`,
+    `thrust::inclusive_scan_by_key`, and `thrust::exclusive_scan_by_key` are
+    currently incompatible with `thrust::discard_iterator`.
+
+## Acknowledgments
+
+- Thanks to David Tarjan for improving the performance of set_intersection.
+- Thanks to Duane Merrill for continued help with sort.
+- Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
+
+# Thrust 1.3.0
+
+## Summary
+
+Thrust 1.3.0 provides support for CUDA Toolkit 3.2 in addition to many feature
+  and performance enhancements.
+Performance of the sort and sort_by_key algorithms is improved by as much as 3x
+  in certain situations.
+The performance of stream compaction algorithms, such as copy_if, is improved
+  by as much as 2x.
+CUDA errors are now converted to runtime exceptions using the system_error
+  interface.
+Combined with a debug mode, also new in 1.3, runtime errors can be located with
+  greater precision.
+Lastly, a few header files have been consolidated or renamed for clarity.
+See the deprecations section below for additional details.
+
+## Breaking Changes
+
+- Promotions
+  - thrust::experimental::inclusive_segmented_scan has been renamed
+      thrust::inclusive_scan_by_key and exposes a different interface
+  - thrust::experimental::exclusive_segmented_scan has been renamed
+      thrust::exclusive_scan_by_key and exposes a different interface
+  - thrust::experimental::partition_copy has been renamed
+      thrust::partition_copy and exposes a different interface
+  - thrust::next::gather has been renamed thrust::gather
+  - thrust::next::gather_if has been renamed thrust::gather_if
+  - thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
+- Deprecations
+  - thrust::copy_when has been renamed thrust::deprecated::copy_when
+  - thrust::absolute_value has been renamed thrust::deprecated::absolute_value
+  - The header thrust/set_intersection.h is now deprecated; use
+      thrust/set_operations.h instead
+  - The header thrust/utility.h is now deprecated; use thrust/swap.h instead
+  - The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
+- Eliminations
+  - thrust::deprecated::gather
+  - thrust::deprecated::gather_if
+  - thrust/experimental/arch.h and the functions therein
+  - thrust/sorting/merge_sort.h
+  - thrust/sorting/radix_sort.h
+- NVCC 2.3 is no longer supported
+
+## New Features
+
+- Algorithms:
+  - `thrust::exclusive_scan_by_key`
+  - `thrust::find`
+  - `thrust::find_if`
+  - `thrust::find_if_not`
+  - `thrust::inclusive_scan_by_key`
+  - `thrust::is_partitioned`
+  - `thrust::is_sorted_until`
+  - `thrust::mismatch`
+  - `thrust::partition_point`
+  - `thrust::reverse`
+  - `thrust::reverse_copy`
+  - `thrust::stable_partition_copy`
+
+- Types:
+  - `thrust::system_error` and related types.
+  - `thrust::experimental::cuda::ogl_interop_allocator`.
+  - `thrust::bit_and`, `thrust::bit_or`, and `thrust::bit_xor`.
+
+- Device Support:
+  - GF104-based GPUs.
+
+## New Examples
+
+- opengl_interop.cu
+- repeated_range.cu
+- simple_moving_average.cu
+- sparse_vector.cu
+- strided_range.cu
+
+## Other Enhancements
+
+- Performance of thrust::sort and thrust::sort_by_key is substantially improved
+    for primitive key types
+- Performance of thrust::copy_if is substantially improved
+- Performance of thrust::reduce and related reductions is improved
+- THRUST_DEBUG mode added
+- Callers of Thrust functions may detect error conditions by catching
+    thrust::system_error, which derives from std::runtime_error
+- The number of compiler warnings generated by Thrust has been substantially
+    reduced
+- Comparison sort now works correctly for input sizes > 32M
+- min & max usage no longer collides with <windows.h> definitions
+- Compiling against the OpenMP backend no longer requires nvcc
+- Performance of device_vector initialized in .cpp files is substantially
+    improved in common cases
+- Performance of thrust::sort_by_key on the host is substantially improved
+
+## Bug Fixes
+
+- Debug device code now compiles correctly
+- thrust::uninitialized_copy and thrust::uninitialized_fill now dispatch
+    constructors on the device rather than the host
+
+## Known Issues
+
+- #212 set_intersection is known to fail for large input sizes
+- partition_point is known to fail for 64b types with nvcc 3.2
+
+Acknowledgments
+- Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
+- Thanks to Erich Elsen for contributing an implementation of find_if
+- Thanks to Andrew Corrigan for contributing changes which allow the OpenMP
+    backend to compile in the absence of nvcc
+- Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez
+    Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for
+    bug reports
+- Thanks to Cliff Woolley for help with testing
+
+# Thrust 1.2.1
+
+## Summary
+
+Small fixes for compatibility for the CUDA Toolkit 3.1.
+
+## Known Issues
+
+- `thrust::inclusive_scan` and `thrust::exclusive_scan` may fail with very
+    large types.
+- MSVC may fail to compile code using both sort and binary search algorithms.
+- `thrust::uninitialized_fill` and `thrust::uninitialized_copy` dispatch
+    constructors on the host rather than the device.
+- #109: Some algorithms may exhibit poor performance with the OpenMP backend
+    with large numbers (>= 6) of CPU threads.
+- `thrust::default_random_engine::discard` is not accelerated with NVCC 2.3
+- NVCC 3.1 may fail to compile code using types derived from
+    `thrust::subtract_with_carry_engine`, such as `thrust::ranlux24` and
+    `thrust::ranlux48`.
+
+# Thrust 1.2.0
+
+## Summary
+
+Thrust 1.2 introduces support for compilation to multicore CPUs and the Ocelot
+  virtual machine, and several new facilities for pseudo-random number
+  generation.
+New algorithms such as set intersection and segmented reduction have also been
+  added.
+Lastly, improvements to the robustness of the CUDA backend ensure correctness
+  across a broad set of (uncommon) use cases.
+
+## Breaking Changes
+
+- `thrust::gather`'s interface was incorrect and has been removed.
+  The old interface is deprecated but will be preserved for Thrust version 1.2
+    at `thrust::deprecated::gather` and `thrust::deprecated::gather_if`.
+  The new interface is provided at `thrust::next::gather` and
+    `thrust::next::gather_if`.
+  The new interface will be promoted to `thrust::` in Thrust version 1.3.
+  For more details, please refer to [this thread](http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd).
+- The `thrust::sorting` namespace has been deprecated in favor of the top-level
+    sorting functions, such as `thrust::sort` and `thrust::sort_by_key`.
+- Removed support for `thrust::equal` between host & device sequences.
+- Removed support for `thrust::scatter` between host & device sequences.
+
+## New Features
+
+- Algorithms:
+  - `thrust::reduce_by_key`
+  - `thrust::set_intersection`
+  - `thrust::unique_copy`
+  - `thrust::unique_by_key`
+  - `thrust::unique_copy_by_key`
+- Types
+- Random Number Generation:
+  - `thrust::discard_block_engine`
+  - `thrust::default_random_engine`
+  - `thrust::linear_congruential_engine`
+  - `thrust::linear_feedback_shift_engine`
+  - `thrust::subtract_with_carry_engine`
+  - `thrust::xor_combine_engine`
+  - `thrust::minstd_rand`
+  - `thrust::minstd_rand0`
+  - `thrust::ranlux24`
+  - `thrust::ranlux48`
+  - `thrust::ranlux24_base`
+  - `thrust::ranlux48_base`
+  - `thrust::taus88`
+  - `thrust::uniform_int_distribution`
+  - `thrust::uniform_real_distribution`
+  - `thrust::normal_distribution` (experimental)
+- Function Objects:
+  - `thrust::project1st`
+  - `thrust::project2nd`
+- `thrust::tie`
+- Fancy Iterators:
+  - `thrust::permutation_iterator`
+  - `thrust::reverse_iterator`
+- Vector Functions:
+  - `operator!=`
+  - `rbegin`
+  - `crbegin`
+  - `rend`
+  - `crend`
+  - `data`
+  - `shrink_to_fit`
+- Device Support:
+  - Multicore CPUs via OpenMP.
+  - Fermi-class GPUs.
+  - Ocelot virtual machines.
+- Support for NVCC 3.0.
+
+## New Examples
+
+- `cpp_integration`
+- `histogram`
+- `mode`
+- `monte_carlo`
+- `monte_carlo_disjoint_sequences`
+- `padded_grid_reduction`
+- `permutation_iterator`
+- `row_sum`
+- `run_length_encoding`
+- `segmented_scan`
+- `stream_compaction`
+- `summary_statistics`
+- `transform_iterator`
+- `word_count`
+
+## Other Enhancements
+
+- Integer sorting performance is improved when max is large but (max - min) is
+    small and when min is negative
+- Performance of `thrust::inclusive_scan` and `thrust::exclusive_scan` is
+    improved by 20-25% for primitive types.
+
+## Bug Fixes
+
+- #8 cause a compiler error if the required compiler is not found rather than a
+    mysterious error at link time
+- #42 device_ptr & device_reference are classes rather than structs,
+    eliminating warnings on certain platforms
+- #46 gather & scatter handle any space iterators correctly
+- #51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
+- #52 avoid collisions with common user macros such as BLOCK_SIZE
+- #62 provide better documentation for device_reference
+- #68 allow built-in CUDA vector types to work with device_vector in pure C++
+    mode
+- #102 eliminated a race condition in device_vector::erase
+- various compilation warnings eliminated
+
+## Known Issues
+
+- inclusive_scan & exclusive_scan may fail with very large types
+- MSVC may fail to compile code using both sort and binary search algorithms
+- uninitialized_fill & uninitialized_copy dispatch constructors on the host
+    rather than the device
+- #109 some algorithms may exhibit poor performance with the OpenMP backend
+    with large numbers (>= 6) of CPU threads
+- default_random_engine::discard is not accelerated with nvcc 2.3
+
+## Acknowledgments
+
+- Thanks to Gregory Diamos for contributing a CUDA implementation of
+    set_intersection
+- Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit
+    tests and examples against Ocelot
+- Thanks to Tom Bradley for contributing an implementation of normal_distribution
+- Thanks to Joseph Rhoads for contributing the example summary_statistics
+
+# Thrust 1.1.1
+
+## Summary
+
+Small fixes for compatibility with CUDA Toolkit 2.3a and Mac OSX Snow Leopard.
+
+# Thrust 1.1.0
+
+## Summary
+
+Thrust 1.1.0 introduces fancy iterators, binary search functions, and several
+  specialized reduction functions.
+Experimental support for segmented scans has also been added.
+
+## Breaking Changes
+
+- `thrust::counting_iterator` has been moved into the `thrust` namespace
+    (previously `thrust::experimental`).
+
+## New Features
+
+- Algorithms:
+  - `thrust::copy_if`
+  - `thrust::lower_bound`
+  - `thrust::upper_bound`
+  - `thrust::vectorized lower_bound`
+  - `thrust::vectorized upper_bound`
+  - `thrust::equal_range`
+  - `thrust::binary_search`
+  - `thrust::vectorized binary_search`
+  - `thrust::all_of`
+  - `thrust::any_of`
+  - `thrust::none_of`
+  - `thrust::minmax_element`
+  - `thrust::advance`
+  - `thrust::inclusive_segmented_scan` (experimental)
+  - `thrust::exclusive_segmented_scan` (experimental)
+- Types:
+  - `thrust::pair`
+  - `thrust::tuple`
+  - `thrust::device_malloc_allocator`
+- Fancy Iterators:
+  - `thrust::constant_iterator`
+  - `thrust::counting_iterator`
+  - `thrust::transform_iterator`
+  - `thrust::zip_iterator`
+
+## New Examples
+
+- Computing the maximum absolute difference between vectors.
+- Computing the bounding box of a two-dimensional point set.
+- Sorting multiple arrays together (lexicographical sorting).
+- Constructing a summed area table.
+- Using `thrust::zip_iterator` to mimic an array of structs.
+- Using `thrust::constant_iterator` to increment array values.
+
+## Other Enhancements
+
+- Added pinned memory allocator (experimental).
+- Added more methods to host_vector & device_vector (issue #4).
+- Added variant of remove_if with a stencil argument (issue #29).
+- Scan and reduce use cudaFuncGetAttributes to determine grid size.
+- Exceptions are reported when temporary device arrays cannot be allocated.
+
+## Bug Fixes
+
+- #5: Make vector work for larger data types
+- #9: stable_partition_copy doesn't respect OutputIterator concept semantics
+- #10: scans should return OutputIterator
+- #16: make algorithms work for larger data types
+- #27: Dispatch radix_sort even when comp=less<T> is explicitly provided
+
+## Known Issues
+
+- Using functors with Thrust entry points may not compile on Mac OSX with gcc
+    4.0.1.
+- `thrust::uninitialized_copy` and `thrust::uninitialized_fill` dispatch
+    constructors on the host rather than the device.
+- `thrust::inclusive_scan`, `thrust::inclusive_scan_by_key`,
+    `thrust::exclusive_scan`, and `thrust::exclusive_scan_by_key` may fail when
+    used with large types with the CUDA Toolkit 3.1.
+
+# Thrust 1.0.0
+
+## Breaking Changes
+
+- Rename top level namespace `komrade` to `thrust`.
+- Move `thrust::partition_copy` & `thrust::stable_partition_copy` into
+    `thrust::experimental` namespace until we can easily provide the standard
+    interface.
+- Rename `thrust::range` to `thrust::sequence` to avoid collision with
+    Boost.Range.
+- Rename `thrust::copy_if` to `thrust::copy_when` due to semantic differences
+    with C++0x `std::copy_if`.
+
+## New Features
+
+- Add C++0x style `cbegin` & `cend` methods to `thrust::host_vector` and
+    `thrust::device_vector`.
+- Add `thrust::transform_if` function.
+- Add stencil versions of `thrust::replace_if` & `thrust::replace_copy_if`.
+- Allow `counting_iterator` to work with `thrust::for_each`.
+- Allow types with constructors in comparison `thrust::sort` and
+    `thrust::reduce`.
+
+## Other Enhancements
+
+- `thrust::merge_sort` and `thrust::stable_merge_sort` are now 2x to 5x faster
+    when executed on the parallel device.
+
+## Bug Fixes
+
+- Komrade 6: Workaround an issue where an incremented iterator causes NVCC to
+    crash.
+- Komrade 7: Fix an issue where `const_iterator`s could not be passed to
+    `thrust::transform`.
+
diff --git a/thrust/CMakeLists.txt b/thrust/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..106d975346e4182f1938d3bfdd552edda26562c0
--- /dev/null
+++ b/thrust/CMakeLists.txt
@@ -0,0 +1,182 @@
+# Support adding Thrust to a parent project via add_subdirectory.
+# See examples/cmake/add_subdir/CMakeLists.txt for details.
+if (NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}")
+  include(cmake/ThrustAddSubdir.cmake)
+  return()
+endif()
+
+# 3.15 is the minimum.
+# 3.17 for nvc++/Feta
+# 3.18 for C++17 + CUDA
+cmake_minimum_required(VERSION 3.15)
+
+# Remove this when we use the new CUDA_ARCHITECTURES properties with both
+# nvcc and nvc++.
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+project(Thrust NONE)
+
+include(cmake/AppendOptionIfAvailable.cmake)
+
+include(cmake/ThrustBuildCompilerTargets.cmake)
+include(cmake/ThrustBuildTargetList.cmake)
+include(cmake/ThrustMultiConfig.cmake)
+include(cmake/ThrustInstallRules.cmake)
+include(cmake/ThrustUtilities.cmake)
+
+option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON")
+option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON")
+option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON")
+option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF")
+
+# Check if we're actually building anything before continuing. If not, no need
+# to search for deps, etc. This is a common approach for packagers that just
+# need the install rules. See GH issue thrust/thrust#1211.
+if (NOT (THRUST_ENABLE_HEADER_TESTING OR
+         THRUST_ENABLE_TESTING OR
+         THRUST_ENABLE_EXAMPLES OR
+         THRUST_INCLUDE_CUB_CMAKE))
+  return()
+endif()
+
+# Add cache string options for CMAKE_BUILD_TYPE and default to RelWithDebInfo.
+if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
+
+  set_property(
+    CACHE CMAKE_BUILD_TYPE
+    PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
+  )
+endif ()
+
+# Disable compiler extensions:
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up in the
+# top-level project's dir when building Thrust via add_subdirectory.
+set(THRUST_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib")
+set(THRUST_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin")
+
+# Temporary hacks to make Feta work; this requires you to define
+# `CMAKE_CUDA_COMPILER_ID=Feta` and `CMAKE_CUDA_COMPILER_FORCED`.
+if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # If using Feta, don't set CXX compiler
+  if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "")
+    unset(CMAKE_CXX_COMPILER CACHE)
+    message(FATAL_ERROR "You are using Feta as your CUDA C++ compiler, but have"
+      " specified a different ISO C++ compiler; Feta acts as both, so please"
+      " unset the CMAKE_CXX_COMPILER variable.")
+  endif ()
+
+  # We don't set CMAKE_CUDA_HOST_COMPILER for Feta; if we do, CMake tries to
+  # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to Feta, which it doesn't
+  # understand.
+  if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
+    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    message(FATAL_ERROR "You are using Feta as your CUDA C++ compiler, but have"
+      " specified a different host ISO C++ compiler; Feta acts as both, so"
+      " please unset the CMAKE_CUDA_HOST_COMPILER variable.")
+  endif ()
+
+  set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -stdpar")
+  set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}")
+  set(CMAKE_CUDA_LINK_EXECUTABLE
+      "<CMAKE_CUDA_HOST_LINK_LAUNCHER> ${CMAKE_CUDA_FLAGS} <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+endif ()
+
+# This must appear after any changes to CMAKE_CXX_COMPILER or else CMake will
+# delete the cache and reconfigure from scratch.
+enable_language(CXX)
+
+# We don't set CMAKE_CUDA_HOST_COMPILER for Feta; if we do, CMake tries to
+# pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to Feta, which it doesn't
+# understand.
+if (NOT "Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR
+           "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}"))
+    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    message(FATAL_ERROR "Thrust tests and examples require the C++ compiler"
+      " and the CUDA host compiler to be the same; to set this compiler, please"
+      " use the CMAKE_CXX_COMPILER variable, not the CMAKE_CUDA_HOST_COMPILER"
+      " variable.")
+  endif ()
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+endif ()
+
+# Temporary hacks to make Feta work; this requires you to define
+# `CMAKE_CUDA_COMPILER_ID=Feta` and `CMAKE_CUDA_COMPILER_FORCED`.
+if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # Need 3.17 for the properties used below.
+  cmake_minimum_required(VERSION 3.17)
+
+  set(CMAKE_CUDA_STANDARD_DEFAULT 03)
+
+  set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03")
+  set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03")
+  set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11")
+  set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11")
+  set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14")
+  set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14")
+  set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17")
+  set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17")
+  set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES)
+
+  cmake_record_cuda_compile_features()
+
+  set(CMAKE_CUDA_COMPILE_FEATURES
+    ${CMAKE_CUDA03_COMPILE_FEATURES}
+    ${CMAKE_CUDA11_COMPILE_FEATURES}
+    ${CMAKE_CUDA14_COMPILE_FEATURES}
+    ${CMAKE_CUDA17_COMPILE_FEATURES}
+    ${CMAKE_CUDA20_COMPILE_FEATURES}
+  )
+endif ()
+
+thrust_configure_multiconfig()
+thrust_build_target_list()
+
+thrust_update_system_found_flags()
+message(STATUS "CPP system found?  ${THRUST_CPP_FOUND}")
+message(STATUS "CUDA system found? ${THRUST_CUDA_FOUND}")
+message(STATUS "TBB system found?  ${THRUST_TBB_FOUND}")
+message(STATUS "OMP system found?  ${THRUST_OMP_FOUND}")
+
+if (THRUST_CUDA_FOUND)
+  include(cmake/ThrustCudaConfig.cmake)
+endif()
+
+if (THRUST_ENABLE_HEADER_TESTING)
+  include(cmake/ThrustHeaderTesting.cmake)
+endif()
+
+# Both testing and examples use ctest
+if (THRUST_ENABLE_TESTING OR THRUST_ENABLE_EXAMPLES)
+  include(CTest)
+  enable_testing()
+endif()
+
+if (THRUST_ENABLE_TESTING)
+  add_subdirectory(testing)
+endif()
+
+if (THRUST_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND)
+  set(CUB_IN_THRUST ON)
+  add_subdirectory(dependencies/cub)
+endif()
diff --git a/thrust/CODE_OF_CONDUCT.md b/thrust/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..25140337afb95175f2082389a4f91161cdff779b
--- /dev/null
+++ b/thrust/CODE_OF_CONDUCT.md
@@ -0,0 +1,59 @@
+# Contributor Covenant Code of Conduct
+
+## Overview
+
+Define the code of conduct followed and enforced for Thrust
+
+### Intended audience
+
+* Community
+* Developers
+* Project Leads
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+-   Using welcoming and inclusive language
+-   Being respectful of differing viewpoints and experiences
+-   Gracefully accepting constructive criticism
+-   Focusing on what is best for the community
+-   Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+-   The use of sexualized language or imagery and unwelcome sexual attention or advances
+-   Trolling, insulting/derogatory comments, and personal or political attacks
+-   Public or private harassment
+-   Publishing others’ private information, such as a physical or electronic address, without explicit permission
+-   Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at  [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com)  All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership.
+
+## Attribution
+
+This Code of Conduct was taken from the [NVIDIA RAPIDS](https://docs.rapids.ai/resources/conduct/) project, which was adapted from the  [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq
+
+## Contact
+
+If you need to contact the Thrust team, please reach out to cpp-conduct@nvidia.com
diff --git a/thrust/CONTRIBUTING.md b/thrust/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ab75fa6650687c849caf75deb2f926cd628f239
--- /dev/null
+++ b/thrust/CONTRIBUTING.md
@@ -0,0 +1,490 @@
+# Table of Contents
+
+1. [Contributing to Thrust](#contributing-to-thrust)
+1. [CMake Options](#cmake-options)
+1. [Development Model](#development-model)
+
+# Contributing to Thrust
+
+Thrust uses Github to manage all open-source development, including bug
+tracking, pull requests, and design discussions. This document details how to get
+started as a Thrust contributor.
+
+An overview of this process is:
+
+1. [Clone the Thrust repository](#clone-the-thrust-repository)
+1. [Setup a fork of Thrust](#setup-a-fork-of-thrust)
+1. [Setup your environment](#setup-your-environment)
+1. [Create a development branch](#create-a-development-branch)
+1. [Local development loop](#local-development-loop)
+1. [Push development branch to your fork](#push-development-branch-to-your-fork)
+1. [Create pull request](#create-pull-request)
+1. [Address feedback and update pull request](#address-feedback-and-update-pull-request)
+1. [When your PR is approved...](#when-your-pr-is-approved)
+
+## Clone the Thrust Repository
+
+To get started, clone the main repository to your local computer. Thrust should
+be cloned recursively to setup the CUB submodule (required for `CUDA`
+acceleration).
+
+```
+git clone --recursive https://github.com/thrust/thrust.git
+cd thrust
+```
+
+## Setup a Fork of Thrust
+
+You'll need a fork of Thrust on Github to create a pull request. To setup your
+fork:
+
+1. Create a Github account (if needed)
+2. Go to [the Thrust Github page](https://github.com/thrust/thrust)
+3. Click "Fork" and follow any prompts that appear.
+
+Once your fork is created, setup a new remote repo in your local Thrust clone:
+
+```
+git remote add github-fork git@github.com:<GITHUB_USERNAME>/thrust.git
+```
+
+If you need to modify CUB, too, go to
+[the CUB Github page](https://github.com/thrust/cub) and repeat this process.
+Create CUB's `github-fork` remote in the `thrust/dependencies/cub` submodule.
+
+## Setup Your Environment
+
+### Git Environment
+
+If you haven't already, this is a good time to tell git who you are. This
+information is used to fill out authorship information on your git commits.
+
+```
+git config --global user.name "John Doe"
+git config --global user.email johndoe@example.com
+```
+
+### Configure CMake builds
+
+Thrust uses [CMake](https://www.cmake.org) for its developer build system. To
+configure, build, and test your checkout of Thrust:
+
+```
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..   # Command line interface.
+ccmake ..  # ncurses GUI (Linux only)
+cmake-gui  # Graphical UI, set source/build directories in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+See [CMake Options](#cmake-options) for details on customizing the build.
+
+## Create a Development Branch
+
+All work should be done in a development branch (also called a "topic branch")
+and not directly in the `master` branch. This makes it easier to manage multiple
+in-progress patches at once, and provides a descriptive label for your patch
+as it passes through the review system.
+
+To create a new branch based on the current `master`:
+
+```
+# Checkout local master branch:
+cd /path/to/thrust/sources
+git checkout master
+
+# Sync local master branch with github:
+git pull
+
+# Create a new branch named `my_descriptive_branch_name` based on master:
+git checkout -b my_descriptive_branch_name
+
+# Verify that the branch has been created and is currently checked out:
+git branch
+```
+
+Thrust branch names should follow a particular pattern:
+
+- For new features, name the branch `feature/<name>`
+- For bugfixes associated with a github issue, use `bug/github/<bug-description>-<bug-id>`
+  - Internal nvidia and gitlab bugs should use `nvidia` or `gitlab` in place of
+    `github`.
+
+If you plan to work on CUB as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule.
+
+## Local Development Loop
+
+### Edit, Build, Test, Repeat
+
+Once the topic branch is created, you're all set to start working on Thrust
+code. Make some changes, then build and test them:
+
+```
+# Implement changes:
+cd /path/to/thrust/sources
+emacs thrust/some_file.h # or whatever editor you prefer
+
+# Create / update a unit test for your changes:
+emacs testing/some_test.cu
+
+# Check that everything builds and tests pass:
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+```
+
+### Creating a Commit
+
+Once you're satisfied with your patch, commit your changes:
+
+#### Thrust-only Changes
+
+```
+# Manually add changed files and create a commit:
+cd /path/to/thrust
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+git gui
+```
+
+#### Thrust and CUB Changes
+
+```
+# Create CUB patch first:
+cd /path/to/thrust/dependencies/cub
+# Manually add changed files and create a commit:
+git add cub/some_file.cuh
+git commit
+
+# Create Thrust patch, including submodule update:
+cd /path/to/thrust/
+git add dependencies/cub # Updates submodule info
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+cd /path/to/thrust/dependencies/cub
+git gui
+cd /path/to/thrust
+git gui # Include dependencies/cub as part of your commit
+
+```
+
+#### Writing a Commit Message
+
+Your commit message will communicate the purpose and rationale behind your
+patch to other developers, and will be used to populate the initial description
+of your Github pull request.
+
+When writing a commit message, the following standard format should be used,
+since tools in the git ecosystem are designed to parse this correctly:
+
+```
+First line of commit message is a short summary (<80 char)
+<Second line left blank>
+Detailed description of change begins on third line. This portion can
+span multiple lines, try to manually wrap them at something reasonable.
+
+Blank lines can be used to separate multiple paragraphs in the description.
+
+If your patch is associated with another pull request or issue in the main
+Thrust repository, you should reference it with a `#` symbol, e.g.
+#1023 for issue 1023.
+
+For issues / pull requests in a different github repo, reference them using
+the full syntax, e.g. thrust/cub#4 for issue 4 in the thrust/cub repo.
+
+Markdown is recommended for formatting more detailed messages, as these will
+be nicely rendered on Github, etc.
+```
+
+## Push Development Branch to your Fork
+
+Once you've committed your changes to a local development branch, it's time to
+push them to your fork:
+
+```
+cd /path/to/thrust/checkout
+git checkout my_descriptive_branch_name # if not already checked out
+git push --set-upstream github-fork my_descriptive_branch_name
+```
+
+`--set-upstream github-fork` tells git that future pushes/pulls on this branch
+should target your `github-fork` remote by default.
+
+If have CUB changes to commit as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule.
+
+## Create Pull Request
+
+To create a pull request for your freshly pushed branch, open your github fork
+in a browser by going to `https://www.github.com/<GITHUB_USERNAME>/thrust`. A
+prompt may automatically appear asking you to create a pull request if you've
+recently pushed a branch.
+
+If there's no prompt, go to "Code" > "Branches" and click the appropriate
+"New pull request" button for your branch.
+
+If you would like a specific developer to review your patch, feel free to
+request them as a reviewer at this time.
+
+The Thrust team will review your patch, test it on NVIDIA's internal CI, and
+provide feedback.
+
+
+If have CUB changes to commit as part of your patch, repeat this process with
+your CUB branch and fork.
+
+## Address Feedback and Update Pull Request
+
+If the reviewers request changes to your patch, use the following process to
+update the pull request:
+
+```
+# Make changes:
+cd /path/to/thrust/sources
+git checkout my_descriptive_branch_name
+emacs thrust/some_file.h
+emacs testing/some_test.cu
+
+# Build + test
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+
+# Amend commit:
+cd /path/to/thrust/sources
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit --amend
+# Or
+git gui # Check the "Amend Last Commit" box
+
+# Update the branch on your fork:
+git push -f
+```
+
+At this point, the pull request should show your recent changes.
+
+If have CUB changes to commit as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule, and be sure to include any CUB submodule
+updates as part of your commit.
+
+## When Your PR is Approved
+
+Once your pull request is approved by the Thrust team, no further action is
+needed from you. We will handle integrating it since we must coordinate changes
+to `master` with NVIDIA's internal perforce repository.
+
+# CMake Options
+
+A Thrust build is configured using CMake options. These may be passed to CMake
+using
+
+```
+cmake -D<option_name>=<value> /path/to/thrust/sources
+```
+
+or configured interactively with the `ccmake` or `cmake-gui` interfaces.
+
+Thrust supports two build modes. By default, a single configuration is built
+that targets a specific host system, device system, and C++ dialect.
+When `THRUST_ENABLE_MULTICONFIG` is `ON`, multiple configurations
+targeting a variety of systems and dialects are generated.
+
+The CMake options are divided into these categories:
+
+1. [Generic CMake Options](#generic-cmake-options): Options applicable to all
+   Thrust builds.
+1. [Single Config CMake Options](#single-config-cmake-options) Options
+   applicable only when `THRUST_ENABLE_MULTICONFIG` is disabled.
+1. [Multi Config CMake Options](#multi-config-cmake-options) Options applicable
+   only when `THRUST_ENABLE_MULTICONFIG` is enabled.
+1. [CUDA Specific CMake Options](#cuda-specific-cmake-options) Options that
+   control CUDA compilation. Only available when one or more configurations
+   targets the CUDA system.
+1. [TBB Specific CMake Options](#tbb-specific-cmake-options) Options that
+   control TBB compilation. Only available when one or more configurations
+   targets the TBB system.
+
+## Generic CMake Options
+
+- `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
+  - Standard CMake build option. Default: `RelWithDebInfo`
+- `THRUST_ENABLE_HEADER_TESTING={ON, OFF}`
+  - Whether to test compile public headers. Default is `ON`.
+- `THRUST_ENABLE_TESTING={ON, OFF}`
+  - Whether to build unit tests. Default is `ON`.
+- `THRUST_ENABLE_EXAMPLES={ON, OFF}`
+  - Whether to build examples. Default is `ON`.
+- `THRUST_ENABLE_MULTICONFIG={ON, OFF}`
+  - Toggles single-config and multi-config modes. Default is `OFF` (single config).
+- `THRUST_ENABLE_EXAMPLE_FILECHECK={ON, OFF}`
+  - Enable validation of example outputs using the LLVM FileCheck utility.
+    Default is `OFF`.
+
+## Single Config CMake Options
+
+- `THRUST_HOST_SYSTEM={CPP, TBB, OMP}`
+  - Selects the host system. Default: `CPP`
+- `THRUST_DEVICE_SYSTEM={CUDA, TBB, OMP, CPP}`
+  - Selects the device system. Default: `CUDA`
+- `THRUST_CPP_DIALECT={11, 14, 17}`
+  - Selects the C++ standard dialect to use. Default is `14` (C++14).
+
+## Multi Config CMake Options
+
+- `THRUST_MULTICONFIG_ENABLE_DIALECT_CPPXX={ON, OFF}`
+  - Toggle whether a specific C++ dialect will be targeted.
+  - Possible values of `XX` are `{11, 14, 17}`.
+  - By default, only C++14 is enabled.
+- `THRUST_MULTICONFIG_ENABLE_SYSTEM_XXXX={ON, OFF}`
+  - Toggle whether a specific system will be targeted.
+  - Possible values of `XXXX` are `{CPP, CUDA, TBB, OMP}`
+  - By default, only `CPP` and `CUDA` are enabled.
+- `THRUST_MULTICONFIG_WORKLOAD={SMALL, MEDIUM, LARGE, FULL}`
+  - Restricts the host/device combinations that will be targeted.
+  - By default, the `SMALL` workload is used.
+  - The full cross product of `host x device` systems results in 12
+    configurations, some of which are more important than others.
+    This option can be used to prune some of the less important ones.
+  - `SMALL`: (3 configs) Minimal coverage and validation of each device system against the `CPP` host.
+  - `MEDIUM`: (6 configs) Cheap extended coverage.
+  - `LARGE`: (8 configs) Expensive extended coverage. Includes all useful build configurations.
+  - `FULL`: (12 configs) The complete cross product of all possible build configurations.
+
+| Config   | Workloads | Value      | Expense   | Note                         |
+|----------|-----------|------------|-----------|------------------------------|
+| CPP/CUDA | `F L M S` | Essential  | Expensive | Validates CUDA against CPP   |
+| CPP/OMP  | `F L M S` | Essential  | Cheap     | Validates OMP against CPP    |
+| CPP/TBB  | `F L M S` | Essential  | Cheap     | Validates TBB against CPP    |
+| CPP/CPP  | `F L M  ` | Important  | Cheap     | Tests CPP as device          |
+| OMP/OMP  | `F L M  ` | Important  | Cheap     | Tests OMP as host            |
+| TBB/TBB  | `F L M  ` | Important  | Cheap     | Tests TBB as host            |
+| TBB/CUDA | `F L    ` | Important  | Expensive | Validates TBB/CUDA interop   |
+| OMP/CUDA | `F L    ` | Important  | Expensive | Validates OMP/CUDA interop   |
+| TBB/OMP  | `F      ` | Not useful | Cheap     | Mixes CPU-parallel systems   |
+| OMP/TBB  | `F      ` | Not useful | Cheap     | Mixes CPU-parallel systems   |
+| TBB/CPP  | `F      ` | Not Useful | Cheap     | Parallel host, serial device |
+| OMP/CPP  | `F      ` | Not Useful | Cheap     | Parallel host, serial device |
+
+## CUDA Specific CMake Options
+
+- `THRUST_INCLUDE_CUB_CMAKE={ON, OFF}`
+  - If enabled, the CUB project will be built as part of Thrust. Default is
+    `OFF`.
+  - This adds CUB tests, etc. Useful for working on both CUB and Thrust
+    simultaneously.
+  - CUB configurations will be generated for each C++ dialect targeted by
+    the current Thrust build.
+- `THRUST_ENABLE_COMPUTE_XX={ON, OFF}`
+  - Controls the targeted CUDA architecture(s)
+  - Multiple options may be selected when using NVCC as the CUDA compiler.
+  - Valid values of `XX` are:
+    `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}`
+  - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`:
+- `THRUST_ENABLE_COMPUTE_FUTURE={ON, OFF}`
+  - If enabled, CUDA objects will target the most recent virtual architecture
+    in addition to the real architectures specified by the
+    `THRUST_ENABLE_COMPUTE_XX` options.
+  - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`:
+- `THRUST_DISABLE_ARCH_BY_DEFAULT={ON, OFF}`
+  - When `ON`, all `THRUST_ENABLE_COMPUTE_*` options are initially `OFF`.
+  - Default: `OFF` (meaning all architectures are enabled by default)
+- `THRUST_ENABLE_TESTS_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building tests.
+    Default is `OFF`.
+- `THRUST_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building examples.
+    Default is `OFF`.
+
+## TBB Specific CMake Options
+
+- `THRUST_TBB_ROOT=<path to tbb root>`
+  - When the TBB system is requested, set this to the root of the TBB installation
+    (e.g. the location of `lib/`, `bin/` and `include/` for the TBB libraries).
+
+# Development Model
+
+The following is a description of the basic development process that Thrust follows. This is a living
+document that will evolve as our process evolves.
+
+Thrust is distributed in three ways:
+
+   * On GitHub.
+   * In the NVIDIA HPC SDK.
+   * In the CUDA Toolkit.
+
+## Trunk Based Development
+
+Thrust uses [trunk based development](https://trunkbaseddevelopment.com). There is a single long-lived
+branch called `master`. Engineers may create branches for feature development. Such branches always
+merge into `master`. There are no release branches. Releases are produced by taking a snapshot of
+`master` ("snapping"). After a release has been snapped from `master`, it will never be changed.
+
+## Repositories
+
+As Thrust is developed both on GitHub and internally at NVIDIA, there are three main places where code lives:
+
+   * The Source of Truth, the [public Thrust repository](https://github.com/thrust/thrust), referred to as
+     `github` later in this document.
+   * An internal GitLab repository, referred to as `gitlab` later in this document.
+   * An internal Perforce repository, referred to as `perforce` later in this document.
+
+## Versioning
+
+Thrust has its own versioning system for releases, independent of the versioning scheme of the NVIDIA
+HPC SDK or the CUDA Toolkit.
+
+Today, Thrust version numbers have a specific [semantic meaning](https://semver.org/).
+Releases prior to 1.10.0 largely, but not strictly, followed these semantic meanings.
+
+The version number for a Thrust release uses the following format: `MMM.mmm.ss-ppp`, where:
+
+   * `THRUST_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits. It is incremented
+     when changes that are API-backwards-incompatible are made.
+   * `THRUST_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits. It is incremented when
+     breaking API, ABI, or semantic changes are made.
+   * `THRUST_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits. It is incremented
+     when notable new features or bug fixes or features that are API-backwards-compatible are made.
+   * `THRUST_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits. It is incremented if any
+     change in the repo whatsoever is made and no other version component has been incremented.
+
+The `<thrust/version.h>` header defines `THRUST_*` macros for all of the version components mentioned
+above. Additionally, a `THRUST_VERSION` macro is defined, which is an integer literal containing all
+of the version components except for `THRUST_PATCH_NUMBER`.
+
+## Branches and Tags
+
+The following tag names are used in the Thrust project:
+
+  * `github/nvhpc-X.Y`: the tag that directly corresponds to what has been shipped in the NVIDIA HPC SDK release X.Y.
+  * `github/cuda-X.Y`: the tag that directly corresponds to what has been shipped in the CUDA Toolkit release X.Y.
+  * `github/A.B.C`: the tag that directly corresponds to a Thrust version A.B.C.
+
+The following branch names are used in the Thrust project:
+
+  * `github/master`: the Source of Truth development branch of Thrust.
+  * `github/old-master`: the old Source of Truth branch, before unification of public and internal repositories.
+  * `github/feature/<name>`: feature branch for a feature under development.
+  * `github/bug/<bug-system>/<bug-description>-<bug-id>`: bug fix branch, where `bug-system` is `github` or `nvidia`.
+  * `gitlab/master`: mirror of `github/master`.
+  * `perforce/private`: mirrored `github/master`, plus files necessary for internal NVIDIA testing systems.
+
+On the rare occasion that we cannot do work in the open, for example when developing a change specific to an
+unreleased product, these branches may exist on `gitlab` instead of `github`. By default, everything should be
+in the open on `github` unless there is a strong motivation for it to not be open.
diff --git a/thrust/LICENSE b/thrust/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e454a52586f29b8ce8a6799163eac1f875e9ac01
--- /dev/null
+++ b/thrust/LICENSE
@@ -0,0 +1,178 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
diff --git a/thrust/Makefile b/thrust/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8b706fc3e8774e406994f70ad71654827b0283ed
--- /dev/null
+++ b/thrust/Makefile
@@ -0,0 +1,164 @@
+# Copyright 2010-2020 NVIDIA Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#		http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Makefile for building Thrust unit test driver
+
+# Force C++11 mode. NVCC will ignore it if the host compiler doesn't support it.
+export CXX_STD := c++11
+
+export CCCL_ENABLE_DEPRECATIONS := 1
+
+export VERBOSE := 1
+
+ifndef PROFILE
+  ifdef VULCAN_TOOLKIT_BASE
+    include $(VULCAN_TOOLKIT_BASE)/build/getprofile.mk
+    include $(VULCAN_TOOLKIT_BASE)/build/config/$(PROFILE).mk
+  else
+    include ../build/getprofile.mk
+    include ../build/config/$(PROFILE).mk
+  endif
+endif
+
+SOLNDIR := .
+
+ifdef VULCAN_TOOLKIT_BASE
+  include $(VULCAN_TOOLKIT_BASE)/build/config/DetectOS.mk
+else
+  include ../build/config/DetectOS.mk
+endif
+
+TMP_DIR      := built
+TMP_PREFIX   := $(ROOTDIR)
+TMP_ARCH     := $(ARCH)_$(PROFILE)_agnostic
+THRUST_MKDIR := $(TMP_PREFIX)/$(TMP_DIR)/$(TMP_ARCH)/thrust/mk
+THRUST_DIR   := $(ROOTDIR)/thrust
+
+res:=$(shell $(PYTHON) ./generate_mk.py $(THRUST_MKDIR) $(THRUST_DIR))
+
+# Use these environment variables to control what gets built:
+#
+#   TEST_ALL
+#   TEST_UNITTESTS
+#   TEST_EXAMPLES
+#   TEST_BENCH
+#   TEST_OTHER
+
+ifneq ($(TEST_ALL),)
+  override TEST_UNITTESTS := 1
+  override TEST_EXAMPLES := 1
+  override TEST_BENCH := 1
+  override TEST_OTHER := 1
+endif
+
+ifeq ($(TEST_UNITTESTS)$(TEST_EXAMPLES)$(TEST_BENCH)$(TEST_OTHER),)
+  override TEST_UNITTESTS := 1
+  override TEST_EXAMPLES := 1
+  override TEST_BENCH := 1
+  override TEST_OTHER := 1
+endif
+
+ifneq ($(TEST_OTHER),)
+  PROJECTS += internal/build/warningstester
+endif
+
+ifneq ($(TEST_BENCH),)
+  PROJECTS += internal/benchmark/bench
+endif
+
+ifneq ($(TEST_UNITTESTS),)
+  # copy existing projects
+  PROJECTS_COPY := $(PROJECTS)
+
+  # empty PROJECTS
+  PROJECTS :=
+
+  # populate PROJECTS with unit tests.
+  include $(THRUST_MKDIR)/testing.mk
+
+  # Once PROJECTS is populated with unit tests, re-add the previous projects.
+  PROJECTS += $(PROJECTS_COPY)
+endif
+
+ifneq ($(TEST_EXAMPLES),)
+  # Copy existing projects.
+  PROJECTS_COPY := $(PROJECTS)
+
+  # Empty PROJECTS.
+  PROJECTS :=
+
+  # Populate PROJECTS with examples.
+  include $(THRUST_MKDIR)/examples.mk
+
+  # Once PROJECTS is populated with examples, re-add the previous projects.
+  PROJECTS += $(PROJECTS_COPY)
+endif
+
+ifdef VULCAN_TOOLKIT_BASE
+  include $(VULCAN_TOOLKIT_BASE)/build/common.mk
+else
+  include ../build/common.mk
+endif
+
+ifeq ($(OS), win32)
+  CREATE_DVS_PACKAGE = $(ZIP) -r built/CUDA-thrust-package.zip bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark thrust/*.trs $(DVS_COMMON_TEST_PACKAGE_FILES)
+  APPEND_H_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.h
+  APPEND_INL_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.inl
+  APPEND_CUH_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.cuh
+  MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_H_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE)
+else
+  CREATE_DVS_PACKAGE = tar -cvh -f built/CUDA-thrust-package.tar bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark thrust/*.trs $(DVS_COMMON_TEST_PACKAGE_FILES)
+  APPEND_H_DVS_PACKAGE = find -L thrust -name "*.h" | xargs tar rvf built/CUDA-thrust-package.tar
+  APPEND_INL_DVS_PACKAGE = find -L thrust -name "*.inl" | xargs tar rvf built/CUDA-thrust-package.tar
+  APPEND_CUH_DVS_PACKAGE = find -L thrust -name "*.cuh" | xargs tar rvf built/CUDA-thrust-package.tar
+  COMPRESS_DVS_PACKAGE = bzip2 --force built/CUDA-thrust-package.tar
+  MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_H_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE) && $(COMPRESS_DVS_PACKAGE)
+endif
+
+COPY_CUB_FOR_PACKAGING = rm -rf cub && cp -r ../cub/cub cub
+
+DVS_OPTIONS :=
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+  DVS_OPTIONS += TARGET_ARCH=$(TARGET_ARCH)
+endif
+ifeq ($(TARGET_ARCH),ARMv7)
+  DVS_OPTIONS += ABITYPE=$(ABITYPE)
+endif
+
+THRUST_DVS_BUILD = release
+
+pack:
+	$(COPY_CUB_FOR_PACKAGING)
+	cd .. && $(MAKE_DVS_PACKAGE)
+
+dvs:
+	$(COPY_CUB_FOR_PACKAGING)
+# Build the CUDA Runtime in GVS, because GVS has no CUDA Runtime component.
+# This is a temporary workaround until the Tegra team adds a CUDA Runtime
+# component, which they have promised to do.
+ifdef GVS
+	$(MAKE) $(DVS_OPTIONS) -s -C ../cuda $(THRUST_DVS_BUILD)
+endif
+	$(MAKE) $(DVS_OPTIONS) $(THRUST_DVS_BUILD) THRUST_DVS=1
+	cd .. && $(MAKE_DVS_PACKAGE)
+
+dvs_release:
+	$(MAKE) dvs THRUST_DVS_BUILD=release
+
+dvs_debug:
+	$(MAKE) dvs THRUST_DVS_BUILD=debug
+
+include $(THRUST_MKDIR)/dependencies.mk
+
diff --git a/thrust/NOTICE b/thrust/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..1ce1dcc29b2ee879ef4234bda001eee9b158d035
--- /dev/null
+++ b/thrust/NOTICE
@@ -0,0 +1,26 @@
+Thrust includes source code from the Boost Iterator, Tuple, System, and Random Number libraries.
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+    
+    Permission is hereby granted, free of charge, to any person or organization
+    obtaining a copy of the software and accompanying documentation covered by
+    this license (the "Software") to use, reproduce, display, distribute,
+    execute, and transmit the Software, and to prepare derivative works of the
+    Software, and to permit third-parties to whom the Software is furnished to
+    do so, all subject to the following:
+    
+    The copyright notices in the Software and this entire statement, including
+    the above license grant, this restriction and the following disclaimer,
+    must be included in all copies of the Software, in whole or in part, and
+    all derivative works of the Software, unless such copies or derivative
+    works are solely in the form of machine-executable object code generated by
+    a source language processor.
+    
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
diff --git a/thrust/README.md b/thrust/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3bfdd999f2074f4c88b9472537f5f9816dea43b3
--- /dev/null
+++ b/thrust/README.md
@@ -0,0 +1,161 @@
+Thrust: Code at the speed of light
+==================================
+
+Thrust is a C++ parallel programming library which resembles the C++ Standard
+Library. Thrust's **high-level** interface greatly enhances
+programmer **productivity** while enabling performance portability between
+GPUs and multicore CPUs. **Interoperability** with established technologies
+(such as CUDA, TBB, and OpenMP) facilitates integration with existing
+software. Develop **high-performance** applications rapidly with Thrust!
+
+Thrust is included in the NVIDIA HPC SDK and the CUDA Toolkit.
+
+Refer to the [Quick Start Guide](http://github.com/thrust/thrust/wiki/Quick-Start-Guide) page for further information and examples.
+
+Examples
+--------
+
+Thrust is best explained through examples. The following source code
+generates random numbers serially and then transfers them to a parallel
+device where they are sorted.
+
+```c++
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/sort.h>
+#include <thrust/copy.h>
+#include <algorithm>
+#include <cstdlib>
+
+int main(void)
+{
+  // generate 32M random numbers serially
+  thrust::host_vector<int> h_vec(32 << 20);
+  std::generate(h_vec.begin(), h_vec.end(), rand);
+
+  // transfer data to the device
+  thrust::device_vector<int> d_vec = h_vec;
+
+  // sort data on the device (846M keys per second on GeForce GTX 480)
+  thrust::sort(d_vec.begin(), d_vec.end());
+
+  // transfer data back to host
+  thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+  return 0;
+}
+```
+
+This code sample computes the sum of 100 random numbers in parallel:
+
+```c++
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <algorithm>
+#include <cstdlib>
+
+int main(void)
+{
+  // generate random data serially
+  thrust::host_vector<int> h_vec(100);
+  std::generate(h_vec.begin(), h_vec.end(), rand);
+
+  // transfer to device and compute sum
+  thrust::device_vector<int> d_vec = h_vec;
+  int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
+  return 0;
+}
+```
+
+Releases
+--------
+
+Thrust is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition
+to GitHub.
+
+See the [changelog](CHANGELOG.md) for details about specific releases.
+
+| Thrust Release    | Included In                             |
+| ----------------- | --------------------------------------- |
+| 1.9.10-1          | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1 |
+| 1.9.10            | NVIDIA HPC SDK 20.5                     |
+| 1.9.9             | CUDA Toolkit 11.0                       |
+| 1.9.8-1           | NVIDIA HPC SDK 20.3                     |
+| 1.9.8             | CUDA Toolkit 11.0 Early Access          |
+| 1.9.7-1           | CUDA Toolkit 10.2 for Tegra             |
+| 1.9.7             | CUDA Toolkit 10.2                       |
+| 1.9.6-1           | NVIDIA HPC SDK 20.3                     |
+| 1.9.6             | CUDA Toolkit 10.1 Update 2              |
+| 1.9.5             | CUDA Toolkit 10.1 Update 1              |
+| 1.9.4             | CUDA Toolkit 10.1                       |
+| 1.9.3             | CUDA Toolkit 10.0                       |
+| 1.9.2             | CUDA Toolkit 9.2                        |
+| 1.9.1-2           | CUDA Toolkit 9.1                        |
+| 1.9.0-5           | CUDA Toolkit 9.0                        |
+| 1.8.3             | CUDA Toolkit 8.0                        |
+| 1.8.2             | CUDA Toolkit 7.5                        |
+| 1.8.1             | CUDA Toolkit 7.0                        |
+| 1.8.0             |                                         |
+| 1.7.2             | CUDA Toolkit 6.5                        |
+| 1.7.1             | CUDA Toolkit 6.0                        |
+| 1.7.0             | CUDA Toolkit 5.5                        |
+| 1.6.0             |                                         |
+| 1.5.3             | CUDA Toolkit 5.0                        |
+| 1.5.2             | CUDA Toolkit 4.2                        |
+| 1.5.1             | CUDA Toolkit 4.1                        |
+| 1.5.0             |                                         |
+| 1.4.0             | CUDA Toolkit 4.0                        |
+| 1.3.0             |                                         |
+| 1.2.1             |                                         |
+| 1.2.0             |                                         |
+| 1.1.1             |                                         |
+| 1.1.0             |                                         |
+| 1.0.0             |                                         |
+
+Adding Thrust To A CMake Project
+--------------------------------
+
+Since Thrust is a header library, there is no need to build or install Thrust
+to use it. The `thrust` directory contains a complete, ready-to-use Thrust
+package upon checkout.
+
+We provide CMake configuration files that make it easy to include Thrust
+from other CMake projects. See the [CMake README](thrust/cmake/README.md)
+for details.
+
+Development Process
+-------------------
+
+Thrust uses the [CMake build system](https://cmake.org/) to build unit tests,
+examples, and header tests. To build Thrust as a developer, the following
+recipe should be followed:
+
+```
+# Clone Thrust and CUB repos recursively:
+git clone --recursive https://github.com/thrust/thrust.git
+cd thrust
+
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..   # Command line interface.
+ccmake ..  # ncurses GUI (Linux only)
+cmake-gui  # Graphical UI, set source/build directories in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+By default, a serial `CPP` host system, `CUDA` accelerated device system, and
+C++14 standard are used. This can be changed in CMake. More information on
+configuring your Thrust build and creating a pull request can be found in
+[CONTRIBUTING.md](CONTRIBUTING.md).
diff --git a/thrust/cmake/AppendOptionIfAvailable.cmake b/thrust/cmake/AppendOptionIfAvailable.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..52dc12216990dedd45196bb253f2f49e5dc28254
--- /dev/null
+++ b/thrust/cmake/AppendOptionIfAvailable.cmake
@@ -0,0 +1,14 @@
+include_guard(GLOBAL)
+include(CheckCXXCompilerFlag)
+
+macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
+
+string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR)
+check_cxx_compiler_flag(${_FLAG} ${_VAR})
+
+if (${${_VAR}})
+  list(APPEND ${_LIST} ${_FLAG})
+endif ()
+
+endmacro ()
+
diff --git a/thrust/cmake/PrintNinjaBuildTimes.cmake b/thrust/cmake/PrintNinjaBuildTimes.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..65d243d35facfe10177d5b818b10bbfc049b6cee
--- /dev/null
+++ b/thrust/cmake/PrintNinjaBuildTimes.cmake
@@ -0,0 +1,101 @@
+## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
+## build/link times, sorted longest first.
+##
+## cmake -DLOGFILE=<.ninja_log file> \
+##       -P PrintNinjaBuildTimes.cmake
+##
+## If LOGFILE is omitted, the current directory's .ninja_log file is used.
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  set(LOGFILE ".ninja_log")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH
+    "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 3)
+    set(start_ms ${CMAKE_MATCH_1})
+    set(end_ms ${CMAKE_MATCH_2})
+    set(command "${CMAKE_MATCH_3}")
+    math(EXPR runtime_ms "${end_ms} - ${start_ms}")
+
+    # Compute human readable time
+    math(EXPR days         "${runtime_ms} / (1000 * 60 * 60 * 24)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
+    math(EXPR hours        "${runtime_ms} / (1000 * 60 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
+    math(EXPR minutes      "${runtime_ms} / (1000 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${minutes} * 1000 * 60)")
+    math(EXPR seconds      "${runtime_ms} / 1000")
+    math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+    pad_string_with_zeros(milliseconds 3)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${command}" key)
+    set(ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries)
+list(REVERSE entries)
+
+# Dump table:
+message(STATUS "-----------------------+----------------------------")
+message(STATUS "Time                   | Command                    ")
+message(STATUS "-----------------------+----------------------------")
+
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff --git a/thrust/cmake/ThrustAddSubdir.cmake b/thrust/cmake/ThrustAddSubdir.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d48aa1415789f8fff6ad35b17404880c481d7b93
--- /dev/null
+++ b/thrust/cmake/ThrustAddSubdir.cmake
@@ -0,0 +1,6 @@
+find_package(Thrust REQUIRED CONFIG
+  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+  HINTS "${CMAKE_CURRENT_LIST_DIR}/.."
+  COMPONENTS ${THRUST_REQUIRED_SYSTEMS}
+  OPTIONAL_COMPONENTS ${THRUST_OPTIONAL_SYSTEMS}
+)
diff --git a/thrust/cmake/ThrustBuildCompilerTargets.cmake b/thrust/cmake/ThrustBuildCompilerTargets.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6e84ec897b4c6d235fc8afcf50cc6e45bd225114
--- /dev/null
+++ b/thrust/cmake/ThrustBuildCompilerTargets.cmake
@@ -0,0 +1,150 @@
+#
+# This file defines the `thrust_build_compiler_targets()` function, which
+# creates the following interface targets:
+#
+# thrust.compiler_interface
+# - Interface target providing compiler-specific options needed to build
+#   Thrust's tests, examples, etc.
+#
+# thrust.promote_cudafe_warnings
+# - Interface target that adds warning promotion for NVCC cudafe invocations.
+# - Only exists to work around github issue #1174 on tbb.cuda configurations.
+# - May be combined with thrust.compiler_interface when #1174 is fully resolved.
+
+function(thrust_build_compiler_targets)
+  set(cxx_compile_definitions)
+  set(cxx_compile_options)
+
+  thrust_update_system_found_flags()
+
+  if (THRUST_TBB_FOUND)
+    # There's a ton of these in the TBB backend, even though the code is correct.
+    # TODO: silence these warnings in code instead
+    append_option_if_available("-Wno-unused-parameter" cxx_compile_options)
+  endif()
+
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # TODO Enable /Wall instead of W3
+    append_option_if_available("/W3" cxx_compile_options)
+
+    # Treat all warnings as errors:
+    append_option_if_available("/WX" cxx_compile_options)
+
+    # Disabled loss-of-data conversion warnings.
+    # TODO Re-enable.
+    append_option_if_available("/wd4244" cxx_compile_options)
+    append_option_if_available("/wd4267" cxx_compile_options)
+
+    # Suppress numeric conversion-to-bool warnings.
+    # TODO Re-enable.
+    append_option_if_available("/wd4800" cxx_compile_options)
+
+    # Disable warning about applying unary operator- to unsigned type.
+    append_option_if_available("/wd4146" cxx_compile_options)
+
+    # MSVC STL assumes that `allocator_traits`'s allocator will use raw pointers,
+    # and the `__DECLSPEC_ALLOCATOR` macro causes issues with thrust's universal
+    # allocators:
+    #   warning C4494: 'std::allocator_traits<_Alloc>::allocate' :
+    #      Ignoring __declspec(allocator) because the function return type is not
+    #      a pointer or reference
+    # See https://github.com/microsoft/STL/issues/696
+    append_option_if_available("/wd4494" cxx_compile_options)
+
+    # Some of the async tests require /bigobj to fit all their sections into the
+    # object files:
+    append_option_if_available("/bigobj" cxx_compile_options)
+
+    # "Oh right, this is Visual Studio."
+    list(APPEND cxx_compile_definitions "NOMINMAX")
+  else()
+    append_option_if_available("-Werror" cxx_compile_options)
+    append_option_if_available("-Wall" cxx_compile_options)
+    append_option_if_available("-Wextra" cxx_compile_options)
+    append_option_if_available("-Winit-self" cxx_compile_options)
+    append_option_if_available("-Woverloaded-virtual" cxx_compile_options)
+    append_option_if_available("-Wcast-qual" cxx_compile_options)
+    append_option_if_available("-Wno-cast-align" cxx_compile_options)
+    append_option_if_available("-Wno-long-long" cxx_compile_options)
+    append_option_if_available("-Wno-variadic-macros" cxx_compile_options)
+    append_option_if_available("-Wno-unused-function" cxx_compile_options)
+    append_option_if_available("-Wno-unused-variable" cxx_compile_options)
+  endif()
+
+  if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.5)
+      # In GCC 4.4, the CUDA backend's kernel launch templates cause
+      # impossible-to-decipher "'<anonymous>' is used uninitialized in this
+      # function" warnings, so we disable uninitialized variable warnings.
+      append_option_if_available("-Wno-uninitialized" cxx_compile_options)
+    endif()
+
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.5)
+      # This isn't available until GCC 4.3, and misfires on TMP code until
+      # GCC 4.5.
+      append_option_if_available("-Wlogical-op" cxx_compile_options)
+    endif()
+
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
+      # GCC 7.3 complains about name mangling changes due to `noexcept`
+      # becoming part of the type system; we don't care.
+      append_option_if_available("-Wno-noexcept-type" cxx_compile_options)
+    endif()
+  endif()
+
+  if (("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
+      ("XL" STREQUAL "${CMAKE_CXX_COMPILER_ID}"))
+    # xlC and Clang warn about unused parameters in uninstantiated templates.
+    # This causes xlC to choke on the OMP backend, which is mostly #ifdef'd out
+    # (and thus has unused parameters) when you aren't using it.
+    append_option_if_available("-Wno-unused-parameters" cxx_compile_options)
+  endif()
+
+  if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # -Wunneeded-internal-declaration misfires in the unit test framework
+    # on older versions of Clang.
+    append_option_if_available("-Wno-unneeded-internal-declaration" cxx_compile_options)
+  endif()
+
+  if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    # Today:
+    # * NVCC accepts CUDA C++ in .cu files but not .cpp files.
+    # * Feta accepts CUDA C++ in .cpp files but not .cu files.
+    # TODO: This won't be necessary in the future.
+    list(APPEND cxx_compile_options -cppsuffix=cu)
+  endif()
+
+  add_library(thrust.compiler_interface INTERFACE)
+
+  foreach (cxx_option IN LISTS cxx_compile_options)
+    target_compile_options(thrust.compiler_interface INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:${cxx_option}>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:Feta>>:${cxx_option}>
+      # Only use -Xcompiler with NVCC, not Feta.
+      #
+      # CMake can't split genexs, so this can't be formatted better :(
+      # This is:
+      # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt:
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${cxx_option}>
+    )
+  endforeach()
+
+  foreach (cxx_definition IN LISTS cxx_compile_definitions)
+    # Add these for both CUDA and CXX targets:
+    target_compile_definitions(thrust.compiler_interface INTERFACE
+      ${cxx_definition}
+    )
+  endforeach()
+
+  # Display warning numbers from nvcc cudafe errors:
+  target_compile_options(thrust.compiler_interface INTERFACE
+    # If using CUDA w/ NVCC...
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--display_error_number>
+  )
+
+  # This is kept separate for Github issue #1174.
+  add_library(thrust.promote_cudafe_warnings INTERFACE)
+  target_compile_options(thrust.promote_cudafe_warnings INTERFACE
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--promote_warnings>
+  )
+endfunction()
diff --git a/thrust/cmake/ThrustBuildTargetList.cmake b/thrust/cmake/ThrustBuildTargetList.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4572bf8b8cdb4527ed2709698f55c29f3de84403
--- /dev/null
+++ b/thrust/cmake/ThrustBuildTargetList.cmake
@@ -0,0 +1,283 @@
+# This file provides utilities for building and working with thrust
+# configuration targets.
+#
+# THRUST_TARGETS
+#  - Built by the calling the `thrust_build_target_list()` function.
+#  - Each item is the name of a thrust interface target that is configured for a
+#    certain combination of host/device/dialect.
+#
+# thrust_build_target_list()
+# - Creates the THRUST_TARGETS list.
+#
+# The following functions can be used to test/set metadata on a thrust target:
+#
+# thrust_get_target_property(<prop_var> <target_name> <prop>)
+#   - Checks the ${prop} target property on thrust target ${target_name}
+#     and sets the ${prop_var} variable in the caller's scope.
+#   - <prop_var> is any valid cmake identifier.
+#   - <target_name> is the name of a thrust target.
+#   - <prop> is one of the following:
+#     - HOST: The host system. Valid values: CPP, OMP, TBB.
+#     - DEVICE: The device system. Valid values: CUDA, CPP, OMP, TBB.
+#     - DIALECT: The C++ dialect. Valid values: 11, 14, 17.
+#     - PREFIX: A unique prefix that should be used to name all
+#       targets/tests/examples that use this configuration.
+#
+# thrust_get_target_properties(<target_name>)
+#   - Defines ${target_name}_${prop} in the caller's scope, for `prop` in:
+#     HOST, DEVICE, DIALECT, PREFIX. See above for details.
+#
+# thrust_clone_target_properties(<dst_target> <src_target>)
+#   - Set the HOST, DEVICE, DIALECT, PREFIX metadata on ${dst_target} to match
+#     ${src_target}. See above for details.
+#   - This *MUST* be called on any targets that link to another thrust target
+#     to ensure that dialect information is updated correctly, e.g.
+#     `thrust_clone_target_properties(${my_thrust_test} ${some_thrust_target})`
+
+define_property(TARGET PROPERTY _THRUST_HOST
+  BRIEF_DOCS "A target's host system: CPP, TBB, or OMP."
+  FULL_DOCS "A target's host system: CPP, TBB, or OMP."
+)
+define_property(TARGET PROPERTY _THRUST_DEVICE
+  BRIEF_DOCS "A target's device system: CUDA, CPP, TBB, or OMP."
+  FULL_DOCS "A target's device system: CUDA, CPP, TBB, or OMP."
+)
+define_property(TARGET PROPERTY _THRUST_DIALECT
+  BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17."
+  FULL_DOCS "A target's C++ dialect: 11, 14, or 17."
+)
+define_property(TARGET PROPERTY _THRUST_PREFIX
+  BRIEF_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'."
+  FULL_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'."
+)
+
+function(thrust_set_target_properties target_name host device dialect prefix)
+  set_target_properties(${target_name}
+    PROPERTIES
+      _THRUST_HOST ${host}
+      _THRUST_DEVICE ${device}
+      _THRUST_DIALECT ${dialect}
+      _THRUST_PREFIX ${prefix}
+  )
+
+  get_target_property(type ${target_name} TYPE)
+  if (NOT ${type} STREQUAL "INTERFACE_LIBRARY")
+    set_target_properties(${target_name}
+      PROPERTIES
+        CXX_STANDARD ${dialect}
+        CUDA_STANDARD ${dialect}
+        # Must manually request that the standards above are actually respected
+        # or else CMake will silently fail to configure the targets correctly...
+        # Note that this doesn't actually work as of CMake 3.16:
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/20953
+        # We'll leave these properties enabled in hopes that they will someday
+        # work.
+        CXX_STANDARD_REQUIRED ON
+        CUDA_STANDARD_REQUIRED ON
+        ARCHIVE_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}"
+        LIBRARY_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}"
+        RUNTIME_OUTPUT_DIRECTORY "${THRUST_EXECUTABLE_OUTPUT_DIR}"
+    )
+
+    # CMake still emits errors about empty CUDA_ARCHITECTURES when CMP0104
+    # is set to OLD. This suppresses the errors for good.
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      set_target_properties(${target_name}
+        PROPERTIES
+          CUDA_ARCHITECTURES OFF
+      )
+    endif()
+
+    if ("CUDA" STREQUAL "${device}" AND
+        "Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+      set_target_properties(${target_name} PROPERTIES
+        CUDA_RESOLVE_DEVICE_SYMBOLS OFF
+      )
+    endif()
+  endif()
+endfunction()
+
+# Get a thrust property from a target and store it in var_name
+# thrust_get_target_property(<var_name> <target_name> [HOST|DEVICE|DIALECT|PREFIX]
+macro(thrust_get_target_property prop_var target_name prop)
+  get_property(${prop_var} TARGET ${target_name} PROPERTY _THRUST_${prop})
+endmacro()
+
+# Defines the following string variables in the caller's scope:
+# - ${target_name}_HOST
+# - ${target_name}_DEVICE
+# - ${target_name}_DIALECT
+# - ${target_name}_PREFIX
+macro(thrust_get_target_properties target_name)
+  thrust_get_target_property(${target_name}_HOST ${target_name} HOST)
+  thrust_get_target_property(${target_name}_DEVICE ${target_name} DEVICE)
+  thrust_get_target_property(${target_name}_DIALECT ${target_name} DIALECT)
+  thrust_get_target_property(${target_name}_PREFIX ${target_name} PREFIX)
+endmacro()
+
+# Set one target's THRUST_* properties to match another target
+function(thrust_clone_target_properties dst_target src_target)
+  thrust_get_target_properties(${src_target})
+  thrust_set_target_properties(${dst_target}
+    ${${src_target}_HOST}
+    ${${src_target}_DEVICE}
+    ${${src_target}_DIALECT}
+    ${${src_target}_PREFIX}
+  )
+endfunction()
+
+# Set ${var_name} to TRUE or FALSE in the caller's scope
+function(_thrust_is_config_valid var_name host device dialect)
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_${host} AND
+      THRUST_MULTICONFIG_ENABLE_SYSTEM_${device} AND
+      THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect} AND
+      "${host}_${device}" IN_LIST THRUST_MULTICONFIG_WORKLOAD_${THRUST_MULTICONFIG_WORKLOAD}_CONFIGS)
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_thrust_init_target_list)
+  set(THRUST_TARGETS "" CACHE INTERNAL "" FORCE)
+endfunction()
+
+function(_thrust_add_target_to_target_list target_name host device dialect prefix)
+  thrust_set_target_properties(${target_name} ${host} ${device} ${dialect} ${prefix})
+
+  target_link_libraries(${target_name} INTERFACE
+    thrust.compiler_interface
+  )
+
+  # Workaround Github issue #1174. cudafe promote TBB header warnings to
+  # errors, even when they're -isystem includes.
+  if ((NOT host STREQUAL "TBB") OR (NOT device STREQUAL "CUDA"))
+    target_link_libraries(${target_name} INTERFACE
+      thrust.promote_cudafe_warnings
+    )
+  endif()
+
+  set(THRUST_TARGETS ${THRUST_TARGETS} ${target_name} CACHE INTERNAL "" FORCE)
+
+  set(label "${host}.${device}.cpp${dialect}")
+  string(TOLOWER "${label}" label)
+  message(STATUS "Enabling Thrust configuration: ${label}")
+endfunction()
+
+function(_thrust_build_target_list_multiconfig)
+  # Find thrust and all of the required systems:
+  set(req_systems)
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA)
+    list(APPEND req_systems CUDA)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP)
+    list(APPEND req_systems CPP)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB)
+    list(APPEND req_systems TBB)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP)
+    list(APPEND req_systems OMP)
+  endif()
+
+  find_package(Thrust REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${Thrust_SOURCE_DIR}"
+    COMPONENTS ${req_systems}
+  )
+
+  # This must be called after backends are loaded but
+  # before _thrust_add_target_to_target_list.
+  thrust_build_compiler_targets()
+
+  # Build THRUST_TARGETS
+  foreach(host IN LISTS THRUST_HOST_SYSTEM_OPTIONS)
+    foreach(device IN LISTS THRUST_DEVICE_SYSTEM_OPTIONS)
+      foreach(dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        _thrust_is_config_valid(config_valid ${host} ${device} ${dialect})
+        if (config_valid)
+          set(prefix "thrust.${host}.${device}.cpp${dialect}")
+          string(TOLOWER "${prefix}" prefix)
+
+          # Configure a thrust interface target for this host/device
+          set(target_name "${prefix}")
+          thrust_create_target(${target_name}
+            HOST ${host}
+            DEVICE ${device}
+            ${THRUST_TARGET_FLAGS}
+          )
+
+          # Set configuration metadata for this thrust interface target:
+          _thrust_add_target_to_target_list(${target_name}
+            ${host} ${device} ${dialect} ${prefix}
+          )
+
+          # Create a meta target for all targets in this configuration:
+          add_custom_target(${prefix}.all)
+          add_dependencies(thrust.all ${prefix}.all)
+        endif()
+      endforeach() # dialects
+    endforeach() # devices
+  endforeach() # hosts
+
+  list(LENGTH THRUST_TARGETS count)
+  message(STATUS "${count} unique thrust.host.device.dialect configurations generated")
+endfunction()
+
+function(_thrust_build_target_list_singleconfig)
+  find_package(Thrust REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${Thrust_SOURCE_DIR}"
+  )
+  thrust_create_target(thrust FROM_OPTIONS ${THRUST_TARGET_FLAGS})
+  thrust_debug_target(thrust "${THRUST_VERSION}")
+
+  set(host ${THRUST_HOST_SYSTEM})
+  set(device ${THRUST_DEVICE_SYSTEM})
+  set(dialect ${THRUST_CPP_DIALECT})
+  set(prefix "thrust") # single config
+
+  # This depends on the backends loaded by thrust_create_target, and must
+  # be called before _thrust_add_target_to_target_list.
+  thrust_build_compiler_targets()
+
+  _thrust_add_target_to_target_list(thrust ${host} ${device} ${dialect} ${prefix})
+endfunction()
+
+# Build a ${THRUST_TARGETS} list containing target names for all
+# requested configurations
+function(thrust_build_target_list)
+  # Clear the list of targets:
+  _thrust_init_target_list()
+
+  # Generic config flags:
+  set(THRUST_TARGET_FLAGS)
+  macro(add_flag_option flag docstring default)
+    set(opt "THRUST_${flag}")
+    option(${opt} "${docstring}" "${default}")
+    mark_as_advanced(${opt})
+    if (${${opt}})
+      list(APPEND THRUST_TARGET_FLAGS ${flag})
+    endif()
+  endmacro()
+  add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF)
+  add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
+  add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF)
+  add_flag_option(IGNORE_CUB_VERSION_CHECK "Don't warn about mismatched CUB versions." OFF)
+
+  # Top level meta-target. Makes it easier to just build thrust targets when
+  # building both CUB and Thrust. Add all project files here so IDEs will be
+  # aware of them. This will not generate build rules.
+  file(GLOB_RECURSE all_sources
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    "${Thrust_SOURCE_DIR}/thrust/*.h"
+    "${Thrust_SOURCE_DIR}/thrust/*.inl"
+  )
+  add_custom_target(thrust.all SOURCES ${all_sources})
+
+  if (THRUST_ENABLE_MULTICONFIG)
+    _thrust_build_target_list_multiconfig()
+  else()
+    _thrust_build_target_list_singleconfig()
+  endif()
+endfunction()
diff --git a/thrust/cmake/ThrustCudaConfig.cmake b/thrust/cmake/ThrustCudaConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..97d2ec9420166415db101dd2fe199d4776fc77e3
--- /dev/null
+++ b/thrust/cmake/ThrustCudaConfig.cmake
@@ -0,0 +1,140 @@
+enable_language(CUDA)
+
+set(THRUST_KNOWN_COMPUTE_ARCHS 35 37 50 52 53 60 61 62 70 72 75 80)
+
+# Split CUDA_FLAGS into 3 parts:
+#
+# THRUST_CUDA_FLAGS_BASE: Common CUDA flags for all targets.
+# THRUST_CUDA_FLAGS_RDC: Additional CUDA flags for targets compiled with RDC.
+# THRUST_CUDA_FLAGS_NO_RDC: Additional CUDA flags for targets compiled without RDC.
+#
+# This is necessary because CUDA SMs 5.3, 6.2, and 7.2 do not support RDC, but
+# we want to always build some targets (e.g. testing/cuda/*) with RDC.
+# We work around this by building the "always RDC" targets without support for
+# those SMs. This requires two sets of CUDA_FLAGS.
+#
+# Enabling any of those SMs along with the ENABLE_RDC options will result in a
+# configuration error.
+#
+# Because of how CMake handles the CMAKE_CUDA_FLAGS variables, every target
+# generated in a given directory will use the same value for CMAKE_CUDA_FLAGS,
+# which is determined at the end of the directory's scope. This means caution
+# should be used when trying to build different targets with different flags,
+# since they might not behave as expected. This will improve with CMake 3.18,
+# which add the DEVICE_LINK genex, fixing the issue with using per-target
+# CUDA_FLAGS: https://gitlab.kitware.com/cmake/cmake/-/issues/18265
+set(THRUST_CUDA_FLAGS_BASE "${CMAKE_CUDA_FLAGS}")
+set(THRUST_CUDA_FLAGS_RDC)
+set(THRUST_CUDA_FLAGS_NO_RDC)
+
+# Archs that don't support RDC:
+set(no_rdc_archs 53 62 72)
+
+# Find the highest arch:
+list(SORT THRUST_KNOWN_COMPUTE_ARCHS)
+list(LENGTH THRUST_KNOWN_COMPUTE_ARCHS max_idx)
+math(EXPR max_idx "${max_idx} - 1")
+list(GET THRUST_KNOWN_COMPUTE_ARCHS ${max_idx} highest_arch)
+
+set(option_init OFF)
+if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(option_init ON)
+endif()
+option(THRUST_DISABLE_ARCH_BY_DEFAULT
+  "If ON, then all CUDA architectures are disabled on the initial CMake run."
+  ${option_init}
+)
+
+set(option_init ON)
+if (THRUST_DISABLE_ARCH_BY_DEFAULT)
+  set(option_init OFF)
+endif()
+
+set(num_archs_enabled 0)
+foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS)
+  option(THRUST_ENABLE_COMPUTE_${arch}
+    "Enable code generation for tests for sm_${arch}"
+    ${option_init}
+  )
+
+  if (NOT THRUST_ENABLE_COMPUTE_${arch})
+    continue()
+  endif()
+
+  math(EXPR num_archs_enabled "${num_archs_enabled} + 1")
+
+  if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    if (NOT ${num_archs_enabled} EQUAL 1)
+      message(FATAL_ERROR
+        "Feta does not support compilation for multiple device architectures "
+        "at once."
+      )
+    endif()
+    set(arch_flag "-gpu=cc${arch}")
+  else()
+    set(arch_flag "-gencode arch=compute_${arch},code=sm_${arch}")
+  endif()
+
+  string(APPEND COMPUTE_MESSAGE " sm_${arch}")
+  string(APPEND THRUST_CUDA_FLAGS_NO_RDC " ${arch_flag}")
+  if (NOT arch IN_LIST no_rdc_archs)
+    string(APPEND THRUST_CUDA_FLAGS_RDC " ${arch_flag}")
+  endif()
+endforeach()
+
+if (NOT "Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  option(THRUST_ENABLE_COMPUTE_FUTURE
+    "Enable code generation for tests for compute_${highest_arch}"
+    ${option_init}
+  )
+  if (THRUST_ENABLE_COMPUTE_FUTURE)
+    string(APPEND THRUST_CUDA_FLAGS_BASE
+      " -gencode arch=compute_${highest_arch},code=compute_${highest_arch}"
+    )
+    string(APPEND COMPUTE_MESSAGE " compute_${highest_arch}")
+  endif()
+endif()
+
+message(STATUS "Thrust: Enabled CUDA architectures:${COMPUTE_MESSAGE}")
+
+# RDC is off by default in NVCC and on by default in Feta. Turning off RDC
+# isn't currently supported by Feta. So, we default to RDC off for NVCC and
+# RDC on for Feta.
+set(option_init OFF)
+if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(option_init ON)
+endif()
+
+option(THRUST_ENABLE_TESTS_WITH_RDC
+  "Build all Thrust tests with RDC; tests that require RDC are not affected by this option."
+  ${option_init}
+)
+
+option(THRUST_ENABLE_EXAMPLES_WITH_RDC
+  "Build all Thrust examples with RDC; examples which require RDC are not affected by this option."
+  ${option_init}
+)
+
+# Check for RDC/SM compatibility and error/warn if necessary
+foreach (sm IN LISTS no_rdc_archs)
+  set(sm_opt THRUST_ENABLE_COMPUTE_${sm})
+  if (${sm_opt})
+    foreach (opt IN ITEMS TESTS EXAMPLES)
+      set(rdc_opt THRUST_ENABLE_${opt}_WITH_RDC)
+      if (${rdc_opt})
+        message(FATAL_ERROR
+          "${rdc_opt} is incompatible with ${sm_opt}, since sm_${sm} does not "
+          "support RDC."
+        )
+      endif()
+    endforeach()
+
+    message(NOTICE
+      "sm_${sm} does not support RDC. Targets that require RDC will be built "
+      "without support for this architecture."
+    )
+  endif()
+endforeach()
+
+# By default RDC is not used:
+set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..81c6e3174e282f50a6f1ee45a8fbf3f3507baea9
--- /dev/null
+++ b/thrust/cmake/ThrustHeaderTesting.cmake
@@ -0,0 +1,119 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  string(TOLOWER "${config_host}" host_lower)
+  string(TOLOWER "${config_device}" device_lower)
+
+  # GLOB ALL THE THINGS
+  set(headers_globs thrust/*.h)
+  set(headers_exclude_systems_globs thrust/system/*/*)
+  set(headers_systems_globs
+    thrust/system/${host_lower}/*
+    thrust/system/${device_lower}/*
+  )
+  set(headers_exclude_details_globs
+    thrust/detail/*
+    thrust/*/detail/*
+    thrust/*/*/detail/*
+  )
+
+  # Get all .h files...
+  file(GLOB_RECURSE headers
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_globs}
+  )
+
+  # ...then remove all system specific headers...
+  file(GLOB_RECURSE headers_exclude_systems
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_exclude_systems_globs}
+  )
+  list(REMOVE_ITEM headers ${headers_exclude_systems})
+
+  # ...then add all headers specific to the selected host and device systems back again...
+  file(GLOB_RECURSE headers_systems
+    RELATIVE ${Thrust_SOURCE_DIR}/thrust
+    CONFIGURE_DEPENDS
+    ${headers_systems_globs}
+  )
+  list(APPEND headers ${headers_systems})
+
+  # ...and remove all the detail headers (also removing the detail headers from the selected systems).
+  file(GLOB_RECURSE headers_exclude_details
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_exclude_details_globs}
+  )
+  list(REMOVE_ITEM headers ${headers_exclude_details})
+
+  # List of headers that aren't implemented for all backends, but are implemented for CUDA.
+  set(partially_implemented_CUDA
+    async/copy.h
+    async/for_each.h
+    async/reduce.h
+    async/sort.h
+    async/transform.h
+    event.h
+    future.h
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for CPP.
+  set(partially_implemented_CPP
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for TBB.
+  set(partially_implemented_TBB
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for OMP.
+  set(partially_implemented_OMP
+  )
+
+  # List of all partially implemented headers.
+  set(partially_implemented
+    ${partially_implemented_CUDA}
+    ${partially_implemented_CPP}
+    ${partially_implemented_TBB}
+    ${partially_implemented_OMP}
+  )
+  list(REMOVE_DUPLICATES partially_implemented)
+
+  set(headertest_srcs)
+
+  foreach (header IN LISTS headers)
+    if ("${header}" IN_LIST partially_implemented)
+      # This header is partially implemented on _some_ backends...
+      if (NOT "${header}" IN_LIST partially_implemented_${config_device})
+        # ...but not on the selected one.
+        continue()
+      endif()
+    endif()
+
+    set(headertest_src_ext .cpp)
+    if ("CUDA" STREQUAL "${config_device}")
+      set(headertest_src_ext .cu)
+    endif()
+
+    set(headertest_src "headers/${config_prefix}/${header}${headertest_src_ext}")
+    configure_file("${Thrust_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
+
+    list(APPEND headertest_srcs "${headertest_src}")
+  endforeach()
+
+  set(headertest_target ${config_prefix}.headers)
+  add_library(${headertest_target} OBJECT ${headertest_srcs})
+  target_link_libraries(${headertest_target} PUBLIC ${thrust_target})
+  thrust_clone_target_properties(${headertest_target} ${thrust_target})
+
+  add_dependencies(${config_prefix}.all ${headertest_target})
+endforeach()
diff --git a/thrust/cmake/ThrustInstallRules.cmake b/thrust/cmake/ThrustInstallRules.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..552a716685ec8509343ea40eb05d6eed7bbf0420
--- /dev/null
+++ b/thrust/cmake/ThrustInstallRules.cmake
@@ -0,0 +1,25 @@
+# Thrust is a header library; no need to build anything before installing:
+set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
+
+install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
+  TYPE INCLUDE
+  FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN "*.inl"
+    PATTERN "*.cmake"
+    PATTERN "*.md"
+)
+
+# Depending on how Thrust is configured, CUB's CMake scripts may or may not be
+# included, so maintain a set of CUB install rules in both projects. By default
+# CUB headers are installed alongside Thrust -- this may be disabled by turning
+# off THRUST_INSTALL_CUB_HEADERS.
+option(THRUST_INSTALL_CUB_HEADERS "Include cub headers when installing." ON)
+if (THRUST_INSTALL_CUB_HEADERS)
+  install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
+    TYPE INCLUDE
+    FILES_MATCHING
+      PATTERN "*.cuh"
+      PATTERN "*.cmake"
+  )
+endif()
diff --git a/thrust/cmake/ThrustMultiConfig.cmake b/thrust/cmake/ThrustMultiConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2b3a40284e6f9fd5515b0fe708b42a0bcc9d3bf2
--- /dev/null
+++ b/thrust/cmake/ThrustMultiConfig.cmake
@@ -0,0 +1,127 @@
+# This file defines thrust_configure_multiconfig(), which sets up and handles
+# the MultiConfig options that allow multiple host/device/dialect configurations
+# to be generated from a single thrust build.
+
+function(thrust_configure_multiconfig)
+  option(THRUST_ENABLE_MULTICONFIG "Enable multiconfig options for coverage testing." OFF)
+
+  # Dialects:
+  set(THRUST_CPP_DIALECT_OPTIONS
+    11 14 17
+    CACHE INTERNAL "C++ dialects supported by Thrust." FORCE
+  )
+
+  if (THRUST_ENABLE_MULTICONFIG)
+    # Handle dialect options:
+    foreach (dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+      set(default_value OFF)
+      if (dialect EQUAL 14) # Default to just 14 on:
+        set(default_value ON)
+      endif()
+      option(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}
+        "Generate C++${dialect} build configurations."
+        ${default_value}
+      )
+    endforeach()
+
+    # Supported versions of MSVC do not distinguish between C++11 and C++14.
+    # Warn the user that they may be generating a ton of redundant targets.
+    if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
+        THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11)
+      message(WARNING
+        "Supported versions of MSVC (2017+) do not distinguish between C++11 "
+        "and C++14. The requested C++11 targets will be built with C++14."
+      )
+    endif()
+
+    # Systems:
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP "Generate build configurations that use CPP." ON)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA "Generate build configurations that use CUDA." ON)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP "Generate build configurations that use OpenMP." OFF)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB "Generate build configurations that use TBB." OFF)
+
+    # CMake added C++17 support for CUDA targets in 3.18:
+    if (THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17 AND
+        THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA)
+      cmake_minimum_required(VERSION 3.18)
+    endif()
+
+    # Workload:
+    # - `SMALL`: [3 configs] Minimal coverage and validation of each device system against the `CPP` host.
+    # - `MEDIUM`: [6 configs] Cheap extended coverage.
+    # - `LARGE`: [8 configs] Expensive extended coverage. Include all useful build configurations.
+    # - `FULL`: [12 configs] The complete cross product of all possible build configurations.
+    #
+    # Config   | Workloads | Value      | Expense   | Note
+    # ---------|-----------|------------|-----------|-----------------------------
+    # CPP/CUDA | F L M S   | Essential  | Expensive | Validates CUDA against CPP
+    # CPP/OMP  | F L M S   | Essential  | Cheap     | Validates OMP against CPP
+    # CPP/TBB  | F L M S   | Essential  | Cheap     | Validates TBB against CPP
+    # CPP/CPP  | F L M     | Important  | Cheap     | Tests CPP as device
+    # OMP/OMP  | F L M     | Important  | Cheap     | Tests OMP as host
+    # TBB/TBB  | F L M     | Important  | Cheap     | Tests TBB as host
+    # TBB/CUDA | F L       | Important  | Expensive | Validates TBB/CUDA interop
+    # OMP/CUDA | F L       | Important  | Expensive | Validates OMP/CUDA interop
+    # TBB/OMP  | F         | Not useful | Cheap     | Mixes CPU-parallel systems
+    # OMP/TBB  | F         | Not useful | Cheap     | Mixes CPU-parallel systems
+    # TBB/CPP  | F         | Not Useful | Cheap     | Parallel host, serial device
+    # OMP/CPP  | F         | Not Useful | Cheap     | Parallel host, serial device
+
+    set(THRUST_MULTICONFIG_WORKLOAD SMALL CACHE STRING
+      "Limit host/device configs: SMALL (up to 3 h/d combos per dialect), MEDIUM(6), LARGE(8), FULL(12)"
+    )
+    set_property(CACHE THRUST_MULTICONFIG_WORKLOAD PROPERTY STRINGS
+      SMALL MEDIUM LARGE FULL
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS
+      CPP_OMP CPP_TBB CPP_CUDA
+      CACHE INTERNAL "Host/device combos enabled for SMALL workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS}
+      CPP_CPP TBB_TBB OMP_OMP
+      CACHE INTERNAL "Host/device combos enabled for MEDIUM workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS}
+      OMP_CUDA TBB_CUDA
+      CACHE INTERNAL "Host/device combos enabled for LARGE workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_FULL_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS}
+      OMP_CPP TBB_CPP OMP_TBB  TBB_OMP
+      CACHE INTERNAL "Host/device combos enabled for FULL workloads." FORCE
+    )
+
+    # Hide the single config options if they exist from a previous run:
+    if (DEFINED THRUST_HOST_SYSTEM)
+      set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE INTERNAL)
+      set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE INTERNAL)
+    endif()
+    if (DEFINED THRUST_CPP_DIALECT)
+      set_property(CACHE THRUST_CPP_DIALECT PROPERTY TYPE INTERNAL)
+    endif()
+
+  else() # Single config:
+    # Restore system option visibility if these cache options already exist
+    # from a previous run.
+    if (DEFINED THRUST_HOST_SYSTEM)
+      set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE STRING)
+      set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE STRING)
+    endif()
+
+    set(THRUST_CPP_DIALECT 14
+      CACHE STRING "The C++ standard to target: ${THRUST_CPP_DIALECT_OPTIONS}"
+    )
+    set_property(CACHE THRUST_CPP_DIALECT
+      PROPERTY STRINGS
+      ${THRUST_CPP_DIALECT_OPTIONS}
+    )
+
+    # CMake added C++17 support for CUDA targets in 3.18:
+    if (THRUST_CPP_DIALECT EQUAL 17 AND
+        THRUST_DEVICE_SYSTEM STREQUAL "CUDA")
+      cmake_minimum_required(VERSION 3.18)
+    endif()
+  endif()
+endfunction()
diff --git a/thrust/cmake/ThrustRunExample.cmake b/thrust/cmake/ThrustRunExample.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..24e9dd2bb33b20bd2e6bc1996048efe891460da7
--- /dev/null
+++ b/thrust/cmake/ThrustRunExample.cmake
@@ -0,0 +1,49 @@
+# Inputs:
+#
+# Variable             | Type     | Doc
+# ---------------------|----------|--------------------------------------
+# EXAMPLE_EXECUTABLE   | FilePath | Path to example executable
+# FILECHECK_ENABLED    | Boolean  | Run FileCheck comparison test
+# FILECHECK_EXECUTABLE | FilePath | Path to the LLVM FileCheck utility
+# REFERENCE_FILE       | FilePath | Path to the FileCheck reference file
+
+if (FILECHECK_ENABLED)
+  if (NOT EXISTS "${REFERENCE_FILE}")
+    message(FATAL_ERROR
+      "FileCheck requested for '${EXAMPLE_EXECUTABLE}', but reference file "
+      "does not exist at '${REFERENCE_FILE}`."
+    )
+  endif()
+
+  # If the reference file is empty, validate that the example doesn't
+  # produce any output.
+  file(SIZE "${REFERENCE_FILE}" file_size)
+  message("${REFERENCE_FILE}: ${file_size} bytes")
+
+  if (file_size EQUAL 0)
+    set(check_empty_output TRUE)
+    set(filecheck_command)
+  else()
+    set(check_empty_output FALSE)
+    set(filecheck_command COMMAND "${FILECHECK_EXECUTABLE}" "${REFERENCE_FILE}")
+  endif()
+endif()
+
+execute_process(
+  COMMAND "${EXAMPLE_EXECUTABLE}"
+  ${filecheck_command}
+  RESULT_VARIABLE exit_code
+  OUTPUT_VARIABLE stdout
+  ERROR_VARIABLE stderr
+)
+
+if (NOT 0 EQUAL exit_code)
+  message(FATAL_ERROR "${EXAMPLE_EXECUTABLE} failed (${exit_code}):\n${stderr}")
+endif()
+
+if (check_empty_output)
+  string(LENGTH "${stdout}" stdout_size)
+  if (NOT stdout_size EQUAL 0)
+    message(FATAL_ERROR "${EXAMPLE_EXECUTABLE}: output received, but not expected:\n${stdout}")
+  endif()
+endif()
diff --git a/thrust/cmake/ThrustRunTest.cmake b/thrust/cmake/ThrustRunTest.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0d03129f0160c7918126d3cda7fccf66d2cc43d2
--- /dev/null
+++ b/thrust/cmake/ThrustRunTest.cmake
@@ -0,0 +1,8 @@
+execute_process(
+  COMMAND "${THRUST_BINARY}"
+  RESULT_VARIABLE EXIT_CODE
+)
+
+if (NOT "0" STREQUAL "${EXIT_CODE}")
+    message(FATAL_ERROR "${THRUST_BINARY} failed (${EXIT_CODE})")
+endif ()
diff --git a/thrust/cmake/ThrustUtilities.cmake b/thrust/cmake/ThrustUtilities.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e8fa9be1046554c5b4ed9309edf472dc3d023f4c
--- /dev/null
+++ b/thrust/cmake/ThrustUtilities.cmake
@@ -0,0 +1,25 @@
+# Given a cu_file (e.g. foo/bar.cu) relative to CMAKE_CURRENT_SOURCE_DIR
+# and a thrust_target, create a cpp file that includes the .cu file, and set
+# ${cpp_file_var} in the parent scope to the full path of the new file. The new
+# file will be generated in:
+# ${CMAKE_CURRENT_BINARY_DIR}/<thrust_target_prefix>/${cu_file}.cpp
+function(thrust_wrap_cu_in_cpp cpp_file_var cu_file thrust_target)
+  thrust_get_target_property(prefix ${thrust_target} PREFIX)
+  set(wrapped_source_file "${CMAKE_CURRENT_SOURCE_DIR}/${cu_file}")
+  set(cpp_file "${CMAKE_CURRENT_BINARY_DIR}/${prefix}/${cu_file}.cpp")
+  configure_file("${Thrust_SOURCE_DIR}/cmake/wrap_source_file.cpp.in" "${cpp_file}")
+  set(${cpp_file_var} "${cpp_file}" PARENT_SCOPE)
+endfunction()
+
+# Enable RDC for a CUDA target. Encapsulates compiler hacks:
+function(thrust_enable_rdc_for_cuda_target target_name)
+  if ("Feta" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    set_target_properties(${target_name} PROPERTIES
+      COMPILE_FLAGS "-gpu=rdc"
+    )
+  else()
+    set_target_properties(${target_name} PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+    )
+  endif()
+endfunction()
diff --git a/thrust/cmake/header_test.in b/thrust/cmake/header_test.in
new file mode 100644
index 0000000000000000000000000000000000000000..08f8b7e97b7333e8835402aec0963ced819d0640
--- /dev/null
+++ b/thrust/cmake/header_test.in
@@ -0,0 +1,4 @@
+#define THRUST_CPP11_REQUIRED_NO_ERROR
+#define THRUST_CPP14_REQUIRED_NO_ERROR
+#define THRUST_MODERN_GCC_REQUIRED_NO_ERROR
+#include <thrust/${header}>
diff --git a/thrust/cmake/sanity b/thrust/cmake/sanity
new file mode 100644
index 0000000000000000000000000000000000000000..f9db80b7f88c6bfb8c9078d9c7e3cbc99badd527
--- /dev/null
+++ b/thrust/cmake/sanity
@@ -0,0 +1 @@
+SANITY
diff --git a/thrust/cmake/wrap_source_file.cpp.in b/thrust/cmake/wrap_source_file.cpp.in
new file mode 100644
index 0000000000000000000000000000000000000000..3015238cc67143594d505c07b38fd71a92848aaf
--- /dev/null
+++ b/thrust/cmake/wrap_source_file.cpp.in
@@ -0,0 +1 @@
+#include <${wrapped_source_file}>
diff --git a/thrust/cub/agent/agent_histogram.cuh b/thrust/cub/agent/agent_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7559bf126b1c5eb6b266e394c4cf1b60ee48175f
--- /dev/null
+++ b/thrust/cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector<SampleT, 4>::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
+            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_radix_sort_downsweep.cuh b/thrust/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c861a41e886147731f02e65dc413d551f2c5b2d5
--- /dev/null
+++ b/thrust/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,790 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm  _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm  _RANK_ALGORITHM,                ///< The radix ranking algorithm to use
+    BlockScanAlgorithm  _SCAN_ALGORITHM,                ///< The block scan algorithm to use
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortDownsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE ||
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        ValueT values[ITEMS_PER_THREAD];
+
+        CTA_SYNC();
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            #pragma unroll 1
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_radix_sort_upsweep.cuh b/thrust/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c65773f12cc01838f0813cede12372bfcda52e95
--- /dev/null
+++ b/thrust/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,527 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortUpsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_reduce.cuh b/thrust/cub/agent/agent_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0f3ba75105a7f4e6afc0e285f1cb2d9bb729709c
--- /dev/null
+++ b/thrust/cub/agent/agent_reduce.cuh
@@ -0,0 +1,386 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                     NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                ComputeT,                       ///< Dominant compute type
+    int                     _VECTOR_LOAD_LENGTH,            ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,               ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    typename                ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentReducePolicy :
+    ScalingType
+{
+    enum
+    {
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_reduce_by_key.cuh b/thrust/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..01eded8975c27ec673cf558f6a2b4dfe5413afc1
--- /dev/null
+++ b/thrust/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = prefix_op.GetInclusivePrefix();
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate.value;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_rle.cuh b/thrust/cub/agent/agent_rle.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..79697b7ec3335c49731f92db59849f648f36bfdc
--- /dev/null
+++ b/thrust/cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
+
+    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
+    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_scan.cuh b/thrust/cub/agent/agent_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0781b3e9e9fb140dc19ea17351bafa1b36b94e7c
--- /dev/null
+++ b/thrust/cub/agent/agent_scan.cuh
@@ -0,0 +1,469 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                         NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                    ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,                ///< The BlockScan algorithm to use
+    typename                    ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+
+struct AgentScanPolicy :
+    ScalingType
+{
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_segment_fixup.cuh b/thrust/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9cd524aa21c9a893de5ce561bfa57ab54d04bc51
--- /dev/null
+++ b/thrust/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
+                                (Equals<ValueT, float>::VALUE || 
+                                 Equals<ValueT, int>::VALUE ||
+                                 Equals<ValueT, unsigned int>::VALUE ||
+                                 Equals<ValueT, unsigned long long>::VALUE),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_select_if.cuh b/thrust/cub/agent/agent_select_if.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e9568f3b00f8e693a6927355ecdc25212296b017
--- /dev/null
+++ b/thrust/cub/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals<FlagT, NullType>::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/agent_spmv_orig.cuh b/thrust/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..810f893fbecf278888de11ed27266105d5ea62f0
--- /dev/null
+++ b/thrust/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value *= spmv_params.alpha;
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/agent/single_pass_scan_operators.cuh b/thrust/cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..924ef2a7aca8732918526a5027bb58ea32c05c30
--- /dev/null
+++ b/thrust/cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,814 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3] = {};
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3] = {};
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_adjacent_difference.cuh b/thrust/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c8953756db2fe3d18167352ca150a81abe08bc21
--- /dev/null
+++ b/thrust/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/block/block_discontinuity.cuh b/thrust/cub/block/block_discontinuity.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..37b8c29925f94970f6feac2ddc055912f15ffb51
--- /dev/null
+++ b/thrust/cub/block/block_discontinuity.cuh
@@ -0,0 +1,1148 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/block/block_exchange.cuh b/thrust/cub/block/block_exchange.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fbe64afc19257b3cad22db7862341baf29eb2a13
--- /dev/null
+++ b/thrust/cub/block/block_exchange.cuh
@@ -0,0 +1,1246 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_histogram.cuh b/thrust/cub/block/block_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..030209063baed0b668ddd0927b0db4785517d373
--- /dev/null
+++ b/thrust/cub/block/block_histogram.cuh
@@ -0,0 +1,414 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
+        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_load.cuh b/thrust/cub/block/block_load.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d932a2c5b53b299441a2dc319a01b242b56ad996
--- /dev/null
+++ b/thrust/cub/block/block_load.cuh
@@ -0,0 +1,1229 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorTis not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_radix_rank.cuh b/thrust/cub/block/block_radix_rank.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a98976fc2614ff6477c8e19f1a3143ba4a84ea0c
--- /dev/null
+++ b/thrust/cub/block/block_radix_rank.cuh
@@ -0,0 +1,695 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/block/block_radix_sort.cuh b/thrust/cub/block/block_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e666902156bab48c938fb46e90cd4d0f2a8563ef
--- /dev/null
+++ b/thrust/cub/block/block_radix_sort.cuh
@@ -0,0 +1,862 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ *   items into ascending order.  It relies upon a positional representation for
+ *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ *   characters, etc.) specified from least-significant to most-significant.  For a
+ *   given input sequence of keys and a set of rules specifying a total ordering
+ *   of the symbolic alphabet, the radix sorting method produces a lexicographic
+ *   ordering of those keys.
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ *   half-precision floating-point type. Within each key, the implementation treats fixed-length
+ *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ *   method can only be applied to unsigned integral types, BlockRadixSort
+ *   is able to sort signed and floating-point types via simple bit-wise transformations
+ *   that ensure lexicographic key ordering.
+ * - \rowmajor
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_raking_layout.cuh b/thrust/cub/block/block_raking_layout.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bbacdf3e02fd5123d0bb0248f61b9da639c2442b
--- /dev/null
+++ b/thrust/cub/block/block_raking_layout.cuh
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_reduce.cuh b/thrust/cub/block/block_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1bf971f0f4a832b3c6fad85219934874bc219db1
--- /dev/null
+++ b/thrust/cub/block/block_reduce.cuh
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        WarpReductions,
+        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
+            RakingCommutativeOnly,
+            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_scan.cuh b/thrust/cub/block/block_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..513ef358bd7c85996ea3ab3f88c420f4285910f3
--- /dev/null
+++ b/thrust/cub/block/block_scan.cuh
@@ -0,0 +1,2141 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        WarpScans,
+        Raking>::Type InternalBlockScan;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_shuffle.cuh b/thrust/cub/block/block_shuffle.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ba2e9b59a0078a0f5b946fdc9bd5ba5b30d9a7b4
--- /dev/null
+++ b/thrust/cub/block/block_shuffle.cuh
@@ -0,0 +1,306 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T prev[BLOCK_THREADS];
+        T next[BLOCK_THREADS];
+    };
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        const int offset_tid = static_cast<int>(linear_tid) + distance;
+        if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
+        {
+            output = temp_storage[static_cast<size_t>(offset_tid)].prev;
+        }
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/block_store.cuh b/thrust/cub/block/block_store.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..495a1553f37654e6b995c2d00062ccf5cded547d
--- /dev/null
+++ b/thrust/cub/block/block_store.cuh
@@ -0,0 +1,999 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_histogram_atomic.cuh b/thrust/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3be0a3dfa6814b07f99217a427b191c88f3bc738
--- /dev/null
+++ b/thrust/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_histogram_sort.cuh b/thrust/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f11735541c0c6da344531e44a2da1c6bbb6ab405
--- /dev/null
+++ b/thrust/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_reduce_raking.cuh b/thrust/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2a57521be319ee8b984b4df11d800464abfb9b0f
--- /dev/null
+++ b/thrust/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                int valid_raking_threads = (IS_FULL_TILE) ?
+                    RAKING_THREADS :
+                    (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
+                    partial,
+                    valid_raking_threads,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/thrust/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..78a32b82263461242f390ae8c0d0f90acfa8e8aa
--- /dev/null
+++ b/thrust/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        };
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_reduce_warp_reductions.cuh b/thrust/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4dd3451b888abecddc24e236d85ac176d1da192c
--- /dev/null
+++ b/thrust/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum    reduction_op;
+        int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_scan_raking.cuh b/thrust/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1d6c2f70dc98e0d86ae7075de34e42ddcad79eba
--- /dev/null
+++ b/thrust/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_scan_warp_scans.cuh b/thrust/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3835e484e132c31790e76f91f4af3d673d1dc957
--- /dev/null
+++ b/thrust/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,391 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_scan_warp_scans2.cuh b/thrust/cub/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6617160d1be5018d2e63b6494a5c3df6813e247a
--- /dev/null
+++ b/thrust/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,435 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/block/specializations/block_scan_warp_scans3.cuh b/thrust/cub/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a8279d5765cd57eb80005b27084d29a963b97067
--- /dev/null
+++ b/thrust/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,417 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/cmake/cub-config-version.cmake b/thrust/cub/cmake/cub-config-version.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4260ba66f57769d96f8cb8dbe9ab3ac543a35075
--- /dev/null
+++ b/thrust/cub/cmake/cub-config-version.cmake
@@ -0,0 +1,33 @@
+# Parse version information from version.cuh:
+file(READ "${CMAKE_CURRENT_LIST_DIR}/../version.cuh" CUB_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
+math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
+math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
+
+# Build comparison versions:
+set(CUB_COMPAT "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}")
+set(CUB_EXACT "${CUB_COMPAT}.${CUB_VERSION_TWEAK}")
+set(FIND_COMPAT "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}")
+set(FIND_EXACT "${FIND_COMPAT}.${PACKAGE_FIND_VERSION_TWEAK}")
+
+# Set default results
+set(PACKAGE_VERSION ${CUB_EXACT})
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+
+# Test for compatibility (ignores tweak)
+if (FIND_COMPAT VERSION_EQUAL CUB_COMPAT)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+endif()
+
+# Test for exact (does not ignore tweak)
+if (FIND_EXACT VERSION_EQUAL CUB_EXACT)
+  set(PACKAGE_VERSION_EXACT TRUE)
+endif()
diff --git a/thrust/cub/cmake/cub-config.cmake b/thrust/cub/cmake/cub-config.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0900becd8fbcff9ee791c9b990ed2bf82e26f220
--- /dev/null
+++ b/thrust/cub/cmake/cub-config.cmake
@@ -0,0 +1,62 @@
+#
+# find_package(CUB) config file.
+#
+# Defines a CUB::CUB target that may be linked from user projects to include
+# CUB.
+
+if (TARGET CUB::CUB)
+  return()
+endif()
+
+function(_cub_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit CUB will *always* be used
+  #    during compilation, and the include paths of an IMPORTED CUB::CUB
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to CUB::CUB. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+#
+# Setup targets
+#
+
+_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
+# Strip out the 'cub/cmake/' from 'cub/cmake/cub-config.cmake':
+get_filename_component(_CUB_INCLUDE_DIR "../.." ABSOLUTE BASE_DIR "${CMAKE_CURRENT_LIST_DIR}")
+target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
+
+if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR
+    THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_CPP_11 OR
+    THRUST_IGNORE_DEPRECATED_CPP_11)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_COMPILER OR
+    THRUST_IGNORE_DEPRECATED_COMPILER)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
+endif()
+
+#
+# Standardize version info
+#
+
+set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
+set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
+set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
+set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
+set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
+set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
diff --git a/thrust/cub/config.cuh b/thrust/cub/config.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b909bbf723708e59a121b5525c628f6715e24c86
--- /dev/null
+++ b/thrust/cub/config.cuh
@@ -0,0 +1,40 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static configuration header for the CUB project.
+ */
+
+#pragma once
+
+#include "util_arch.cuh"
+#include "util_compiler.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_deprecated.cuh"
+#include "util_macro.cuh"
+#include "util_namespace.cuh"
diff --git a/thrust/cub/cub.cuh b/thrust/cub/cub.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a71d78fe0d5abdda4df5dc42e15de4ea17034ad4
--- /dev/null
+++ b/thrust/cub/cub.cuh
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+// Static configuration
+#include "config.cuh"
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/discard_output_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_allocator.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/thrust/cub/device/device_histogram.cuh b/thrust/cub/device/device_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4413ff3950d977145d8ac7c8617a0f72f4d64fd8
--- /dev/null
+++ b/thrust/cub/device/device_histogram.cuh
@@ -0,0 +1,866 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            (OffsetT)1,
+            (size_t)(sizeof(SampleT) * num_samples),
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ , , , , , , , ]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            (OffsetT)1,
+            (size_t)(sizeof(SampleT) * NUM_CHANNELS * num_pixels),
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX))
+        {
+            // Down-convert OffsetT data type
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_partition.cuh b/thrust/cub/device/device_partition.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..65db3b7b5ad4825822ef2ed0d485d638918d11b6
--- /dev/null
+++ b/thrust/cub/device/device_partition.cuh
@@ -0,0 +1,273 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
+ * a specified input sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * \par Performance
+ * \linear_performance{partition}
+ *
+ * \par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * \image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated partition-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected for the first partition with 50% probability.
+     *
+     * \image html partition_if_int32_50_percent.png
+     * \image html partition_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability for the first partition:
+     *
+     * \image html partition_if_int32_5_percent.png
+     * \image html partition_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_partition_flagged.cu
+ * \example example_device_partition_if.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_radix_sort.cuh b/thrust/cub/device/device_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df218a7c3561709a8799a56c55fe5f34e4297d65
--- /dev/null
+++ b/thrust/cub/device/device_radix_sort.cuh
@@ -0,0 +1,796 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_reduce.cuh b/thrust/cub/device/device_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4f01c2446abf418d0211d2540c87f4b5e339aa22
--- /dev/null
+++ b/thrust/cub/device/device_reduce.cuh
@@ -0,0 +1,734 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_run_length_encode.cuh b/thrust/cub/device/device_run_length_encode.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e31ebf0142c1fc511245864602477f13f8728d63
--- /dev/null
+++ b/thrust/cub/device/device_run_length_encode.cuh
@@ -0,0 +1,278 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
+        typedef NullType    SelectOp;                   // Selection op (not used)
+        typedef Equality    EqualityOp;                 // Default == operator
+        typedef cub::Sum    ReductionOp;                // Value reduction operator
+
+        // The lengths output value type
+        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+            OffsetT,                                                                                                    // ... then the OffsetT type,
+            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+        // Generator type for providing 1s values for run-length reduction
+        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_scan.cuh b/thrust/cub/device/device_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a5902ceefa905fe6ce4ad773261a01c0fda80
--- /dev/null
+++ b/thrust/cub/device/device_scan.cuh
@@ -0,0 +1,443 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_segmented_radix_sort.cuh b/thrust/cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2ab2a7dde2d789535b34b89f48b26ca66512f7e8
--- /dev/null
+++ b/thrust/cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,875 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_radix_sort.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_segmented_reduce.cuh b/thrust/cub/device/device_segmented_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..97308c5a5dff9132c80bb7e1ac50eab764c2ab6b
--- /dev/null
+++ b/thrust/cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
+        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_select.cuh b/thrust/cub/device/device_select.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..136d26044a7ccacfbab7744c6ec3b7bb8ba355a6
--- /dev/null
+++ b/thrust/cub/device/device_select.cuh
@@ -0,0 +1,369 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/device_spmv.cuh b/thrust/cub/device/device_spmv.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0be0c20e7b73a651cb1e0bdc063c160da32942d5
--- /dev/null
+++ b/thrust/cub/device/device_spmv.cuh
@@ -0,0 +1,174 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+
+        return DispatchSpmv<ValueT, int>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_histogram.cuh b/thrust/cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..879d5ddec2adedfbad1bc06da83929ddbe3868c8
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1087 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DipatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels,               // Pointer to levels array
+            int             num_output_levels)      // Number of levels in array
+        {
+            this->d_levels          = d_levels;
+            this->num_output_levels = num_output_levels;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
+                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
+                WrappedLevelIteratorT;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max,                // Max sample level (exclusive)
+            _LevelT min,                // Min sample level (inclusive)
+            _LevelT scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = scale;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max,                // Max sample level (exclusive)
+            float   min,                // Min sample level (inclusive)
+            float   scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = float(1.0) / scale;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max,                 // Max sample level (exclusive)
+            double min,                 // Min sample level (inclusive)
+            double scale)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = double(1.0) / scale;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+
+    /// SM11
+    struct Policy110
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                (NUM_CHANNELS == 1) ? 256 : 128,
+                (NUM_CHANNELS == 1) ? 8 : 3,
+                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+        cudaError_t result = cudaErrorNotSupported;
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 500)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+                }
+                else
+                {
+                    result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+                }
+            #endif
+        }
+        return result;
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS] = {};
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                histogram_init_grid_dims, histogram_init_block_threads, 0,
+                stream
+            ).doit(histogram_init_kernel,
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
+            ).doit(histogram_sweep_kernel,
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_radix_sort.cuh b/thrust/cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2b0919fa1c2c65969590538e55b10e652eea9756
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,1660 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../config.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
+        ::Type ActiveUpsweepPolicyT;
+
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
+        ::Type ActiveDownsweepPolicyT;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            ActiveUpsweepPolicyT,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
+        ::Type ActiveUpsweepPolicyT;
+
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
+        ::Type ActiveDownsweepPolicyT;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            ActiveDownsweepPolicyT,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        // Register pressure work-around: moving num_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
+
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    typedef typename If<(ALT_DIGIT_BITS),
+        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Upsweep type
+    typedef AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT> BlockUpsweepT;
+
+    // Digit-scan type
+    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
+
+    // Downsweep type
+    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+    // Dominant-sized key/value type
+    typedef typename If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>::Type DominantT;
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <512, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+        };
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+
+
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <160, 39, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 31, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 11, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 25, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <384, 31, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 35, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy700 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT,       ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             spine_length,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.upsweep_config.block_threads, 0, stream
+            ).doit(pass_config.upsweep_kernel,
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, pass_config.scan_config.block_threads, 0, stream
+            ).doit(pass_config.scan_kernel,
+                d_spine,
+                spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.downsweep_config.block_threads, 0, stream
+            ).doit(pass_config.downsweep_kernel,
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            int                 num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template InitPassConfig<
+                    typename ActivePolicyT::UpsweepPolicy,
+                    typename ActivePolicyT::ScanPolicy,
+                    typename ActivePolicyT::DownsweepPolicy>(
+                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            if ((error = alt_pass_config.template InitPassConfig<
+                    typename ActivePolicyT::AltUpsweepPolicy,
+                    typename ActivePolicyT::ScanPolicy,
+                    typename ActivePolicyT::AltDownsweepPolicy>(
+                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3] = {};
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchSegmentedRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+            {
+              _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
+                      "%lld items per thread, %lld SM occupancy, "
+                      "current bit %d, bit_grain %d\n",
+                      (long long)num_segments,
+                      (long long)pass_config.segmented_config.block_threads,
+                      (long long)stream,
+                      (long long)pass_config.segmented_config.items_per_thread,
+                      (long long)pass_config.segmented_config.sm_occupancy,
+                      current_bit,
+                      pass_bits);
+            }
+
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments, pass_config.segmented_config.block_threads, 0,
+                stream
+            ).doit(pass_config.segmented_kernel,
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2] = {};
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_reduce.cuh b/thrust/cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c9a5e4fbe0f33c19e774b47dc0231fcc2d1851c7
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,885 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename InputT,            ///< Input data type
+    typename OutputT,           ///< Compute/output data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                256, 16, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+    typename SelectedPolicy = DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1] = {};
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(reduce_kernel,
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+    typename SelectedPolicy = DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchSegmentedReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor
+    OutputT             init;                   ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
+            ).doit(segmented_reduce_kernel,
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_reduce_by_key.cuh b/thrust/cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d8d8dcac41965eb006fbad4e0e94db14359a835b
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,560 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM11
+    struct Policy110
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+                }
+                else
+                {
+                    reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, reduce_by_key_config.block_threads, 0,
+                    stream
+                ).doit(reduce_by_key_kernel,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_rle.cuh b/thrust/cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..68f887151604c627901c4bbdd377a73bc95f9537
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,542 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                device_rle_config.template Init<PtxRleSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+                }
+                else
+                {
+                    device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(device_scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, device_rle_config.block_threads, 0, stream
+            ).doit(device_rle_sweep_kernel,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_scan.cuh b/thrust/cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..24b30f102cfca976cfb61b354f2ad7719255e3c8
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,493 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OutputT> ///< Data type
+struct DeviceScanPolicy
+{
+
+    /// SM10
+    struct Policy100 : ChainedPolicy<100, Policy100, Policy100>
+    {
+        typedef AgentScanPolicy<
+                64, 9,                                          ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy100>
+    {
+        typedef AgentScanPolicy<
+                96, 21,                                         ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        typedef AgentScanPolicy<
+                256, 9,                                         ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM600
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+    {
+        typedef AgentScanPolicy<
+                128, 15,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT,            ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceScanPolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type> >
+struct DispatchScan:
+    SelectedPolicy
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    void*           d_temp_storage;         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&         temp_storage_bytes;     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT  d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT d_out;                  ///< [out] Pointer to the output sequence of data items
+    ScanOpT         scan_op;                ///< [in] Binary scan functor
+    InitValueT      init_value;             ///< [in] Initial value to seed the exclusive scan
+    OffsetT         num_items;              ///< [in] Total number of input items (i.e., the length of \p d_in)
+    cudaStream_t    stream;                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool            debug_synchronous;
+    int             ptx_version;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchScan(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous,
+        int             ptx_version
+    ):
+    d_temp_storage(d_temp_storage),
+    temp_storage_bytes(temp_storage_bytes),
+    d_in(d_in),
+    d_out(d_out),
+    num_items(num_items),
+    scan_op(scan_op),
+    init_value(init_value),
+    stream(stream),
+    debug_synchronous(debug_synchronous),
+    ptx_version(ptx_version)
+    {}
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                Policy::BLOCK_THREADS))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+                ).doit(scan_kernel,
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceScanInitKernel<ScanTileStateT>,
+            DeviceScanKernel<Policy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>
+        );
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchScan::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchScan dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            scan_op,
+            init_value,
+            stream,
+            debug_synchronous,
+            ptx_version
+            );
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/device/dispatch/dispatch_select_if.cuh b/thrust/cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5fec4cff72a6e45fed308aaf9658e4ec13190d02
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,546 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectIfPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            SelectIfPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                select_if_config.template Init<PtxSelectIfPolicyT>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+                }
+                else
+                {
+                    select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for select_if_kernel
+            int range_select_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                range_select_sm_occupancy,            // out
+                select_if_kernel,
+                select_if_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
+
+            // Invoke select_if_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, select_if_config.block_threads, 0, stream
+            ).doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/device/dispatch/dispatch_spmv_orig.cuh b/thrust/cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fb431df2cbdd363028f7d76826c185bc8417ab8a
--- /dev/null
+++ b/thrust/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,850 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for sequence offsets
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = 0.0;
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+        }
+
+        spmv_params.d_vector_y[row_idx] = value;
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT>                    ///< SpmvParams type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            cub::Sum,
+            OffsetT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM11
+    struct Policy110
+    {
+        typedef AgentSpmvPolicy<
+                128,
+                1,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                18,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_RAKING>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                6,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                spmv_config.template Init<PtxSpmvPolicyT>();
+                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 500)
+                {
+                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 370)
+                {
+                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+                }
+                else
+                {
+                    spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
+                    stream
+                ).doit(spmv_1col_kernel,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
+            int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Init textures
+                    if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+                }
+            #endif
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    search_grid_size, search_block_size, 0, stream
+                ).doit(spmv_search_kernel,
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                spmv_grid_size, spmv_config.block_threads, 0, stream
+            ).doit(spmv_kernel,
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    segment_fixup_grid_size, segment_fixup_config.block_threads,
+                    0, stream
+                ).doit(segment_fixup_kernel,
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Free textures
+                    if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+                }
+            #endif
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            if (CubDebug(error = Dispatch(
+                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
+                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+                spmv_config, segment_fixup_config))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/grid/grid_barrier.cuh b/thrust/cub/grid/grid_barrier.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1bcb533ee47ffa21157528fc895e107a9c9fa65e
--- /dev/null
+++ b/thrust/cub/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/grid/grid_even_share.cuh b/thrust/cub/grid/grid_even_share.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d5f8b340ce7c72b701b1192844c9ebc8a9c3b6ef
--- /dev/null
+++ b/thrust/cub/grid/grid_even_share.cuh
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "grid_mapping.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    OffsetT     total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items;    // Initialize past-the-end
+        this->block_end             = num_items;    // Initialize past-the-end
+        this->num_items             = num_items;
+        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        OffsetT avg_tiles_per_block = total_tiles / grid_size;
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/grid/grid_mapping.cuh b/thrust/cub/grid/grid_mapping.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..889a94c96ea3d75c7c519034eabc53b3e04db11f
--- /dev/null
+++ b/thrust/cub/grid/grid_mapping.cuh
@@ -0,0 +1,113 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/grid/grid_queue.cuh b/thrust/cub/grid/grid_queue.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b5f676b03c09256bf32cc8e6ae56c31d5d11a7a
--- /dev/null
+++ b/thrust/cub/grid/grid_queue.cuh
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = fill_size;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                OffsetT counters[2];
+                counters[FILL] = fill_size;
+                counters[DRAIN] = 0;
+                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                fill_size = d_counters[FILL];
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/cub/host/mutex.cuh b/thrust/cub/host/mutex.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9880dee57ca8d0e0ef878e86a22b88e8f53767af
--- /dev/null
+++ b/thrust/cub/host/mutex.cuh
@@ -0,0 +1,167 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+#include "../util_cpp_dialect.cuh"
+
+#pragma once
+
+#if CUB_CPP_DIALECT >= 2011
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../config.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if CUB_CPP_DIALECT >= 2011
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+#else       // C++11
+
+    #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // MSVC
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // C++11
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/cub/iterator/arg_index_input_iterator.cuh b/thrust/cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f16fab8c26eb489979ff9e9fc9278d923637f9a9
--- /dev/null
+++ b/thrust/cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/cache_modified_input_iterator.cuh b/thrust/cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7a41a5d31a35685f40ea8a199e6b9c224c4850fd
--- /dev/null
+++ b/thrust/cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/cache_modified_output_iterator.cuh b/thrust/cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e1697013c4ccdccea10683a17954932238e4b1b5
--- /dev/null
+++ b/thrust/cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/constant_input_iterator.cuh b/thrust/cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..44fb56c920b79afb470a222ffbb37de32115785e
--- /dev/null
+++ b/thrust/cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/counting_input_iterator.cuh b/thrust/cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c7167a70666bbd3c5ae195b4c80aaea1547a6c9a
--- /dev/null
+++ b/thrust/cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/discard_output_iterator.cuh b/thrust/cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e665c784e9ee07b6da7ce61fc8151451edf41b09
--- /dev/null
+++ b/thrust/cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,219 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to self (no-op)
+    __host__ __device__ __forceinline__ void operator=(self_type const& other)
+    {
+        offset = other.offset;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/tex_obj_input_iterator.cuh b/thrust/cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2bd3a607e1bd718d3507bdde4988d122729dba6b
--- /dev/null
+++ b/thrust/cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,318 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        this->tex_offset = tex_offset;
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                // Simply dereference the pointer on the host
+                return ptr[tex_offset];
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Move array of uninitialized words, then alias and assign to return value
+                TextureWord words[TEXTURE_MULTIPLE];
+
+                #pragma unroll
+                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+                {
+                    words[i] = tex1Dfetch<TextureWord>(
+                        tex_obj,
+                        (tex_offset * TEXTURE_MULTIPLE) + i);
+                }
+
+                // Load from words
+                return *reinterpret_cast<T*>(words);
+            #else
+                // This is dead code which will never be executed.  It is here
+                // only to avoid warnings about missing return statements.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/iterator/tex_ref_input_iterator.cuh b/thrust/cub/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..630882724fc508a8ad6590376b3fa56ed11def6a
--- /dev/null
+++ b/thrust/cub/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,379 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (CUDART_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Static file-scope Tesla/Fermi-style texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Global texture reference specialized by type
+template <typename T>
+struct IteratorTexRef
+{
+    /// And by unique ID
+    template <int UNIQUE_ID>
+    struct TexId
+    {
+        // Largest texture word we can use in device
+        typedef typename UnitWord<T>::DeviceWord DeviceWord;
+        typedef typename UnitWord<T>::TextureWord TextureWord;
+
+        // Number of texture words per T
+        enum {
+            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
+            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+        };
+
+        // Texture reference type
+        typedef texture<TextureWord> TexRef;
+
+        // Texture reference
+        static TexRef ref;
+
+        /// Bind texture
+        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        {
+            if (d_in)
+            {
+                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
+                ref.channelDesc = tex_desc;
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+            }
+
+            return cudaSuccess;
+        }
+
+        /// Unbind texture
+        static cudaError_t UnbindTexture()
+        {
+            return CubDebug(cudaUnbindTexture(ref));
+        }
+
+        /// Fetch element
+        template <typename Distance>
+        static __device__ __forceinline__ T Fetch(Distance tex_offset)
+        {
+            DeviceWord temp[DEVICE_MULTIPLE];
+            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
+
+            #pragma unroll
+            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+            {
+                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
+            }
+
+            return reinterpret_cast<T&>(temp);
+        }
+    };
+};
+
+// Texture reference definitions
+template <typename  T>
+template <int       UNIQUE_ID>
+typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
+
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ *
+ * \par Overview
+ * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ * - Compatible with CUDA toolkit v5.5 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         UNIQUE_ID,
+    typename    OffsetT = ptrdiff_t>
+class TexRefInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexRefInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    T*              ptr;
+    difference_type tex_offset;
+
+    // Texture reference wrapper (old Tesla/Fermi-style textures)
+    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
+
+public:
+/*
+    /// Constructor
+    __host__ __device__ __forceinline__ TexRefInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0)
+    {}
+*/
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          /*bytes*/ = size_t(-1), ///< Number of bytes in the range
+        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        size_t offset;
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
+        return retval;
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return TexId::UnbindTexture();
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            // Simply dereference the pointer on the host
+            return ptr[tex_offset];
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Use the texture reference
+                return TexId::Fetch(tex_offset);
+            #else
+                // This is dead code that will never be executed.  It is here
+                // only to avoid warnings about missing returns.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif // CUDART_VERSION
diff --git a/thrust/cub/iterator/transform_input_iterator.cuh b/thrust/cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bce8b817d731fac5549b10aa791fdaf93170a193
--- /dev/null
+++ b/thrust/cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*input_itr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/thread/thread_load.cuh b/thrust/cub/thread/thread_load.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..31e759602feccfed35b6d613bbcde3683e5fd271
--- /dev/null
+++ b/thrust/cub/thread/thread_load.cuh
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
+    InputIteratorT          itr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/thread/thread_operators.cuh b/thrust/cub/thread/thread_operators.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6a3192bca3ac0e1526830847eb0ead3c67ebe730
--- /dev/null
+++ b/thrust/cub/thread/thread_operators.cuh
@@ -0,0 +1,316 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+        return retval;
+    }
+};
+
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+
+
+
+
+
+/** @} */       // end group UtilModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/thread/thread_reduce.cuh b/thrust/cub/thread/thread_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..41063f971471a8cce76c951b013e3fd6a261299d
--- /dev/null
+++ b/thrust/cub/thread/thread_reduce.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/thread/thread_scan.cuh b/thrust/cub/thread/thread_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fd907fcae104aa0bcfd84f24b1583b6143e40dcd
--- /dev/null
+++ b/thrust/cub/thread/thread_scan.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../thread/thread_operators.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/thread/thread_search.cuh b/thrust/cub/thread/thread_search.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..96b9e65a56eafb4381a51cac139e47b515ed70df
--- /dev/null
+++ b/thrust/cub/thread/thread_search.cuh
@@ -0,0 +1,156 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include <iterator>
+#include "../util_namespace.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    typedef typename std::iterator_traits<AIteratorT>::value_type T;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/thread/thread_store.cuh b/thrust/cub/thread/thread_store.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..47d6c6145fecfd51ebc762a8fdbb100643af9fc5
--- /dev/null
+++ b/thrust/cub/thread/thread_store.cuh
@@ -0,0 +1,420 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_allocator.cuh b/thrust/cub/util_allocator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fa03996f0b7a667f7936e864a604eaacdc0f937f
--- /dev/null
+++ b/thrust/cub/util_allocator.cuh
@@ -0,0 +1,709 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        if (!recached)
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_arch.cuh b/thrust/cub/util_arch.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..58d0c738819e29126e8f77bd51500aca228fcca9
--- /dev/null
+++ b/thrust/cub/util_arch.cuh
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
+        !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
+/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
+#ifndef CUB_PTX_ARCH
+    #if defined(__NVCOMPILER_CUDA__)
+        // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
+        // when compiling both host code and device code. Currently, only one
+        // PTX version can be targeted.
+        #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
+    #elif !defined(__CUDA_ARCH__)
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+#ifndef CUB_IS_DEVICE_CODE
+    #if defined(__NVCOMPILER_CUDA__)
+        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 1
+    #elif CUB_PTX_ARCH > 0
+        #define CUB_IS_DEVICE_CODE 1
+        #define CUB_IS_HOST_CODE 0
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 0
+    #else
+        #define CUB_IS_DEVICE_CODE 0
+        #define CUB_IS_HOST_CODE 1
+        #define CUB_INCLUDE_DEVICE_CODE 0
+        #define CUB_INCLUDE_HOST_CODE 1
+    #endif
+#endif
+
+/// Maximum number of devices supported.
+#ifndef CUB_MAX_DEVICES
+    #define CUB_MAX_DEVICES 128
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+    static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
+#endif
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct RegBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct MemBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+
+
+#endif  // Do not document
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_compiler.cuh b/thrust/cub/util_compiler.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9be94922a50619655af4d5f0092f74e346597607
--- /dev/null
+++ b/thrust/cub/util_compiler.cuh
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Detect compiler information.
+ */
+
+#pragma once
+
+// enumerate host compilers we know about
+#define CUB_HOST_COMPILER_UNKNOWN 0
+#define CUB_HOST_COMPILER_MSVC 1
+#define CUB_HOST_COMPILER_GCC 2
+#define CUB_HOST_COMPILER_CLANG 3
+
+// enumerate device compilers we know about
+#define CUB_DEVICE_COMPILER_UNKNOWN 0
+#define CUB_DEVICE_COMPILER_MSVC 1
+#define CUB_DEVICE_COMPILER_GCC 2
+#define CUB_DEVICE_COMPILER_NVCC 3
+#define CUB_DEVICE_COMPILER_CLANG 4
+
+// figure out which host compiler we're using
+#if defined(_MSC_VER)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
+#  define CUB_MSVC_VERSION _MSC_VER
+#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__clang__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
+#  define CUB_CLANG_VERSION                                                    \
+    (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#elif defined(__GNUC__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
+#  define CUB_GCC_VERSION                                                      \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
+#endif // CUB_HOST_COMPILER
+
+// figure out which device compiler we're using
+#if defined(__CUDACC__)
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+// CUDA-capable clang should behave similar to NVCC.
+#  if defined(__CUDA__)
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#  else
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
+#  endif
+#else
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
+#endif
diff --git a/thrust/cub/util_cpp_dialect.cuh b/thrust/cub/util_cpp_dialect.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b4cbe92373ca90ed47ef91a8ea31ae62dd12d6f1
--- /dev/null
+++ b/thrust/cub/util_cpp_dialect.cuh
@@ -0,0 +1,135 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - CUB_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - CUB_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the thrust opt-outs as well:
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+#  define    CUB_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \
+     defined(THRUST_IGNORE_DEPRECATED_COMPILER)
+#  define    CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#  define CUB_IGNORE_DEPRECATED_CPP_11
+#  define CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+// Define this to override the built-in detection.
+#ifndef CUB_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define CUB_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define CUB_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if CUB_CPLUSPLUS < 201103L
+#    define CUB_CPP_DIALECT 2003
+#  elif CUB_CPLUSPLUS < 201402L
+#    define CUB_CPP_DIALECT 2011
+#  elif CUB_CPLUSPLUS < 201703L
+#    define CUB_CPP_DIALECT 2014
+#  elif CUB_CPLUSPLUS == 201703L
+#    define CUB_CPP_DIALECT 2017
+#  elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define CUB_CPP_DIALECT 2020
+#  endif
+
+#  undef CUB_CPLUSPLUS // cleanup
+
+#endif // !CUB_CPP_DIALECT
+
+// Define CUB_COMPILER_DEPRECATION macro:
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x)
+#  define CUB_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define CUB_COMPILER_DEPRECATION(REQ, FIX) \
+  CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+// Minimum required compiler checks:
+#ifndef CUB_IGNORE_DEPRECATED_COMPILER
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
+     CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
+#  endif
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000
+     CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
+#  endif
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
+     CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
+#  endif
+#endif
+
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \
+    (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11))
+  CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
+#endif
+
+#undef CUB_COMPILER_DEPRECATION
+#undef CUB_COMP_DEPR_IMPL
+#undef CUB_COMP_DEPR_IMPL0
+#undef CUB_COMP_DEPR_IMPL1
diff --git a/thrust/cub/util_debug.cuh b/thrust/cub/util_debug.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8413f7bd4ee476297d6882fbfff860aa39bc4faa
--- /dev/null
+++ b/thrust/cub/util_debug.cuh
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
+#ifdef CUB_STDERR
+    if (error)
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+                fflush(stderr);
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+            #endif
+        }
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if defined(__NVCOMPILER_CUDA__)
+        #define _CubLog(format, ...) (__builtin_is_device_code() \
+            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                     blockIdx.z, blockIdx.y, blockIdx.x, \
+                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
+            : printf(format, __VA_ARGS__));
+    #elif !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) cub::va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) cub::va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_deprecated.cuh b/thrust/cub/util_deprecated.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b2bf4658b54f8cf7ebdc37f8d59356f77b5c30b3
--- /dev/null
+++ b/thrust/cub/util_deprecated.cuh
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define CUB_DEPRECATED macro.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEPRECATED __declspec(deprecated)
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#else
+#  define CUB_DEPRECATED
+#endif
+
diff --git a/thrust/cub/util_device.cuh b/thrust/cub/util_device.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5196f408c6b0c872cd0c6784d0cb273cc04bef2b
--- /dev/null
+++ b/thrust/cub/util_device.cuh
@@ -0,0 +1,715 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+#include <atomic>
+#include <array>
+#include <cassert>
+#endif
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+/**
+ * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t& temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Returns the current device or -1 if an error occurred.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int device = -1;
+    if (CubDebug(cudaGetDevice(&device))) return -1;
+    return device;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+/**
+ * \brief RAII helper which saves the current device and switches to the
+ *        specified device on construction and switches to the saved device on
+ *        destruction.
+ */
+struct SwitchDevice
+{
+private:
+    int const old_device;
+    bool const needs_reset;
+public:
+    __host__ __forceinline__ SwitchDevice(int new_device)
+      : old_device(CurrentDevice()), needs_reset(old_device != new_device)
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(new_device));
+    }
+
+    __host__ __forceinline__ ~SwitchDevice()
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(old_device));
+    }
+};
+
+/**
+ * \brief Returns the number of CUDA devices available or -1 if an error
+ *        occurred.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int count = -1;
+    if (CubDebug(cudaGetDeviceCount(&count)))
+        // CUDA makes no guarantees about the state of the output parameter if
+        // `cudaGetDeviceCount` fails; in practice, they don't, but out of
+        // paranoia we'll reset `count` to `-1`.
+        count = -1;
+    return count;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Cache for an arbitrary value produced by a nullary function.
+ */
+template <typename T, T(*Function)()>
+struct ValueCache
+{
+    T const value;
+
+    /**
+     * \brief Call the nullary function to produce the value and construct the
+     *        cache.
+     */
+    __host__ __forceinline__ ValueCache() : value(Function()) {}
+};
+
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+// Host code, only safely usable in C++11 or newer, where thread-safe
+// initialization of static locals is guaranteed.  This is a separate function
+// to avoid defining a local static in a host/device function.
+__host__ __forceinline__ int DeviceCountCachedValue()
+{
+    static ValueCache<int, DeviceCountUncached> cache;
+    return cache.value;
+}
+#endif
+
+/**
+ * \brief Returns the number of CUDA devices available.
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount()
+{
+    int result = -1;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                result = DeviceCountCachedValue();
+            #else
+                // Host code and C++98.
+                result = DeviceCountUncached();
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = DeviceCountUncached();
+        #endif
+    }
+    return result;
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Per-device cache for a CUDA attribute value; the attribute is queried
+ *        and stored for each device upon construction.
+ */
+struct PerDeviceAttributeCache
+{
+    struct DevicePayload
+    {
+        int         attribute;
+        cudaError_t error;
+    };
+
+    // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
+    // `DeviceEntryInitializing` state, and then proceeds to the
+    // `DeviceEntryReady` state. These are the only state transitions allowed;
+    // e.g. a linear sequence of transitions.
+    enum DeviceEntryStatus
+    {
+        DeviceEntryEmpty = 0,
+        DeviceEntryInitializing,
+        DeviceEntryReady
+    };
+
+    struct DeviceEntry
+    {
+        std::atomic<DeviceEntryStatus> flag;
+        DevicePayload                  payload;
+    };
+
+private:
+    std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
+
+public:
+    /**
+     * \brief Construct the cache.
+     */
+    __host__ __forceinline__ PerDeviceAttributeCache() : entries_()
+    {
+        assert(DeviceCount() <= CUB_MAX_DEVICES);
+    }
+
+    /**
+     * \brief Retrieves the payload of the cached function \p f for \p device.
+     *
+     * \note You must pass a morally equivalent function in to every call or
+     *       this function has undefined behavior.
+     */
+    template <typename Invocable>
+    __host__ DevicePayload operator()(Invocable&& f, int device)
+    {
+        if (device >= DeviceCount())
+            return DevicePayload{0, cudaErrorInvalidDevice};
+
+        auto& entry   = entries_[device];
+        auto& flag    = entry.flag;
+        auto& payload = entry.payload;
+
+        DeviceEntryStatus old_status = DeviceEntryEmpty;
+
+        // First, check for the common case of the entry being ready.
+        if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
+        {
+            // Assume the entry is empty and attempt to lock it so we can fill
+            // it by trying to set the state from `DeviceEntryReady` to
+            // `DeviceEntryInitializing`.
+            if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
+                                             std::memory_order_acq_rel,
+                                             std::memory_order_acquire))
+            {
+                // We successfully set the state to `DeviceEntryInitializing`;
+                // we have the lock and it's our job to initialize this entry
+                // and then release it.
+
+                // We don't use `CubDebug` here because we let the user code
+                // decide whether or not errors are hard errors.
+                if (payload.error = std::forward<Invocable>(f)(payload.attribute))
+                    // Clear the global CUDA error state which may have been
+                    // set by the last call. Otherwise, errors may "leak" to
+                    // unrelated kernel launches.
+                    cudaGetLastError();
+
+                // Release the lock by setting the state to `DeviceEntryReady`.
+                flag.store(DeviceEntryReady, std::memory_order_release);
+            }
+
+            // If the `compare_exchange_weak` failed, then `old_status` has
+            // been updated with the value of `flag` that it observed.
+
+            else if (old_status == DeviceEntryInitializing)
+            {
+                // Another execution agent is initializing this entry; we need
+                // to wait for them to finish; we'll know they're done when we
+                // observe the entry status as `DeviceEntryReady`.
+                do { old_status = flag.load(std::memory_order_acquire); }
+                while (old_status != DeviceEntryReady);
+                // FIXME: Use `atomic::wait` instead when we have access to
+                // host-side C++20 atomics. We could use libcu++, but it only
+                // supports atomics for SM60 and up, even if you're only using
+                // them in host code.
+            }
+        }
+
+        // We now know that the state of our entry is `DeviceEntryReady`, so
+        // just return the entry's payload.
+        return entry.payload;
+    }
+};
+
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
+{
+    // Instantiate `EmptyKernel<void>` in both host and device code to ensure
+    // it can be called.
+    typedef void (*EmptyKernelPtr)();
+    EmptyKernelPtr empty_kernel = EmptyKernel<void>;
+
+    // This is necessary for unused variable warnings in host compilers. The
+    // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
+    (void)reinterpret_cast<void*>(empty_kernel);
+
+    cudaError_t result = cudaSuccess;
+    if (CUB_IS_HOST_CODE) {
+       #if CUB_INCLUDE_HOST_CODE
+            cudaFuncAttributes empty_kernel_attrs;
+
+            do {
+                if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
+                    break;
+            }
+            while(0);
+
+            ptx_version = empty_kernel_attrs.ptxVersion * 10;
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // This is necessary to ensure instantiation of EmptyKernel in device code.
+            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
+            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+            ptx_version = CUB_PTX_ARCH;
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ */
+__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device)
+{
+    SwitchDevice sd(device);
+    return PtxVersionUncached(ptx_version);
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+template <typename Tag>
+__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache()
+{
+    // C++11 guarantees that initialization of static locals is thread safe.
+    static PerDeviceAttributeCache cache;
+    return cache;
+}
+
+struct PtxVersionCacheTag {};
+struct SmVersionCacheTag {};
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
+{
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+    auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+      // If this call fails, then we get the error code back in the payload,
+      // which we check with `CubDebug` below.
+      [=] (int& pv) { return PtxVersionUncached(pv, device); },
+      device);
+
+    if (!CubDebug(payload.error))
+        ptx_version = payload.attribute;
+
+    return payload.error;
+
+#else // Pre C++11.
+
+    return PtxVersionUncached(ptx_version, device);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                auto const device = CurrentDevice();
+
+                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    ptx_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98.
+                result = PtxVersionUncached(ptx_version);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = PtxVersionUncached(ptx_version);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        int major = 0, minor = 0;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#else // Device code without the CUDA runtime.
+
+    (void)sm_version;
+    (void)device;
+
+    // CUDA API calls are not supported from this device.
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11
+                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return SmVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    sm_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98
+                result = SmVersionUncached(sm_version, device);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            result = SmVersionUncached(sm_version, device);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * Synchronize the specified \p stream.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = CubDebug(cudaStreamSynchronize(stream));
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
+                (void)stream;
+                // Device can't yet sync on a specific stream
+                result = CubDebug(cudaDeviceSynchronize());
+            #else // Device code without the CUDA runtime.
+                (void)stream;
+                // CUDA API calls are not supported from this device.
+                result = CubDebug(cudaErrorInvalidConfiguration);
+            #endif
+        #endif
+    }
+    return result;
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t MaxSmOccupancy(
+    int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)
+{
+#ifndef CUB_RUNTIME_ENABLED
+
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes));
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+   /// The policy for the active compiler pass
+   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
+
+   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+   template <typename FunctorT>
+   CUB_RUNTIME_FUNCTION __forceinline__
+   static cudaError_t Invoke(int ptx_version, FunctorT& op)
+   {
+       if (ptx_version < PTX_VERSION) {
+           return PrevPolicyT::Invoke(ptx_version, op);
+       }
+       return op.template Invoke<PolicyT>();
+   }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_macro.cuh b/thrust/cub/util_macro.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ff8636542286de8a2fa956522415a46c1b5524ef
--- /dev/null
+++ b/thrust/cub/util_macro.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_namespace.cuh b/thrust/cub/util_namespace.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4488d97f6bd151e4ed9514d956d5b4590c0e38ce
--- /dev/null
+++ b/thrust/cub/util_namespace.cuh
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+#include "version.cuh"
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
+
+// Declare these namespaces here for the purpose of Doxygenating them
+
+/*! \namespace cub
+ *  \brief \p cub is the top-level namespace which contains all CUB
+ *         functions and types.
+ */
+namespace cub
+{
+
+}
diff --git a/thrust/cub/util_ptx.cuh b/thrust/cub/util_ptx.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3f20c11bebc2a9f9e2704dd88e18930f97f4c0c8
--- /dev/null
+++ b/thrust/cub/util_ptx.cuh
@@ -0,0 +1,734 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 SHFL_C,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     SHFL_C,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/util_type.cuh b/thrust/cub/util_type.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0ba41e1ed26e56c11f373fd235fc9dee88fd213c
--- /dev/null
+++ b/thrust/cub/util_type.cuh
@@ -0,0 +1,1167 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <cfloat>
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
+#include "util_macro.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
+        unsigned int,
+        typename If<IsMultiple<short>::IS_MULTIPLE,
+            unsigned short,
+            unsigned char>::Type>::Type         ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
+        unsigned long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
+        ulonglong2,
+        VolatileWord>::Type                     DeviceWord;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
+        uint4,
+        typename If<IsMultiple<int2>::IS_MULTIPLE,
+            uint2,
+            ShuffleWord>::Type>::Type           TextureWord;
+};
+
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+
+template <class T>
+struct EnableIf<false, T> {};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+template <>
+struct FpLimits<__half>
+{
+    static __host__ __device__ __forceinline__ __half Max() {
+        unsigned short max_word = 0x7BFF;
+        return reinterpret_cast<__half&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __half Lowest() {
+        unsigned short lowest_word = 0xFBFF;
+        return reinterpret_cast<__half&>(lowest_word);
+    }
+};
+#endif
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#endif
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/version.cuh b/thrust/cub/version.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..122fb9a7625da78287bacf0f4d51e3779dfd2dd5
--- /dev/null
+++ b/thrust/cub/version.cuh
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file version.h
+ *  \brief Compile-time macros encoding CUB release version
+ *
+ *         <cub/version.h> is the only CUB header that is guaranteed to
+ *         change with every CUB release.
+ *
+ */
+
+#pragma once
+
+/*! \def CUB_VERSION
+ *  \brief The preprocessor macro \p CUB_VERSION encodes the version
+ *         number of the CUB library.
+ *
+ *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
+ *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
+ *         <tt>CUB_VERSION / 100000</tt> is the major version.
+ */
+#define CUB_VERSION 101000
+
+/*! \def CUB_MAJOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
+ *         major version number of the CUB library.
+ */
+#define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
+
+/*! \def CUB_MINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
+ *         minor version number of the CUB library.
+ */
+#define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
+
+/*! \def CUB_SUBMINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
+ *         sub-minor version number of the CUB library.
+ */
+#define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
+
+/*! \def CUB_PATCH_NUMBER
+ *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
+ *         patch number of the CUB library.
+ */
+#define CUB_PATCH_NUMBER 0
diff --git a/thrust/cub/warp/specializations/warp_reduce_shfl.cuh b/thrust/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dbc56ec1bc46ca7548b58daf3b6a797b2b815585
--- /dev/null
+++ b/thrust/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,542 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+
+#include <stdint.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    uint32_t member_mask;
+
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = static_cast<int>(LaneId());
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
+
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value,
+            cub::Sum(),
+            last_lane,
+            offset,
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        int last_lane = (ALL_LANES_VALID) ?
+                            LOGICAL_WARP_THREADS - 1 :
+                            valid_items - 1;
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Mask of physical lanes outside the logical warp and convert to logical lanemask
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
+        }
+
+        // Mask in the last lane of logical warp
+        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/warp/specializations/warp_reduce_smem.cuh b/thrust/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2442a8c4f24b948e58871ae1387fce17cbb63ffe
--- /dev/null
+++ b/thrust/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,372 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/warp/specializations/warp_scan_shfl.cuh b/thrust/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..18b46dd9981728995c9bd0a0708c5355cf0dd1a0
--- /dev/null
+++ b/thrust/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,632 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = LaneId();
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/warp/specializations/warp_scan_smem.cuh b/thrust/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ccd1de30f63f82b04694282aa9d60e329185ed81
--- /dev/null
+++ b/thrust/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,397 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/warp/warp_reduce.cuh b/thrust/cub/warp/warp_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..50ee7056c30f4d3bb07e49083cd9ddefae314281
--- /dev/null
+++ b/thrust/cub/warp/warp_reduce.cuh
@@ -0,0 +1,611 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/cub/warp/warp_scan.cuh b/thrust/cub/warp/warp_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e9e95008a38721a670c3e8531f390ed40ed41cca
--- /dev/null
+++ b/thrust/cub/warp/warp_scan.cuh
@@ -0,0 +1,935 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/.cproject b/thrust/dependencies/cub/.cproject
new file mode 100644
index 0000000000000000000000000000000000000000..e76d1da67e7f85eeca10761a4613202c255605af
--- /dev/null
+++ b/thrust/dependencies/cub/.cproject
@@ -0,0 +1,1223 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings>
+					<externalSetting languages="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1945715073"/>
+				</externalSettings>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.Cygwin_PE" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="B40CTrunk" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
+					<folderInfo id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113" name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.481495889" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.100038061" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
+							<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.412463247" keepEnvironmentInBuildfile="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.996758685" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
+								<option id="gnu.both.asm.option.include.paths.900454792" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.221302756" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1353653670" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
+								<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
+								<option id="gnu.cpp.compiler.option.preprocessor.def.1893619952" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" useByScannerDiscovery="false" valueType="definedSymbols">
+									<listOptionValue builtIn="false" value="__device__"/>
+									<listOptionValue builtIn="false" value="__global__"/>
+									<listOptionValue builtIn="false" value="__shared__"/>
+									<listOptionValue builtIn="false" value="__forceinline__"/>
+									<listOptionValue builtIn="false" value="__host__"/>
+									<listOptionValue builtIn="false" value="__device_builtin__"/>
+									<listOptionValue builtIn="false" value="__device_builtin_texture_type__"/>
+									<listOptionValue builtIn="false" value="TEST_ARCH=200"/>
+									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
+									<listOptionValue builtIn="false" value="__align__(...)"/>
+									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
+									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
+								</option>
+								<option id="gnu.cpp.compiler.option.dialect.std.49639338" name="Language standard" superClass="gnu.cpp.compiler.option.dialect.std" useByScannerDiscovery="true" value="gnu.cpp.compiler.dialect.default" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
+								<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
+								<option id="gnu.c.compiler.option.preprocessor.def.symbols.1005509663" name="Defined symbols (-D)" superClass="gnu.c.compiler.option.preprocessor.def.symbols" useByScannerDiscovery="false" valueType="definedSymbols">
+									<listOptionValue builtIn="false" value="__device__"/>
+									<listOptionValue builtIn="false" value="__global__"/>
+									<listOptionValue builtIn="false" value="__shared__"/>
+									<listOptionValue builtIn="false" value="__forceinline__"/>
+									<listOptionValue builtIn="false" value="__host__"/>
+									<listOptionValue builtIn="false" value="__device_builtin__"/>
+									<listOptionValue builtIn="false" value="__device_builtin_texture_type__"/>
+									<listOptionValue builtIn="false" value="TEST_ARCH=200"/>
+									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
+									<listOptionValue builtIn="false" value="__align__(...)"/>
+									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
+									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.c.linker.cygwin.base.1600375047" name="Cygwin C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base.1176124124" name="Cygwin C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.958378367" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.pathentry"/>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+			<storageModule moduleId="org.eclipse.cdt.core.language.mapping"/>
+			<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="B40CTrunk.null.1404415602" name="B40CTrunk"/>
+	</storageModule>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Default">
+			<resource resourceType="PROJECT" workspacePath="/GIT_CUB"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings">
+		<doc-comment-owner id="org.eclipse.cdt.ui.doxygen">
+			<path value=""/>
+		</doc-comment-owner>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+		<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="makefileGenerator">
+				<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1665401269;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.494265807">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.43985841;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1045483126">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1240277003;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1264397663">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.459535216;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2120860882">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1758599759;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.466964704">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1671954574;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.304556051">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.2110267806;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.903720746">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1850250798;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1752562149">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1296776241;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.268633283">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.265387950;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.563557831">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.629007265;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.450470600">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.2085396856;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1885998497">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.652522784;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1098348915">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1149397878;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1156849140">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.586941236;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1654082299">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1214991320;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.332043455">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.440957653;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1117446939">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.158380621;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1945715073">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+	</storageModule>
+</cproject>
diff --git a/thrust/dependencies/cub/.github/workflows/mirror-main-to-master.yml b/thrust/dependencies/cub/.github/workflows/mirror-main-to-master.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5c4707573542b45f2a7473993869495b46b888d8
--- /dev/null
+++ b/thrust/dependencies/cub/.github/workflows/mirror-main-to-master.yml
@@ -0,0 +1,17 @@
+on:
+  push:
+    branches:
+      - 'main'
+
+jobs:
+  mirror_job:
+    runs-on: ubuntu-latest
+    name: Mirror main branch to master branch
+    steps:
+    - name: Mirror action step
+      id: mirror
+      uses: google/mirror-branch-action@v1.0
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        source: 'main'
+        dest: 'master'
diff --git a/thrust/dependencies/cub/.gitignore b/thrust/dependencies/cub/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..3441f55e5e674a73e38d24750d9ee2e4a3947b6d
--- /dev/null
+++ b/thrust/dependencies/cub/.gitignore
@@ -0,0 +1 @@
+.p4config
diff --git a/thrust/dependencies/cub/.project b/thrust/dependencies/cub/.project
new file mode 100644
index 0000000000000000000000000000000000000000..7aca9e046bdb6a5376ec7512f00928df78d24f7e
--- /dev/null
+++ b/thrust/dependencies/cub/.project
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>GIT_CUB</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+	</natures>
+</projectDescription>
diff --git a/thrust/dependencies/cub/.settings/.gitignore b/thrust/dependencies/cub/.settings/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d81d4c414cc298fa21635d0b27c126c1b0a7ff3a
--- /dev/null
+++ b/thrust/dependencies/cub/.settings/.gitignore
@@ -0,0 +1 @@
+/language.settings.xml
diff --git a/thrust/dependencies/cub/.settings/org.eclipse.cdt.codan.core.prefs b/thrust/dependencies/cub/.settings/org.eclipse.cdt.codan.core.prefs
new file mode 100644
index 0000000000000000000000000000000000000000..64da7771b6ab31f4727594a0a40ab201f321bc5d
--- /dev/null
+++ b/thrust/dependencies/cub/.settings/org.eclipse.cdt.codan.core.prefs
@@ -0,0 +1,72 @@
+eclipse.preferences.version=1
+org.eclipse.cdt.codan.checkers.errnoreturn=Warning
+org.eclipse.cdt.codan.checkers.errnoreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.checkers.errreturnvalue=Error
+org.eclipse.cdt.codan.checkers.errreturnvalue.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nocommentinside=-Error
+org.eclipse.cdt.codan.checkers.nocommentinside.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nolinecomment=-Error
+org.eclipse.cdt.codan.checkers.nolinecomment.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.noreturn=Error
+org.eclipse.cdt.codan.checkers.noreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation=Error
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},no_break_comment\=>"no break",last_case_param\=>true,empty_case_param\=>false}
+org.eclipse.cdt.codan.internal.checkers.CatchByReference=Warning
+org.eclipse.cdt.codan.internal.checkers.CatchByReference.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},unknown\=>false,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem=Error
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization=Warning
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},skip\=>true}
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker=-Info
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},pattern\=>"^[a-z]",macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem=Error
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},paramNot\=>false}
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},else\=>false,afterelse\=>false}
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>("@(\#)","$Id")}
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+useParentScope=false
diff --git a/thrust/dependencies/cub/.settings/org.eclipse.cdt.core.prefs b/thrust/dependencies/cub/.settings/org.eclipse.cdt.core.prefs
new file mode 100644
index 0000000000000000000000000000000000000000..80b8e65c79275741503481b6bde1ec8524a00343
--- /dev/null
+++ b/thrust/dependencies/cub/.settings/org.eclipse.cdt.core.prefs
@@ -0,0 +1,177 @@
+eclipse.preferences.version=1
+indexer/indexAllFiles=true
+indexer/indexAllHeaderVersions=false
+indexer/indexAllVersionsSpecificHeaders=
+indexer/indexOnOpen=false
+indexer/indexUnusedHeadersWithAlternateLang=false
+indexer/indexUnusedHeadersWithDefaultLang=true
+indexer/indexerId=org.eclipse.cdt.core.fastIndexer
+indexer/skipFilesLargerThanMB=8
+indexer/skipImplicitReferences=false
+indexer/skipIncludedFilesLargerThanMB=16
+indexer/skipMacroReferences=false
+indexer/skipReferences=false
+indexer/skipTypeReferences=false
+indexer/useHeuristicIncludeResolution=true
+org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation=16
+org.eclipse.cdt.core.formatter.alignment_for_assignment=16
+org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_binary_expression=16
+org.eclipse.cdt.core.formatter.alignment_for_compact_if=0
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression=48
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression_chain=18
+org.eclipse.cdt.core.formatter.alignment_for_constructor_initializer_list=0
+org.eclipse.cdt.core.formatter.alignment_for_declarator_list=16
+org.eclipse.cdt.core.formatter.alignment_for_enumerator_list=48
+org.eclipse.cdt.core.formatter.alignment_for_expression_list=0
+org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer=16
+org.eclipse.cdt.core.formatter.alignment_for_member_access=0
+org.eclipse.cdt.core.formatter.alignment_for_overloaded_left_shift_chain=16
+org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration=48
+org.eclipse.cdt.core.formatter.brace_position_for_array_initializer=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block_in_case=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_method_declaration=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_switch=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_type_declaration=next_line
+org.eclipse.cdt.core.formatter.comment.min_distance_between_code_and_line_comment=1
+org.eclipse.cdt.core.formatter.comment.never_indent_line_comments_on_first_column=true
+org.eclipse.cdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=true
+org.eclipse.cdt.core.formatter.compact_else_if=true
+org.eclipse.cdt.core.formatter.continuation_indentation=1
+org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer=1
+org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_extra_spaces=0
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier=true
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header=false
+org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header=false
+org.eclipse.cdt.core.formatter.indent_empty_lines=false
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_block=true
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_body=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch=false
+org.eclipse.cdt.core.formatter.indentation.size=4
+org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_colon_in_constructor_initializer_list=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_after_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while=insert
+org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.join_wrapped_lines=true
+org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line=false
+org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line=true
+org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.lineSplit=80
+org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve=1
+org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line=true
+org.eclipse.cdt.core.formatter.tabulation.char=space
+org.eclipse.cdt.core.formatter.tabulation.size=4
+org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations=false
diff --git a/thrust/dependencies/cub/.settings/org.eclipse.cdt.ui.prefs b/thrust/dependencies/cub/.settings/org.eclipse.cdt.ui.prefs
new file mode 100644
index 0000000000000000000000000000000000000000..ca73f82defc6bf69c61eca8a2c4e26eb48f3c947
--- /dev/null
+++ b/thrust/dependencies/cub/.settings/org.eclipse.cdt.ui.prefs
@@ -0,0 +1,3 @@
+eclipse.preferences.version=1
+formatter_profile=_B40C
+formatter_settings_version=1
diff --git a/thrust/dependencies/cub/.settings/org.eclipse.core.runtime.prefs b/thrust/dependencies/cub/.settings/org.eclipse.core.runtime.prefs
new file mode 100644
index 0000000000000000000000000000000000000000..2e6330e751422a7636c692290b429116b2c55596
--- /dev/null
+++ b/thrust/dependencies/cub/.settings/org.eclipse.core.runtime.prefs
@@ -0,0 +1,4 @@
+content-types/enabled=true
+content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh
+content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
+eclipse.preferences.version=1
diff --git a/thrust/dependencies/cub/CHANGELOG.md b/thrust/dependencies/cub/CHANGELOG.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c05ac274c68ae42b31d93dfcc7e06ddf8e28de9
--- /dev/null
+++ b/thrust/dependencies/cub/CHANGELOG.md
@@ -0,0 +1,848 @@
+# CUB 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
+
+## Summary
+
+CUB 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
+  and the CUDA Toolkit 11.1 release.
+
+## Bug Fixes
+
+- #1217: Move static local in `cub::DeviceCount` to a separate host-only
+    function because NVC++ doesn't support static locals in host-device
+    functions.
+
+# CUB 1.9.10 (NVIDIA HPC SDK 20.5)
+
+## Summary
+
+Thrust 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
+It adds CMake `find_package` support.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+  `CUB_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- CMake `find_package` support.
+  Just point CMake at the `cmake` folder in your CUB include directory
+    (ex: `cmake -DCUB_DIR=/usr/local/cuda/include/cub/cmake/ .`) and then you
+    can add CUB to your CMake project with `find_package(CUB REQUIRED CONFIG)`.
+
+# CUB 1.9.9 (CUDA 11.0)
+
+## Summary
+
+CUB 1.9.9 is the release accompanying the CUDA Toolkit 11.0 release.
+It introduces CMake support, version macros, platform detection machinery,
+  and support for NVC++, which uses Thrust (and thus CUB) to implement
+  GPU-accelerated C++17 Parallel Algorithms.
+Additionally, the scan dispatch layer was refactored and modernized.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- CMake support.
+  Thanks to Francis Lemaire for this contribution.
+- Refactorized and modernized scan dispatch layer.
+  Thanks to Francis Lemaire for this contribution.
+- Policy hooks for device-wide reduce, scan, and radix sort facilities
+    to simplify tuning and allow users to provide custom policies.
+  Thanks to Francis Lemaire for this contribution.
+- `<cub/version.cuh>`: `CUB_VERSION`, `CUB_VERSION_MAJOR`, `CUB_VERSION_MINOR`,
+    `CUB_VERSION_SUBMINOR`, and `CUB_PATCH_NUMBER`.
+- Platform detection machinery:
+  - `<cub/util_cpp_dialect.cuh>`: Detects the C++ standard dialect.
+  - `<cub/util_compiler.cuh>`: host and device compiler detection.
+  - `<cub/util_deprecated.cuh>`: `CUB_DEPRECATED`.
+  - <cub/config.cuh>`: Includes `<cub/util_arch.cuh>`,
+      `<cub/util_compiler.cuh>`, `<cub/util_cpp_dialect.cuh>`,
+      `<cub/util_deprecated.cuh>`, `<cub/util_macro.cuh>`,
+      `<cub/util_namespace.cuh>`
+- `cub::DeviceCount` and `cub::DeviceCountUncached`, caching abstractions for
+    `cudaGetDeviceCount`.
+
+## Other Enhancements
+
+- Lazily initialize the per-device CUDAattribute caches, because CUDA context
+    creation is expensive and adds up with large CUDA binaries on machines with
+    many GPUs.
+  Thanks to the NVIDIA PyTorch team for bringing this to our attention.
+- Make `cub::SwitchDevice` avoid setting/resetting the device if the current
+    device is the same as the target device.
+
+## Bug Fixes
+
+- Add explicit failure parameter to CAS in the CUB attribute cache to workaround
+    a GCC 4.8 bug.
+- Revert a change in reductions that changed the signedness of the `lane_id`
+    variable to suppress a warning, as this introduces a bug in optimized device
+    code.
+- Fix initialization in `cub::ExclusiveSum`.
+  Thanks to Conor Hoekstra for this contribution.
+- Fix initialization of the `std::array` in the CUB attribute cache.
+- Fix `-Wsign-compare` warnings.
+  Thanks to Elias Stehle for this contribution.
+- Fix `test_block_reduce.cu` to build without parameters.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `grid_even_share.cuh`.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `thread_search.cuh`.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `cub.cuh`.
+  Thanks to Felix Kallenborn for this contribution.
+
+# CUB 1.9.8-1 (NVIDIA HPC SDK 20.3)
+
+## Summary
+
+CUB 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms.
+
+# CUB 1.9.8 (CUDA 11.0 Early Access)
+
+## Summary
+
+CUB 1.9.8 is the first release of CUB to be officially supported and included
+  in the CUDA Toolkit.
+When compiling CUB in C++11 mode, CUB now caches calls to CUDA attribute query
+  APIs, which improves performance of these queries by 20x to 50x when they
+  are called concurrently by multiple host threads.
+
+## Enhancements
+
+- (C++11 or later) Cache calls to `cudaFuncGetAttributes` and
+    `cudaDeviceGetAttribute` within `cub::PtxVersion` and `cub::SmVersion`.
+    These CUDA APIs acquire locks to CUDA driver/runtime mutex and perform
+    poorly under contention; with the caching, they are 20 to 50x faster when
+    called concurrently.
+  Thanks to Bilge Acun for bringing this issue to our attention.
+- `DispatchReduce` now takes an `OutputT` template parameter so that users can
+    specify the intermediate type explicitly.
+- Radix sort tuning policies updates to fix performance issues for element
+    types smaller than 4 bytes.
+
+## Bug Fixes
+
+- Change initialization style from copy initialization to direct initialization
+    (which is more permissive) in `AgentReduce` to allow a wider range of types
+    to be used with it.
+- Fix bad signed/unsigned comparisons in `WarpReduce`.
+- Fix computation of valid lanes in warp-level reduction primitive to correctly
+    handle the case where there are 0 input items per warp.
+
+# CUB 1.8.0
+
+## Summary
+
+CUB 1.8.0 introduces changes to the `cub::Shuffle*` interfaces.
+
+## Breaking Changes
+
+- The interfaces of `cub::ShuffleIndex`, `cub::ShuffleUp`, and
+    `cub::ShuffleDown` have been changed to allow for better computation of the
+    PTX SHFL control constant for logical warps smaller than 32 threads.
+
+## Bug Fixes
+
+- #112: Fix `cub::WarpScan`'s broadcast of warp-wide aggregate for logical
+    warps smaller than 32 threads.
+
+# CUB 1.7.5
+
+## Summary
+
+CUB 1.7.5 adds support for radix sorting `__half` keys and improved sorting
+  performance for 1 byte keys.
+It was incorporated into Thrust 1.9.2.
+
+## Enhancements
+
+- Radix sort support for `__half` keys.
+- Radix sort tuning policy updates to improve 1 byte key performance.
+
+## Bug Fixes
+
+- Syntax tweaks to mollify Clang.
+- #127: `cub::DeviceRunLengthEncode::Encode` returns incorrect results.
+- #128: 7-bit sorting passes fail for SM61 with large values.
+
+# CUB 1.7.4
+
+## Summary
+
+CUB 1.7.4 is a minor release that was incorporated into Thrust 1.9.1-2.
+
+## Bug Fixes
+
+- #114: Can't pair non-trivially-constructible values in radix sort.
+- #115: `cub::WarpReduce` segmented reduction is broken in CUDA 9 for logical
+    warp sizes smaller than 32.
+
+# CUB 1.7.3
+
+## Summary
+
+CUB 1.7.3 is a minor release.
+
+## Bug Fixes
+
+- #110: `cub::DeviceHistogram` null-pointer exception bug for iterator inputs.
+
+# CUB 1.7.2
+
+## Summary
+
+CUB 1.7.2 is a minor release.
+
+## Bug Fixes
+
+- #104: Device-wide reduction is now "run-to-run" deterministic for
+    pseudo-associative reduction operators (like floating point addition).
+
+# CUB 1.7.1
+
+## Summary
+
+CUB 1.7.1 delivers improved radix sort performance on SM7x (Volta) GPUs and a
+  number of bug fixes.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM7x (Volta).
+
+## Bug Fixes
+
+- #104: `uint64_t` `cub::WarpReduce` broken for CUB 1.7.0 on CUDA 8 and older.
+- #103: Can't mix Thrust from CUDA 9.0 and CUB.
+- #102: CUB pulls in `windows.h` which defines `min`/`max` macros that conflict
+    with `std::min`/`std::max`.
+- #99: Radix sorting crashes NVCC on Windows 10 for SM52.
+- #98: cuda-memcheck: --tool initcheck failed with lineOfSight.
+- #94: Git clone size.
+- #93: Accept iterators for segment offsets.
+- #87: CUB uses anonymous unions which is not valid C++.
+- #44: Check for C++11 is incorrect for Visual Studio 2013.
+
+# CUB 1.7.0
+
+## Summary
+
+CUB 1.7.0 brings support for CUDA 9.0 and SM7x (Volta) GPUs.
+It is compatible with independent thread scheduling.
+It was incorporated into Thrust 1.9.0-5.
+
+## Breaking Changes
+
+- Remove `cub::WarpAll` and `cub::WarpAny`.
+  These functions served to emulate `__all` and `__any` functionality for
+    SM1x devices, which did not have those operations.
+  However, SM1x devices are now deprecated in CUDA, and the interfaces of these
+    two functions are now lacking the lane-mask needed for collectives to run on
+    SM7x and newer GPUs which have independent thread scheduling.
+
+## Other Enhancements
+
+- Remove any assumptions of implicit warp synchronization to be compatible with
+    SM7x's (Volta) independent thread scheduling.
+
+## Bug Fixes
+
+- #86: Incorrect results with reduce-by-key.
+
+# CUB 1.6.4
+
+## Summary
+
+CUB 1.6.4 improves radix sorting performance for SM5x (Maxwell) and SM6x
+  (Pascal) GPUs.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM5x (Maxwell) and SM6x (Pascal) -
+    3.5B and 3.4B 32 byte keys/s on TitanX and GTX 1080, respectively.
+
+## Bug Fixes
+
+- Restore fence work-around for scan (reduce-by-key, etc.) hangs in CUDA 8.5.
+- #65: `cub::DeviceSegmentedRadixSort` should allow inputs to have
+    pointer-to-const type.
+- Mollify Clang device-side warnings.
+- Remove out-dated MSVC project files.
+
+# CUB 1.6.3
+
+## Summary
+
+CUB 1.6.3 improves support for Windows, changes
+  `cub::BlockLoad`/`cub::BlockStore` interface to take the local data type,
+  and enhances radix sort performance for SM6x (Pascal) GPUs.
+
+## Breaking Changes
+
+- `cub::BlockLoad` and `cub::BlockStore` are now templated by the local data
+    type, instead of the `Iterator` type.
+  This allows for output iterators having `void` as their `value_type` (e.g.
+    discard iterators).
+
+## Other Enhancements
+
+- Radix sort tuning policies updated for SM6x (Pascal) GPUs - 6.2B 4 byte
+    keys/s on GP100.
+- Improved support for Windows (warnings, alignment, etc).
+
+## Bug Fixes
+
+- #74: `cub::WarpReduce` executes reduction operator for out-of-bounds items.
+- #72: `cub:InequalityWrapper::operator` should be non-const.
+- #71: `cub::KeyValuePair` won't work if `Key` has non-trivial constructor.
+- #69: cub::BlockStore::Store` doesn't compile if `OutputIteratorT::value_type`
+    isn't `T`.
+- #68: `cub::TilePrefixCallbackOp::WarpReduce` doesn't permit PTX arch
+    specialization.
+
+# CUB 1.6.2 (previously 1.5.5)
+
+## Summary
+
+CUB 1.6.2 (previously 1.5.5) improves radix sort performance for SM6x (Pascal)
+  GPUs.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM6x (Pascal) GPUs.
+
+## Bug Fixes
+
+- Fix AArch64 compilation of `cub::CachingDeviceAllocator`.
+
+# CUB 1.6.1 (previously 1.5.4)
+
+## Summary
+
+CUB 1.6.1 (previously 1.5.4) is a minor release.
+
+## Bug Fixes
+
+- Fix radix sorting bug introduced by scan refactorization.
+
+# CUB 1.6.0 (previously 1.5.3)
+
+## Summary
+
+CUB 1.6.0 changes the scan and reduce interfaces.
+Exclusive scans now accept an "initial value" instead of an "identity value".
+Scans and reductions now support differing input and output sequence types.
+Additionally, many bugs have been fixed.
+
+## Breaking Changes
+
+- Device/block/warp-wide exclusive scans have been revised to now accept an
+    "initial value" (instead of an "identity value") for seeding the computation
+    with an arbitrary prefix.
+- Device-wide reductions and scans can now have input sequence types that are
+    different from output sequence types (as long as they are convertible).
+
+## Other Enhancements
+
+- Reduce repository size by moving the doxygen binary to doc repository.
+- Minor reduction in `cub::BlockScan` instruction counts.
+
+## Bug Fixes
+
+- Issue #55: Warning in `cub/device/dispatch/dispatch_reduce_by_key.cuh`.
+- Issue #59: `cub::DeviceScan::ExclusiveSum` can't prefix sum of float into
+    double.
+- Issue #58: Infinite loop in `cub::CachingDeviceAllocator::NearestPowerOf`.
+- Issue #47: `cub::CachingDeviceAllocator` needs to clean up CUDA global error
+    state upon successful retry.
+- Issue #46: Very high amount of needed memory from the
+    `cub::DeviceHistogram::HistogramEven`.
+- Issue #45: `cub::CachingDeviceAllocator` fails with debug output enabled
+
+# CUB 1.5.2
+
+## Summary
+
+CUB 1.5.2 enhances `cub::CachingDeviceAllocator` and improves scan performance
+  for SM5x (Maxwell).
+
+## Enhancements
+
+- Improved medium-size scan performance on SM5x (Maxwell).
+- Refactored `cub::CachingDeviceAllocator`:
+  - Now spends less time locked.
+  - Uses C++11's `std::mutex` when available.
+  - Failure to allocate a block from the runtime will retry once after
+  		freeing cached allocations.
+  - Now respects max-bin, fixing an issue where blocks in excess of max-bin
+      were still being retained in the free cache.
+
+## Bug fixes:
+
+- Fix for generic-type reduce-by-key `cub::WarpScan` for SM3x and newer GPUs.
+
+# CUB 1.5.1
+
+## Summary
+
+CUB 1.5.1 is a minor release.
+
+## Bug Fixes
+
+- Fix for incorrect `cub::DeviceRadixSort` output for some small problems on
+    SM52 (Mawell) GPUs.
+- Fix for macro redefinition warnings when compiling `thrust::sort`.
+
+# CUB 1.5.0
+
+CUB 1.5.0 introduces segmented sort and reduction primitives.
+
+## New Features:
+
+- Segmented device-wide operations for device-wide sort and reduction primitives.
+
+## Bug Fixes:
+
+- #36: `cub::ThreadLoad` generates compiler errors when loading from
+    pointer-to-const.
+- #29: `cub::DeviceRadixSort::SortKeys<bool>` yields compiler errors.
+- #26: Misaligned address after `cub::DeviceRadixSort::SortKeys`.
+- #25: Fix for incorrect results and crashes when radix sorting 0-length
+    problems.
+- Fix CUDA 7.5 issues on SM52 GPUs with SHFL-based warp-scan and
+    warp-reduction on non-primitive data types (e.g. user-defined structs).
+- Fix small radix sorting problems where 0 temporary bytes were required and
+    users code was invoking `malloc(0)` on some systems where that returns
+    `NULL`.
+  CUB assumed the user was asking for the size again and not running the sort.
+
+# CUB 1.4.1
+
+## Summary
+
+CUB 1.4.1 is a minor release.
+
+## Enhancements
+
+- Allow `cub::DeviceRadixSort` and `cub::BlockRadixSort` on bool types.
+
+## Bug Fixes
+
+- Fix minor CUDA 7.0 performance regressions in `cub::DeviceScan` and
+    `cub::DeviceReduceByKey`.
+- Remove requirement for callers to define the `CUB_CDP` macro
+    when invoking CUB device-wide rountines using CUDA dynamic parallelism.
+- Fix headers not being included in the proper order (or missing includes)
+    for some block-wide functions.
+
+# CUB 1.4.0
+
+## Summary
+
+CUB 1.4.0 adds `cub::DeviceSpmv`, `cub::DeviceRunLength::NonTrivialRuns`,
+  improves `cub::DeviceHistogram`, and introduces support for SM5x (Maxwell)
+  GPUs.
+
+## New Features:
+
+- `cub::DeviceSpmv` methods for multiplying sparse matrices by
+    dense vectors, load-balanced using a merge-based parallel decomposition.
+- `cub::DeviceRadixSort` sorting entry-points that always return
+    the sorted output into the specified buffer, as opposed to the
+    `cub::DoubleBuffer` in which it could end up in either buffer.
+- `cub::DeviceRunLengthEncode::NonTrivialRuns` for finding the starting
+    offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in
+    a given sequence.
+  Useful for top-down partitioning algorithms like MSD sorting of very-large
+    keys.
+
+## Other Enhancements
+
+- Support and performance tuning for SM5x (Maxwell) GPUs.
+- Updated cub::DeviceHistogram implementation that provides the same
+    "histogram-even" and "histogram-range" functionality as IPP/NPP.
+  Provides extremely fast and, perhaps more importantly, very uniform
+    performance response across diverse real-world datasets, including
+    pathological (homogeneous) sample distributions.
+
+# CUB 1.3.2
+
+## Summary
+
+CUB 1.3.2 is a minor release.
+
+## Bug Fixes
+
+- Fix `cub::DeviceReduce` where reductions of small problems (small enough to
+    only dispatch a single thread block) would run in the default stream (stream
+    zero) regardless of whether an alternate stream was specified.
+
+# CUB 1.3.1
+
+## Summary
+
+CUB 1.3.1 is a minor release.
+
+## Bug Fixes
+
+- Workaround for a benign WAW race warning reported by cuda-memcheck
+    in `cub::BlockScan` specialized for `BLOCK_SCAN_WARP_SCANS` algorithm.
+- Fix bug in `cub::DeviceRadixSort` where the algorithm may sort more
+    key bits than the caller specified (up to the nearest radix digit).
+- Fix for ~3% `cub::DeviceRadixSort` performance regression on SM2x (Fermi) and
+    SM3x (Kepler) GPUs.
+
+# CUB 1.3.0
+
+## Summary
+
+CUB 1.3.0 improves how thread blocks are expressed in block- and warp-wide
+  primitives and adds an enhanced version of `cub::WarpScan`.
+
+## Breaking Changes
+
+- CUB's collective (block-wide, warp-wide) primitives underwent a minor
+    interface refactoring:
+  - To provide the appropriate support for multidimensional thread blocks,
+      The interfaces for collective classes are now template-parameterized by
+      X, Y, and Z block dimensions (with `BLOCK_DIM_Y` and `BLOCK_DIM_Z` being
+      optional, and `BLOCK_DIM_X` replacing `BLOCK_THREADS`).
+    Furthermore, the constructors that accept remapped linear
+      thread-identifiers have been removed: all primitives now assume a
+      row-major thread-ranking for multidimensional thread blocks.
+  - To allow the host program (compiled by the host-pass) to accurately
+      determine the device-specific storage requirements for a given collective
+      (compiled for each device-pass), the interfaces for collective classes
+      are now (optionally) template-parameterized by the desired PTX compute
+      capability.
+    This is useful when aliasing collective storage to shared memory that has
+      been allocated dynamically by the host at the kernel call site.
+  - Most CUB programs having typical 1D usage should not require any
+      changes to accomodate these updates.
+
+## New Features
+
+- Added "combination" `cub::WarpScan` methods for efficiently computing
+    both inclusive and exclusive prefix scans (and sums).
+
+## Bug Fixes
+
+- Fix for bug in `cub::WarpScan` (which affected `cub::BlockScan` and
+    `cub::DeviceScan`) where incorrect results (e.g., NAN) would often be
+    returned when parameterized for floating-point types (fp32, fp64).
+- Workaround for ptxas error when compiling with with -G flag on Linux (for
+    debug instrumentation).
+- Fixes for certain scan scenarios using custom scan operators where code
+    compiled for SM1x is run on newer GPUs of higher compute-capability: the
+    compiler could not tell which memory space was being used collective
+    operations and was mistakenly using global ops instead of shared ops.
+
+# CUB 1.2.3
+
+## Summary
+
+CUB 1.2.3 is a minor release.
+
+## Bug Fixes
+
+- Fixed access violation bug in `cub::DeviceReduce::ReduceByKey` for
+    non-primitive value types.
+- Fixed code-snippet bug in `ArgIndexInputIteratorT` documentation.
+
+# CUB 1.2.2
+
+## Summary
+
+CUB 1.2.2 adds a new variant of `cub::BlockReduce` and MSVC project solections
+  for examples.
+
+## New Features
+
+- MSVC project solutions for device-wide and block-wide examples
+- New algorithmic variant of cub::BlockReduce for improved performance
+    when using commutative operators (e.g., numeric addition).
+
+## Bug Fixes
+
+- Inclusion of Thrust headers in a certain order prevented CUB device-wide
+    primitives from working properly.
+
+# CUB 1.2.0
+
+## Summary
+
+CUB 1.2.0 adds `cub::DeviceReduce::ReduceByKey` and
+  `cub::DeviceReduce::RunLengthEncode` and support for CUDA 6.0.
+
+## New Features
+
+- `cub::DeviceReduce::ReduceByKey`.
+- `cub::DeviceReduce::RunLengthEncode`.
+
+## Other Enhancements
+
+- Improved `cub::DeviceScan`, `cub::DeviceSelect`, `cub::DevicePartition`
+    performance.
+- Documentation and testing:
+  - Added performance-portability plots for many device-wide primitives.
+  - Explain that iterator (in)compatibilities with CUDA 5.0 (and older) and
+      Thrust 1.6 (and older).
+- Revised the operation of temporary tile status bookkeeping for
+    `cub::DeviceScan` (and similar) to be safe for current code run on future
+    platforms (now uses proper fences).
+
+## Bug Fixes
+
+- Fix `cub::DeviceScan` bug where Windows alignment disagreements between host
+    and device regarding user-defined data types would corrupt tile status.
+- Fix `cub::BlockScan` bug where certain exclusive scans on custom data types
+    for the `BLOCK_SCAN_WARP_SCANS` variant would return incorrect results for
+    the first thread in the block.
+- Added workaround to make `cub::TexRefInputIteratorT` work with CUDA 6.0.
+
+# CUB 1.1.1
+
+## Summary
+
+CUB 1.1.1 introduces texture and cache modifier iterators, descending sorting,
+  `cub::DeviceSelect`, `cub::DevicePartition`, `cub::Shuffle*`, and
+  `cub::MaxSMOccupancy`.
+Additionally, scan and sort performance for older GPUs has been improved and
+  many bugs have been fixed.
+
+## Breaking Changes
+
+- Refactored block-wide I/O (`cub::BlockLoad` and `cub::BlockStore`), removing
+    cache-modifiers from their interfaces.
+  `cub::CacheModifiedInputIterator` and `cub::CacheModifiedOutputIterator`
+    should now be used with `cub::BlockLoad` and `cub::BlockStore` to effect that
+    behavior.
+
+## New Features
+
+- `cub::TexObjInputIterator`, `cub::TexRefInputIterator`,
+    `cub::CacheModifiedInputIterator`, and `cub::CacheModifiedOutputIterator`
+    types for loading & storing arbitrary types through the cache hierarchy.
+  They are compatible with Thrust.
+- Descending sorting for `cub::DeviceRadixSort` and `cub::BlockRadixSort`.
+- Min, max, arg-min, and arg-max operators for `cub::DeviceReduce`.
+- `cub::DeviceSelect` (select-unique, select-if, and select-flagged).
+- `cub::DevicePartition` (partition-if, partition-flagged).
+- Generic `cub::ShuffleUp`, `cub::ShuffleDown`, and `cub::ShuffleIndex` for
+    warp-wide communication of arbitrary data types (SM3x and up).
+- `cub::MaxSmOccupancy` for accurately determining SM occupancy for any given
+    kernel function pointer.
+
+## Other Enhancements
+
+- Improved `cub::DeviceScan` and `cub::DeviceRadixSort` performance for older
+    GPUs (SM1x to SM3x).
+- Renamed device-wide `stream_synchronous` param to `debug_synchronous` to
+    avoid confusion about usage.
+- Documentation improvements:
+  - Added simple examples of device-wide methods.
+  - Improved doxygen documentation and example snippets.
+- Improved test coverege to include up to 21,000 kernel variants and 851,000
+    unit tests (per architecture, per platform).
+
+## Bug Fixes
+
+- Fix misc `cub::DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when
+    operating on non-primitive types for older architectures SM1x.
+- SHFL-based scans and reductions produced incorrect results for multi-word
+    types (size > 4B) on Linux.
+- For `cub::WarpScan`-based scans, not all threads in the first warp were
+    entering the prefix callback functor.
+- `cub::DeviceRadixSort` had a race condition with key-value pairs for pre-SM35
+    architectures.
+- `cub::DeviceRadixSor` bitfield-extract behavior with long keys on 64-bit
+    Linux was incorrect.
+- `cub::BlockDiscontinuity` failed to compile for types other than
+    `int32_t`/`uint32_t`.
+- CUDA Dynamic Parallelism (CDP, e.g. device-callable) versions of device-wide
+    methods now report the same temporary storage allocation size requirement as
+    their host-callable counterparts.
+
+# CUB 1.0.2
+
+## Summary
+
+CUB 1.0.2 is a minor release.
+
+## Bug Fixes
+
+- Corrections to code snippet examples for `cub::BlockLoad`, `cub::BlockStore`,
+    and `cub::BlockDiscontinuity`.
+- Cleaned up unnecessary/missing header includes.
+  You can now safely include a specific .cuh (instead of `cub.cuh`).
+- Bug/compilation fixes for `cub::BlockHistogram`.
+
+# CUB 1.0.1
+
+## Summary
+
+CUB 1.0.1 adds `cub::DeviceRadixSort` and `cub::DeviceScan`.
+Numerous other performance and correctness fixes and included.
+
+## Breaking Changes
+
+- New collective interface idiom (specialize/construct/invoke).
+
+## New Features
+
+- `cub::DeviceRadixSort`.
+  Implements short-circuiting for homogenous digit passes.
+- `cub::DeviceScan`.
+  Implements single-pass "adaptive-lookback" strategy.
+
+## Other Enhancements
+
+- Significantly improved documentation (with example code snippets).
+- More extensive regression test suit for aggressively testing collective
+    variants.
+- Allow non-trially-constructed types (previously unions had prevented aliasing
+    temporary storage of those types).
+- Improved support for SM3x SHFL (collective ops now use SHFL for types larger
+    than 32 bits).
+- Better code generation for 64-bit addressing within
+    `cub::BlockLoad`/`cub::BlockStore`.
+- `cub::DeviceHistogram` now supports histograms of arbitrary bins.
+- Updates to accommodate CUDA 5.5 dynamic parallelism.
+
+## Bug Fixes
+
+- Workarounds for SM10 codegen issues in uncommonly-used
+    `cub::WarpScan`/`cub::WarpReduce` specializations.
+
+# CUB 0.9.4
+
+## Summary
+
+CUB 0.9.3 is a minor release.
+
+## Enhancements
+
+- Various documentation updates and corrections.
+
+## Bug Fixes
+
+- Fixed compilation errors for SM1x.
+- Fixed compilation errors for some WarpScan entrypoints on SM3x and up.
+
+# CUB 0.9.3
+
+## Summary
+
+CUB 0.9.3 adds histogram algorithms and work management utility descriptors.
+
+## New Features
+
+- `cub::DevicHistogram256`.
+- `cub::BlockHistogram256`.
+- `cub::BlockScan` algorithm variant `BLOCK_SCAN_RAKING_MEMOIZE`, which
+    trades more register consumption for less shared memory I/O.
+- `cub::GridQueue`, `cub::GridEvenShare`, work management utility descriptors.
+
+## Other Enhancements
+
+- Updates to `cub::BlockRadixRank` to use `cub::BlockScan`, which improves
+    performance on SM3x by using SHFL.
+- Allow types other than builtin types to be used in `cub::WarpScan::*Sum`
+    methods if they only have `operator+` overloaded.
+  Previously they also required to support assignment from `int(0)`.
+- Update `cub::BlockReduce`'s `BLOCK_REDUCE_WARP_REDUCTIONS` algorithm to work
+    even when block size is not an even multiple of warp size.
+- Refactoring of `cub::DeviceAllocator` interface and
+    `cub::CachingDeviceAllocator` implementation.
+
+# CUB 0.9.2
+
+## Summary
+
+CUB 0.9.2 adds `cub::WarpReduce`.
+
+## New Features
+
+- `cub::WarpReduce`, which uses the SHFL instruction when applicable.
+  `cub::BlockReduce` now uses this `cub::WarpReduce` instead of implementing
+    its own.
+
+## Enhancements
+
+- Documentation updates and corrections.
+
+## Bug Fixes
+
+- Fixes for 64-bit Linux compilation warnings and errors.
+
+# CUB 0.9.1
+
+## Summary
+
+CUB 0.9.1 is a minor release.
+
+## Bug Fixes
+
+- Fix for ambiguity in `cub::BlockScan::Reduce` between generic reduction and
+    summation.
+  Summation entrypoints are now called `::Sum()`, similar to the
+    convention in `cub::BlockScan`.
+- Small edits to documentation and download tracking.
+
+# CUB 0.9.0
+
+## Summary
+
+Initial preview release.
+CUB is the first durable, high-performance library of cooperative block-level,
+  warp-level, and thread-level primitives for CUDA kernel programming.
+
diff --git a/thrust/dependencies/cub/CMakeLists.txt b/thrust/dependencies/cub/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..39a8824feee3d76894fa992fe4465127d15ba7ff
--- /dev/null
+++ b/thrust/dependencies/cub/CMakeLists.txt
@@ -0,0 +1,65 @@
+# Will be increased to 3.18 when C++17 is enabled:
+cmake_minimum_required(VERSION 3.15)
+
+# Remove this when we use the new CUDA_ARCHITECTURES properties.
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+# CXX is only needed for AppendOptionIfAvailable.
+project(CUB CUDA CXX)
+
+include(cmake/AppendOptionIfAvailable.cmake)
+include(cmake/CubBuildCompilerTargets.cmake)
+include(cmake/CubBuildTargetList.cmake)
+include(cmake/CubCudaConfig.cmake)
+include(cmake/CubInstallRules.cmake)
+
+option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
+option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
+option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
+
+# Check if we're actually building anything before continuing. If not, no need
+# to search for deps, etc. This is a common approach for packagers that just
+# need the install rules. See GH issue thrust/thrust#1211.
+if (NOT (CUB_ENABLE_HEADER_TESTING OR
+         CUB_ENABLE_TESTING OR
+         CUB_ENABLE_EXAMPLES))
+  return()
+endif()
+
+if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
+
+  set_property(
+    CACHE CMAKE_BUILD_TYPE
+    PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
+  )
+endif ()
+
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside
+# Thrust targets when building as part of Thrust.
+set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib")
+set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin")
+
+cub_build_target_list()
+
+if (CUB_ENABLE_HEADER_TESTING)
+  include(cmake/CubHeaderTesting.cmake)
+endif()
+
+# Both testing and examples use ctest
+if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES)
+  include(CTest)
+  enable_testing()
+endif()
+
+if (CUB_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
+if (CUB_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
diff --git a/thrust/dependencies/cub/CODE_OF_CONDUCT.md b/thrust/dependencies/cub/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b5aa85d48b65c3f5700b2e155aa1afcabb0ae96
--- /dev/null
+++ b/thrust/dependencies/cub/CODE_OF_CONDUCT.md
@@ -0,0 +1,59 @@
+# Contributor Covenant Code of Conduct
+
+## Overview
+
+Define the code of conduct followed and enforced for CUB
+
+### Intended audience
+
+COMMUNITY | DEVELOPERS | PROJECT LEADS
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+-   Using welcoming and inclusive language
+-   Being respectful of differing viewpoints and experiences
+-   Gracefully accepting constructive criticism
+-   Focusing on what is best for the community
+-   Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+-   The use of sexualized language or imagery and unwelcome sexual attention or advances
+-   Trolling, insulting/derogatory comments, and personal or political attacks
+-   Public or private harassment
+-   Publishing others’ private information, such as a physical or electronic address, without explicit permission
+-   Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at  [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com)  All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership.
+
+## Attribution
+
+This Code of Conduct was taken from the [NVIDIA RAPIDS](https://docs.rapids.ai/resources/conduct/) project, which was adapted from the  [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq
+
+## Contact
+
+If you need to contact the CUB team, please reach out to one of the following email addresses:
+- cub-conduct@nvidia.com
+- libcudacxx-conduct@nvidia.com
diff --git a/thrust/dependencies/cub/CONTRIBUTING.md b/thrust/dependencies/cub/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd641756387902cb775c44e669564997dd83f325
--- /dev/null
+++ b/thrust/dependencies/cub/CONTRIBUTING.md
@@ -0,0 +1,366 @@
+# Table of Contents
+
+1. [Contributing to CUB](#contributing-to-cub)
+1. [CMake Options](#cmake-options)
+1. [Development Model](#development-model)
+
+# Contributing to CUB
+
+CUB uses Github to manage all open-source development, including bug tracking,
+pull requests, and design discussions. This document details how to get
+started as a CUB contributor.
+
+An overview of this process is:
+
+1. [Clone the CUB repository](#clone-the-cub-repository)
+1. [Setup a fork of CUB](#setup-a-fork-of-cub)
+1. [Setup your environment](#setup-your-environment)
+1. [Create a development branch](#create-a-development-branch)
+1. [Local development loop](#local-development-loop)
+1. [Push development branch to your fork](#push-development-branch-to-your-fork)
+1. [Create pull request](#create-pull-request)
+1. [Address feedback and update pull request](#address-feedback-and-update-pull-request)
+1. [When your PR is approved...](#when-your-pr-is-approved)
+
+## Clone the CUB Repository
+
+To get started, clone the main repository to your local computer:
+
+```
+git clone https://github.com/thrust/cub.git
+cd cub
+```
+
+## Setup a Fork of CUB
+
+You'll need a fork of CUB on Github to create a pull request. To setup your
+fork:
+
+1. Create a Github account (if needed)
+2. Go to [the CUB Github page](https://github.com/thrust/cub)
+3. Click "Fork" and follow any prompts that appear.
+
+Once your fork is created, setup a new remote repo in your local CUB clone:
+
+```
+git remote add github-fork git@github.com:<GITHUB_USERNAME>/cub.git
+```
+
+## Setup Your Environment
+
+### Git Environment
+
+If you haven't already, this is a good time to tell git who you are. This
+information is used to fill out authorship information on your git commits.
+
+```
+git config --global user.name "John Doe"
+git config --global user.email johndoe@example.com
+```
+
+### Configure CMake builds
+
+CUB uses [CMake](https://www.cmake.org) for its developer build system. To
+configure, build, and test your checkout of CUB with default settings:
+
+```
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..   # Command line interface.
+ccmake ..  # ncurses GUI (Linux only)
+cmake-gui  # Graphical UI, set source/build directories in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+See [CMake Options](#cmake-options) for details on customizing the build.
+
+## Create a Development Branch
+
+All work should be done in a development branch (also called a "topic branch")
+and not directly in the `master` branch. This makes it easier to manage multiple
+in-progress patches at once, and provides a descriptive label for your patch
+as it passes through the review system.
+
+To create a new branch based on the current `master`:
+
+```
+# Checkout local master branch:
+cd /path/to/cub/sources
+git checkout master
+
+# Sync local master branch with github:
+git pull
+
+# Create a new branch named `my_descriptive_branch_name` based on master:
+git checkout -b my_descriptive_branch_name
+
+# Verify that the branch has been created and is currently checked out:
+git branch
+```
+
+CUB branch names should follow a particular pattern:
+
+- For new features, name the branch `feature/<name>`
+- For bugfixes associated with a github issue, use `bug/github/<bug-description>-<bug-id>`
+  - Internal nvidia and gitlab bugs should use `nvidia` or `gitlab` in place of
+    `github`.
+
+## Local Development Loop
+
+### Edit, Build, Test, Repeat
+
+Once the topic branch is created, you're all set to start working on CUB
+code. Make some changes, then build and test them:
+
+```
+# Implement changes:
+cd /path/to/cub/sources
+emacs cub/some_file.cuh # or whatever editor you prefer
+
+# Create / update a unit test for your changes:
+emacs tests/some_test.cu
+
+# Check that everything builds and tests pass:
+cd /path/to/cub/build/directory
+cmake --build . -j <num_jobs> # or make, ninja, etc
+ctest
+```
+
+### Creating a Commit
+
+Once you're satisfied with your patch, commit your changes:
+
+```
+# Manually add changed files and create a commit:
+cd /path/to/cub
+git add cub/some_file.cuh
+git add tests/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+git gui
+```
+
+#### Writing a Commit Message
+
+Your commit message will communicate the purpose and rationale behind your
+patch to other developers, and will be used to populate the initial description
+of your Github pull request.
+
+When writing a commit message, the following standard format should be used,
+since tools in the git ecosystem are designed to parse this correctly:
+
+```
+First line of commit message is a short summary (<80 char)
+<Second line left blank>
+Detailed description of change begins on third line. This portion can
+span multiple lines, try to manually wrap them at something reasonable.
+
+Blank lines can be used to separate multiple paragraphs in the description.
+
+If your patch is associated with another pull request or issue in the main
+CUB repository, you should reference it with a `#` symbol, e.g.
+#1023 for issue 1023.
+
+For issues / pull requests in a different github repo, reference them using
+the full syntax, e.g. thrust/thrust#4 for issue 4 in the thrust/thrust repo.
+
+Markdown is recommended for formatting more detailed messages, as these will
+be nicely rendered on Github, etc.
+```
+
+## Push Development Branch to your Fork
+
+Once you've committed your changes to a local development branch, it's time to
+push them to your fork:
+
+```
+cd /path/to/cub/checkout
+git checkout my_descriptive_branch_name # if not already checked out
+git push --set-upstream github-fork my_descriptive_branch_name
+```
+
+`--set-upstream github-fork` tells git that future pushes/pulls on this branch
+should target your `github-fork` remote by default.
+
+## Create Pull Request
+
+To create a pull request for your freshly pushed branch, open your github fork
+in a browser by going to `https://www.github.com/<GITHUB_USERNAME>/cub`. A
+prompt may automatically appear asking you to create a pull request if you've
+recently pushed a branch.
+
+If there's no prompt, go to "Code" > "Branches" and click the appropriate
+"New pull request" button for your branch.
+
+If you would like a specific developer to review your patch, feel free to
+request them as a reviewer at this time.
+
+The CUB team will review your patch, test it on NVIDIA's internal CI, and
+provide feedback.
+
+## Address Feedback and Update Pull Request
+
+If the reviewers request changes to your patch, use the following process to
+update the pull request:
+
+```
+# Make changes:
+cd /path/to/cub/sources
+git checkout my_descriptive_branch_name
+emacs cub/some_file.cuh
+emacs tests/some_test.cu
+
+# Build + test
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+
+# Amend commit:
+cd /path/to/cub/sources
+git add cub/some_file.cuh
+git add tests/some_test.cu
+git commit --amend
+# Or
+git gui # Check the "Amend Last Commit" box
+
+# Update the branch on your fork:
+git push -f
+```
+
+At this point, the pull request should show your recent changes.
+
+## When Your PR is Approved
+
+Once your pull request is approved by the CUB team, no further action is
+needed from you. We will handle integrating it since we must coordinate changes
+to `master` with NVIDIA's internal perforce repository.
+
+# CMake Options
+
+A CUB build is configured using CMake options. These may be passed to CMake
+using
+
+```
+cmake -D<option_name>=<value> /path/to/cub/sources
+```
+
+or configured interactively with the `ccmake` or `cmake-gui` interfaces.
+
+The configuration options for CUB are:
+
+- `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
+  - Standard CMake build option. Default: `RelWithDebInfo`
+- `CUB_ENABLE_HEADER_TESTING={ON, OFF}`
+  - Whether to test compile public headers. Default is `ON`.
+- `CUB_ENABLE_TESTING={ON, OFF}`
+  - Whether to build unit tests. Default is `ON`.
+- `CUB_ENABLE_EXAMPLES={ON, OFF}`
+  - Whether to build examples. Default is `ON`.
+- `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}`
+  - Toggle whether a specific C++ dialect will be targeted.
+  - Multiple dialects may be targeted in a single build.
+  - Possible values of `XX` are `{11, 14, 17}`.
+  - By default, only C++14 is enabled.
+- `CUB_ENABLE_COMPUTE_XX={ON, OFF}`
+  - Controls the targeted CUDA architecture(s)
+  - Multiple options may be selected when using NVCC as the CUDA compiler.
+  - Valid values of `XX` are:
+    `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}`
+  - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`:
+- `CUB_ENABLE_COMPUTE_FUTURE={ON, OFF}`
+  - If enabled, CUDA objects will target the most recent virtual architecture
+    in addition to the real architectures specified by the
+    `CUB_ENABLE_COMPUTE_XX` options.
+  - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`:
+- `CUB_DISABLE_ARCH_BY_DEFAULT={ON, OFF}`
+  - When `ON`, all `CUB_ENABLE_COMPUTE_*` options are initially `OFF`.
+  - Default: `OFF` (meaning all architectures are enabled by default)
+- `CUB_ENABLE_TESTS_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building tests.
+    Default is `OFF`.
+- `CUB_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building examples.
+    Default is `OFF`.
+
+# Development Model
+
+The following is a description of the basic development process that CUB follows. This is a living
+document that will evolve as our process evolves.
+
+CUB is distributed in three ways:
+
+   * On GitHub.
+   * In the NVIDIA HPC SDK.
+   * In the CUDA Toolkit.
+
+## Trunk Based Development
+
+CUB uses [trunk based development](https://trunkbaseddevelopment.com). There is a single long-lived
+branch called `master`. Engineers may create branches for feature development. Such branches always
+merge into `master`. There are no release branches. Releases are produced by taking a snapshot of
+`master` ("snapping"). After a release has been snapped from `master`, it will never be changed.
+
+## Repositories
+
+As CUB is developed both on GitHub and internally at NVIDIA, there are three main places where code lives:
+
+   * The Source of Truth, the [public CUB repository](https://github.com/thrust/cub), referred to as
+     `github` later in this document.
+   * An internal GitLab repository, referred to as `gitlab` later in this document.
+   * An internal Perforce repository, referred to as `perforce` later in this document.
+
+## Versioning
+
+CUB has its own versioning system for releases, independent of the versioning scheme of the NVIDIA
+HPC SDK or the CUDA Toolkit.
+
+Today, CUB version numbers have a specific [semantic meaning](https://semver.org/).
+Releases prior to 1.10.0 largely, but not strictly, followed these semantic meanings.
+
+The version number for a CUB release uses the following format: `MMM.mmm.ss-ppp`, where:
+
+   * `CUB_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits. It is incremented
+     when the fundamental nature of the library evolves, leading to widespread changes across the
+     entire library interface with no guarantee of API, ABI, or semantic compatibility with former
+     versions.
+   * `CUB_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits. It is incremented when
+     breaking API, ABI, or semantic changes are made.
+   * `CUB_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits. It is incremented
+     when notable new features or bug fixes or features that are API, ABI, and semantic backwards
+     compatible are added.
+   * `CUB_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits. It is incremented if any
+     change in the repo whatsoever is made and no other version component has been incremented.
+
+The `<cub/version.h>` header defines `CUB_*` macros for all of the version components mentioned
+above. Additionally, a `CUB_VERSION` macro is defined, which is an integer literal containing all
+of the version components except for `CUB_PATCH_NUMBER`.
+
+## Branches and Tags
+
+The following tag names are used in the CUB project:
+
+  * `github/nvhpc-X.Y`: the tag that directly corresponds to what has been shipped in the NVIDIA HPC SDK release X.Y.
+  * `github/cuda-X.Y`: the tag that directly corresponds to what has been shipped in the CUDA Toolkit release X.Y.
+  * `github/A.B.C`: the tag that directly corresponds to a CUB version A.B.C.
+
+The following branch names are used in the CUB project:
+
+  * `github/master`: the Source of Truth development branch of CUB.
+  * `github/old-master`: the old Source of Truth branch, before unification of public and internal repositories.
+  * `github/feature/<name>`: feature branch for a feature under development.
+  * `github/bug/<bug-system>/<bug-description>-<bug-id>`: bug fix branch, where `bug-system` is `github` or `nvidia`.
+  * `gitlab/master`: mirror of `github/master`.
+  * `perforce/private`: mirrored `github/master`, plus files necessary for internal NVIDIA testing systems.
+
+On the rare occasion that we cannot do work in the open, for example when developing a change specific to an
+unreleased product, these branches may exist on `gitlab` instead of `github`. By default, everything should be
+in the open on `github` unless there is a strong motivation for it to not be open.
diff --git a/thrust/dependencies/cub/LICENSE.TXT b/thrust/dependencies/cub/LICENSE.TXT
new file mode 100644
index 0000000000000000000000000000000000000000..a678e64f8ccc1d7fdce261ed45a8d3ad4b3274c0
--- /dev/null
+++ b/thrust/dependencies/cub/LICENSE.TXT
@@ -0,0 +1,24 @@
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/thrust/dependencies/cub/README.md b/thrust/dependencies/cub/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..18ad2298fd7d10d864d64a022f17ad6743501697
--- /dev/null
+++ b/thrust/dependencies/cub/README.md
@@ -0,0 +1,189 @@
+<hr>
+<h3>About CUB</h3>
+
+CUB provides state-of-the-art, reusable software components for every layer
+of the CUDA programming model:
+- [<b><em>Device-wide primitives</em></b>](https://nvlabs.github.com/cub/group___device_module.html)
+  - Sort, prefix scan, reduction, histogram, etc.
+  - Compatible with CUDA dynamic parallelism
+- [<b><em>Block-wide "collective" primitives</em></b>](https://nvlabs.github.com/cub/group___block_module.html)
+  - I/O, sort, prefix scan, reduction, histogram, etc.
+  - Compatible with arbitrary thread block sizes and types
+- [<b><em>Warp-wide "collective" primitives</em></b>](https://nvlabs.github.com/cub/group___warp_module.html)
+  - Warp-wide prefix scan, reduction, etc.
+  - Safe and architecture-specific
+- [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
+  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc.
+
+![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
+
+CUB is included in the NVIDIA HPC SDK and the CUDA Toolkit.
+
+We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples.
+
+<br><hr>
+<h3>A Simple Example</h3>
+
+```C++
+#include <cub/cub.cuh>
+
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
+{
+     using namespace cub;
+
+     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
+     // owning 16 integer items each
+     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
+     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
+
+     // Allocate shared memory
+     __shared__ union {
+         typename BlockRadixSort::TempStorage  sort;
+         typename BlockLoad::TempStorage       load;
+         typename BlockStore::TempStorage      store;
+     } temp_storage;
+
+     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
+
+     // Obtain a segment of 2048 consecutive keys that are blocked across threads
+     int thread_keys[16];
+     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+     __syncthreads();
+
+     // Collectively sort the keys
+     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+     __syncthreads();
+
+     // Store the sorted segment
+     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
+}
+```
+
+Each thread block uses `cub::BlockRadixSort` to collectively sort
+its own input segment.  The class is specialized by the
+data type being sorted, by the number of threads per block, by the number of
+keys per thread, and implicitly by the targeted compilation architecture.
+
+The `cub::BlockLoad` and `cub::BlockStore` classes are similarly specialized.
+Furthermore, to provide coalesced accesses to device memory, these primitives are
+configured to access memory using a striped access pattern (where consecutive threads
+simultaneously access consecutive items) and then <em>transpose</em> the keys into
+a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads.
+
+Once specialized, these classes expose opaque `TempStorage` member types.
+The thread block uses these storage types to statically allocate the union of
+shared memory needed by the thread block.  (Alternatively these storage types
+could be aliased to global memory allocations).
+
+<br><hr>
+<h3>Releases</h3>
+
+CUB is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition
+to GitHub.
+
+See the [changelog](CHANGELOG.md) for details about specific releases.
+
+| CUB Release               | Included In                             |
+| ------------------------- | --------------------------------------- |
+| 1.9.10-1                  | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1 |
+| 1.9.10                    | NVIDIA HPC SDK 20.5                     |
+| 1.9.9                     | CUDA Toolkit 11.0                       |
+| 1.9.8-1                   | NVIDIA HPC SDK 20.3                     |
+| 1.9.8                     | CUDA Toolkit 11.0 Early Access          |
+| 1.9.8                     | CUDA 11.0 Early Access                  |
+| 1.8.0                     |                                         |
+| 1.7.5                     | Thrust 1.9.2                            |
+| 1.7.4                     | Thrust 1.9.1-2                          |
+| 1.7.3                     |                                         |
+| 1.7.2                     |                                         |
+| 1.7.1                     |                                         |
+| 1.7.0                     | Thrust 1.9.0-5                          |
+| 1.6.4                     |                                         |
+| 1.6.3                     |                                         |
+| 1.6.2 (previously 1.5.5)  |                                         |
+| 1.6.1 (previously 1.5.4)  |                                         |
+| 1.6.0 (previously 1.5.3)  |                                         |
+| 1.5.2                     |                                         |
+| 1.5.1                     |                                         |
+| 1.5.0                     |                                         |
+| 1.4.1                     |                                         |
+| 1.4.0                     |                                         |
+| 1.3.2                     |                                         |
+| 1.3.1                     |                                         |
+| 1.3.0                     |                                         |
+| 1.2.3                     |                                         |
+| 1.2.2                     |                                         |
+| 1.2.0                     |                                         |
+| 1.1.1                     |                                         |
+| 1.0.2                     |                                         |
+| 1.0.1                     |                                         |
+| 0.9.4                     |                                         |
+| 0.9.2                     |                                         |
+| 0.9.1                     |                                         |
+| 0.9.0                     |                                         |
+
+<br><hr>
+<h3>Development Process</h3>
+
+CUB uses the [CMake build system](https://cmake.org/) to build unit tests,
+examples, and header tests. To build CUB as a developer, the following
+recipe should be followed:
+
+```
+# Clone CUB repo from github:
+git clone https://github.com/thrust/cub.git
+cd cub
+
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..   # Command line interface.
+ccmake ..  # ncurses GUI (Linux only)
+cmake-gui  # Graphical UI, set source/build directories in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+By default, the C++14 standard is targeted, but this can be changed in CMake.
+More information on configuring your CUB build and creating a pull request is
+found in [CONTRIBUTING.md](CONTRIBUTING.md).
+
+<br><hr>
+<h3>Open Source License</h3>
+
+CUB is available under the "New BSD" open-source license:
+
+```
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/thrust/dependencies/cub/cmake/AppendOptionIfAvailable.cmake b/thrust/dependencies/cub/cmake/AppendOptionIfAvailable.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..478321ec8787ec34b2d4fc5dd6067e9239103a07
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/AppendOptionIfAvailable.cmake
@@ -0,0 +1,13 @@
+include_guard(GLOBAL)
+include(CheckCXXCompilerFlag)
+
+macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
+
+string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR)
+check_cxx_compiler_flag(${_FLAG} ${_VAR})
+
+if (${${_VAR}})
+  list(APPEND ${_LIST} ${_FLAG})
+endif ()
+
+endmacro ()
diff --git a/thrust/dependencies/cub/cmake/CubBuildCompilerTargets.cmake b/thrust/dependencies/cub/cmake/CubBuildCompilerTargets.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..86016059da2e4e1bc637f6c7f55a4500c96a5c91
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/CubBuildCompilerTargets.cmake
@@ -0,0 +1,102 @@
+#
+# This file defines the `cub_build_compiler_targets()` function, which
+# creates the following interface targets:
+#
+# cub.compiler_interface
+# - Interface target providing compiler-specific options needed to build
+#   Thrust's tests, examples, etc.
+
+function(cub_build_compiler_targets)
+  set(cxx_compile_definitions)
+  set(cxx_compile_options)
+
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # TODO Enable /Wall
+    append_option_if_available("/WX" cxx_compile_options)
+
+    # Disabled loss-of-data conversion warnings.
+    # TODO Re-enable.
+    append_option_if_available("/wd4244" cxx_compile_options)
+    append_option_if_available("/wd4267" cxx_compile_options)
+
+    # Suppress numeric conversion-to-bool warnings.
+    # TODO Re-enable.
+    append_option_if_available("/wd4800" cxx_compile_options)
+
+    # Disable warning about applying unary operator- to unsigned type.
+    append_option_if_available("/wd4146" cxx_compile_options)
+
+    # Some tests require /bigobj to fit everything into their object files:
+    append_option_if_available("/bigobj" cxx_compile_options)
+  else()
+    append_option_if_available("-Werror" cxx_compile_options)
+    append_option_if_available("-Wall" cxx_compile_options)
+    append_option_if_available("-Wextra" cxx_compile_options)
+    append_option_if_available("-Winit-self" cxx_compile_options)
+    append_option_if_available("-Woverloaded-virtual" cxx_compile_options)
+    append_option_if_available("-Wcast-qual" cxx_compile_options)
+    append_option_if_available("-Wno-cast-align" cxx_compile_options)
+    append_option_if_available("-Wno-long-long" cxx_compile_options)
+    append_option_if_available("-Wno-variadic-macros" cxx_compile_options)
+    append_option_if_available("-Wno-unused-function" cxx_compile_options)
+    append_option_if_available("-Wno-unused-variable" cxx_compile_options)
+
+    # CUB uses deprecated texture functions (cudaBindTexture, etc). These
+    # need to be replaced, but silence the warnings for now.
+    append_option_if_available("-Wno-deprecated-declarations" cxx_compile_options)
+  endif()
+
+  if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.5)
+      # This isn't available until GCC 4.3, and misfires on TMP code until
+      # GCC 4.5.
+      append_option_if_available("-Wlogical-op" cxx_compile_options)
+    endif()
+
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
+      # GCC 7.3 complains about name mangling changes due to `noexcept`
+      # becoming part of the type system; we don't care.
+      append_option_if_available("-Wno-noexcept-type" cxx_compile_options)
+    endif()
+  endif()
+
+  if (("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
+      ("XL" STREQUAL "${CMAKE_CXX_COMPILER_ID}"))
+    # xlC and Clang warn about unused parameters in uninstantiated templates.
+    # This causes xlC to choke on the OMP backend, which is mostly #ifdef'd out
+    # (and thus has unused parameters) when you aren't using it.
+    append_option_if_available("-Wno-unused-parameters" cxx_compile_options)
+  endif()
+
+  if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # -Wunneeded-internal-declaration misfires in the unit test framework
+    # on older versions of Clang.
+    append_option_if_available("-Wno-unneeded-internal-declaration" cxx_compile_options)
+  endif()
+
+  add_library(cub.compiler_interface INTERFACE)
+
+  foreach (cxx_option IN LISTS cxx_compile_options)
+    target_compile_options(cub.compiler_interface INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:${cxx_option}>
+      # Only use -Xcompiler with NVCC, not Feta.
+      #
+      # CMake can't split genexs, so this can't be formatted better :(
+      # This is:
+      # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt:
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${cxx_option}>
+    )
+  endforeach()
+
+  # Add these for both CUDA and CXX targets:
+  target_compile_definitions(cub.compiler_interface INTERFACE
+    ${cxx_compile_definitions}
+  )
+
+  # Promote warnings and display diagnostic numbers for nvcc:
+  target_compile_options(cub.compiler_interface INTERFACE
+    # If using CUDA w/ NVCC...
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--display_error_number>
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--promote_warnings>
+  )
+endfunction()
diff --git a/thrust/dependencies/cub/cmake/CubBuildTargetList.cmake b/thrust/dependencies/cub/cmake/CubBuildTargetList.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c887b6afa6000098bff6d34a08d563d3799c0a44
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/CubBuildTargetList.cmake
@@ -0,0 +1,261 @@
+# This file provides utilities for building and working with CUB
+# configuration targets.
+#
+# CUB_TARGETS
+#  - Built by the calling the `cub_build_target_list()` function.
+#  - Each item is the name of a CUB interface target that is configured for a
+#    certain build configuration. Currently only C++ standard dialect is
+#    considered.
+#
+# cub_build_target_list()
+# - Creates the CUB_TARGETS list.
+#
+# The following functions can be used to test/set metadata on a CUB target:
+#
+# cub_get_target_property(<prop_var> <target_name> <prop>)
+#   - Checks the ${prop} target property on CUB target ${target_name}
+#     and sets the ${prop_var} variable in the caller's scope.
+#   - <prop_var> is any valid cmake identifier.
+#   - <target_name> is the name of a CUB target.
+#   - <prop> is one of the following:
+#     - DIALECT: The C++ dialect. Valid values: 11, 14, 17.
+#     - PREFIX: A unique prefix that should be used to name all
+#       targets/tests/examples that use this configuration.
+#
+# cub_get_target_properties(<target_name>)
+#   - Defines ${target_name}_${prop} in the caller's scope, for `prop` in:
+#     {DIALECT, PREFIX}. See above for details.
+#
+# cub_clone_target_properties(<dst_target> <src_target>)
+#   - Set the {DIALECT, PREFIX} metadata on ${dst_target} to match
+#     ${src_target}. See above for details.
+#   - This *MUST* be called on any targets that link to another CUB target
+#     to ensure that dialect information is updated correctly, e.g.
+#     `cub_clone_target_properties(${my_cub_test} ${some_cub_target})`
+
+# Dialects:
+set(CUB_CPP_DIALECT_OPTIONS
+  11 14 17
+  CACHE INTERNAL "C++ dialects supported by CUB." FORCE
+)
+
+define_property(TARGET PROPERTY _CUB_DIALECT
+  BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17."
+  FULL_DOCS "A target's C++ dialect: 11, 14, or 17."
+)
+define_property(TARGET PROPERTY _CUB_PREFIX
+  BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp14'."
+  FULL_DOCS "A prefix describing the config, eg. 'cub.cpp14'."
+)
+
+function(cub_set_target_properties target_name dialect prefix)
+  set_target_properties(${target_name}
+    PROPERTIES
+      _CUB_DIALECT ${dialect}
+      _CUB_PREFIX ${prefix}
+  )
+
+  get_target_property(type ${target_name} TYPE)
+  if (NOT ${type} STREQUAL "INTERFACE_LIBRARY")
+    set_target_properties(${target_name}
+      PROPERTIES
+        CXX_STANDARD ${dialect}
+        CUDA_STANDARD ${dialect}
+        ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}"
+        LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}"
+        RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}"
+    )
+
+    # CMake still emits errors about empty CUDA_ARCHITECTURES when CMP0104
+    # is set to OLD. This suppresses the errors for good.
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      set_target_properties(${target_name}
+        PROPERTIES
+          CUDA_ARCHITECTURES OFF
+      )
+    endif()
+  endif()
+endfunction()
+
+# Get a cub property from a target and store it in var_name
+# cub_get_target_property(<var_name> <target_name> [DIALECT|PREFIX]
+macro(cub_get_target_property prop_var target_name prop)
+  get_property(${prop_var} TARGET ${target_name} PROPERTY _CUB_${prop})
+endmacro()
+
+# Defines the following string variables in the caller's scope:
+# - ${target_name}_DIALECT
+# - ${target_name}_PREFIX
+macro(cub_get_target_properties target_name)
+  cub_get_target_property(${target_name}_DIALECT ${target_name} DIALECT)
+  cub_get_target_property(${target_name}_PREFIX ${target_name} PREFIX)
+endmacro()
+
+# Set one target's _CUB_* properties to match another target
+function(cub_clone_target_properties dst_target src_target)
+  cub_get_target_properties(${src_target})
+  cub_set_target_properties(${dst_target}
+    ${${src_target}_DIALECT}
+    ${${src_target}_PREFIX}
+  )
+endfunction()
+
+# Set ${var_name} to TRUE or FALSE in the caller's scope
+function(_cub_is_config_valid var_name dialect)
+  if (CUB_ENABLE_DIALECT_CPP${dialect})
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_cub_init_target_list)
+  set(CUB_TARGETS "" CACHE INTERNAL "" FORCE)
+endfunction()
+
+function(_cub_add_target_to_target_list target_name dialect prefix)
+  cub_set_target_properties(${target_name} ${dialect} ${prefix})
+
+  target_link_libraries(${target_name} INTERFACE
+    CUB::CUB
+    cub.compiler_interface
+  )
+
+  if (TARGET cub.thrust)
+    target_link_libraries(${target_name} INTERFACE cub.thrust)
+  endif()
+
+  set(CUB_TARGETS ${CUB_TARGETS} ${target_name} CACHE INTERNAL "" FORCE)
+
+  set(label "cpp${dialect}")
+  string(TOLOWER "${label}" label)
+  message(STATUS "Enabling CUB configuration: ${label}")
+endfunction()
+
+# Build a ${CUB_TARGETS} list containing target names for all
+# requested configurations
+function(cub_build_target_list)
+  # Clear the list of targets:
+  _cub_init_target_list()
+
+  # Handle dialect options:
+  foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS)
+    if (CUB_IN_THRUST)
+      # Just use Thrust's settings:
+      if (THRUST_ENABLE_MULTICONFIG)
+        set(CUB_ENABLE_DIALECT_CPP${dialect}
+            ${THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}}
+        )
+      else()
+        set(val OFF)
+        if (dialect EQUAL ${THRUST_CPP_DIALECT})
+          set(val ON)
+        endif()
+        set(CUB_ENABLE_DIALECT_CPP${dialect} ${val})
+      endif()
+    else()
+      # Create CMake options:
+      set(default_value OFF)
+      if (dialect EQUAL 14) # Default to just 14 on:
+        set(default_value ON)
+      endif()
+      option(CUB_ENABLE_DIALECT_CPP${dialect}
+        "Generate C++${dialect} build configurations."
+        ${default_value}
+      )
+    endif()
+  endforeach()
+
+  # CMake added C++17 support for CUDA targets in 3.18:
+  if (CUB_ENABLE_DIALECT_CPP17)
+    cmake_minimum_required(VERSION 3.18)
+  endif()
+
+  # Supported versions of MSVC do not distinguish between C++11 and C++14.
+  # Warn the user that they may be generating a ton of redundant targets.
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
+      CUB_ENABLE_DIALECT_CPP11)
+    message(WARNING
+      "Supported versions of MSVC (2017+) do not distinguish between C++11 "
+      "and C++14. The requested C++11 targets will be built with C++14."
+    )
+  endif()
+
+  # Generic config flags:
+  macro(add_flag_option flag docstring default)
+    set(cub_opt "CUB_${flag}")
+    if (CUB_IN_THRUST)
+      set(thrust_opt "THRUST_${flag}")
+      # Use thrust's settings:
+      set(${cub_opt} ${${thrust_opt}})
+    else()
+      option(${cub_opt} "${docstring}" "${default}")
+      mark_as_advanced(${cub_opt})
+    endif()
+  endmacro()
+  add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF)
+  add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
+  add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF)
+
+  # Build cub.compiler_interface with warning flags, etc
+  # This must be called before _cub_add_target_to_target_list.
+  cub_build_compiler_targets()
+
+  # Set up the CUB target while testing out our find_package scripts.
+  find_package(CUB REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${CUB_SOURCE_DIR}"
+  )
+
+  # TODO
+  # Some of the iterators and unittests depend on thrust. We should break the
+  # cyclical dependency by migrating CUB's Thrust bits into Thrust.
+  find_package(Thrust ${CUB_VERSION} EXACT CONFIG
+    HINTS "../../" # Check if we are in thrust/dependencies/cub
+  )
+
+  if (Thrust_FOUND)
+    thrust_set_CUB_target(CUB::CUB)
+    thrust_create_target(cub.thrust HOST CPP DEVICE CUDA)
+  else()
+    message(STATUS
+      "Thrust was not found. Set CMake variable 'Thrust_DIR' to the "
+      "thrust-config.cmake file of a Thrust ${CUB_VERSION} installation to "
+      "enable additional testing."
+    )
+  endif()
+
+  # Build CUB_TARGETS
+  foreach(dialect IN LISTS CUB_CPP_DIALECT_OPTIONS)
+    _cub_is_config_valid(config_valid ${dialect})
+    if (config_valid)
+      set(prefix "cub.cpp${dialect}")
+      string(TOLOWER "${prefix}" prefix)
+      set(target_name "${prefix}")
+
+      add_library(${target_name} INTERFACE)
+
+      # Set configuration metadata for this cub interface target:
+      _cub_add_target_to_target_list(${target_name} ${dialect} ${prefix})
+    endif()
+  endforeach() # dialects
+
+  list(LENGTH CUB_TARGETS count)
+  message(STATUS "${count} unique cub.dialect configurations generated")
+
+  # Top level meta-target. Makes it easier to just build CUB targets when
+  # building both CUB and Thrust. Add all project files here so IDEs will be
+  # aware of them. This will not generate build rules.
+  file(GLOB_RECURSE all_sources
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    "${CUB_SOURCE_DIR}/cub/*.cuh"
+  )
+  add_custom_target(cub.all SOURCES ${all_sources})
+
+  # Create meta targets for each config:
+  foreach(cub_target IN LISTS CUB_TARGETS)
+    cub_get_target_property(config_prefix ${cub_target} PREFIX)
+    add_custom_target(${config_prefix}.all)
+    add_dependencies(cub.all ${config_prefix}.all)
+  endforeach()
+endfunction()
diff --git a/thrust/dependencies/cub/cmake/CubCudaConfig.cmake b/thrust/dependencies/cub/cmake/CubCudaConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..74d3a13517ddab3b975ab84cb1b692b04a0db84a
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/CubCudaConfig.cmake
@@ -0,0 +1,133 @@
+if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR
+         "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}"))
+  message(FATAL_ERROR
+    "CUB tests and examples require the C++ compiler and the CUDA host "
+    "compiler to be the same; to set this compiler, please use the "
+    "CMAKE_CXX_COMPILER variable, not the CMAKE_CUDA_HOST_COMPILER variable."
+  )
+endif()
+set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+
+#
+# Architecture options:
+#
+
+set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80)
+set(arch_message "CUB: Enabled CUDA architectures:")
+set(enabled_archs)
+
+# Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just
+# reuse them if possible. After we transition to CMake 3.18 CUDA_ARCHITECTURE
+# target properties this will need to be updated.
+if (CUB_IN_THRUST)
+  # Configure to use all flags from thrust:
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+
+  # Update the enabled architectures list from thrust
+  foreach (arch IN LISTS all_archs)
+    if (THRUST_ENABLE_COMPUTE_${arch})
+      set(CUB_ENABLE_COMPUTE_${arch} True)
+      list(APPEND enabled_archs ${arch})
+      string(APPEND arch_message " sm_${arch}")
+    else()
+      set(CUB_ENABLE_COMPUTE_${arch} False)
+    endif()
+  endforeach()
+
+  # Otherwise create cache options and build the flags ourselves:
+else() # NOT CUB_IN_THRUST
+
+  # Find the highest arch:
+  list(SORT all_archs)
+  list(LENGTH all_archs max_idx)
+  math(EXPR max_idx "${max_idx} - 1")
+  list(GET all_archs ${max_idx} highest_arch)
+
+  option(CUB_DISABLE_ARCH_BY_DEFAULT
+    "If ON, then all CUDA architectures are disabled on the initial CMake run."
+    OFF
+  )
+
+  set(option_init ON)
+  if (CUB_DISABLE_ARCH_BY_DEFAULT)
+    set(option_init OFF)
+  endif()
+
+  set(arch_flags)
+  foreach (arch IN LISTS all_archs)
+    option(CUB_ENABLE_COMPUTE_${arch}
+      "Enable code generation for sm_${arch}."
+      ${option_init}
+    )
+    if (CUB_ENABLE_COMPUTE_${arch})
+      list(APPEND enabled_archs ${arch})
+      string(APPEND arch_flags " -gencode arch=compute_${arch},code=sm_${arch}")
+      string(APPEND arch_message " sm_${arch}")
+    endif()
+  endforeach()
+
+  option(CUB_ENABLE_COMPUTE_FUTURE
+    "Enable code generation for tests for compute_${highest_arch}"
+    ${option_init}
+  )
+  if (CUB_ENABLE_COMPUTE_FUTURE)
+    string(APPEND arch_flags
+      " -gencode arch=compute_${highest_arch},code=compute_${highest_arch}"
+    )
+    string(APPEND arch_message " compute_${highest_arch}")
+  endif()
+
+  # TODO Once CMake 3.18 is required, use the CUDA_ARCHITECTURE target props
+  string(APPEND CMAKE_CUDA_FLAGS "${arch_flags}")
+endif()
+
+message(STATUS ${arch_message})
+
+# Create a variable containing the minimal target arch for tests
+list(SORT enabled_archs)
+list(GET enabled_archs 0 CUB_MINIMAL_ENABLED_ARCH)
+
+#
+# RDC options:
+#
+
+option(CUB_ENABLE_TESTS_WITH_RDC
+  "Build all CUB tests with RDC; tests that require RDC are not affected by this option."
+  OFF
+)
+
+option(CUB_ENABLE_EXAMPLES_WITH_RDC
+  "Build all CUB examples with RDC; examples which require RDC are not affected by this option."
+  OFF
+)
+
+# Check for RDC/SM compatibility and error/warn if necessary
+set(no_rdc_archs 53 62 72)
+set(rdc_supported True)
+foreach (arch IN LISTS no_rdc_archs)
+  if (CUB_ENABLE_COMPUTE_${arch})
+    set(rdc_supported False)
+    break()
+  endif()
+endforeach()
+
+set(rdc_opts
+  CUB_ENABLE_TESTS_WITH_RDC
+  CUB_ENABLE_EXAMPLES_WITH_RDC
+)
+set(rdc_requested False)
+foreach (rdc_opt IN LISTS rdc_opts)
+  if (${rdc_opt})
+    set(rdc_requested True)
+    break()
+  endif()
+endforeach()
+
+if (rdc_requested AND NOT rdc_supported)
+  string(JOIN ", " no_rdc ${no_rdc_archs})
+  string(JOIN "\n" opts ${rdc_opts})
+  message(FATAL_ERROR
+    "Architectures {${no_rdc}} do not support RDC and are incompatible with "
+    "these options:\n${opts}"
+  )
+endif()
diff --git a/thrust/dependencies/cub/cmake/CubHeaderTesting.cmake b/thrust/dependencies/cub/cmake/CubHeaderTesting.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..45f20ce5f3b130a76e00bdbacfd7fc00784ba758
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/CubHeaderTesting.cmake
@@ -0,0 +1,29 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+file(GLOB_RECURSE headers
+  RELATIVE "${CUB_SOURCE_DIR}/cub"
+  CONFIGURE_DEPENDS
+  cub/*.cuh
+)
+
+set(headertest_srcs)
+foreach (header IN LISTS headers)
+  set(headertest_src "headers/${header}.cu")
+  configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
+  list(APPEND headertest_srcs "${headertest_src}")
+endforeach()
+
+foreach(cub_target IN LISTS CUB_TARGETS)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+  set(headertest_target ${config_prefix}.headers)
+  add_library(${headertest_target} OBJECT ${headertest_srcs})
+  target_link_libraries(${headertest_target} PUBLIC ${cub_target})
+  cub_clone_target_properties(${headertest_target} ${cub_target})
+
+  add_dependencies(${config_prefix}.all ${headertest_target})
+endforeach()
diff --git a/thrust/dependencies/cub/cmake/CubInstallRules.cmake b/thrust/dependencies/cub/cmake/CubInstallRules.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..25505e1393016d95548b37e029e308f0a27e9de5
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/CubInstallRules.cmake
@@ -0,0 +1,15 @@
+# Thrust manages its own copy of these rules. Update ThrustInstallRules.cmake
+# if modifying this file.
+if (CUB_IN_THRUST)
+  return()
+endif()
+
+# CUB is a header library; no need to build anything before installing:
+set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
+
+install(DIRECTORY "${CUB_SOURCE_DIR}/cub"
+  TYPE INCLUDE
+  FILES_MATCHING
+    PATTERN "*.cuh"
+    PATTERN "*.cmake"
+)
diff --git a/thrust/dependencies/cub/cmake/header_test.in b/thrust/dependencies/cub/cmake/header_test.in
new file mode 100644
index 0000000000000000000000000000000000000000..cb4121c997927b0aa349bc3f1ab9f158af133e82
--- /dev/null
+++ b/thrust/dependencies/cub/cmake/header_test.in
@@ -0,0 +1 @@
+#include <cub/${header}>
diff --git a/thrust/dependencies/cub/common.mk b/thrust/dependencies/cub/common.mk
new file mode 100644
index 0000000000000000000000000000000000000000..4010ed309826f24cf79c09880f2b640a9b69faf2
--- /dev/null
+++ b/thrust/dependencies/cub/common.mk
@@ -0,0 +1,203 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# *
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# *
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
+
+COMMA = ,
+ifdef sm
+	SM_ARCH = $(subst $(COMMA),-,$(sm))
+else
+    SM_ARCH = 600
+endif
+
+ifeq (700, $(findstring 700, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+    SM_DEF 		+= -DSM700
+    TEST_ARCH 	= 700
+endif
+ifeq (620, $(findstring 620, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
+    SM_DEF 		+= -DSM620
+    TEST_ARCH 	= 620
+endif
+ifeq (610, $(findstring 610, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+    SM_DEF 		+= -DSM610
+    TEST_ARCH 	= 610
+endif
+ifeq (600, $(findstring 600, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+    SM_DEF 		+= -DSM600
+    TEST_ARCH 	= 600
+endif
+ifeq (520, $(findstring 520, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+    SM_DEF 		+= -DSM520
+    TEST_ARCH 	= 520
+endif
+ifeq (370, $(findstring 370, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\"
+    SM_DEF 		+= -DSM370
+    TEST_ARCH 	= 370
+endif
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+    SM_DEF 		+= -DSM350
+    TEST_ARCH 	= 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+    SM_DEF 		+= -DSM300
+    TEST_ARCH 	= 300
+endif
+
+
+# [cdp=<0|1>] CDP enable option (default: no)
+ifeq ($(cdp), 1)
+	DEFINES += -DCUB_CDP
+	CDP_SUFFIX = cdp
+    NVCCFLAGS += -rdc=true -lcudadevrt
+else
+	CDP_SUFFIX = nocdp
+endif
+
+
+# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
+ifeq ($(force32), 1)
+	CPU_ARCH = -m32
+	CPU_ARCH_SUFFIX = i386
+else
+	CPU_ARCH = -m64
+	CPU_ARCH_SUFFIX = x86_64
+    NPPI = -lnppist
+endif
+
+
+# [abi=<0|1>] CUDA ABI option (enabled by default)
+ifneq ($(abi), 0)
+	ABI_SUFFIX = abi
+else
+	NVCCFLAGS += -Xptxas -abi=no
+	ABI_SUFFIX = noabi
+endif
+
+
+# [open64=<0|1>] Middle-end compiler option (nvvm by default)
+ifeq ($(open64), 1)
+	NVCCFLAGS += -open64
+	PTX_SUFFIX = open64
+else
+	PTX_SUFFIX = nvvm
+endif
+
+
+# [verbose=<0|1>] Verbose toolchain output from nvcc option
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+
+# [keep=<0|1>] Keep intermediate compilation artifacts option
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+# [debug=<0|1>] Generate debug mode code
+ifeq ($(debug), 1)
+	NVCCFLAGS += -G
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
+
+NVCC ?= "$(shell which nvcc)"
+ifdef nvccver
+    NVCC_VERSION = $(nvccver)
+else
+    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
+NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+    # For MSVC
+    # Enable more warnings and treat as errors
+    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler /fp:strict
+    # Help the compiler/linker work with huge numbers of kernels on Windows
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+	CC = cl
+
+	# Multithreaded runtime
+	NVCCFLAGS += -Xcompiler /MT
+
+ifneq ($(force32), 1)
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
+else
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
+endif
+	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
+else
+    # For g++
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler -ffloat-store
+    CC = g++
+ifneq ($(force32), 1)
+    CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
+else
+    CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
+endif
+endif
+
+# Suffix to append to each binary
+BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
+			$(CUB_DIR)common.mk
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_histogram.cuh b/thrust/dependencies/cub/cub/agent/agent_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7559bf126b1c5eb6b266e394c4cf1b60ee48175f
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector<SampleT, 4>::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
+            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_radix_sort_downsweep.cuh b/thrust/dependencies/cub/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c861a41e886147731f02e65dc413d551f2c5b2d5
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,790 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm  _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm  _RANK_ALGORITHM,                ///< The radix ranking algorithm to use
+    BlockScanAlgorithm  _SCAN_ALGORITHM,                ///< The block scan algorithm to use
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortDownsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE ||
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        ValueT values[ITEMS_PER_THREAD];
+
+        CTA_SYNC();
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            #pragma unroll 1
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_radix_sort_upsweep.cuh b/thrust/dependencies/cub/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c65773f12cc01838f0813cede12372bfcda52e95
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,527 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortUpsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_reduce.cuh b/thrust/dependencies/cub/cub/agent/agent_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0f3ba75105a7f4e6afc0e285f1cb2d9bb729709c
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_reduce.cuh
@@ -0,0 +1,386 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                     NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                ComputeT,                       ///< Dominant compute type
+    int                     _VECTOR_LOAD_LENGTH,            ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,               ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    typename                ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentReducePolicy :
+    ScalingType
+{
+    enum
+    {
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_reduce_by_key.cuh b/thrust/dependencies/cub/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..01eded8975c27ec673cf558f6a2b4dfe5413afc1
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = prefix_op.GetInclusivePrefix();
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate.value;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_rle.cuh b/thrust/dependencies/cub/cub/agent/agent_rle.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..79697b7ec3335c49731f92db59849f648f36bfdc
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
+
+    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
+    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_scan.cuh b/thrust/dependencies/cub/cub/agent/agent_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0781b3e9e9fb140dc19ea17351bafa1b36b94e7c
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_scan.cuh
@@ -0,0 +1,469 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                         NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                    ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,                ///< The BlockScan algorithm to use
+    typename                    ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+
+struct AgentScanPolicy :
+    ScalingType
+{
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_segment_fixup.cuh b/thrust/dependencies/cub/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9cd524aa21c9a893de5ce561bfa57ab54d04bc51
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
+                                (Equals<ValueT, float>::VALUE || 
+                                 Equals<ValueT, int>::VALUE ||
+                                 Equals<ValueT, unsigned int>::VALUE ||
+                                 Equals<ValueT, unsigned long long>::VALUE),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_select_if.cuh b/thrust/dependencies/cub/cub/agent/agent_select_if.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e9568f3b00f8e693a6927355ecdc25212296b017
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals<FlagT, NullType>::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/agent_spmv_orig.cuh b/thrust/dependencies/cub/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..810f893fbecf278888de11ed27266105d5ea62f0
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value *= spmv_params.alpha;
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/agent/single_pass_scan_operators.cuh b/thrust/dependencies/cub/cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..924ef2a7aca8732918526a5027bb58ea32c05c30
--- /dev/null
+++ b/thrust/dependencies/cub/cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,814 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3] = {};
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3] = {};
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_adjacent_difference.cuh b/thrust/dependencies/cub/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c8953756db2fe3d18167352ca150a81abe08bc21
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/block/block_discontinuity.cuh b/thrust/dependencies/cub/cub/block/block_discontinuity.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..37b8c29925f94970f6feac2ddc055912f15ffb51
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_discontinuity.cuh
@@ -0,0 +1,1148 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/block/block_exchange.cuh b/thrust/dependencies/cub/cub/block/block_exchange.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fbe64afc19257b3cad22db7862341baf29eb2a13
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_exchange.cuh
@@ -0,0 +1,1246 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_histogram.cuh b/thrust/dependencies/cub/cub/block/block_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..030209063baed0b668ddd0927b0db4785517d373
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_histogram.cuh
@@ -0,0 +1,414 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
+        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_load.cuh b/thrust/dependencies/cub/cub/block/block_load.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d932a2c5b53b299441a2dc319a01b242b56ad996
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_load.cuh
@@ -0,0 +1,1229 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorTis not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_radix_rank.cuh b/thrust/dependencies/cub/cub/block/block_radix_rank.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a98976fc2614ff6477c8e19f1a3143ba4a84ea0c
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_radix_rank.cuh
@@ -0,0 +1,695 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/block/block_radix_sort.cuh b/thrust/dependencies/cub/cub/block/block_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e666902156bab48c938fb46e90cd4d0f2a8563ef
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_radix_sort.cuh
@@ -0,0 +1,862 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ *   items into ascending order.  It relies upon a positional representation for
+ *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ *   characters, etc.) specified from least-significant to most-significant.  For a
+ *   given input sequence of keys and a set of rules specifying a total ordering
+ *   of the symbolic alphabet, the radix sorting method produces a lexicographic
+ *   ordering of those keys.
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ *   half-precision floating-point type. Within each key, the implementation treats fixed-length
+ *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ *   method can only be applied to unsigned integral types, BlockRadixSort
+ *   is able to sort signed and floating-point types via simple bit-wise transformations
+ *   that ensure lexicographic key ordering.
+ * - \rowmajor
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_raking_layout.cuh b/thrust/dependencies/cub/cub/block/block_raking_layout.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bbacdf3e02fd5123d0bb0248f61b9da639c2442b
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_raking_layout.cuh
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_reduce.cuh b/thrust/dependencies/cub/cub/block/block_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1bf971f0f4a832b3c6fad85219934874bc219db1
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_reduce.cuh
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        WarpReductions,
+        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
+            RakingCommutativeOnly,
+            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_scan.cuh b/thrust/dependencies/cub/cub/block/block_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..513ef358bd7c85996ea3ab3f88c420f4285910f3
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_scan.cuh
@@ -0,0 +1,2141 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        WarpScans,
+        Raking>::Type InternalBlockScan;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_shuffle.cuh b/thrust/dependencies/cub/cub/block/block_shuffle.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ba2e9b59a0078a0f5b946fdc9bd5ba5b30d9a7b4
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_shuffle.cuh
@@ -0,0 +1,306 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T prev[BLOCK_THREADS];
+        T next[BLOCK_THREADS];
+    };
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        const int offset_tid = static_cast<int>(linear_tid) + distance;
+        if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
+        {
+            output = temp_storage[static_cast<size_t>(offset_tid)].prev;
+        }
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/block_store.cuh b/thrust/dependencies/cub/cub/block/block_store.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..495a1553f37654e6b995c2d00062ccf5cded547d
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/block_store.cuh
@@ -0,0 +1,999 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_histogram_atomic.cuh b/thrust/dependencies/cub/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3be0a3dfa6814b07f99217a427b191c88f3bc738
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_histogram_sort.cuh b/thrust/dependencies/cub/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f11735541c0c6da344531e44a2da1c6bbb6ab405
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_reduce_raking.cuh b/thrust/dependencies/cub/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2a57521be319ee8b984b4df11d800464abfb9b0f
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                int valid_raking_threads = (IS_FULL_TILE) ?
+                    RAKING_THREADS :
+                    (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
+                    partial,
+                    valid_raking_threads,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/thrust/dependencies/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..78a32b82263461242f390ae8c0d0f90acfa8e8aa
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        };
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/thrust/dependencies/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4dd3451b888abecddc24e236d85ac176d1da192c
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum    reduction_op;
+        int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_scan_raking.cuh b/thrust/dependencies/cub/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1d6c2f70dc98e0d86ae7075de34e42ddcad79eba
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans.cuh b/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3835e484e132c31790e76f91f4af3d673d1dc957
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,391 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans2.cuh b/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6617160d1be5018d2e63b6494a5c3df6813e247a
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,435 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans3.cuh b/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a8279d5765cd57eb80005b27084d29a963b97067
--- /dev/null
+++ b/thrust/dependencies/cub/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,417 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/cmake/cub-config-version.cmake b/thrust/dependencies/cub/cub/cmake/cub-config-version.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4260ba66f57769d96f8cb8dbe9ab3ac543a35075
--- /dev/null
+++ b/thrust/dependencies/cub/cub/cmake/cub-config-version.cmake
@@ -0,0 +1,33 @@
+# Parse version information from version.cuh:
+file(READ "${CMAKE_CURRENT_LIST_DIR}/../version.cuh" CUB_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
+math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
+math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
+
+# Build comparison versions:
+set(CUB_COMPAT "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}")
+set(CUB_EXACT "${CUB_COMPAT}.${CUB_VERSION_TWEAK}")
+set(FIND_COMPAT "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}")
+set(FIND_EXACT "${FIND_COMPAT}.${PACKAGE_FIND_VERSION_TWEAK}")
+
+# Set default results
+set(PACKAGE_VERSION ${CUB_EXACT})
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+
+# Test for compatibility (ignores tweak)
+if (FIND_COMPAT VERSION_EQUAL CUB_COMPAT)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+endif()
+
+# Test for exact (does not ignore tweak)
+if (FIND_EXACT VERSION_EQUAL CUB_EXACT)
+  set(PACKAGE_VERSION_EXACT TRUE)
+endif()
diff --git a/thrust/dependencies/cub/cub/cmake/cub-config.cmake b/thrust/dependencies/cub/cub/cmake/cub-config.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0900becd8fbcff9ee791c9b990ed2bf82e26f220
--- /dev/null
+++ b/thrust/dependencies/cub/cub/cmake/cub-config.cmake
@@ -0,0 +1,62 @@
+#
+# find_package(CUB) config file.
+#
+# Defines a CUB::CUB target that may be linked from user projects to include
+# CUB.
+
+if (TARGET CUB::CUB)
+  return()
+endif()
+
+function(_cub_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit CUB will *always* be used
+  #    during compilation, and the include paths of an IMPORTED CUB::CUB
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to CUB::CUB. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+#
+# Setup targets
+#
+
+_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
+# Strip out the 'cub/cmake/' from 'cub/cmake/cub-config.cmake':
+get_filename_component(_CUB_INCLUDE_DIR "../.." ABSOLUTE BASE_DIR "${CMAKE_CURRENT_LIST_DIR}")
+target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
+
+if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR
+    THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_CPP_11 OR
+    THRUST_IGNORE_DEPRECATED_CPP_11)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_COMPILER OR
+    THRUST_IGNORE_DEPRECATED_COMPILER)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
+endif()
+
+#
+# Standardize version info
+#
+
+set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
+set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
+set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
+set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
+set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
+set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
diff --git a/thrust/dependencies/cub/cub/config.cuh b/thrust/dependencies/cub/cub/config.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b909bbf723708e59a121b5525c628f6715e24c86
--- /dev/null
+++ b/thrust/dependencies/cub/cub/config.cuh
@@ -0,0 +1,40 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static configuration header for the CUB project.
+ */
+
+#pragma once
+
+#include "util_arch.cuh"
+#include "util_compiler.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_deprecated.cuh"
+#include "util_macro.cuh"
+#include "util_namespace.cuh"
diff --git a/thrust/dependencies/cub/cub/cub.cuh b/thrust/dependencies/cub/cub/cub.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a71d78fe0d5abdda4df5dc42e15de4ea17034ad4
--- /dev/null
+++ b/thrust/dependencies/cub/cub/cub.cuh
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+// Static configuration
+#include "config.cuh"
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/discard_output_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_allocator.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/thrust/dependencies/cub/cub/device/device_histogram.cuh b/thrust/dependencies/cub/cub/device/device_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4413ff3950d977145d8ac7c8617a0f72f4d64fd8
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_histogram.cuh
@@ -0,0 +1,866 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            (OffsetT)1,
+            (size_t)(sizeof(SampleT) * num_samples),
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ , , , , , , , ]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            (OffsetT)1,
+            (size_t)(sizeof(SampleT) * NUM_CHANNELS * num_pixels),
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX))
+        {
+            // Down-convert OffsetT data type
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_partition.cuh b/thrust/dependencies/cub/cub/device/device_partition.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..65db3b7b5ad4825822ef2ed0d485d638918d11b6
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_partition.cuh
@@ -0,0 +1,273 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
+ * a specified input sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * \par Performance
+ * \linear_performance{partition}
+ *
+ * \par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * \image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated partition-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected for the first partition with 50% probability.
+     *
+     * \image html partition_if_int32_50_percent.png
+     * \image html partition_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability for the first partition:
+     *
+     * \image html partition_if_int32_5_percent.png
+     * \image html partition_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_partition_flagged.cu
+ * \example example_device_partition_if.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_radix_sort.cuh b/thrust/dependencies/cub/cub/device/device_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df218a7c3561709a8799a56c55fe5f34e4297d65
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_radix_sort.cuh
@@ -0,0 +1,796 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_reduce.cuh b/thrust/dependencies/cub/cub/device/device_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4f01c2446abf418d0211d2540c87f4b5e339aa22
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_reduce.cuh
@@ -0,0 +1,734 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_run_length_encode.cuh b/thrust/dependencies/cub/cub/device/device_run_length_encode.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e31ebf0142c1fc511245864602477f13f8728d63
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_run_length_encode.cuh
@@ -0,0 +1,278 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
+        typedef NullType    SelectOp;                   // Selection op (not used)
+        typedef Equality    EqualityOp;                 // Default == operator
+        typedef cub::Sum    ReductionOp;                // Value reduction operator
+
+        // The lengths output value type
+        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+            OffsetT,                                                                                                    // ... then the OffsetT type,
+            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+        // Generator type for providing 1s values for run-length reduction
+        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_scan.cuh b/thrust/dependencies/cub/cub/device/device_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a5902ceefa905fe6ce4ad773261a01c0fda80
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_scan.cuh
@@ -0,0 +1,443 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_segmented_radix_sort.cuh b/thrust/dependencies/cub/cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2ab2a7dde2d789535b34b89f48b26ca66512f7e8
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,875 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_radix_sort.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_segmented_reduce.cuh b/thrust/dependencies/cub/cub/device/device_segmented_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..97308c5a5dff9132c80bb7e1ac50eab764c2ab6b
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
+        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_select.cuh b/thrust/dependencies/cub/cub/device/device_select.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..136d26044a7ccacfbab7744c6ec3b7bb8ba355a6
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_select.cuh
@@ -0,0 +1,369 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/device_spmv.cuh b/thrust/dependencies/cub/cub/device/device_spmv.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0be0c20e7b73a651cb1e0bdc063c160da32942d5
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/device_spmv.cuh
@@ -0,0 +1,174 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+
+        return DispatchSpmv<ValueT, int>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_histogram.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..879d5ddec2adedfbad1bc06da83929ddbe3868c8
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1087 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DipatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels,               // Pointer to levels array
+            int             num_output_levels)      // Number of levels in array
+        {
+            this->d_levels          = d_levels;
+            this->num_output_levels = num_output_levels;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
+                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
+                WrappedLevelIteratorT;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max,                // Max sample level (exclusive)
+            _LevelT min,                // Min sample level (inclusive)
+            _LevelT scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = scale;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max,                // Max sample level (exclusive)
+            float   min,                // Min sample level (inclusive)
+            float   scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = float(1.0) / scale;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max,                 // Max sample level (exclusive)
+            double min,                 // Min sample level (inclusive)
+            double scale)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = double(1.0) / scale;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+
+    /// SM11
+    struct Policy110
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                (NUM_CHANNELS == 1) ? 256 : 128,
+                (NUM_CHANNELS == 1) ? 8 : 3,
+                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+        cudaError_t result = cudaErrorNotSupported;
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 500)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+                }
+                else
+                {
+                    result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+                }
+            #endif
+        }
+        return result;
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS] = {};
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                histogram_init_grid_dims, histogram_init_block_threads, 0,
+                stream
+            ).doit(histogram_init_kernel,
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
+            ).doit(histogram_sweep_kernel,
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2b0919fa1c2c65969590538e55b10e652eea9756
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,1660 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../config.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
+        ::Type ActiveUpsweepPolicyT;
+
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
+        ::Type ActiveDownsweepPolicyT;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            ActiveUpsweepPolicyT,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
+        ::Type ActiveUpsweepPolicyT;
+
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
+        ::Type ActiveDownsweepPolicyT;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            ActiveDownsweepPolicyT,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        // Register pressure work-around: moving num_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
+
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    typedef typename If<(ALT_DIGIT_BITS),
+        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Upsweep type
+    typedef AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT> BlockUpsweepT;
+
+    // Digit-scan type
+    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
+
+    // Downsweep type
+    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+    // Dominant-sized key/value type
+    typedef typename If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>::Type DominantT;
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <512, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+        };
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+
+
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <160, 39, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 31, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 11, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 25, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <384, 31, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 35, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy700 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT,       ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             spine_length,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.upsweep_config.block_threads, 0, stream
+            ).doit(pass_config.upsweep_kernel,
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, pass_config.scan_config.block_threads, 0, stream
+            ).doit(pass_config.scan_kernel,
+                d_spine,
+                spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.downsweep_config.block_threads, 0, stream
+            ).doit(pass_config.downsweep_kernel,
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            int                 num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template InitPassConfig<
+                    typename ActivePolicyT::UpsweepPolicy,
+                    typename ActivePolicyT::ScanPolicy,
+                    typename ActivePolicyT::DownsweepPolicy>(
+                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            if ((error = alt_pass_config.template InitPassConfig<
+                    typename ActivePolicyT::AltUpsweepPolicy,
+                    typename ActivePolicyT::ScanPolicy,
+                    typename ActivePolicyT::AltDownsweepPolicy>(
+                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3] = {};
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchSegmentedRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+            {
+              _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
+                      "%lld items per thread, %lld SM occupancy, "
+                      "current bit %d, bit_grain %d\n",
+                      (long long)num_segments,
+                      (long long)pass_config.segmented_config.block_threads,
+                      (long long)stream,
+                      (long long)pass_config.segmented_config.items_per_thread,
+                      (long long)pass_config.segmented_config.sm_occupancy,
+                      current_bit,
+                      pass_bits);
+            }
+
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments, pass_config.segmented_config.block_threads, 0,
+                stream
+            ).doit(pass_config.segmented_kernel,
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2] = {};
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c9a5e4fbe0f33c19e774b47dc0231fcc2d1851c7
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,885 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename InputT,            ///< Input data type
+    typename OutputT,           ///< Compute/output data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                256, 16, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+    typename SelectedPolicy = DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1] = {};
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(reduce_kernel,
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+    typename SelectedPolicy = DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchSegmentedReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor
+    OutputT             init;                   ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
+            ).doit(segmented_reduce_kernel,
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d8d8dcac41965eb006fbad4e0e94db14359a835b
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,560 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM11
+    struct Policy110
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+                }
+                else
+                {
+                    reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, reduce_by_key_config.block_threads, 0,
+                    stream
+                ).doit(reduce_by_key_kernel,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_rle.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..68f887151604c627901c4bbdd377a73bc95f9537
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,542 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                device_rle_config.template Init<PtxRleSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+                }
+                else
+                {
+                    device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(device_scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, device_rle_config.block_threads, 0, stream
+            ).doit(device_rle_sweep_kernel,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..24b30f102cfca976cfb61b354f2ad7719255e3c8
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,493 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OutputT> ///< Data type
+struct DeviceScanPolicy
+{
+
+    /// SM10
+    struct Policy100 : ChainedPolicy<100, Policy100, Policy100>
+    {
+        typedef AgentScanPolicy<
+                64, 9,                                          ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy100>
+    {
+        typedef AgentScanPolicy<
+                96, 21,                                         ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        typedef AgentScanPolicy<
+                256, 9,                                         ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM600
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+    {
+        typedef AgentScanPolicy<
+                128, 15,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT,            ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceScanPolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type> >
+struct DispatchScan:
+    SelectedPolicy
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    void*           d_temp_storage;         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&         temp_storage_bytes;     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT  d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT d_out;                  ///< [out] Pointer to the output sequence of data items
+    ScanOpT         scan_op;                ///< [in] Binary scan functor
+    InitValueT      init_value;             ///< [in] Initial value to seed the exclusive scan
+    OffsetT         num_items;              ///< [in] Total number of input items (i.e., the length of \p d_in)
+    cudaStream_t    stream;                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool            debug_synchronous;
+    int             ptx_version;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchScan(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous,
+        int             ptx_version
+    ):
+    d_temp_storage(d_temp_storage),
+    temp_storage_bytes(temp_storage_bytes),
+    d_in(d_in),
+    d_out(d_out),
+    num_items(num_items),
+    scan_op(scan_op),
+    init_value(init_value),
+    stream(stream),
+    debug_synchronous(debug_synchronous),
+    ptx_version(ptx_version)
+    {}
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                Policy::BLOCK_THREADS))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+                ).doit(scan_kernel,
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceScanInitKernel<ScanTileStateT>,
+            DeviceScanKernel<Policy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>
+        );
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchScan::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchScan dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            scan_op,
+            init_value,
+            stream,
+            debug_synchronous,
+            ptx_version
+            );
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_select_if.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5fec4cff72a6e45fed308aaf9658e4ec13190d02
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,546 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectIfPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            SelectIfPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                select_if_config.template Init<PtxSelectIfPolicyT>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+                }
+                else
+                {
+                    select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for select_if_kernel
+            int range_select_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                range_select_sm_occupancy,            // out
+                select_if_kernel,
+                select_if_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
+
+            // Invoke select_if_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, select_if_config.block_threads, 0, stream
+            ).doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/thrust/dependencies/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fb431df2cbdd363028f7d76826c185bc8417ab8a
--- /dev/null
+++ b/thrust/dependencies/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,850 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for sequence offsets
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = 0.0;
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+        }
+
+        spmv_params.d_vector_y[row_idx] = value;
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT>                    ///< SpmvParams type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            cub::Sum,
+            OffsetT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM11
+    struct Policy110
+    {
+        typedef AgentSpmvPolicy<
+                128,
+                1,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                18,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_RAKING>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                6,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                spmv_config.template Init<PtxSpmvPolicyT>();
+                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 500)
+                {
+                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 370)
+                {
+                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+                }
+                else
+                {
+                    spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
+                    stream
+                ).doit(spmv_1col_kernel,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
+            int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Init textures
+                    if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+                }
+            #endif
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    search_grid_size, search_block_size, 0, stream
+                ).doit(spmv_search_kernel,
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                spmv_grid_size, spmv_config.block_threads, 0, stream
+            ).doit(spmv_kernel,
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    segment_fixup_grid_size, segment_fixup_config.block_threads,
+                    0, stream
+                ).doit(segment_fixup_kernel,
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Free textures
+                    if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+                }
+            #endif
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            if (CubDebug(error = Dispatch(
+                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
+                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+                spmv_config, segment_fixup_config))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/grid/grid_barrier.cuh b/thrust/dependencies/cub/cub/grid/grid_barrier.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1bcb533ee47ffa21157528fc895e107a9c9fa65e
--- /dev/null
+++ b/thrust/dependencies/cub/cub/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/grid/grid_even_share.cuh b/thrust/dependencies/cub/cub/grid/grid_even_share.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d5f8b340ce7c72b701b1192844c9ebc8a9c3b6ef
--- /dev/null
+++ b/thrust/dependencies/cub/cub/grid/grid_even_share.cuh
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "grid_mapping.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    OffsetT     total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items;    // Initialize past-the-end
+        this->block_end             = num_items;    // Initialize past-the-end
+        this->num_items             = num_items;
+        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        OffsetT avg_tiles_per_block = total_tiles / grid_size;
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/grid/grid_mapping.cuh b/thrust/dependencies/cub/cub/grid/grid_mapping.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..889a94c96ea3d75c7c519034eabc53b3e04db11f
--- /dev/null
+++ b/thrust/dependencies/cub/cub/grid/grid_mapping.cuh
@@ -0,0 +1,113 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/grid/grid_queue.cuh b/thrust/dependencies/cub/cub/grid/grid_queue.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b5f676b03c09256bf32cc8e6ae56c31d5d11a7a
--- /dev/null
+++ b/thrust/dependencies/cub/cub/grid/grid_queue.cuh
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = fill_size;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                OffsetT counters[2];
+                counters[FILL] = fill_size;
+                counters[DRAIN] = 0;
+                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                fill_size = d_counters[FILL];
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/thrust/dependencies/cub/cub/host/mutex.cuh b/thrust/dependencies/cub/cub/host/mutex.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9880dee57ca8d0e0ef878e86a22b88e8f53767af
--- /dev/null
+++ b/thrust/dependencies/cub/cub/host/mutex.cuh
@@ -0,0 +1,167 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+#include "../util_cpp_dialect.cuh"
+
+#pragma once
+
+#if CUB_CPP_DIALECT >= 2011
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../config.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if CUB_CPP_DIALECT >= 2011
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+#else       // C++11
+
+    #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // MSVC
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // C++11
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/thrust/dependencies/cub/cub/iterator/arg_index_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f16fab8c26eb489979ff9e9fc9278d923637f9a9
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/cache_modified_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7a41a5d31a35685f40ea8a199e6b9c224c4850fd
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/cache_modified_output_iterator.cuh b/thrust/dependencies/cub/cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e1697013c4ccdccea10683a17954932238e4b1b5
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/constant_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..44fb56c920b79afb470a222ffbb37de32115785e
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/counting_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c7167a70666bbd3c5ae195b4c80aaea1547a6c9a
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/discard_output_iterator.cuh b/thrust/dependencies/cub/cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e665c784e9ee07b6da7ce61fc8151451edf41b09
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,219 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to self (no-op)
+    __host__ __device__ __forceinline__ void operator=(self_type const& other)
+    {
+        offset = other.offset;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/tex_obj_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2bd3a607e1bd718d3507bdde4988d122729dba6b
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,318 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        this->tex_offset = tex_offset;
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                // Simply dereference the pointer on the host
+                return ptr[tex_offset];
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Move array of uninitialized words, then alias and assign to return value
+                TextureWord words[TEXTURE_MULTIPLE];
+
+                #pragma unroll
+                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+                {
+                    words[i] = tex1Dfetch<TextureWord>(
+                        tex_obj,
+                        (tex_offset * TEXTURE_MULTIPLE) + i);
+                }
+
+                // Load from words
+                return *reinterpret_cast<T*>(words);
+            #else
+                // This is dead code which will never be executed.  It is here
+                // only to avoid warnings about missing return statements.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/iterator/tex_ref_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..630882724fc508a8ad6590376b3fa56ed11def6a
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,379 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (CUDART_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Static file-scope Tesla/Fermi-style texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Global texture reference specialized by type
+template <typename T>
+struct IteratorTexRef
+{
+    /// And by unique ID
+    template <int UNIQUE_ID>
+    struct TexId
+    {
+        // Largest texture word we can use in device
+        typedef typename UnitWord<T>::DeviceWord DeviceWord;
+        typedef typename UnitWord<T>::TextureWord TextureWord;
+
+        // Number of texture words per T
+        enum {
+            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
+            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+        };
+
+        // Texture reference type
+        typedef texture<TextureWord> TexRef;
+
+        // Texture reference
+        static TexRef ref;
+
+        /// Bind texture
+        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        {
+            if (d_in)
+            {
+                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
+                ref.channelDesc = tex_desc;
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+            }
+
+            return cudaSuccess;
+        }
+
+        /// Unbind texture
+        static cudaError_t UnbindTexture()
+        {
+            return CubDebug(cudaUnbindTexture(ref));
+        }
+
+        /// Fetch element
+        template <typename Distance>
+        static __device__ __forceinline__ T Fetch(Distance tex_offset)
+        {
+            DeviceWord temp[DEVICE_MULTIPLE];
+            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
+
+            #pragma unroll
+            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+            {
+                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
+            }
+
+            return reinterpret_cast<T&>(temp);
+        }
+    };
+};
+
+// Texture reference definitions
+template <typename  T>
+template <int       UNIQUE_ID>
+typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
+
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ *
+ * \par Overview
+ * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ * - Compatible with CUDA toolkit v5.5 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         UNIQUE_ID,
+    typename    OffsetT = ptrdiff_t>
+class TexRefInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexRefInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    T*              ptr;
+    difference_type tex_offset;
+
+    // Texture reference wrapper (old Tesla/Fermi-style textures)
+    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
+
+public:
+/*
+    /// Constructor
+    __host__ __device__ __forceinline__ TexRefInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0)
+    {}
+*/
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          /*bytes*/ = size_t(-1), ///< Number of bytes in the range
+        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        size_t offset;
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
+        return retval;
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return TexId::UnbindTexture();
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            // Simply dereference the pointer on the host
+            return ptr[tex_offset];
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Use the texture reference
+                return TexId::Fetch(tex_offset);
+            #else
+                // This is dead code that will never be executed.  It is here
+                // only to avoid warnings about missing returns.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif // CUDART_VERSION
diff --git a/thrust/dependencies/cub/cub/iterator/transform_input_iterator.cuh b/thrust/dependencies/cub/cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bce8b817d731fac5549b10aa791fdaf93170a193
--- /dev/null
+++ b/thrust/dependencies/cub/cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*input_itr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/thread/thread_load.cuh b/thrust/dependencies/cub/cub/thread/thread_load.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..31e759602feccfed35b6d613bbcde3683e5fd271
--- /dev/null
+++ b/thrust/dependencies/cub/cub/thread/thread_load.cuh
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
+    InputIteratorT          itr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/thread/thread_operators.cuh b/thrust/dependencies/cub/cub/thread/thread_operators.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6a3192bca3ac0e1526830847eb0ead3c67ebe730
--- /dev/null
+++ b/thrust/dependencies/cub/cub/thread/thread_operators.cuh
@@ -0,0 +1,316 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+        return retval;
+    }
+};
+
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+
+
+
+
+
+/** @} */       // end group UtilModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/thread/thread_reduce.cuh b/thrust/dependencies/cub/cub/thread/thread_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..41063f971471a8cce76c951b013e3fd6a261299d
--- /dev/null
+++ b/thrust/dependencies/cub/cub/thread/thread_reduce.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/thread/thread_scan.cuh b/thrust/dependencies/cub/cub/thread/thread_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fd907fcae104aa0bcfd84f24b1583b6143e40dcd
--- /dev/null
+++ b/thrust/dependencies/cub/cub/thread/thread_scan.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../thread/thread_operators.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/thread/thread_search.cuh b/thrust/dependencies/cub/cub/thread/thread_search.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..96b9e65a56eafb4381a51cac139e47b515ed70df
--- /dev/null
+++ b/thrust/dependencies/cub/cub/thread/thread_search.cuh
@@ -0,0 +1,156 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include <iterator>
+#include "../util_namespace.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    typedef typename std::iterator_traits<AIteratorT>::value_type T;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/thread/thread_store.cuh b/thrust/dependencies/cub/cub/thread/thread_store.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..47d6c6145fecfd51ebc762a8fdbb100643af9fc5
--- /dev/null
+++ b/thrust/dependencies/cub/cub/thread/thread_store.cuh
@@ -0,0 +1,420 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_allocator.cuh b/thrust/dependencies/cub/cub/util_allocator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fa03996f0b7a667f7936e864a604eaacdc0f937f
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_allocator.cuh
@@ -0,0 +1,709 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        if (!recached)
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_arch.cuh b/thrust/dependencies/cub/cub/util_arch.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..58d0c738819e29126e8f77bd51500aca228fcca9
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_arch.cuh
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
+        !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
+/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
+#ifndef CUB_PTX_ARCH
+    #if defined(__NVCOMPILER_CUDA__)
+        // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
+        // when compiling both host code and device code. Currently, only one
+        // PTX version can be targeted.
+        #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
+    #elif !defined(__CUDA_ARCH__)
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+#ifndef CUB_IS_DEVICE_CODE
+    #if defined(__NVCOMPILER_CUDA__)
+        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 1
+    #elif CUB_PTX_ARCH > 0
+        #define CUB_IS_DEVICE_CODE 1
+        #define CUB_IS_HOST_CODE 0
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 0
+    #else
+        #define CUB_IS_DEVICE_CODE 0
+        #define CUB_IS_HOST_CODE 1
+        #define CUB_INCLUDE_DEVICE_CODE 0
+        #define CUB_INCLUDE_HOST_CODE 1
+    #endif
+#endif
+
+/// Maximum number of devices supported.
+#ifndef CUB_MAX_DEVICES
+    #define CUB_MAX_DEVICES 128
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+    static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
+#endif
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct RegBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct MemBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+
+
+#endif  // Do not document
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_compiler.cuh b/thrust/dependencies/cub/cub/util_compiler.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9be94922a50619655af4d5f0092f74e346597607
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_compiler.cuh
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Detect compiler information.
+ */
+
+#pragma once
+
+// enumerate host compilers we know about
+#define CUB_HOST_COMPILER_UNKNOWN 0
+#define CUB_HOST_COMPILER_MSVC 1
+#define CUB_HOST_COMPILER_GCC 2
+#define CUB_HOST_COMPILER_CLANG 3
+
+// enumerate device compilers we know about
+#define CUB_DEVICE_COMPILER_UNKNOWN 0
+#define CUB_DEVICE_COMPILER_MSVC 1
+#define CUB_DEVICE_COMPILER_GCC 2
+#define CUB_DEVICE_COMPILER_NVCC 3
+#define CUB_DEVICE_COMPILER_CLANG 4
+
+// figure out which host compiler we're using
+#if defined(_MSC_VER)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
+#  define CUB_MSVC_VERSION _MSC_VER
+#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__clang__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
+#  define CUB_CLANG_VERSION                                                    \
+    (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#elif defined(__GNUC__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
+#  define CUB_GCC_VERSION                                                      \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
+#endif // CUB_HOST_COMPILER
+
+// figure out which device compiler we're using
+#if defined(__CUDACC__)
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+// CUDA-capable clang should behave similar to NVCC.
+#  if defined(__CUDA__)
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#  else
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
+#  endif
+#else
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
+#endif
diff --git a/thrust/dependencies/cub/cub/util_cpp_dialect.cuh b/thrust/dependencies/cub/cub/util_cpp_dialect.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b4cbe92373ca90ed47ef91a8ea31ae62dd12d6f1
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_cpp_dialect.cuh
@@ -0,0 +1,135 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - CUB_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - CUB_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the thrust opt-outs as well:
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+#  define    CUB_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \
+     defined(THRUST_IGNORE_DEPRECATED_COMPILER)
+#  define    CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#  define CUB_IGNORE_DEPRECATED_CPP_11
+#  define CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+// Define this to override the built-in detection.
+#ifndef CUB_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define CUB_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define CUB_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if CUB_CPLUSPLUS < 201103L
+#    define CUB_CPP_DIALECT 2003
+#  elif CUB_CPLUSPLUS < 201402L
+#    define CUB_CPP_DIALECT 2011
+#  elif CUB_CPLUSPLUS < 201703L
+#    define CUB_CPP_DIALECT 2014
+#  elif CUB_CPLUSPLUS == 201703L
+#    define CUB_CPP_DIALECT 2017
+#  elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define CUB_CPP_DIALECT 2020
+#  endif
+
+#  undef CUB_CPLUSPLUS // cleanup
+
+#endif // !CUB_CPP_DIALECT
+
+// Define CUB_COMPILER_DEPRECATION macro:
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x)
+#  define CUB_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define CUB_COMPILER_DEPRECATION(REQ, FIX) \
+  CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+// Minimum required compiler checks:
+#ifndef CUB_IGNORE_DEPRECATED_COMPILER
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
+     CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
+#  endif
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000
+     CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
+#  endif
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
+     CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
+#  endif
+#endif
+
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \
+    (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11))
+  CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
+#endif
+
+#undef CUB_COMPILER_DEPRECATION
+#undef CUB_COMP_DEPR_IMPL
+#undef CUB_COMP_DEPR_IMPL0
+#undef CUB_COMP_DEPR_IMPL1
diff --git a/thrust/dependencies/cub/cub/util_debug.cuh b/thrust/dependencies/cub/cub/util_debug.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8413f7bd4ee476297d6882fbfff860aa39bc4faa
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_debug.cuh
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
+#ifdef CUB_STDERR
+    if (error)
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+                fflush(stderr);
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+            #endif
+        }
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if defined(__NVCOMPILER_CUDA__)
+        #define _CubLog(format, ...) (__builtin_is_device_code() \
+            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                     blockIdx.z, blockIdx.y, blockIdx.x, \
+                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
+            : printf(format, __VA_ARGS__));
+    #elif !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) cub::va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) cub::va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_deprecated.cuh b/thrust/dependencies/cub/cub/util_deprecated.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b2bf4658b54f8cf7ebdc37f8d59356f77b5c30b3
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_deprecated.cuh
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define CUB_DEPRECATED macro.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEPRECATED __declspec(deprecated)
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#else
+#  define CUB_DEPRECATED
+#endif
+
diff --git a/thrust/dependencies/cub/cub/util_device.cuh b/thrust/dependencies/cub/cub/util_device.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5196f408c6b0c872cd0c6784d0cb273cc04bef2b
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_device.cuh
@@ -0,0 +1,715 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+#include <atomic>
+#include <array>
+#include <cassert>
+#endif
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+/**
+ * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t& temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Returns the current device or -1 if an error occurred.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int device = -1;
+    if (CubDebug(cudaGetDevice(&device))) return -1;
+    return device;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+/**
+ * \brief RAII helper which saves the current device and switches to the
+ *        specified device on construction and switches to the saved device on
+ *        destruction.
+ */
+struct SwitchDevice
+{
+private:
+    int const old_device;
+    bool const needs_reset;
+public:
+    __host__ __forceinline__ SwitchDevice(int new_device)
+      : old_device(CurrentDevice()), needs_reset(old_device != new_device)
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(new_device));
+    }
+
+    __host__ __forceinline__ ~SwitchDevice()
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(old_device));
+    }
+};
+
+/**
+ * \brief Returns the number of CUDA devices available or -1 if an error
+ *        occurred.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int count = -1;
+    if (CubDebug(cudaGetDeviceCount(&count)))
+        // CUDA makes no guarantees about the state of the output parameter if
+        // `cudaGetDeviceCount` fails; in practice, they don't, but out of
+        // paranoia we'll reset `count` to `-1`.
+        count = -1;
+    return count;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Cache for an arbitrary value produced by a nullary function.
+ */
+template <typename T, T(*Function)()>
+struct ValueCache
+{
+    T const value;
+
+    /**
+     * \brief Call the nullary function to produce the value and construct the
+     *        cache.
+     */
+    __host__ __forceinline__ ValueCache() : value(Function()) {}
+};
+
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+// Host code, only safely usable in C++11 or newer, where thread-safe
+// initialization of static locals is guaranteed.  This is a separate function
+// to avoid defining a local static in a host/device function.
+__host__ __forceinline__ int DeviceCountCachedValue()
+{
+    static ValueCache<int, DeviceCountUncached> cache;
+    return cache.value;
+}
+#endif
+
+/**
+ * \brief Returns the number of CUDA devices available.
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount()
+{
+    int result = -1;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                result = DeviceCountCachedValue();
+            #else
+                // Host code and C++98.
+                result = DeviceCountUncached();
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = DeviceCountUncached();
+        #endif
+    }
+    return result;
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Per-device cache for a CUDA attribute value; the attribute is queried
+ *        and stored for each device upon construction.
+ */
+struct PerDeviceAttributeCache
+{
+    struct DevicePayload
+    {
+        int         attribute;
+        cudaError_t error;
+    };
+
+    // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
+    // `DeviceEntryInitializing` state, and then proceeds to the
+    // `DeviceEntryReady` state. These are the only state transitions allowed;
+    // e.g. a linear sequence of transitions.
+    enum DeviceEntryStatus
+    {
+        DeviceEntryEmpty = 0,
+        DeviceEntryInitializing,
+        DeviceEntryReady
+    };
+
+    struct DeviceEntry
+    {
+        std::atomic<DeviceEntryStatus> flag;
+        DevicePayload                  payload;
+    };
+
+private:
+    std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
+
+public:
+    /**
+     * \brief Construct the cache.
+     */
+    __host__ __forceinline__ PerDeviceAttributeCache() : entries_()
+    {
+        assert(DeviceCount() <= CUB_MAX_DEVICES);
+    }
+
+    /**
+     * \brief Retrieves the payload of the cached function \p f for \p device.
+     *
+     * \note You must pass a morally equivalent function in to every call or
+     *       this function has undefined behavior.
+     */
+    template <typename Invocable>
+    __host__ DevicePayload operator()(Invocable&& f, int device)
+    {
+        if (device >= DeviceCount())
+            return DevicePayload{0, cudaErrorInvalidDevice};
+
+        auto& entry   = entries_[device];
+        auto& flag    = entry.flag;
+        auto& payload = entry.payload;
+
+        DeviceEntryStatus old_status = DeviceEntryEmpty;
+
+        // First, check for the common case of the entry being ready.
+        if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
+        {
+            // Assume the entry is empty and attempt to lock it so we can fill
+            // it by trying to set the state from `DeviceEntryReady` to
+            // `DeviceEntryInitializing`.
+            if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
+                                             std::memory_order_acq_rel,
+                                             std::memory_order_acquire))
+            {
+                // We successfully set the state to `DeviceEntryInitializing`;
+                // we have the lock and it's our job to initialize this entry
+                // and then release it.
+
+                // We don't use `CubDebug` here because we let the user code
+                // decide whether or not errors are hard errors.
+                if (payload.error = std::forward<Invocable>(f)(payload.attribute))
+                    // Clear the global CUDA error state which may have been
+                    // set by the last call. Otherwise, errors may "leak" to
+                    // unrelated kernel launches.
+                    cudaGetLastError();
+
+                // Release the lock by setting the state to `DeviceEntryReady`.
+                flag.store(DeviceEntryReady, std::memory_order_release);
+            }
+
+            // If the `compare_exchange_weak` failed, then `old_status` has
+            // been updated with the value of `flag` that it observed.
+
+            else if (old_status == DeviceEntryInitializing)
+            {
+                // Another execution agent is initializing this entry; we need
+                // to wait for them to finish; we'll know they're done when we
+                // observe the entry status as `DeviceEntryReady`.
+                do { old_status = flag.load(std::memory_order_acquire); }
+                while (old_status != DeviceEntryReady);
+                // FIXME: Use `atomic::wait` instead when we have access to
+                // host-side C++20 atomics. We could use libcu++, but it only
+                // supports atomics for SM60 and up, even if you're only using
+                // them in host code.
+            }
+        }
+
+        // We now know that the state of our entry is `DeviceEntryReady`, so
+        // just return the entry's payload.
+        return entry.payload;
+    }
+};
+
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
+{
+    // Instantiate `EmptyKernel<void>` in both host and device code to ensure
+    // it can be called.
+    typedef void (*EmptyKernelPtr)();
+    EmptyKernelPtr empty_kernel = EmptyKernel<void>;
+
+    // This is necessary for unused variable warnings in host compilers. The
+    // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
+    (void)reinterpret_cast<void*>(empty_kernel);
+
+    cudaError_t result = cudaSuccess;
+    if (CUB_IS_HOST_CODE) {
+       #if CUB_INCLUDE_HOST_CODE
+            cudaFuncAttributes empty_kernel_attrs;
+
+            do {
+                if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
+                    break;
+            }
+            while(0);
+
+            ptx_version = empty_kernel_attrs.ptxVersion * 10;
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // This is necessary to ensure instantiation of EmptyKernel in device code.
+            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
+            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+            ptx_version = CUB_PTX_ARCH;
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ */
+__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device)
+{
+    SwitchDevice sd(device);
+    return PtxVersionUncached(ptx_version);
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+template <typename Tag>
+__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache()
+{
+    // C++11 guarantees that initialization of static locals is thread safe.
+    static PerDeviceAttributeCache cache;
+    return cache;
+}
+
+struct PtxVersionCacheTag {};
+struct SmVersionCacheTag {};
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
+{
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+    auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+      // If this call fails, then we get the error code back in the payload,
+      // which we check with `CubDebug` below.
+      [=] (int& pv) { return PtxVersionUncached(pv, device); },
+      device);
+
+    if (!CubDebug(payload.error))
+        ptx_version = payload.attribute;
+
+    return payload.error;
+
+#else // Pre C++11.
+
+    return PtxVersionUncached(ptx_version, device);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                auto const device = CurrentDevice();
+
+                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    ptx_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98.
+                result = PtxVersionUncached(ptx_version);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = PtxVersionUncached(ptx_version);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        int major = 0, minor = 0;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#else // Device code without the CUDA runtime.
+
+    (void)sm_version;
+    (void)device;
+
+    // CUDA API calls are not supported from this device.
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11
+                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return SmVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    sm_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98
+                result = SmVersionUncached(sm_version, device);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            result = SmVersionUncached(sm_version, device);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * Synchronize the specified \p stream.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = CubDebug(cudaStreamSynchronize(stream));
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
+                (void)stream;
+                // Device can't yet sync on a specific stream
+                result = CubDebug(cudaDeviceSynchronize());
+            #else // Device code without the CUDA runtime.
+                (void)stream;
+                // CUDA API calls are not supported from this device.
+                result = CubDebug(cudaErrorInvalidConfiguration);
+            #endif
+        #endif
+    }
+    return result;
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t MaxSmOccupancy(
+    int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)
+{
+#ifndef CUB_RUNTIME_ENABLED
+
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes));
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+   /// The policy for the active compiler pass
+   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
+
+   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+   template <typename FunctorT>
+   CUB_RUNTIME_FUNCTION __forceinline__
+   static cudaError_t Invoke(int ptx_version, FunctorT& op)
+   {
+       if (ptx_version < PTX_VERSION) {
+           return PrevPolicyT::Invoke(ptx_version, op);
+       }
+       return op.template Invoke<PolicyT>();
+   }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_macro.cuh b/thrust/dependencies/cub/cub/util_macro.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ff8636542286de8a2fa956522415a46c1b5524ef
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_macro.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_namespace.cuh b/thrust/dependencies/cub/cub/util_namespace.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4488d97f6bd151e4ed9514d956d5b4590c0e38ce
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_namespace.cuh
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+#include "version.cuh"
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
+
+// Declare these namespaces here for the purpose of Doxygenating them
+
+/*! \namespace cub
+ *  \brief \p cub is the top-level namespace which contains all CUB
+ *         functions and types.
+ */
+namespace cub
+{
+
+}
diff --git a/thrust/dependencies/cub/cub/util_ptx.cuh b/thrust/dependencies/cub/cub/util_ptx.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3f20c11bebc2a9f9e2704dd88e18930f97f4c0c8
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_ptx.cuh
@@ -0,0 +1,734 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 SHFL_C,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     SHFL_C,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/util_type.cuh b/thrust/dependencies/cub/cub/util_type.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0ba41e1ed26e56c11f373fd235fc9dee88fd213c
--- /dev/null
+++ b/thrust/dependencies/cub/cub/util_type.cuh
@@ -0,0 +1,1167 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <cfloat>
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
+#include "util_macro.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
+        unsigned int,
+        typename If<IsMultiple<short>::IS_MULTIPLE,
+            unsigned short,
+            unsigned char>::Type>::Type         ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
+        unsigned long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
+        ulonglong2,
+        VolatileWord>::Type                     DeviceWord;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
+        uint4,
+        typename If<IsMultiple<int2>::IS_MULTIPLE,
+            uint2,
+            ShuffleWord>::Type>::Type           TextureWord;
+};
+
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+
+template <class T>
+struct EnableIf<false, T> {};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+template <>
+struct FpLimits<__half>
+{
+    static __host__ __device__ __forceinline__ __half Max() {
+        unsigned short max_word = 0x7BFF;
+        return reinterpret_cast<__half&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __half Lowest() {
+        unsigned short lowest_word = 0xFBFF;
+        return reinterpret_cast<__half&>(lowest_word);
+    }
+};
+#endif
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#endif
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/version.cuh b/thrust/dependencies/cub/cub/version.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..122fb9a7625da78287bacf0f4d51e3779dfd2dd5
--- /dev/null
+++ b/thrust/dependencies/cub/cub/version.cuh
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file version.h
+ *  \brief Compile-time macros encoding CUB release version
+ *
+ *         <cub/version.h> is the only CUB header that is guaranteed to
+ *         change with every CUB release.
+ *
+ */
+
+#pragma once
+
+/*! \def CUB_VERSION
+ *  \brief The preprocessor macro \p CUB_VERSION encodes the version
+ *         number of the CUB library.
+ *
+ *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
+ *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
+ *         <tt>CUB_VERSION / 100000</tt> is the major version.
+ */
+#define CUB_VERSION 101000
+
+/*! \def CUB_MAJOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
+ *         major version number of the CUB library.
+ */
+#define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
+
+/*! \def CUB_MINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
+ *         minor version number of the CUB library.
+ */
+#define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
+
+/*! \def CUB_SUBMINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
+ *         sub-minor version number of the CUB library.
+ */
+#define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
+
+/*! \def CUB_PATCH_NUMBER
+ *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
+ *         patch number of the CUB library.
+ */
+#define CUB_PATCH_NUMBER 0
diff --git a/thrust/dependencies/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/thrust/dependencies/cub/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dbc56ec1bc46ca7548b58daf3b6a797b2b815585
--- /dev/null
+++ b/thrust/dependencies/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,542 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+
+#include <stdint.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    uint32_t member_mask;
+
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = static_cast<int>(LaneId());
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
+
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value,
+            cub::Sum(),
+            last_lane,
+            offset,
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        int last_lane = (ALL_LANES_VALID) ?
+                            LOGICAL_WARP_THREADS - 1 :
+                            valid_items - 1;
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Mask of physical lanes outside the logical warp and convert to logical lanemask
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
+        }
+
+        // Mask in the last lane of logical warp
+        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/warp/specializations/warp_reduce_smem.cuh b/thrust/dependencies/cub/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2442a8c4f24b948e58871ae1387fce17cbb63ffe
--- /dev/null
+++ b/thrust/dependencies/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,372 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/warp/specializations/warp_scan_shfl.cuh b/thrust/dependencies/cub/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..18b46dd9981728995c9bd0a0708c5355cf0dd1a0
--- /dev/null
+++ b/thrust/dependencies/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,632 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = LaneId();
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/warp/specializations/warp_scan_smem.cuh b/thrust/dependencies/cub/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ccd1de30f63f82b04694282aa9d60e329185ed81
--- /dev/null
+++ b/thrust/dependencies/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,397 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/warp/warp_reduce.cuh b/thrust/dependencies/cub/cub/warp/warp_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..50ee7056c30f4d3bb07e49083cd9ddefae314281
--- /dev/null
+++ b/thrust/dependencies/cub/cub/warp/warp_reduce.cuh
@@ -0,0 +1,611 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/cub/warp/warp_scan.cuh b/thrust/dependencies/cub/cub/warp/warp_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e9e95008a38721a670c3e8531f390ed40ed41cca
--- /dev/null
+++ b/thrust/dependencies/cub/cub/warp/warp_scan.cuh
@@ -0,0 +1,935 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/dependencies/cub/eclipse code style profile.xml b/thrust/dependencies/cub/eclipse code style profile.xml
new file mode 100644
index 0000000000000000000000000000000000000000..3ca7f771cc2ea1c55c8e5db8b72e757bd315e9dc
--- /dev/null
+++ b/thrust/dependencies/cub/eclipse code style profile.xml	
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<profiles version="1">
+<profile kind="CodeFormatterProfile" name="B40C" version="1">
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.lineSplit" value="80"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.tabulation.size" value="4"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_enumerator_list" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_declarator_list" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_empty_lines" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_method_declaration" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_type_declaration" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.continuation_indentation" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_expression_list" value="0"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_conditional_expression" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_compact_if" value="0"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_body" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_block" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.tabulation.char" value="space"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.compact_else_if" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indentation.size" value="4"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_array_initializer" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments" value="do not insert"/>
+</profile>
+</profiles>
diff --git a/thrust/dependencies/cub/examples/CMakeLists.txt b/thrust/dependencies/cub/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eabb7d5009a09c479992133e0cf13d331ad5f097
--- /dev/null
+++ b/thrust/dependencies/cub/examples/CMakeLists.txt
@@ -0,0 +1,58 @@
+# Create meta targets that build all examples for a single configuration:
+foreach(cub_target IN LISTS CUB_TARGETS)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+## cub_add_example
+#
+# Add an example executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the example
+#   target. Useful for post-processing target information per-backend.
+# example_name: The name of the example minus "<config_prefix>.example." For
+#   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
+#   would be "cuda.copy".
+# example_src: The source file that implements the example.
+# cub_target: The reference cub target with configuration information.
+#
+function(cub_add_example target_name_var example_name example_src cub_target)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target cub.all.example.${example_name})
+
+  add_executable(${example_target} "${example_src}")
+  target_link_libraries(${example_target} ${cub_target})
+  cub_clone_target_properties(${example_target} ${cub_target})
+  target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  if (CUB_ENABLE_EXAMPLES_WITH_RDC)
+    set_target_properties(${example_target} PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+    )
+  endif()
+
+  add_test(NAME ${example_target}
+    COMMAND "$<TARGET_FILE:${example_target}>"
+  )
+endfunction()
+
+add_subdirectory(block)
+add_subdirectory(device)
diff --git a/thrust/dependencies/cub/examples/block/.gitignore b/thrust/dependencies/cub/examples/block/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9dad9639065c8a1ed0ac787f39bf01ca48001c87
--- /dev/null
+++ b/thrust/dependencies/cub/examples/block/.gitignore
@@ -0,0 +1,7 @@
+/bin
+/Debug
+/Release
+/cuda55.sdf
+/cuda55.suo
+/cuda60.sdf
+/cuda60.suo
diff --git a/thrust/dependencies/cub/examples/block/CMakeLists.txt b/thrust/dependencies/cub/examples/block/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cfca5720a85baad31c45b6def5ebce35f54524db
--- /dev/null
+++ b/thrust/dependencies/cub/examples/block/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB_RECURSE example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  example_*.cu
+)
+
+foreach (cub_target IN LISTS CUB_TARGETS)
+  foreach (example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WE)
+    string(REGEX REPLACE
+      "^example_block_" "block."
+      example_name "${example_name}"
+    )
+    cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
+  endforeach()
+endforeach()
diff --git a/thrust/dependencies/cub/examples/block/Makefile b/thrust/dependencies/cub/examples/block/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..b173c2a02f2c77b8b6f51546e2ed422d2d02d4d2
--- /dev/null
+++ b/thrust/dependencies/cub/examples/block/Makefile
@@ -0,0 +1,128 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
+#
+#-------------------------------------------------------------------------------
+ 
+include ../../common.mk 
+ 
+ 
+#-------------------------------------------------------------------------------
+# Includes
+#-------------------------------------------------------------------------------
+
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =				$(CUB_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+		
+ALL = 	example_block_radix_sort \
+	 	example_block_reduce \
+	 	example_block_scan
+		
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+#-------------------------------------------------------------------------------
+# make all
+#-------------------------------------------------------------------------------
+
+all : $(ALL)
+
+#-------------------------------------------------------------------------------
+# make run
+#-------------------------------------------------------------------------------
+
+run : 
+	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
+
+
+
+
+#-------------------------------------------------------------------------------
+# make example_block_reduce
+#-------------------------------------------------------------------------------
+
+example_block_reduce: bin/example_block_reduce_$(BIN_SUFFIX)
+
+bin/example_block_reduce_$(BIN_SUFFIX) : example_block_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_reduce_$(BIN_SUFFIX) example_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_block_scan
+#-------------------------------------------------------------------------------
+
+example_block_scan: bin/example_block_scan_$(BIN_SUFFIX)
+
+bin/example_block_scan_$(BIN_SUFFIX) : example_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_scan_$(BIN_SUFFIX) example_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_block_radix_sort
+#-------------------------------------------------------------------------------
+
+example_block_radix_sort: bin/example_block_radix_sort_$(BIN_SUFFIX)
+
+bin/example_block_radix_sort_$(BIN_SUFFIX) : example_block_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_radix_sort_$(BIN_SUFFIX) example_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
diff --git a/thrust/dependencies/cub/examples/block/example_block_radix_sort.cu b/thrust/dependencies/cub/examples/block/example_block_radix_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2fbeda9016df8dded69b3d6b4e5a765cd5c85159
--- /dev/null
+++ b/thrust/dependencies/cub/examples/block/example_block_radix_sort.cu
@@ -0,0 +1,323 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockRadixSort
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+#include <algorithm>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+/// Uniform key samples
+bool g_uniform_keys;
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide sorting over integers
+ */
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void BlockSortKernel(
+    Key         *d_in,          // Tile of input
+    Key         *d_out,         // Tile of output
+    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
+{
+    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+
+    // Specialize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage        load;
+        typename BlockRadixSortT::TempStorage   sort;
+    } temp_storage;
+
+    // Per-thread tile items
+    Key items[ITEMS_PER_THREAD];
+
+    // Our current block's offset
+    int block_offset = blockIdx.x * TILE_SIZE;
+
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Sort keys
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Store output in striped fashion
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+    // Store elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize sorting problem (and solution).
+ */
+template <typename Key>
+void Initialize(
+    Key *h_in,
+    Key *h_reference,
+    int num_items,
+    int tile_size)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (g_uniform_keys)
+        {
+            h_in[i] = 0;
+        }
+        else
+        {
+            RandomBits(h_in[i]);
+        }
+        h_reference[i] = h_in[i];
+    }
+
+    // Only sort the first tile
+    std::sort(h_reference, h_reference + tile_size);
+}
+
+
+/**
+ * Test BlockScan
+ */
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    Key *h_in               = new Key[TILE_SIZE * g_grid_size];
+    Key *h_reference        = new Key[TILE_SIZE * g_grid_size];
+    clock_t *h_elapsed      = new clock_t[g_grid_size];
+
+    // Initialize problem and reference output on host
+    Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
+
+    // Initialize device arrays
+    Key *d_in       = NULL;
+    Key *d_out      = NULL;
+    clock_t *d_elapsed  = NULL;
+    CubDebugExit(cudaMalloc((void**)&d_in,          sizeof(Key) * TILE_SIZE * g_grid_size));
+    CubDebugExit(cudaMalloc((void**)&d_out,         sizeof(Key) * TILE_SIZE * g_grid_size));
+    CubDebugExit(cudaMalloc((void**)&d_elapsed,     sizeof(clock_t) * g_grid_size));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            std::cout << h_in[i] << ", ";
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
+
+    // Copy problem to device
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
+
+    printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+    fflush(stdout);
+
+    // Run kernel once to prime caches and check result
+    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    printf("\tOutput items: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    fflush(stdout);
+
+    // Run this several times and average the performance results
+    GpuTimer            timer;
+    float               elapsed_millis          = 0.0;
+    unsigned long long  elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        timer.Start();
+
+        // Run kernel
+        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
+        for (int i = 0; i < g_grid_size; i++)
+            elapsed_clocks += h_elapsed[i];
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    double avg_clocks           = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
+    double avg_clocks_per_item  = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+    fflush(stdout);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_elapsed) delete[] h_elapsed;
+    if (d_in) CubDebugExit(cudaFree(d_in));
+    if (d_out) CubDebugExit(cudaFree(d_out));
+    if (d_elapsed) CubDebugExit(cudaFree(d_elapsed));
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_uniform_keys = args.CheckCmdLineFlag("uniform");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations (default:%d)>]"
+            "[--grid-size=<grid size (default:%d)>]"
+            "[--v] "
+            "\n", argv[0], g_timing_iterations, g_grid_size);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    fflush(stdout);
+
+    // Run tests
+    printf("\nuint32:\n"); fflush(stdout);
+    Test<unsigned int, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    printf("\nfp32:\n"); fflush(stdout);
+    Test<float, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    printf("\nuint8:\n"); fflush(stdout);
+    Test<unsigned char, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/examples/block/example_block_reduce.cu b/thrust/dependencies/cub/examples/block/example_block_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bad8001309e840142e944ed9cebfe98ec90d9f51
--- /dev/null
+++ b/thrust/dependencies/cub/examples/block/example_block_reduce.cu
@@ -0,0 +1,290 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockReduce
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockReduceAlgorithm    ALGORITHM>
+__global__ void BlockSumKernel(
+    int         *d_in,          // Tile of input
+    int         *d_out,         // Tile aggregate
+    clock_t     *d_elapsed)     // Elapsed cycle count of block reduction
+{
+    // Specialize BlockReduce type for our thread block
+    typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;
+
+    // Shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    // Per-thread tile data
+    int data[ITEMS_PER_THREAD];
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Compute sum
+    int aggregate = BlockReduceT(temp_storage).Sum(data);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Store aggregate and elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+        *d_out = aggregate;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(int *h_in, int num_items)
+{
+    int inclusive = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_in[i] = i % 17;
+        inclusive += h_in[i];
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * Test thread block reduction
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockReduceAlgorithm    ALGORITHM>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    int *h_in           = new int[TILE_SIZE];
+    int *h_gpu          = new int[TILE_SIZE + 1];
+
+    // Initialize problem and reference output on host
+    int h_aggregate = Initialize(h_in, TILE_SIZE);
+
+    // Initialize device arrays
+    int *d_in           = NULL;
+    int *d_out          = NULL;
+    clock_t *d_elapsed  = NULL;
+    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
+    cudaMalloc((void**)&d_out,         sizeof(int) * 1);
+    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            printf("%d, ", h_in[i]);
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+
+    // Run aggregate/prefix kernel
+    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check total aggregate
+    printf("\tAggregate: ");
+    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Run this several times and average the performance results
+    GpuTimer    timer;
+    float       elapsed_millis          = 0.0;
+    clock_t     elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Copy problem to device
+        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+        timer.Start();
+
+        // Run aggregate/prefix kernel
+        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        clock_t clocks;
+        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+        elapsed_clocks += clocks;
+
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
+    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_gpu) delete[] h_gpu;
+    if (d_in) cudaFree(d_in);
+    if (d_out) cudaFree(d_out);
+    if (d_elapsed) cudaFree(d_elapsed);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations>] "
+            "[--grid-size=<grid size>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run tests
+    Test<1024, 1, BLOCK_REDUCE_RAKING>();
+    Test<512, 2, BLOCK_REDUCE_RAKING>();
+    Test<256, 4, BLOCK_REDUCE_RAKING>();
+    Test<128, 8, BLOCK_REDUCE_RAKING>();
+    Test<64, 16, BLOCK_REDUCE_RAKING>();
+    Test<32, 32, BLOCK_REDUCE_RAKING>();
+    Test<16, 64, BLOCK_REDUCE_RAKING>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/examples/block/example_block_scan.cu b/thrust/dependencies/cub/examples/block/example_block_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fa709a56cde1120795192d8002a549b53b139f5c
--- /dev/null
+++ b/thrust/dependencies/cub/examples/block/example_block_scan.cu
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockScan
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockScanAlgorithm      ALGORITHM>
+__global__ void BlockPrefixSumKernel(
+    int         *d_in,          // Tile of input
+    int         *d_out,         // Tile of output
+    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
+{
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+
+    // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
+
+    // Specialize BlockScan type for our thread block
+    typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;
+        typename BlockStoreT::TempStorage   store;
+        typename BlockScanT::TempStorage    scan;
+    } temp_storage;
+
+    // Per-thread tile data
+    int data[ITEMS_PER_THREAD];
+
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in, data);
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Compute exclusive prefix sum
+    int aggregate;
+    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Store items from a blocked arrangement
+    BlockStoreT(temp_storage.store).Store(d_out, data);
+
+    // Store aggregate and elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+        d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive prefix sum problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(
+    int *h_in,
+    int *h_reference,
+    int num_items)
+{
+    int inclusive = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_in[i] = i % 17;
+
+        h_reference[i] = inclusive;
+        inclusive += h_in[i];
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * Test thread block scan
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockScanAlgorithm  ALGORITHM>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    int *h_in           = new int[TILE_SIZE];
+    int *h_reference    = new int[TILE_SIZE];
+    int *h_gpu          = new int[TILE_SIZE + 1];
+
+    // Initialize problem and reference output on host
+    int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
+
+    // Initialize device arrays
+    int *d_in           = NULL;
+    int *d_out          = NULL;
+    clock_t *d_elapsed  = NULL;
+    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
+    cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
+    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            printf("%d, ", h_in[i]);
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
+        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+
+    // Run aggregate/prefix kernel
+    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check results
+    printf("\tOutput items: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check total aggregate
+    printf("\tAggregate: ");
+    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Run this several times and average the performance results
+    GpuTimer    timer;
+    float       elapsed_millis          = 0.0;
+    clock_t     elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Copy problem to device
+        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+        timer.Start();
+
+        // Run aggregate/prefix kernel
+        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        clock_t clocks;
+        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+        elapsed_clocks += clocks;
+
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
+    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_gpu) delete[] h_gpu;
+    if (d_in) cudaFree(d_in);
+    if (d_out) cudaFree(d_out);
+    if (d_elapsed) cudaFree(d_elapsed);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations (default:%d)>]"
+            "[--grid-size=<grid size (default:%d)>]"
+            "[--v] "
+            "\n", argv[0], g_timing_iterations, g_grid_size);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run tests
+    Test<1024, 1, BLOCK_SCAN_RAKING>();
+    Test<512, 2, BLOCK_SCAN_RAKING>();
+    Test<256, 4, BLOCK_SCAN_RAKING>();
+    Test<128, 8, BLOCK_SCAN_RAKING>();
+    Test<64, 16, BLOCK_SCAN_RAKING>();
+    Test<32, 32, BLOCK_SCAN_RAKING>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
+    Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
+    Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
+    Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
+    Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
+    Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
+
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/examples/device/.gitignore b/thrust/dependencies/cub/examples/device/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7032b5ac752727d574fb518651131ba5cc25fd79
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/.gitignore
@@ -0,0 +1,8 @@
+/bin
+/Debug
+/ipch
+/Release
+/cuda55.sdf
+/cuda55.suo
+/cuda60.sdf
+/cuda60.suo
diff --git a/thrust/dependencies/cub/examples/device/CMakeLists.txt b/thrust/dependencies/cub/examples/device/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19d412cfb8020b35e345e0ef1e8a1317e35e678d
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB_RECURSE example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  example_*.cu
+)
+
+foreach (cub_target IN LISTS CUB_TARGETS)
+  foreach (example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WE)
+    string(REGEX REPLACE
+      "^example_device_" "device."
+      example_name "${example_name}"
+    )
+    cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
+  endforeach()
+endforeach()
diff --git a/thrust/dependencies/cub/examples/device/Makefile b/thrust/dependencies/cub/examples/device/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..fea1494e823291746dfcfbfd6ca68822aa3824fe
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/Makefile
@@ -0,0 +1,197 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
+#
+#-------------------------------------------------------------------------------
+ 
+include ../../common.mk 
+ 
+ 
+#-------------------------------------------------------------------------------
+# Includes
+#-------------------------------------------------------------------------------
+
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =				$(CUB_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+		
+ALL = 	example_device_partition_flagged \
+		example_device_partition_if \
+	 	example_device_radix_sort \
+		example_device_reduce \
+	 	example_device_scan \
+	 	example_device_select_unique \
+		example_device_select_flagged \
+		example_device_select_if \
+		example_device_sort_find_non_trivial_runs
+		
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+#-------------------------------------------------------------------------------
+# make all
+#-------------------------------------------------------------------------------
+
+all : $(ALL)
+
+#-------------------------------------------------------------------------------
+# make run
+#-------------------------------------------------------------------------------
+
+run : 
+	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
+
+
+#-------------------------------------------------------------------------------
+# make example_device_reduce
+#-------------------------------------------------------------------------------
+
+example_device_reduce: bin/example_device_reduce_$(BIN_SUFFIX)
+
+bin/example_device_reduce_$(BIN_SUFFIX) : example_device_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_reduce_$(BIN_SUFFIX) example_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_partition_flagged
+#-------------------------------------------------------------------------------
+
+example_device_partition_flagged: bin/example_device_partition_flagged_$(BIN_SUFFIX)
+
+bin/example_device_partition_flagged_$(BIN_SUFFIX) : example_device_partition_flagged.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_partition_flagged_$(BIN_SUFFIX) example_device_partition_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make example_device_partition_if
+#-------------------------------------------------------------------------------
+
+example_device_partition_if: bin/example_device_partition_if_$(BIN_SUFFIX)
+
+bin/example_device_partition_if_$(BIN_SUFFIX) : example_device_partition_if.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_partition_if_$(BIN_SUFFIX) example_device_partition_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make example_device_scan
+#-------------------------------------------------------------------------------
+
+example_device_scan: bin/example_device_scan_$(BIN_SUFFIX)
+
+bin/example_device_scan_$(BIN_SUFFIX) : example_device_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_scan_$(BIN_SUFFIX) example_device_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_radix_sort
+#-------------------------------------------------------------------------------
+
+example_device_radix_sort: bin/example_device_radix_sort_$(BIN_SUFFIX)
+
+bin/example_device_radix_sort_$(BIN_SUFFIX) : example_device_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_radix_sort_$(BIN_SUFFIX) example_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_select_unique
+#-------------------------------------------------------------------------------
+
+example_device_select_unique: bin/example_device_select_unique_$(BIN_SUFFIX)
+
+bin/example_device_select_unique_$(BIN_SUFFIX) : example_device_select_unique.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_unique_$(BIN_SUFFIX) example_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_select_flagged
+#-------------------------------------------------------------------------------
+
+example_device_select_flagged: bin/example_device_select_flagged_$(BIN_SUFFIX)
+
+bin/example_device_select_flagged_$(BIN_SUFFIX) : example_device_select_flagged.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_flagged_$(BIN_SUFFIX) example_device_select_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make example_device_select_if
+#-------------------------------------------------------------------------------
+
+example_device_select_if: bin/example_device_select_if_$(BIN_SUFFIX)
+
+bin/example_device_select_if_$(BIN_SUFFIX) : example_device_select_if.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_if_$(BIN_SUFFIX) example_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_sort_find_non_trivial_runs
+#-------------------------------------------------------------------------------
+
+example_device_sort_find_non_trivial_runs: bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX)
+
+bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) : example_device_sort_find_non_trivial_runs.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) example_device_sort_find_non_trivial_runs.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_partition_flagged.cu b/thrust/dependencies/cub/examples/device/example_device_partition_flagged.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ae02b3c52566dfa4914160a8645f052f829c6504
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_partition_flagged.cu
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::Flagged().
+ *
+ * Partition flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_partition.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             num_items,
+    int             max_segment)
+{
+    unsigned short max_short = (unsigned short) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_flags[j] = 0;
+            h_in[j] = key;
+            j++;
+        }
+
+        h_flags[i] = 1;
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("Flags:\n");
+        DisplayResults(h_flags, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (h_flags[i])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int             *h_in        = new int[num_items];
+    int             *h_reference = new int[num_items];
+    unsigned char   *h_flags     = new unsigned char[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, h_flags, num_items, max_segment);
+    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+    printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int             *d_in = NULL;
+    unsigned char   *d_flags = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_partition_if.cu b/thrust/dependencies/cub/examples/device/example_device_partition_if.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7bf1c163e060011f6365410780b80ae3a6b9533c
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_partition_if.cu
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::If().
+ *
+ * Partitions items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_partition.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+/// Selection functor type
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const {
+        return (a > compare);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(
+    int             *h_in,
+    SelectOp        select_op,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (select_op(h_in[i]))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int *h_in        = new int[num_items];
+    int *h_reference = new int[num_items];
+
+    // DevicePartition a pivot index
+    unsigned int pivot_index;
+    unsigned int max_int = (unsigned int) -1;
+    RandomBits(pivot_index);
+    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    GreaterThan select_op(h_in[pivot_index]);
+
+    int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+    printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_radix_sort.cu b/thrust/dependencies/cub/examples/device/example_device_radix_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ea7e62c457229d6d8c45e9ae79c2c56dbe85ace
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_radix_sort.cu
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceRadixSort::SortPairs().
+ *
+ * Sorts an array of float keys paired with a corresponding array of int values.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for floating point types.  Distinguishes
+ * between positive and negative zero.
+ */
+struct Pair
+{
+    float   key;
+    int     value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // Return true if key is negative zero and b.key is positive zero
+        unsigned int key_bits   = SafeBitCast<unsigned int>(key);
+        unsigned int b_key_bits = SafeBitCast<unsigned int>(b.key);
+        unsigned int HIGH_BIT   = 1u << 31;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key-value sorting problem.
+ */
+void Initialize(
+    float           *h_keys,
+    int             *h_values,
+    float           *h_reference_keys,
+    int             *h_reference_values,
+    int             num_items)
+{
+    Pair *h_pairs = new Pair[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        RandomBits(h_keys[i]);
+        RandomBits(h_values[i]);
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+
+        printf("Input values:\n");
+        DisplayResults(h_values, num_items);
+        printf("\n\n");
+    }
+
+    std::stable_sort(h_pairs, h_pairs + num_items);
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_keys[i]     = h_pairs[i].key;
+        h_reference_values[i]   = h_pairs[i].value;
+    }
+
+    delete[] h_pairs;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
+        num_items, int(sizeof(float)), int(sizeof(int)));
+    fflush(stdout);
+
+    // Allocate host arrays
+    float   *h_keys             = new float[num_items];
+    float   *h_reference_keys   = new float[num_items];
+    int     *h_values           = new int[num_items];
+    int     *h_reference_values = new int[num_items];
+
+    // Initialize problem and solution on host
+    Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
+
+    // Allocate device arrays
+    DoubleBuffer<float> d_keys;
+    DoubleBuffer<int>   d_values;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
+
+    // Allocate temporary storage
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Initialize device arrays
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Run
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
+    printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+    printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_keys) delete[] h_keys;
+    if (h_reference_keys) delete[] h_reference_keys;
+    if (h_values) delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_reduce.cu b/thrust/dependencies/cub/examples/device/example_device_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc8fddb0e205408794dc32d96044b7b523428afe
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_reduce.cu
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceReduce::Sum().
+ *
+ * Sums an array of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_reduce.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+void Initialize(
+    int   *h_in,
+    int     num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+        h_in[i] = i;
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Compute solution
+ */
+void Solve(
+    int           *h_in,
+    int           &h_reference,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (i == 0)
+            h_reference = h_in[0];
+        else
+            h_reference += h_in[i];
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
+        num_items, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate host arrays
+    int* h_in = new int[num_items];
+    int  h_reference;
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items);
+    Solve(h_in, h_reference, num_items);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array
+    int *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
+
+    // Request and allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_scan.cu b/thrust/dependencies/cub/examples/device/example_device_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c85526b5b13f1bedb9fe30c31f910f9f8a4f9e6
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_scan.cu
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceScan::ExclusiveSum().
+ *
+ * Computes an exclusive sum of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_scan.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+void Initialize(
+    int        *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+        h_in[i] = i;
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+int Solve(
+    int           *h_in,
+    int           *h_reference,
+    int             num_items)
+{
+    int inclusive = 0;
+    int aggregate = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference[i] = inclusive;
+        inclusive += h_in[i];
+        aggregate += h_in[i];
+    }
+
+    return aggregate;
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n",
+        num_items, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate host arrays
+    int*  h_in = new int[num_items];
+    int*  h_reference = new int[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items);
+    Solve(h_in, h_reference, num_items);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array
+    int *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_select_flagged.cu b/thrust/dependencies/cub/examples/device/example_device_select_flagged.cu
new file mode 100644
index 0000000000000000000000000000000000000000..12581f89e005af29fa7e7e7c94abe27c5ce7781a
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_select_flagged.cu
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Flagged().
+ *
+ * Selects flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             num_items,
+    int             max_segment)
+{
+    unsigned short max_short = (unsigned short) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_flags[j] = 0;
+            h_in[j] = key;
+            j++;
+        }
+
+        h_flags[i] = 1;
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("Flags:\n");
+        DisplayResults(h_flags, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (h_flags[i])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int             *h_in        = new int[num_items];
+    int             *h_reference = new int[num_items];
+    unsigned char   *h_flags     = new unsigned char[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, h_flags, num_items, max_segment);
+    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+    printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int             *d_in = NULL;
+    unsigned char   *d_flags = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_select_if.cu b/thrust/dependencies/cub/examples/device/example_device_select_if.cu
new file mode 100644
index 0000000000000000000000000000000000000000..689c99b96d6cb7ff2038a818731e9471da41f185
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_select_if.cu
@@ -0,0 +1,242 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::If().
+ *
+ * Selects items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+/// Selection functor type
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const {
+        return (a > compare);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(
+    int             *h_in,
+    SelectOp        select_op,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (select_op(h_in[i]))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int *h_in        = new int[num_items];
+    int *h_reference = new int[num_items];
+
+    // Select a pivot index
+    unsigned int pivot_index;
+    unsigned int max_int = (unsigned int) -1;
+    RandomBits(pivot_index);
+    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    GreaterThan select_op(h_in[pivot_index]);
+
+    int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+    printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_select_unique.cu b/thrust/dependencies/cub/examples/device/example_device_select_unique.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e9cefd5b8dd30f61c3d136d284b159c0ddc29d35
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_select_unique.cu
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Unique().
+ *
+ * Selects the first element from each run of identical values from a sequence
+ * of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int         *h_in,
+    int         *h_reference,
+    int         num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int*  h_in        = new int[num_items];
+    int*  h_reference = new int[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
+        num_items, (int) sizeof(int), num_selected, num_items / num_selected);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/examples/device/example_device_sort_find_non_trivial_runs.cu b/thrust/dependencies/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ed7024840c2b8b844828becefb8bbc5f04f3ffa7
--- /dev/null
+++ b/thrust/dependencies/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
@@ -0,0 +1,384 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of sorting a sequence of keys and values (each pair is a
+ * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
+ * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for using std::sort on key-value pairs.
+ */
+template <typename Key, typename Value>
+struct Pair
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Pair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const Pair<Key, Value>& val)
+{
+    os << '<' << val.key << ',' << val.value << '>';
+    return os;
+}
+
+
+/**
+ * Initialize problem
+ */
+template <typename Key, typename Value>
+void Initialize(
+    Key    *h_keys,
+    Value  *h_values,
+    int    num_items,
+    int    max_key)
+{
+    float scale = float(max_key) / float(UINT_MAX);
+    for (int i = 0; i < num_items; ++i)
+    {
+        Key sample;
+        RandomBits(sample);
+        h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample);
+        h_values[i] = i;
+    }
+
+    if (g_verbose)
+    {
+        printf("Keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+
+        printf("Values:\n");
+        DisplayResults(h_values, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve sorted non-trivial subrange problem.  Returns the number
+ * of non-trivial runs found.
+ */
+template <typename Key, typename Value>
+int Solve(
+    Key     *h_keys,
+    Value   *h_values,
+    int     num_items,
+    int     *h_offsets_reference,
+    int     *h_lengths_reference)
+{
+    // Sort
+
+    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    std::stable_sort(h_pairs, h_pairs + num_items);
+
+    if (g_verbose)
+    {
+        printf("Sorted pairs:\n");
+        DisplayResults(h_pairs, num_items);
+        printf("\n\n");
+    }
+
+    // Find non-trivial runs
+
+    Key     previous        = h_pairs[0].key;
+    int     length          = 1;
+    int     num_runs        = 0;
+    int     run_begin       = 0;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (previous != h_pairs[i].key)
+        {
+            if (length > 1)
+            {
+                h_offsets_reference[num_runs]     = run_begin;
+                h_lengths_reference[num_runs]     = length;
+                num_runs++;
+            }
+            length = 1;
+            run_begin = i;
+        }
+        else
+        {
+            length++;
+        }
+        previous = h_pairs[i].key;
+    }
+
+    if (length > 1)
+    {
+        h_offsets_reference[num_runs]   = run_begin;
+        h_lengths_reference[num_runs]   = length;
+        num_runs++;
+    }
+
+    delete[] h_pairs;
+
+    return num_runs;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef unsigned int    Key;
+    typedef int             Value;
+
+    int timing_iterations   = 0;
+    int num_items           = 40;
+    Key max_key             = 20;       // Max item
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxkey", max_key);
+    args.GetCmdLineArgument("i", timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations> "
+            "[--n=<input items, default 40> "
+            "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays (problem and reference solution)
+
+    Key     *h_keys                 = new Key[num_items];
+    Value   *h_values               = new Value[num_items];
+    int     *h_offsets_reference    = new int[num_items];
+    int     *h_lengths_reference    = new int[num_items];
+
+    // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs)
+    printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
+    fflush(stdout);
+
+    Initialize(h_keys, h_values, num_items, max_key);
+    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
+
+    printf("%d non-trivial runs\n", num_runs);
+    fflush(stdout);
+
+    // Repeat for performance timing
+    GpuTimer gpu_timer;
+    GpuTimer gpu_rle_timer;
+    float elapsed_millis = 0.0;
+    float elapsed_rle_millis = 0.0;
+    for (int i = 0; i <= timing_iterations; ++i)
+    {
+
+        // Allocate and initialize device arrays for sorting
+        DoubleBuffer<Key>       d_keys;
+        DoubleBuffer<Value>     d_values;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
+
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+        // Start timer
+        gpu_timer.Start();
+
+        // Allocate temporary storage for sorting
+        size_t  temp_storage_bytes  = 0;
+        void    *d_temp_storage     = NULL;
+        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Do the sort
+        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+        // Free unused buffers and sorting temporary storage
+        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
+        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+        // Start timer
+        gpu_rle_timer.Start();
+
+        // Allocate device arrays for enumerating non-trivial runs
+        int     *d_offests_out   = NULL;
+        int     *d_lengths_out   = NULL;
+        int     *d_num_runs      = NULL;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
+
+        // Allocate temporary storage for isolating non-trivial runs
+        d_temp_storage = NULL;
+        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys.d_buffers[d_keys.selector],
+            d_offests_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Do the isolation
+        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys.d_buffers[d_keys.selector],
+            d_offests_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items));
+
+        // Free keys buffer
+        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
+
+        //
+        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
+        //
+
+        // Stop sort timer
+        gpu_timer.Stop();
+        gpu_rle_timer.Stop();
+
+        if (i == 0)
+        {
+            // First iteration is a warmup: // Check for correctness (and display results, if specified)
+
+            printf("\nRUN OFFSETS: \n");
+            int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            printf("\nRUN LENGTHS: \n");
+            compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            printf("\nNUM RUNS: \n");
+            compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            AssertEquals(0, compare);
+        }
+        else
+        {
+            elapsed_millis += gpu_timer.ElapsedMillis();
+            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
+        }
+
+        // GPU cleanup
+
+        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
+        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
+        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+    // Host cleanup
+    if (h_keys) delete[] h_keys;
+    if (h_values) delete[] h_values;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+
+    printf("\n\n");
+
+    if (timing_iterations > 0)
+    {
+        printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
+            timing_iterations,
+            elapsed_millis / timing_iterations,
+            elapsed_rle_millis / timing_iterations);
+    }
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/experimental/.gitignore b/thrust/dependencies/cub/experimental/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5e56e040ec0902e58df8573adaec65c5da6e9304
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/.gitignore
@@ -0,0 +1 @@
+/bin
diff --git a/thrust/dependencies/cub/experimental/Makefile b/thrust/dependencies/cub/experimental/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7165d93320c0d45af4e6aadc7c7f96af22c89d97
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/Makefile
@@ -0,0 +1,125 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>]
+#
+#-------------------------------------------------------------------------------
+ 
+include ../common.mk 
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [mkl=<0|1>] compile against Intel MKL
+ifeq ($(mkl), 1)
+	DEFINES 	+= -DCUB_MKL
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+	LIBS 		+=	mkl_intel_lp64.lib mkl_intel_thread.lib  mkl_core.lib libiomp5md.lib
+	NVCCFLAGS 	+= -Xcompiler /openmp
+else
+	LIBS		+= -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm
+	NVCCFLAGS 	+= -Xcompiler -fopenmp
+	
+endif	
+
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+# Includes
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+exp_rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+EXP_DEPS = 	$(call rwildcard, ./,*.cuh) \
+			$(call rwildcard, ./,*.h)
+
+DEPS =				$(CUB_DEPS) \
+					$(EXP_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+
+		
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+
+#-------------------------------------------------------------------------------
+# make histogram_compare
+#-------------------------------------------------------------------------------
+
+histogram_compare: bin/histogram_compare_$(BIN_SUFFIX)
+
+bin/histogram_compare_$(BIN_SUFFIX) : histogram_compare.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/histogram_compare_$(BIN_SUFFIX) histogram_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
+
+
+#-------------------------------------------------------------------------------
+# make spmv_compare
+#-------------------------------------------------------------------------------
+
+spmv_compare: bin/spmv_compare_$(BIN_SUFFIX)
+
+bin/spmv_compare_$(BIN_SUFFIX) : spmv_compare.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/spmv_compare_$(BIN_SUFFIX) spmv_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse $(MKL_LIBS) -O3
+	
+
diff --git a/thrust/dependencies/cub/experimental/defunct/example_coo_spmv.cu b/thrust/dependencies/cub/experimental/defunct/example_coo_spmv.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b33e1f706178d932475a45e23de328a022d0caa
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/defunct/example_coo_spmv.cu
@@ -0,0 +1,1070 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * An implementation of COO SpMV using prefix scan to implement a
+ * reduce-value-by-row strategy
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdio.h>
+
+#include <cub/cub.cuh>
+
+#include "coo_graph.cuh"
+#include "../test/test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+/******************************************************************************
+ * Globals, constants, and typedefs
+ ******************************************************************************/
+
+typedef int         VertexId;   // uint32s as vertex ids
+typedef double      Value;      // double-precision floating point values
+
+bool                    g_verbose       = false;
+int                     g_timing_iterations    = 1;
+CachingDeviceAllocator  g_allocator;
+
+
+/******************************************************************************
+ * Texture referencing
+ ******************************************************************************/
+
+/**
+ * Templated texture reference type for multiplicand vector
+ */
+template <typename Value>
+struct TexVector
+{
+    // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
+    typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
+
+    // Texture reference type
+    typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
+
+    static TexRef ref;
+
+    /**
+     * Bind textures
+     */
+    static void BindTexture(void *d_in, int elements)
+    {
+        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
+        if (d_in)
+        {
+            size_t offset;
+            size_t bytes = sizeof(CastType) * elements;
+            CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
+        }
+    }
+
+    /**
+     * Unbind textures
+     */
+    static void UnbindTexture()
+    {
+        CubDebugExit(cudaUnbindTexture(ref));
+    }
+
+    /**
+     * Load
+     */
+    static __device__ __forceinline__ Value Load(int offset)
+    {
+        Value output;
+        reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
+        return output;
+    }
+};
+
+// Texture reference definitions
+template <typename Value>
+typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
+
+
+/******************************************************************************
+ * Utility types
+ ******************************************************************************/
+
+
+/**
+ * A partial dot-product sum paired with a corresponding row-id
+ */
+template <typename VertexId, typename Value>
+struct PartialProduct
+{
+    VertexId    row;            /// Row-id
+    Value       partial;        /// PartialProduct sum
+};
+
+
+/**
+ * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
+ */
+template <>
+struct PartialProduct<int, double>
+{
+    long long   row;            /// Row-id
+    double      partial;        /// PartialProduct sum
+};
+
+
+/**
+ * Reduce-value-by-row scan operator
+ */
+struct ReduceByKeyOp
+{
+    template <typename PartialProduct>
+    __device__ __forceinline__ PartialProduct operator()(
+        const PartialProduct &first,
+        const PartialProduct &second)
+    {
+        PartialProduct retval;
+
+        retval.partial = (second.row != first.row) ?
+                second.partial :
+                first.partial + second.partial;
+
+        retval.row = second.row;
+        return retval;
+    }
+};
+
+
+/**
+ * Stateful block-wide prefix operator for BlockScan
+ */
+template <typename PartialProduct>
+struct BlockPrefixCallbackOp
+{
+    // Running block-wide prefix
+    PartialProduct running_prefix;
+
+    /**
+     * Returns the block-wide running_prefix in thread-0
+     */
+    __device__ __forceinline__ PartialProduct operator()(
+        const PartialProduct &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        ReduceByKeyOp scan_op;
+
+        PartialProduct retval = running_prefix;
+        running_prefix = scan_op(running_prefix, block_aggregate);
+        return retval;
+    }
+};
+
+
+/**
+ * Operator for detecting discontinuities in a list of row identifiers.
+ */
+struct NewRowOp
+{
+    /// Returns true if row_b is the start of a new row
+    template <typename VertexId>
+    __device__ __forceinline__ bool operator()(
+        const VertexId& row_a,
+        const VertexId& row_b)
+    {
+        return (row_a != row_b);
+    }
+};
+
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * SpMV thread block abstraction for processing a contiguous segment of
+ * sparse COO tiles.
+ */
+template <
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    typename        VertexId,
+    typename        Value>
+struct PersistentBlockSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Partial dot product type
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    // Parameterized BlockScan type for reduce-value-by-row scan
+    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
+
+    // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
+    typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
+
+    // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
+    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
+
+    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
+    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
+
+    // Shared memory type for this thread block
+    struct TempStorage
+    {
+        union
+        {
+            typename BlockExchangeRows::TempStorage         exchange_rows;      // Smem needed for BlockExchangeRows
+            typename BlockExchangeValues::TempStorage       exchange_values;    // Smem needed for BlockExchangeValues
+            struct
+            {
+                typename BlockScan::TempStorage             scan;               // Smem needed for BlockScan
+                typename BlockDiscontinuity::TempStorage    discontinuity;      // Smem needed for BlockDiscontinuity
+            };
+        };
+
+        VertexId        first_block_row;    ///< The first row-ID seen by this thread block
+        VertexId        last_block_row;     ///< The last row-ID seen by this thread block
+        Value           first_product;      ///< The first dot-product written by this thread block
+    };
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                     &temp_storage;
+    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
+    VertexId                        *d_rows;
+    VertexId                        *d_columns;
+    Value                           *d_values;
+    Value                           *d_vector;
+    Value                           *d_result;
+    PartialProduct                  *d_block_partials;
+    int                             block_offset;
+    int                             block_end;
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    PersistentBlockSpmv(
+        TempStorage                 &temp_storage,
+        VertexId                    *d_rows,
+        VertexId                    *d_columns,
+        Value                       *d_values,
+        Value                       *d_vector,
+        Value                       *d_result,
+        PartialProduct              *d_block_partials,
+        int                         block_offset,
+        int                         block_end)
+    :
+        temp_storage(temp_storage),
+        d_rows(d_rows),
+        d_columns(d_columns),
+        d_values(d_values),
+        d_vector(d_vector),
+        d_result(d_result),
+        d_block_partials(d_block_partials),
+        block_offset(block_offset),
+        block_end(block_end)
+    {
+        // Initialize scalar shared memory values
+        if (threadIdx.x == 0)
+        {
+            VertexId first_block_row            = d_rows[block_offset];
+            VertexId last_block_row             = d_rows[block_end - 1];
+
+            temp_storage.first_block_row        = first_block_row;
+            temp_storage.last_block_row         = last_block_row;
+            temp_storage.first_product          = Value(0);
+
+            // Initialize prefix_op to identity
+            prefix_op.running_prefix.row        = first_block_row;
+            prefix_op.running_prefix.partial    = Value(0);
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Processes a COO input tile of edges, outputting dot products for each row
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        int block_offset,
+        int guarded_items = 0)
+    {
+        VertexId        columns[ITEMS_PER_THREAD];
+        VertexId        rows[ITEMS_PER_THREAD];
+        Value           values[ITEMS_PER_THREAD];
+        PartialProduct  partial_sums[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
+        if (FULL_TILE)
+        {
+            // Unguarded loads
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
+        }
+        else
+        {
+            // This is a partial-tile (e.g., the last tile of input).  Extend the coordinates of the last
+            // vertex for out-of-bound items, but zero-valued
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
+        }
+
+        // Load the referenced values from x and compute the dot product partials sums
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+#if CUB_PTX_ARCH >= 350
+            values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
+#else
+            values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
+#endif
+        }
+
+        // Transpose from warp-striped to blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
+
+        __syncthreads();
+
+        // Transpose from warp-striped to blocked arrangement
+        BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
+
+        // Barrier for smem reuse and coherence
+        __syncthreads();
+
+        // FlagT row heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            head_flags,                     // (Out) Head flags
+            rows,                           // Original row ids
+            NewRowOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_prefix.row);  // Last row ID from previous tile to compare with first row ID in this tile
+
+        // Assemble partial product structures
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            partial_sums[ITEM].partial = values[ITEM];
+            partial_sums[ITEM].row = rows[ITEM];
+        }
+
+        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
+        PartialProduct block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_sums,                   // Scan input
+            partial_sums,                   // Scan output
+            ReduceByKeyOp(),                // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Barrier for smem reuse and coherence
+        __syncthreads();
+
+        // Scatter an accumulated dot product if it is the head of a valid row
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
+
+                // Save off the first partial product that this thread block will scatter
+                if (partial_sums[ITEM].row == temp_storage.first_block_row)
+                {
+                    temp_storage.first_product = partial_sums[ITEM].partial;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessTiles()
+    {
+        // Process full tiles
+        while (block_offset <= block_end - TILE_ITEMS)
+        {
+            ProcessTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process the last, partially-full tile (if present)
+        int guarded_items = block_end - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, guarded_items);
+        }
+
+        if (threadIdx.x == 0)
+        {
+            if (gridDim.x == 1)
+            {
+                // Scatter the final aggregate (this kernel contains only 1 thread block)
+                d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
+            }
+            else
+            {
+                // Write the first and last partial products from this thread block so
+                // that they can be subsequently "fixed up" in the next kernel.
+
+                PartialProduct first_product;
+                first_product.row       = temp_storage.first_block_row;
+                first_product.partial   = temp_storage.first_product;
+
+                d_block_partials[blockIdx.x * 2]          = first_product;
+                d_block_partials[(blockIdx.x * 2) + 1]    = prefix_op.running_prefix;
+            }
+        }
+    }
+};
+
+
+/**
+ * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
+ */
+template <
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    typename        VertexId,
+    typename        Value>
+struct FinalizeSpmvBlock
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Partial dot product type
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    // Parameterized BlockScan type for reduce-value-by-row scan
+    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
+
+    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
+    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
+
+    // Shared memory type for this thread block
+    struct TempStorage
+    {
+        typename BlockScan::TempStorage           scan;               // Smem needed for reduce-value-by-row scan
+        typename BlockDiscontinuity::TempStorage  discontinuity;      // Smem needed for head-flagging
+
+        VertexId last_block_row;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                     &temp_storage;
+    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
+    Value                           *d_result;
+    PartialProduct                  *d_block_partials;
+    int                             num_partials;
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    FinalizeSpmvBlock(
+        TempStorage                 &temp_storage,
+        Value                       *d_result,
+        PartialProduct              *d_block_partials,
+        int                         num_partials)
+    :
+        temp_storage(temp_storage),
+        d_result(d_result),
+        d_block_partials(d_block_partials),
+        num_partials(num_partials)
+    {
+        // Initialize scalar shared memory values
+        if (threadIdx.x == 0)
+        {
+            VertexId first_block_row            = d_block_partials[0].row;
+            VertexId last_block_row             = d_block_partials[num_partials - 1].row;
+            temp_storage.last_block_row         = last_block_row;
+
+            // Initialize prefix_op to identity
+            prefix_op.running_prefix.row        = first_block_row;
+            prefix_op.running_prefix.partial    = Value(0);
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Processes a COO input tile of edges, outputting dot products for each row
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__
+    void ProcessTile(
+        int block_offset,
+        int guarded_items = 0)
+    {
+        VertexId        rows[ITEMS_PER_THREAD];
+        PartialProduct  partial_sums[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a tile of block partials from previous kernel
+        if (FULL_TILE)
+        {
+            // Full tile
+#if CUB_PTX_ARCH >= 350
+            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
+#else
+            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
+#endif
+        }
+        else
+        {
+            // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
+            PartialProduct default_sum;
+            default_sum.row = temp_storage.last_block_row;
+            default_sum.partial = Value(0);
+
+#if CUB_PTX_ARCH >= 350
+            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
+#else
+            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
+#endif
+        }
+
+        // Copy out row IDs for row-head flagging
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            rows[ITEM] = partial_sums[ITEM].row;
+        }
+
+        // FlagT row heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            rows,                           // Original row ids
+            head_flags,                     // (Out) Head flags
+            NewRowOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_prefix.row);   // Last row ID from previous tile to compare with first row ID in this tile
+
+        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
+        PartialProduct block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_sums,                   // Scan input
+            partial_sums,                   // Scan output
+            ReduceByKeyOp(),                // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Scatter an accumulated dot product if it is the head of a valid row
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessTiles()
+    {
+        // Process full tiles
+        int block_offset = 0;
+        while (block_offset <= num_partials - TILE_ITEMS)
+        {
+            ProcessTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process final partial tile (if present)
+        int guarded_items = num_partials - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, guarded_items);
+        }
+
+        // Scatter the final aggregate (this kernel contains only 1 thread block)
+        if (threadIdx.x == 0)
+        {
+            d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
+        }
+    }
+};
+
+
+/******************************************************************************
+ * Kernel entrypoints
+ ******************************************************************************/
+
+
+
+/**
+ * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
+ */
+template <
+    int                             BLOCK_THREADS,
+    int                             ITEMS_PER_THREAD,
+    typename                        VertexId,
+    typename                        Value>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void CooKernel(
+    GridEvenShare<int>              even_share,
+    PartialProduct<VertexId, Value> *d_block_partials,
+    VertexId                        *d_rows,
+    VertexId                        *d_columns,
+    Value                           *d_values,
+    Value                           *d_vector,
+    Value                           *d_result)
+{
+    // Specialize SpMV thread block abstraction type
+    typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
+
+    // Shared memory allocation
+    __shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
+
+    // Initialize thread block even-share to tell us where to start and stop our tile-processing
+    even_share.BlockInit();
+
+    // Construct persistent thread block
+    PersistentBlockSpmv persistent_block(
+        temp_storage,
+        d_rows,
+        d_columns,
+        d_values,
+        d_vector,
+        d_result,
+        d_block_partials,
+        even_share.block_offset,
+        even_share.block_end);
+
+    // Process input tiles
+    persistent_block.ProcessTiles();
+}
+
+
+/**
+ * Kernel for "fixing up" an array of interblock SpMV partial products.
+ */
+template <
+    int                             BLOCK_THREADS,
+    int                             ITEMS_PER_THREAD,
+    typename                        VertexId,
+    typename                        Value>
+__launch_bounds__ (BLOCK_THREADS,  1)
+__global__ void CooFinalizeKernel(
+    PartialProduct<VertexId, Value> *d_block_partials,
+    int                             num_partials,
+    Value                           *d_result)
+{
+    // Specialize "fix-up" thread block abstraction type
+    typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
+
+    // Shared memory allocation
+    __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
+
+    // Construct persistent thread block
+    FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
+
+    // Process input tiles
+    persistent_block.ProcessTiles();
+}
+
+
+
+//---------------------------------------------------------------------
+// Host subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple test of device
+ */
+template <
+    int                         COO_BLOCK_THREADS,
+    int                         COO_ITEMS_PER_THREAD,
+    int                         COO_SUBSCRIPTION_FACTOR,
+    int                         FINALIZE_BLOCK_THREADS,
+    int                         FINALIZE_ITEMS_PER_THREAD,
+    typename                    VertexId,
+    typename                    Value>
+void TestDevice(
+    CooGraph<VertexId, Value>&  coo_graph,
+    Value*                      h_vector,
+    Value*                      h_reference)
+{
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
+
+    // SOA device storage
+    VertexId        *d_rows;             // SOA graph row coordinates
+    VertexId        *d_columns;          // SOA graph col coordinates
+    Value           *d_values;           // SOA graph values
+    Value           *d_vector;           // Vector multiplicand
+    Value           *d_result;           // Output row
+    PartialProduct  *d_block_partials;   // Temporary storage for communicating dot product partials between thread blocks
+
+    // Create SOA version of coo_graph on host
+    int             num_edges   = coo_graph.coo_tuples.size();
+    VertexId        *h_rows     = new VertexId[num_edges];
+    VertexId        *h_columns  = new VertexId[num_edges];
+    Value           *h_values   = new Value[num_edges];
+    for (int i = 0; i < num_edges; i++)
+    {
+        h_rows[i]       = coo_graph.coo_tuples[i].row;
+        h_columns[i]    = coo_graph.coo_tuples[i].col;
+        h_values[i]     = coo_graph.coo_tuples[i].val;
+    }
+
+    // Get CUDA properties
+    Device device_props;
+    CubDebugExit(device_props.Init());
+
+    // Determine launch configuration from kernel properties
+    int coo_sm_occupancy;
+    CubDebugExit(device_props.MaxSmOccupancy(
+        coo_sm_occupancy,
+        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
+        COO_BLOCK_THREADS));
+    int max_coo_grid_size   = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
+
+    // Construct an even-share work distribution
+    GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
+    int coo_grid_size  = even_share.grid_size;
+    int num_partials   = coo_grid_size * 2;
+
+    // Allocate COO device arrays
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows,            sizeof(VertexId) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns,         sizeof(VertexId) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values,          sizeof(Value) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector,          sizeof(Value) * coo_graph.col_dim));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result,          sizeof(Value) * coo_graph.row_dim));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials,  sizeof(PartialProduct) * num_partials));
+
+    // Copy host arrays to device
+    CubDebugExit(cudaMemcpy(d_rows,     h_rows,     sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_columns,  h_columns,  sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values,   h_values,   sizeof(Value) * num_edges,          cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_vector,   h_vector,   sizeof(Value) * coo_graph.col_dim,  cudaMemcpyHostToDevice));
+
+    // Bind textures
+    TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
+
+    // Print debug info
+    printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
+        COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
+    if (coo_grid_size > 1)
+    {
+        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
+    }
+    fflush(stdout);
+
+    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+
+    // Run kernel (always run one iteration without timing)
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0;
+    for (int i = 0; i <= g_timing_iterations; i++)
+    {
+        gpu_timer.Start();
+
+        // Initialize output
+        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
+
+        // Run the COO kernel
+        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
+            even_share,
+            d_block_partials,
+            d_rows,
+            d_columns,
+            d_values,
+            d_vector,
+            d_result);
+
+        if (coo_grid_size > 1)
+        {
+            // Run the COO finalize kernel
+            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
+                d_block_partials,
+                num_partials,
+                d_result);
+        }
+
+        gpu_timer.Stop();
+
+        if (i > 0)
+            elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Force any kernel stdio to screen
+    CubDebugExit(cudaThreadSynchronize());
+    fflush(stdout);
+
+    // Display timing
+    if (g_timing_iterations > 0)
+    {
+        float avg_elapsed = elapsed_millis / g_timing_iterations;
+        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
+        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
+            g_timing_iterations,
+            avg_elapsed,
+            total_bytes / avg_elapsed / 1000.0 / 1000.0,
+            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
+    }
+
+    // Check results
+    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    TexVector<Value>::UnbindTexture();
+    CubDebugExit(g_allocator.DeviceFree(d_block_partials));
+    CubDebugExit(g_allocator.DeviceFree(d_rows));
+    CubDebugExit(g_allocator.DeviceFree(d_columns));
+    CubDebugExit(g_allocator.DeviceFree(d_values));
+    CubDebugExit(g_allocator.DeviceFree(d_vector));
+    CubDebugExit(g_allocator.DeviceFree(d_result));
+    delete[] h_rows;
+    delete[] h_columns;
+    delete[] h_values;
+}
+
+
+/**
+ * Compute reference answer on CPU
+ */
+template <typename VertexId, typename Value>
+void ComputeReference(
+    CooGraph<VertexId, Value>&  coo_graph,
+    Value*                      h_vector,
+    Value*                      h_reference)
+{
+    for (VertexId i = 0; i < coo_graph.row_dim; i++)
+    {
+        h_reference[i] = 0.0;
+    }
+
+    for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
+    {
+        h_reference[coo_graph.coo_tuples[i].row] +=
+            coo_graph.coo_tuples[i].val *
+            h_vector[coo_graph.coo_tuples[i].col];
+    }
+}
+
+
+/**
+ * Assign arbitrary values to vector items
+ */
+template <typename Value>
+void AssignVectorValues(Value *vector, int col_dim)
+{
+    for (int i = 0; i < col_dim; i++)
+    {
+        vector[i] = 1.0;
+    }
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
+            "\t--type=wheel --spokes=<spokes>\n"
+            "\t--type=grid2d --width=<width> [--no-self-loops]\n"
+            "\t--type=grid3d --width=<width> [--no-self-loops]\n"
+            "\t--type=market --file=<file>\n"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get graph type
+    string type;
+    args.GetCmdLineArgument("type", type);
+
+    // Generate graph structure
+
+    CpuTimer timer;
+    timer.Start();
+    CooGraph<VertexId, Value> coo_graph;
+    if (type == string("grid2d"))
+    {
+        VertexId width;
+        args.GetCmdLineArgument("width", width);
+        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
+        printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
+        if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
+    } else if (type == string("grid3d"))
+    {
+        VertexId width;
+        args.GetCmdLineArgument("width", width);
+        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
+        printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
+        if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
+    }
+    else if (type == string("wheel"))
+    {
+        VertexId spokes;
+        args.GetCmdLineArgument("spokes", spokes);
+        printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
+        if (coo_graph.InitWheel(spokes)) exit(1);
+    }
+    else if (type == string("market"))
+    {
+        string filename;
+        args.GetCmdLineArgument("file", filename);
+        printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
+        if (coo_graph.InitMarket(filename)) exit(1);
+    }
+    else
+    {
+        printf("Unsupported graph type\n");
+        exit(1);
+    }
+    timer.Stop();
+    printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
+        timer.ElapsedMillis() / 1000.0,
+        coo_graph.coo_tuples.size(),
+        coo_graph.row_dim,
+        coo_graph.col_dim);
+    fflush(stdout);
+
+    if (g_verbose)
+    {
+        cout << coo_graph << "\n";
+    }
+
+    // Create vector
+    Value *h_vector = new Value[coo_graph.col_dim];
+    AssignVectorValues(h_vector, coo_graph.col_dim);
+    if (g_verbose)
+    {
+        printf("Vector[%d]: ", coo_graph.col_dim);
+        DisplayResults(h_vector, coo_graph.col_dim);
+        printf("\n\n");
+    }
+
+    // Compute reference answer
+    Value *h_reference = new Value[coo_graph.row_dim];
+    ComputeReference(coo_graph, h_vector, h_reference);
+    if (g_verbose)
+    {
+        printf("Results[%d]: ", coo_graph.row_dim);
+        DisplayResults(h_reference, coo_graph.row_dim);
+        printf("\n\n");
+    }
+
+    // Parameterization for SM35
+    enum
+    {
+        COO_BLOCK_THREADS           = 64,
+        COO_ITEMS_PER_THREAD        = 10,
+        COO_SUBSCRIPTION_FACTOR     = 4,
+        FINALIZE_BLOCK_THREADS      = 256,
+        FINALIZE_ITEMS_PER_THREAD   = 4,
+    };
+
+    // Run GPU version
+    TestDevice<
+        COO_BLOCK_THREADS,
+        COO_ITEMS_PER_THREAD,
+        COO_SUBSCRIPTION_FACTOR,
+        FINALIZE_BLOCK_THREADS,
+        FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
+
+    // Cleanup
+    delete[] h_vector;
+    delete[] h_reference;
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/experimental/defunct/test_device_seg_reduce.cu b/thrust/dependencies/cub/experimental/defunct/test_device_seg_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e91233113f2115b21a014bfd94f48952e8839fa4
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/defunct/test_device_seg_reduce.cu
@@ -0,0 +1,2142 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * An implementation of segmented reduction using a load-balanced parallelization
+ * strategy based on the MergePath decision path.
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdio.h>
+
+#include <cub/cub.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+/******************************************************************************
+ * Globals, constants, and typedefs
+ ******************************************************************************/
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 1;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/******************************************************************************
+ * Utility routines
+ ******************************************************************************/
+
+
+/**
+ * An pair of index offsets
+ */
+template <typename OffsetT>
+struct IndexPair
+{
+    OffsetT a_idx;
+    OffsetT b_idx;
+};
+
+
+/**
+ * Computes the begin offsets into A and B for the specified
+ * location (diagonal) along the merge decision path
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            IteratorA,
+    typename            IteratorB,
+    typename            OffsetT>
+__device__ __forceinline__ void ParallelMergePathSearch(
+    OffsetT             diagonal,
+    IteratorA           a,
+    IteratorB           b,
+    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
+    IndexPair<OffsetT>  end,            // End offsets into a and b
+    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
+{
+    OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
+    OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);
+
+    while (a_split_min < a_split_max)
+    {
+        OffsetT a_distance       = a_split_max - a_split_min;
+        OffsetT a_slice          = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
+        OffsetT a_split_pivot    = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);
+
+        int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
+        int num_up = __syncthreads_count(move_up);
+/*
+        _CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
+            a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
+*/
+        a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
+        a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
+    }
+
+    intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
+    intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
+}
+
+/**
+ * Computes the begin offsets into A and B for the specified
+ * location (diagonal) along the merge decision path
+ */
+template <
+    typename            IteratorA,
+    typename            IteratorB,
+    typename            OffsetT>
+__device__ __forceinline__ void MergePathSearch(
+    OffsetT             diagonal,
+    IteratorA           a,
+    IteratorB           b,
+    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
+    IndexPair<OffsetT>  end,            // End offsets into a and b
+    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
+{
+    OffsetT split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
+    OffsetT split_max = CUB_MIN(diagonal, end.a_idx);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    intersection.a_idx = CUB_MIN(split_min, end.a_idx);
+    intersection.b_idx = CUB_MIN(diagonal - split_min, end.b_idx);
+}
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockSegReduceRegion
+ */
+template <
+    int                     _BLOCK_THREADS,             ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    bool                    _USE_SMEM_SEGMENT_CACHE,    ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+    bool                    _USE_SMEM_VALUE_CACHE,      ///< Whether or not to cache incoming values in shared memory before reducing each tile
+    CacheLoadModifier       _LOAD_MODIFIER_SEGMENTS,    ///< Cache load modifier for reading segment offsets
+    CacheLoadModifier       _LOAD_MODIFIER_VALUES,      ///< Cache load modifier for reading values
+    BlockReduceAlgorithm    _REDUCE_ALGORITHM,          ///< The BlockReduce algorithm to use
+    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
+struct BlockSegReduceRegionPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        USE_SMEM_SEGMENT_CACHE  = _USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+        USE_SMEM_VALUE_CACHE    = _USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
+    };
+
+    static const CacheLoadModifier      LOAD_MODIFIER_SEGMENTS  = _LOAD_MODIFIER_SEGMENTS;  ///< Cache load modifier for reading segment offsets
+    static const CacheLoadModifier      LOAD_MODIFIER_VALUES    = _LOAD_MODIFIER_VALUES;    ///< Cache load modifier for reading values
+    static const BlockReduceAlgorithm   REDUCE_ALGORITHM        = _REDUCE_ALGORITHM;        ///< The BlockReduce algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * \brief BlockSegReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide segmented reduction.
+ */
+template <
+    typename BlockSegReduceRegionPolicy,    ///< Parameterized BlockSegReduceRegionPolicy tuning policy
+    typename SegmentOffsetIterator,         ///< Random-access input iterator type for reading segment end-offsets
+    typename ValueIterator,                 ///< Random-access input iterator type for reading values
+    typename OutputIteratorT,               ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct BlockSegReduceRegion
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockSegReduceRegionPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockSegReduceRegionPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,                     /// Number of work items to be processed per tile
+
+        USE_SMEM_SEGMENT_CACHE  = BlockSegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+        USE_SMEM_VALUE_CACHE    = BlockSegReduceRegionPolicy::USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
+
+        SMEM_SEGMENT_CACHE_ITEMS    = USE_SMEM_SEGMENT_CACHE ? TILE_ITEMS : 1,
+        SMEM_VALUE_CACHE_ITEMS      = USE_SMEM_VALUE_CACHE ? TILE_ITEMS : 1,
+    };
+
+    // Segment offset type
+    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
+
+    // Value type
+    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+    // Counting iterator type
+    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
+
+    // Segment offsets iterator wrapper type
+    typedef typename If<(IsPointer<SegmentOffsetIterator>::VALUE),
+            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS, SegmentOffsetT, OffsetT>,  // Wrap the native input pointer with CacheModifiedInputIterator
+            SegmentOffsetIterator>::Type                                                                            // Directly use the supplied input iterator type
+        WrappedSegmentOffsetIterator;
+
+    // Values iterator wrapper type
+    typedef typename If<(IsPointer<ValueIterator>::VALUE),
+            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_VALUES, Value, OffsetT>,        // Wrap the native input pointer with CacheModifiedInputIterator
+            ValueIterator>::Type                                                                                // Directly use the supplied input iterator type
+        WrappedValueIterator;
+
+    // Tail flag type for marking segment discontinuities
+    typedef int TailFlag;
+
+    // Reduce-by-key data type tuple (segment-ID, value)
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Index pair data type
+    typedef IndexPair<OffsetT> IndexPair;
+
+    // BlockScan scan operator for reduction-by-segment
+    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef RunningBlockPrefixCallbackOp<
+            KeyValuePair,
+            ReduceByKeyOp>
+        RunningPrefixCallbackOp;
+
+    // Parameterized BlockShift type for exchanging index pairs
+    typedef BlockShift<
+            IndexPair,
+            BLOCK_THREADS>
+        BlockShift;
+
+    // Parameterized BlockReduce type for block-wide reduction
+    typedef BlockReduce<
+            Value,
+            BLOCK_THREADS,
+            BlockSegReduceRegionPolicy::REDUCE_ALGORITHM>
+        BlockReduce;
+
+    // Parameterized BlockScan type for block-wide reduce-value-by-key
+    typedef BlockScan<
+            KeyValuePair,
+            BLOCK_THREADS,
+            BlockSegReduceRegionPolicy::SCAN_ALGORITHM>
+        BlockScan;
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        union
+        {
+            // Smem needed for BlockScan
+            typename BlockScan::TempStorage scan;
+
+            // Smem needed for BlockReduce
+            typename BlockReduce::TempStorage reduce;
+
+            struct
+            {
+                // Smem needed for communicating start/end indices between threads for a given work tile
+                typename BlockShift::TempStorage shift;
+
+                // Smem needed for caching segment end-offsets
+                SegmentOffset cached_segment_end_offsets[SMEM_SEGMENT_CACHE_ITEMS + 1];
+            };
+
+            // Smem needed for caching values
+            Value cached_values[SMEM_VALUE_CACHE_ITEMS];
+        };
+
+        IndexPair block_region_idx[2];      // The starting [0] and ending [1] pairs of segment and value indices for the thread block's region
+
+        // The first partial reduction tuple scattered by this thread block
+        KeyValuePair first_tuple;
+    };
+
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                    &temp_storage;          ///< Reference to shared storage
+    WrappedSegmentOffsetIterator    d_segment_end_offsets;  ///< A sequence of \p num_segments segment end-offsets
+    WrappedValueIterator            d_values;               ///< A sequence of \p num_values data to reduce
+    OutputIteratorT                  d_output;               ///< A sequence of \p num_segments segment totals
+    CountingIterator                d_value_offsets;        ///< A sequence of \p num_values value-offsets
+    IndexPair                       *d_block_idx;
+    OffsetT                         num_values;             ///< Total number of values to reduce
+    OffsetT                         num_segments;           ///< Number of segments being reduced
+    Value                           identity;               ///< Identity value (for zero-length segments)
+    ReductionOp                     reduction_op;           ///< Reduction operator
+    ReduceByKeyOp                   scan_op;                ///< Reduce-by-key scan operator
+    RunningPrefixCallbackOp         prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    BlockSegReduceRegion(
+        TempStorage             &temp_storage,          ///< Reference to shared storage
+        SegmentOffsetIterator   d_segment_end_offsets,  ///< A sequence of \p num_segments segment end-offsets
+        ValueIterator           d_values,               ///< A sequence of \p num_values values
+        OutputIteratorT          d_output,               ///< A sequence of \p num_segments segment totals
+        IndexPair               *d_block_idx,
+        OffsetT                 num_values,             ///< Number of values to reduce
+        OffsetT                 num_segments,           ///< Number of segments being reduced
+        Value                   identity,               ///< Identity value (for zero-length segments)
+        ReductionOp             reduction_op)           ///< Reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_segment_end_offsets(d_segment_end_offsets),
+        d_values(d_values),
+        d_value_offsets(0),
+        d_output(d_output),
+        d_block_idx(d_block_idx),
+        num_values(num_values),
+        num_segments(num_segments),
+        identity(identity),
+        reduction_op(reduction_op),
+        scan_op(reduction_op),
+        prefix_op(scan_op)
+    {}
+
+
+    /**
+     * Fast-path single-segment tile reduction.  Perform a
+     * simple block-wide reduction and accumulate the result into
+     * the running total.
+     */
+    __device__ __forceinline__ void SingleSegmentTile(
+        IndexPair next_tile_idx,
+        IndexPair block_idx)
+    {
+        OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
+
+        // Load a tile's worth of values (using identity for out-of-bounds items)
+        Value values[ITEMS_PER_THREAD];
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
+
+        // Barrier for smem reuse
+        __syncthreads();
+
+        // Reduce the tile of values and update the running total in thread-0
+        KeyValuePair tile_aggregate;
+        tile_aggregate.key      = block_idx.a_idx;
+        tile_aggregate.value    = BlockReduce(temp_storage.reduce).Reduce(values, reduction_op);
+
+        if (threadIdx.x == 0)
+        {
+            prefix_op.running_total = scan_op(prefix_op.running_total, tile_aggregate);
+        }
+    }
+
+    /**
+     * Fast-path empty-segment tile reduction.  Write out a tile of identity
+     * values to output.
+     */
+    __device__ __forceinline__ void EmptySegmentsTile(
+        IndexPair next_tile_idx,
+        IndexPair block_idx)
+    {
+        Value segment_reductions[ITEMS_PER_THREAD];
+
+        if (threadIdx.x == 0)
+        {
+            // The first segment gets the running segment total
+            segment_reductions[0] = prefix_op.running_total.value;
+
+            // Update the running prefix
+            prefix_op.running_total.value = identity;
+            prefix_op.running_total.key = next_tile_idx.a_idx;
+        }
+        else
+        {
+            // Remainder of segments in this tile get identity
+            segment_reductions[0] = identity;
+        }
+
+        // Remainder of segments in this tile get identity
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            segment_reductions[ITEM] = identity;
+
+        // Store reductions
+        OffsetT tile_segments = next_tile_idx.a_idx - block_idx.a_idx;
+        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_output + block_idx.a_idx, segment_reductions, tile_segments);
+    }
+
+
+    /**
+     * Multi-segment tile reduction.
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void MultiSegmentTile(
+        IndexPair block_idx,
+        IndexPair thread_idx,
+        IndexPair next_thread_idx,
+        IndexPair next_tile_idx)
+    {
+        IndexPair local_thread_idx;
+        local_thread_idx.a_idx = thread_idx.a_idx - block_idx.a_idx;
+        local_thread_idx.b_idx = thread_idx.b_idx - block_idx.b_idx;
+
+        // Check if first segment end-offset is in range
+        bool valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
+
+        // Check if first value offset is in range
+        bool valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
+
+        // Load first segment end-offset
+        OffsetT segment_end_offset = (valid_segment) ?
+            (USE_SMEM_SEGMENT_CACHE)?
+                temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx] :
+                d_segment_end_offsets[thread_idx.a_idx] :
+            -1;
+
+        OffsetT segment_ids[ITEMS_PER_THREAD];
+        OffsetT value_offsets[ITEMS_PER_THREAD];
+
+        KeyValuePair first_partial;
+        first_partial.key    = thread_idx.a_idx;
+        first_partial.value  = identity;
+
+        // Get segment IDs and gather-offsets for values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            segment_ids[ITEM]   = -1;
+            value_offsets[ITEM] = -1;
+
+            // Whether or not we slide (a) right along the segment path or (b) down the value path
+            if (valid_segment && (!valid_value || (segment_end_offset <= thread_idx.b_idx)))
+            {
+                // Consume this segment index
+                segment_ids[ITEM] = thread_idx.a_idx;
+                thread_idx.a_idx++;
+                local_thread_idx.a_idx++;
+
+                valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
+
+                // Read next segment end-offset (if valid)
+                if (valid_segment)
+                {
+                    if (USE_SMEM_SEGMENT_CACHE)
+                        segment_end_offset = temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx];
+                    else
+                        segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
+                }
+            }
+            else if (valid_value)
+            {
+                // Consume this value index
+                value_offsets[ITEM] = thread_idx.b_idx;
+                thread_idx.b_idx++;
+                local_thread_idx.b_idx++;
+
+                valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
+            }
+        }
+
+        // Load values
+        Value values[ITEMS_PER_THREAD];
+
+        if (USE_SMEM_VALUE_CACHE)
+        {
+            // Barrier for smem reuse
+            __syncthreads();
+
+            OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
+
+            // Load a tile's worth of values (using identity for out-of-bounds items)
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
+
+            // Store to shared
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_values, values, tile_values);
+
+            // Barrier for smem reuse
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                values[ITEM] = (value_offsets[ITEM] == -1) ?
+                    identity :
+                    temp_storage.cached_values[value_offsets[ITEM] - block_idx.b_idx];
+            }
+        }
+        else
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                values[ITEM] = (value_offsets[ITEM] == -1) ?
+                    identity :
+                    d_values[value_offsets[ITEM]];
+            }
+        }
+
+        // Reduce within thread segments
+        KeyValuePair running_total = first_partial;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_ids[ITEM] != -1)
+            {
+                // Consume this segment index
+                d_output[segment_ids[ITEM]] = running_total.value;
+
+//                _CubLog("Updating segment %d with value %lld\n", segment_ids[ITEM], running_total.value)
+
+                if (first_partial.key == segment_ids[ITEM])
+                    first_partial.value = running_total.value;
+
+                running_total.key    = segment_ids[ITEM];
+                running_total.value  = identity;
+            }
+
+            running_total.value = reduction_op(running_total.value, values[ITEM]);
+        }
+/*
+
+        // Barrier for smem reuse
+        __syncthreads();
+
+        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).InclusiveScan(
+            pairs,                          // Scan input
+            pairs,                          // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+*/
+
+/*
+        // Check if first segment end-offset is in range
+        bool valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx);
+
+        // Check if first value offset is in range
+        bool valid_value = (thread_idx.b_idx < next_thread_idx.b_idx);
+
+        // Load first segment end-offset
+        OffsetT segment_end_offset = (valid_segment) ?
+            d_segment_end_offsets[thread_idx.a_idx] :
+            num_values;                                                     // Out of range (the last segment end-offset is one-past the last value offset)
+
+        // Load first value offset
+        OffsetT value_offset = (valid_value) ?
+            d_value_offsets[thread_idx.b_idx] :
+            num_values;                                                     // Out of range (one-past the last value offset)
+
+        // Assemble segment-demarcating tail flags and partial reduction tuples
+        TailFlag        tail_flags[ITEMS_PER_THREAD];
+        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Default tuple and flag values
+            partial_reductions[ITEM].key    = thread_idx.a_idx;
+            partial_reductions[ITEM].value  = identity;
+            tail_flags[ITEM]                = 0;
+
+            // Whether or not we slide (a) right along the segment path or (b) down the value path
+            if (valid_segment && (!valid_value || (segment_end_offset <= value_offset)))
+            {
+                // Consume this segment index
+
+                // Set tail flag noting the end of the segment
+                tail_flags[ITEM] = 1;
+
+                // Increment segment index
+                thread_idx.a_idx++;
+
+                // Read next segment end-offset (if valid)
+                if ((valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx)))
+                    segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
+            }
+            else if (valid_value)
+            {
+                // Consume this value index
+
+                // Update the tuple's value with the value at this index.
+                partial_reductions[ITEM].value = d_values[value_offset];
+
+                // Increment value index
+                thread_idx.b_idx++;
+
+                // Read next value offset (if valid)
+                if ((valid_value = (thread_idx.b_idx < next_thread_idx.b_idx)))
+                    value_offset = d_value_offsets[thread_idx.b_idx];
+            }
+        }
+
+        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).InclusiveScan(
+            partial_reductions,             // Scan input
+            partial_reductions,             // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // The first segment index for this region (hoist?)
+        OffsetT first_segment_idx = temp_storage.block_idx.a_idx[0];
+
+        // Scatter an accumulated reduction if it is the head of a valid segment
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (tail_flags[ITEM])
+            {
+                OffsetT segment_idx = partial_reductions[ITEM].key;
+                Value   value       = partial_reductions[ITEM].value;
+
+                // Write value reduction to corresponding segment id
+                d_output[segment_idx] = value;
+
+                // Save off the first value product that this thread block will scatter
+                if (segment_idx == first_segment_idx)
+                {
+                    temp_storage.first_tuple.value = value;
+                }
+            }
+        }
+*/
+    }
+
+
+
+    /**
+     * Have the thread block process the specified region of the MergePath decision path
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT         block_diagonal,
+        OffsetT         next_block_diagonal,
+        KeyValuePair    &first_tuple,       // [Out] Valid in thread-0
+        KeyValuePair    &last_tuple)        // [Out] Valid in thread-0
+    {
+        // Thread block initialization
+        if (threadIdx.x < 2)
+        {
+            // Retrieve block starting and ending indices
+            IndexPair block_idx = {0, 0};
+            if (gridDim.x > 1)
+            {
+                block_idx = d_block_idx[blockIdx.x + threadIdx.x];
+            }
+            else if (threadIdx.x > 0)
+            {
+                block_idx.a_idx = num_segments;
+                block_idx.b_idx = num_values;
+            }
+
+            // Share block starting and ending indices
+            temp_storage.block_region_idx[threadIdx.x] = block_idx;
+
+            // Initialize the block's running prefix
+            if (threadIdx.x == 0)
+            {
+                prefix_op.running_total.key    = block_idx.a_idx;
+                prefix_op.running_total.value  = identity;
+
+                // Initialize the "first scattered partial reduction tuple" to the prefix tuple (in case we don't actually scatter one)
+                temp_storage.first_tuple = prefix_op.running_total;
+            }
+        }
+
+        // Ensure coherence of region indices
+        __syncthreads();
+
+        // Read block's starting indices
+        IndexPair block_idx = temp_storage.block_region_idx[0];
+
+        // Have the thread block iterate over the region
+        #pragma unroll 1
+        while (block_diagonal < next_block_diagonal)
+        {
+            // Read block's ending indices (hoist?)
+            IndexPair next_block_idx = temp_storage.block_region_idx[1];
+
+            // Clamp the per-thread search range to within one work-tile of block's current indices
+            IndexPair next_tile_idx;
+            next_tile_idx.a_idx = CUB_MIN(next_block_idx.a_idx, block_idx.a_idx + TILE_ITEMS);
+            next_tile_idx.b_idx = CUB_MIN(next_block_idx.b_idx, block_idx.b_idx + TILE_ITEMS);
+
+            // Have each thread search for the end-indices of its subranges within the segment and value inputs
+            IndexPair next_thread_idx;
+            if (USE_SMEM_SEGMENT_CACHE)
+            {
+                // Search in smem cache
+                OffsetT num_segments = next_tile_idx.a_idx - block_idx.a_idx;
+
+                // Load global
+                SegmentOffset segment_offsets[ITEMS_PER_THREAD];
+                LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_segment_end_offsets + block_idx.a_idx, segment_offsets, num_segments, num_values);
+
+                // Store to shared
+                StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_segment_end_offsets, segment_offsets);
+
+                __syncthreads();
+
+                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
+
+                MergePathSearch(
+                    next_thread_diagonal,                       // Next thread diagonal
+                    temp_storage.cached_segment_end_offsets - block_idx.a_idx,                      // A (segment end-offsets)
+                    d_value_offsets,                            // B (value offsets)
+                    block_idx,                                  // Start indices into A and B
+                    next_tile_idx,                              // End indices into A and B
+                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
+            }
+            else
+            {
+                // Search in global
+
+                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
+
+                MergePathSearch(
+                    next_thread_diagonal,                       // Next thread diagonal
+                    d_segment_end_offsets,                      // A (segment end-offsets)
+                    d_value_offsets,                            // B (value offsets)
+                    block_idx,                                  // Start indices into A and B
+                    next_tile_idx,                              // End indices into A and B
+                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
+            }
+
+            // Share thread end-indices to get thread begin-indices and tile end-indices
+            IndexPair thread_idx;
+
+            BlockShift(temp_storage.shift).Up(
+                next_thread_idx,    // Input item
+                thread_idx,         // [out] Output item
+                block_idx,          // Prefix item to be provided to <em>thread</em><sub>0</sub>
+                next_tile_idx);     // [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads
+
+//            if (block_idx.a_idx == next_tile_idx.a_idx)
+//            {
+//                // There are no segment end-offsets in this tile.  Perform a
+//                // simple block-wide reduction and accumulate the result into
+//                // the running total.
+//                SingleSegmentTile(next_tile_idx, block_idx);
+//            }
+//          else if (block_idx.b_idx == next_tile_idx.b_idx)
+//            {
+//                // There are no values in this tile (only empty segments).
+//                EmptySegmentsTile(next_tile_idx.a_idx, block_idx.a_idx);
+//            }
+//            else
+            if ((next_tile_idx.a_idx < num_segments) && (next_tile_idx.b_idx < num_values))
+            {
+                // Merge the tile's segment and value indices (full tile)
+                MultiSegmentTile<true>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
+            }
+            else
+            {
+                // Merge the tile's segment and value indices (partially full tile)
+                MultiSegmentTile<false>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
+            }
+
+            // Advance the block's indices in preparation for the next tile
+            block_idx = next_tile_idx;
+
+            // Advance to the next region in the decision path
+            block_diagonal += TILE_ITEMS;
+
+            // Barrier for smem reuse
+            __syncthreads();
+        }
+
+        // Get first and last tuples for the region
+        if (threadIdx.x == 0)
+        {
+            first_tuple = temp_storage.first_tuple;
+            last_tuple = prefix_op.running_total;
+        }
+
+    }
+
+
+};
+
+
+
+
+
+
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockSegReduceRegionByKey
+ */
+template <
+    int                     _BLOCK_THREADS,             ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm      _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
+    bool                    _LOAD_WARP_TIME_SLICING,    ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+    CacheLoadModifier       _LOAD_MODIFIER,             ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
+struct BlockSegReduceRegionByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)    };
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * \brief BlockSegReduceRegionByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+template <
+    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
+    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
+    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
+    typename    ReductionOp>                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct BlockSegReduceRegionByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockSegReduceRegionByKeyPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // KeyValuePair input type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type KeyValuePair;
+
+    // Signed integer type for global offsets
+    typedef typename KeyValuePair::Key OffsetT;
+
+    // Value type
+    typedef typename KeyValuePair::Value Value;
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Input iterator wrapper type for loading KeyValuePair elements through cache
+    typedef CacheModifiedInputIterator<
+            BlockSegReduceRegionByKeyPolicy::LOAD_MODIFIER,
+            KeyValuePair,
+            OffsetT>
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            WrappedInputIteratorT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            BlockSegReduceRegionByKeyPolicy::LOAD_ALGORITHM,
+            BlockSegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING>
+        BlockLoad;
+
+    // BlockScan scan operator for reduction-by-segment
+    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef RunningBlockPrefixCallbackOp<
+            KeyValuePair,
+            ReduceByKeyOp>
+        RunningPrefixCallbackOp;
+
+    // Parameterized BlockScan type for block-wide reduce-value-by-key
+    typedef BlockScan<
+            KeyValuePair,
+            BLOCK_THREADS,
+            BlockSegReduceRegionByKeyPolicy::SCAN_ALGORITHM>
+        BlockScan;
+
+    // Parameterized BlockDiscontinuity type for identifying key discontinuities
+    typedef BlockDiscontinuity<
+            OffsetT,
+            BLOCK_THREADS>
+        BlockDiscontinuity;
+
+    // Operator for detecting discontinuities in a list of segment identifiers.
+    struct NewSegmentOp
+    {
+        /// Returns true if row_b is the start of a new row
+        __device__ __forceinline__ bool operator()(const OffsetT& b, const OffsetT& a)
+        {
+            return (a != b);
+        }
+    };
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoad::TempStorage                 load;           // Smem needed for tile loading
+            struct {
+                typename BlockScan::TempStorage             scan;           // Smem needed for reduce-value-by-segment scan
+                typename BlockDiscontinuity::TempStorage    discontinuity;  // Smem needed for head-flagging
+            };
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;          ///< Reference to shared storage
+    WrappedInputIteratorT       d_tuple_partials;       ///< A sequence of partial reduction tuples to scan
+    OutputIteratorT              d_output;               ///< A sequence of segment totals
+    Value                       identity;               ///< Identity value (for zero-length segments)
+    ReduceByKeyOp               scan_op;                ///< Reduce-by-key scan operator
+    RunningPrefixCallbackOp     prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    BlockSegReduceRegionByKey(
+        TempStorage             &temp_storage,          ///< Reference to shared storage
+        InputIteratorT          d_tuple_partials,       ///< A sequence of partial reduction tuples to scan
+        OutputIteratorT          d_output,               ///< A sequence of segment totals
+        Value                   identity,               ///< Identity value (for zero-length segments)
+        ReductionOp             reduction_op)           ///< Reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_tuple_partials(d_tuple_partials),
+        d_output(d_output),
+        identity(identity),
+        scan_op(reduction_op),
+        prefix_op(scan_op)
+    {}
+
+
+
+    /**
+     * Processes a reduce-value-by-key input tile, outputting reductions for each segment
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__
+    void ProcessTile(
+        OffsetT block_offset,
+        OffsetT first_segment_idx,
+        OffsetT last_segment_idx,
+        int guarded_items = TILE_ITEMS)
+    {
+        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
+        OffsetT         segment_ids[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a tile of block partials from previous kernel
+        if (FULL_TILE)
+        {
+            // Full tile
+            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions);
+        }
+        else
+        {
+            KeyValuePair oob_default;
+            oob_default.key    = last_segment_idx;       // The last segment ID to be reduced
+            oob_default.value  = identity;
+
+            // Partially-full tile
+            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions, guarded_items, oob_default);
+        }
+
+        // Barrier for shared memory reuse
+        __syncthreads();
+
+        // Copy the segment IDs for head-flagging
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            segment_ids[ITEM] = partial_reductions[ITEM].key;
+        }
+
+        // FlagT segment heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            head_flags,                         // [out] Head flags
+            segment_ids,                        // Segment ids
+            NewSegmentOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_total.key);       // Last segment ID from previous tile to compare with first segment ID in this tile
+
+        // Reduce-value-by-segment across partial_reductions using exclusive prefix scan
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_reductions,                   // Scan input
+            partial_reductions,                   // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Scatter an accumulated reduction if it is the head of a valid segment
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_output[partial_reductions[ITEM].key] = partial_reductions[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessRegion(
+        OffsetT block_offset,
+        OffsetT block_end,
+        OffsetT first_segment_idx,
+        OffsetT last_segment_idx)
+    {
+        if (threadIdx.x == 0)
+        {
+            // Initialize running prefix to the first segment index paired with identity
+            prefix_op.running_total.key    = first_segment_idx;
+            prefix_op.running_total.value  = identity;
+        }
+
+        // Process full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessTile<true>(block_offset, first_segment_idx, last_segment_idx);
+            __syncthreads();
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process final value tile (if present)
+        int guarded_items = block_end - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, first_segment_idx, last_segment_idx, guarded_items);
+        }
+    }
+};
+
+
+
+/******************************************************************************
+ * Kernel entrypoints
+ ******************************************************************************/
+
+/**
+ * Segmented reduce region kernel entry point (multi-block).
+ */
+
+template <
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename OffsetT>                           ///< Signed integer type for global offsets
+__global__ void SegReducePartitionKernel(
+    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+    IndexPair<OffsetT>          *d_block_idx,
+    int                         num_partition_samples,
+    OffsetT                     num_values,             ///< [in] Number of values to reduce
+    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
+    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    // Segment offset type
+    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
+
+    // Counting iterator type
+    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
+
+    // Cache-modified iterator for segment end-offsets
+    CacheModifiedInputIterator<LOAD_LDG, SegmentOffsetT, OffsetT> d_wrapped_segment_end_offsets(d_segment_end_offsets);
+
+    // Counting iterator for value offsets
+    CountingIterator d_value_offsets(0);
+
+    // Initialize even-share to tell us where to start and stop our tile-processing
+    int partition_id = (blockDim.x * blockIdx.x) + threadIdx.x;
+    even_share.Init(partition_id);
+
+    // Search for block starting and ending indices
+    IndexPair<OffsetT> start_idx = {0, 0};
+    IndexPair<OffsetT> end_idx   = {num_segments, num_values};
+    IndexPair<OffsetT> block_idx;
+
+    MergePathSearch(
+        even_share.block_offset,            // Next thread diagonal
+        d_wrapped_segment_end_offsets,      // A (segment end-offsets)
+        d_value_offsets,                    // B (value offsets)
+        start_idx,                          // Start indices into A and B
+        end_idx,                            // End indices into A and B
+        block_idx);                         // [out] diagonal intersection indices into A and B
+
+    // Write output
+    if (partition_id < num_partition_samples)
+    {
+        d_block_idx[partition_id] = block_idx;
+    }
+}
+
+
+/**
+ * Segmented reduce region kernel entry point (multi-block).
+ */
+template <
+    typename BlockSegReduceRegionPolicy,        ///< Parameterized BlockSegReduceRegionPolicy tuning policy
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename ValueIterator,                     ///< Random-access input iterator type for reading values
+    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT,                           ///< Signed integer type for global offsets
+    typename Value>                             ///< Value type
+__launch_bounds__ (BlockSegReduceRegionPolicy::BLOCK_THREADS)
+__global__ void SegReduceRegionKernel(
+    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+    ValueIterator               d_values,               ///< [in] A sequence of \p num_values values
+    OutputIteratorT              d_output,               ///< [out] A sequence of \p num_segments segment totals
+    KeyValuePair<OffsetT, Value> *d_tuple_partials,      ///< [out] A sequence of (gridDim.x * 2) partial reduction tuples
+    IndexPair<OffsetT>          *d_block_idx,
+    OffsetT                     num_values,             ///< [in] Number of values to reduce
+    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
+    Value                       identity,               ///< [in] Identity value (for zero-length segments)
+    ReductionOp                 reduction_op,           ///< [in] Reduction operator
+    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Specialize thread block abstraction type for reducing a range of segmented values
+    typedef BlockSegReduceRegion<
+            BlockSegReduceRegionPolicy,
+            SegmentOffsetIterator,
+            ValueIterator,
+            OutputIteratorT,
+            ReductionOp,
+            OffsetT>
+        BlockSegReduceRegion;
+
+    // Shared memory allocation
+    __shared__ typename BlockSegReduceRegion::TempStorage temp_storage;
+
+    // Initialize thread block even-share to tell us where to start and stop our tile-processing
+    even_share.BlockInit();
+
+    // Construct persistent thread block
+    BlockSegReduceRegion thread_block(
+        temp_storage,
+        d_segment_end_offsets,
+        d_values,
+        d_output,
+        d_block_idx,
+        num_values,
+        num_segments,
+        identity,
+        reduction_op);
+
+    // First and last partial reduction tuples within the range (valid in thread-0)
+    KeyValuePair first_tuple, last_tuple;
+
+    // Consume block's region of work
+    thread_block.ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end,
+        first_tuple,
+        last_tuple);
+
+    if (threadIdx.x == 0)
+    {
+        if (gridDim.x > 1)
+        {
+            // Special case where the first segment written and the carry-out are for the same segment
+            if (first_tuple.key == last_tuple.key)
+            {
+                first_tuple.value = identity;
+            }
+
+            // Write the first and last partial products from this thread block so
+            // that they can be subsequently "fixed up" in the next kernel.
+            d_tuple_partials[blockIdx.x * 2]          = first_tuple;
+            d_tuple_partials[(blockIdx.x * 2) + 1]    = last_tuple;
+        }
+    }
+
+}
+
+
+/**
+ * Segmented reduce region kernel entry point (single-block).
+ */
+template <
+    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
+    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
+    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
+    typename    ReductionOp,                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    OffsetT,                                ///< Signed integer type for global offsets
+    typename    Value>                                  ///< Value type
+__launch_bounds__ (BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS, 1)
+__global__ void SegReduceRegionByKeyKernel(
+    InputIteratorT          d_tuple_partials,           ///< [in] A sequence of partial reduction tuples
+    OutputIteratorT          d_output,                   ///< [out] A sequence of \p num_segments segment totals
+    OffsetT                 num_segments,               ///< [in] Number of segments in the \p d_output sequence
+    int                     num_tuple_partials,         ///< [in] Number of partial reduction tuples being reduced
+    Value                   identity,                   ///< [in] Identity value (for zero-length segments)
+    ReductionOp             reduction_op)               ///< [in] Reduction operator
+{
+    // Specialize thread block abstraction type for reducing a range of values by key
+    typedef BlockSegReduceRegionByKey<
+            BlockSegReduceRegionByKeyPolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            ReductionOp>
+        BlockSegReduceRegionByKey;
+
+    // Shared memory allocation
+    __shared__ typename BlockSegReduceRegionByKey::TempStorage temp_storage;
+
+    // Construct persistent thread block
+    BlockSegReduceRegionByKey thread_block(
+        temp_storage,
+        d_tuple_partials,
+        d_output,
+        identity,
+        reduction_op);
+
+    // Process input tiles
+    thread_block.ProcessRegion(
+        0,                          // Region start
+        num_tuple_partials,         // Region end
+        0,                          // First segment ID
+        num_segments);              // Last segment ID (one-past)
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
+ */
+template <
+    typename ValueIterator,                     ///< Random-access input iterator type for reading values
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct DeviceSegReduceDispatch
+{
+    // Value type
+    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+    // Reduce-by-key data type tuple (segment-ID, value)
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Index pair data type
+    typedef IndexPair<OffsetT>IndexPair;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        // ReduceRegionPolicy
+        typedef BlockSegReduceRegionPolicy<
+                128,                            ///< Threads per thread block
+                6,                              ///< Items per thread (per tile of input)
+                true,                           ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
+                LOAD_LDG,                       ///< Cache load modifier for reading values
+                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionPolicy;
+
+        // ReduceRegionByKeyPolicy
+        typedef BlockSegReduceRegionByKeyPolicy<
+                256,                            ///< Threads per thread block
+                9,                             ///< Items per thread (per tile of input)
+                BLOCK_LOAD_DIRECT,              ///< The BlockLoad algorithm to use
+                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+                LOAD_LDG,                       ///< Cache load modifier for reading input elements
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionByKeyPolicy;
+    };
+
+
+    /// SM10
+    struct Policy100
+    {
+        // ReduceRegionPolicy
+        typedef BlockSegReduceRegionPolicy<
+                128,                            ///< Threads per thread block
+                3,                              ///< Items per thread (per tile of input)
+                false,                          ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading values
+                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
+                BLOCK_SCAN_RAKING>              ///< The BlockScan algorithm to use
+            SegReduceRegionPolicy;
+
+        // ReduceRegionByKeyPolicy
+        typedef BlockSegReduceRegionByKeyPolicy<
+                128,                            ///< Threads per thread block
+                3,                              ///< Items per thread (per tile of input)
+                BLOCK_LOAD_WARP_TRANSPOSE,      ///< The BlockLoad algorithm to use
+                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading input elements
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionByKeyPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+/*
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+*/
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSegReduceRegionPolicy           : PtxPolicy::SegReduceRegionPolicy {};
+    struct PtxSegReduceRegionByKeyPolicy      : PtxPolicy::SegReduceRegionByKeyPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <
+        typename SegReduceKernelConfig,
+        typename SegReduceByKeyKernelConfig>
+    __host__ __device__ __forceinline__
+    static void InitConfigs(
+        int                         ptx_version,
+        SegReduceKernelConfig       &seg_reduce_region_config,
+        SegReduceByKeyKernelConfig  &seg_reduce_region_by_key_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        seg_reduce_region_config.Init<PtxSegReduceRegionPolicy>();
+        seg_reduce_region_by_key_config.Init<PtxSegReduceRegionByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            seg_reduce_region_config.template          Init<typename Policy350::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy350::SegReduceRegionByKeyPolicy>();
+        }
+/*
+        else if (ptx_version >= 300)
+        {
+            seg_reduce_region_config.template          Init<typename Policy300::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy300::SegReduceRegionByKeyPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            seg_reduce_region_config.template          Init<typename Policy200::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy200::SegReduceRegionByKeyPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            seg_reduce_region_config.template          Init<typename Policy130::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy130::SegReduceRegionByKeyPolicy>();
+        }
+*/
+        else
+        {
+            seg_reduce_region_config.template          Init<typename Policy100::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy100::SegReduceRegionByKeyPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * SegReduceRegionKernel kernel dispatch configuration
+     */
+    struct SegReduceKernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        bool                    use_smem_segment_cache;
+        bool                    use_smem_value_cache;
+        CacheLoadModifier       load_modifier_segments;
+        CacheLoadModifier       load_modifier_values;
+        BlockReduceAlgorithm    reduce_algorithm;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename SegReduceRegionPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = SegReduceRegionPolicy::BLOCK_THREADS;
+            items_per_thread            = SegReduceRegionPolicy::ITEMS_PER_THREAD;
+            use_smem_segment_cache      = SegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE;
+            use_smem_value_cache        = SegReduceRegionPolicy::USE_SMEM_VALUE_CACHE;
+            load_modifier_segments      = SegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS;
+            load_modifier_values        = SegReduceRegionPolicy::LOAD_MODIFIER_VALUES;
+            reduce_algorithm            = SegReduceRegionPolicy::REDUCE_ALGORITHM;
+            scan_algorithm              = SegReduceRegionPolicy::SCAN_ALGORITHM;
+        }
+    };
+
+    /**
+     * SegReduceRegionByKeyKernel kernel dispatch configuration
+     */
+    struct SegReduceByKeyKernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_algorithm;
+        bool                    load_warp_time_slicing;
+        CacheLoadModifier       load_modifier;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename SegReduceRegionByKeyPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = SegReduceRegionByKeyPolicy::BLOCK_THREADS;
+            items_per_thread            = SegReduceRegionByKeyPolicy::ITEMS_PER_THREAD;
+            load_algorithm              = SegReduceRegionByKeyPolicy::LOAD_ALGORITHM;
+            load_warp_time_slicing      = SegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING;
+            load_modifier               = SegReduceRegionByKeyPolicy::LOAD_MODIFIER;
+            scan_algorithm              = SegReduceRegionByKeyPolicy::SCAN_ALGORITHM;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide segmented reduction.
+     */
+    template <
+        typename                        SegReducePartitionKernelPtr,
+        typename                        SegReduceRegionKernelPtr,               ///< Function type of cub::SegReduceRegionKernel
+        typename                        SegReduceRegionByKeyKernelPtr>          ///< Function type of cub::SegReduceRegionByKeyKernel
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
+        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
+        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                            debug_synchronous,                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                             sm_version,                             ///< [in] SM version of target device to use when computing SM occupancy
+        SegReducePartitionKernelPtr     seg_reduce_partition_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
+        SegReduceRegionKernelPtr        seg_reduce_region_kernel,               ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
+        SegReduceRegionByKeyKernelPtr   seg_reduce_region_by_key_kernel,        ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionByKeyKernel
+        SegReduceKernelConfig           &seg_reduce_region_config,              ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_kernel was compiled for
+        SegReduceByKeyKernelConfig      &seg_reduce_region_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_by_key_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Dispatch two kernels: (1) a multi-block segmented reduction
+            // to reduce regions by block, and (2) a single-block reduce-by-key kernel
+            // to "fix up" segments spanning more than one region.
+
+            // Tile size of seg_reduce_region_kernel
+            int tile_size = seg_reduce_region_config.block_threads * seg_reduce_region_config.items_per_thread;
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_region_kernel
+            int seg_reduce_region_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                seg_reduce_region_sm_occupancy,
+                sm_version,
+                seg_reduce_region_kernel,
+                seg_reduce_region_config.block_threads))) break;
+
+            // Get device occupancy for histogram_region_kernel
+            int seg_reduce_region_occupancy = seg_reduce_region_sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int num_diagonals = num_values + num_segments;                  // Total number of work items
+            int subscription_factor = seg_reduce_region_sm_occupancy;       // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+            int max_grid_size = seg_reduce_region_occupancy * subscription_factor;
+            GridEvenShare<OffsetT>even_share(
+                num_diagonals,
+                max_grid_size,
+                tile_size);
+
+            // Get grid size for seg_reduce_region_kernel
+            int seg_reduce_region_grid_size = even_share.grid_size;
+
+            // Number of "fix-up" reduce-by-key tuples (2 per thread block)
+            int num_tuple_partials = seg_reduce_region_grid_size * 2;
+            int num_partition_samples = seg_reduce_region_grid_size + 1;
+
+            // Temporary storage allocation requirements
+            void* allocations[2] = {};
+            size_t allocation_sizes[2] =
+            {
+                num_tuple_partials * sizeof(KeyValuePair),  // bytes needed for "fix-up" reduce-by-key tuples
+                num_partition_samples * sizeof(IndexPair),  // bytes needed block indices
+            };
+
+            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocations
+            KeyValuePair    *d_tuple_partials   = (KeyValuePair*) allocations[0];           // "fix-up" tuples
+            IndexPair       *d_block_idx        = (IndexPair *) allocations[1];             // block starting/ending indices
+
+            // Array of segment end-offsets
+            SegmentOffsetIterator d_segment_end_offsets = d_segment_offsets + 1;
+
+            // Grid launch params for seg_reduce_partition_kernel
+            int partition_block_size = 32;
+            int partition_grid_size = (num_partition_samples + partition_block_size - 1) / partition_block_size;
+
+            // Partition work among multiple thread blocks if necessary
+            if (seg_reduce_region_grid_size > 1)
+            {
+                // Log seg_reduce_partition_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking seg_reduce_partition_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    partition_grid_size, partition_block_size, (long long) stream);
+
+                // Invoke seg_reduce_partition_kernel
+                seg_reduce_partition_kernel<<<partition_grid_size, partition_block_size, 0, stream>>>(
+                    d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+                    d_block_idx,
+                    num_partition_samples,
+                    num_values,             ///< [in] Number of values to reduce
+                    num_segments,           ///< [in] Number of segments being reduced
+                    even_share);            ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+
+                // Sync the stream if specified
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log seg_reduce_region_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking seg_reduce_region_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, (long long) stream, seg_reduce_region_config.items_per_thread, seg_reduce_region_sm_occupancy);
+
+            // Mooch
+            if (CubDebug(error = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte))) break;
+
+            // Invoke seg_reduce_region_kernel
+            seg_reduce_region_kernel<<<seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, 0, stream>>>(
+                d_segment_end_offsets,
+                d_values,
+                d_output,
+                d_tuple_partials,
+                d_block_idx,
+                num_values,
+                num_segments,
+                identity,
+                reduction_op,
+                even_share);
+
+            // Sync the stream if specified
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+/*
+            // Perform "fix-up" of region partial reductions if grid size is greater than one thread block
+            if (seg_reduce_region_grid_size > 1)
+            {
+                // Log seg_reduce_region_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking seg_reduce_region_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, seg_reduce_region_by_key_config.block_threads, (long long) stream, seg_reduce_region_by_key_config.items_per_thread);
+
+                // Invoke seg_reduce_region_by_key_kernel
+                seg_reduce_region_by_key_kernel<<<1, seg_reduce_region_by_key_config.block_threads, 0, stream>>>(
+                    d_tuple_partials,
+                    d_output,
+                    num_segments,
+                    num_tuple_partials,
+                    identity,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+*/
+        }
+
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide segmented reduction.
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
+        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
+        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                            debug_synchronous)                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            SegReduceKernelConfig seg_reduce_region_config;
+            SegReduceByKeyKernelConfig seg_reduce_region_by_key_config;
+
+            InitConfigs(ptx_version, seg_reduce_region_config, seg_reduce_region_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_values,
+                d_segment_offsets,
+                d_output,
+                num_values,
+                num_segments,
+                identity,
+                reduction_op,
+                stream,
+                debug_synchronous,
+                ptx_version,            // Use PTX version instead of SM version because, as a statically known quantity, this improves device-side launch dramatically but at the risk of imprecise occupancy calculation for mismatches
+                SegReducePartitionKernel<SegmentOffsetIterator, OffsetT>,
+                SegReduceRegionKernel<PtxSegReduceRegionPolicy, SegmentOffsetIterator, ValueIterator, OutputIteratorT, ReductionOp, OffsetT, Value>,
+                SegReduceRegionByKeyKernel<PtxSegReduceRegionByKeyPolicy, KeyValuePair*, OutputIteratorT, ReductionOp, OffsetT, Value>,
+                seg_reduce_region_config,
+                seg_reduce_region_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+
+    }
+};
+
+
+
+
+/******************************************************************************
+ * DeviceSegReduce
+ *****************************************************************************/
+
+/**
+ * \brief DeviceSegReduce provides operations for computing a device-wide, parallel segmented reduction across a sequence of data items residing within global memory.
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ */
+struct DeviceSegReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
+     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
+     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
+     * \tparam Value                    <b>[inferred]</b> Value type
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename                ValueIterator,
+        typename                SegmentOffsetIterator,
+        typename                OutputIteratorT,
+        typename                Value,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Reduce(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        int                     num_values,                             ///< [in] Total number of values to reduce
+        int                     num_segments,                           ///< [in] Number of segments being reduced
+        Value                   identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp             reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        typedef DeviceSegReduceDispatch<
+                ValueIterator,
+                SegmentOffsetIterator,
+                OutputIteratorT,
+                ReductionOp,
+                OffsetT>
+            DeviceSegReduceDispatch;
+
+        return DeviceSegReduceDispatch::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_values,
+            d_segment_offsets,
+            d_output,
+            num_values,
+            num_segments,
+            identity,
+            reduction_op,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * Does not support non-commutative summation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
+     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
+     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
+     */
+    template <
+        typename                ValueIterator,
+        typename                SegmentOffsetIterator,
+        typename                OutputIteratorT>
+    __host__ __device__ __forceinline__
+    static cudaError_t Sum(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        int                     num_values,                             ///< [in] Total number of values to reduce
+        int                     num_segments,                           ///< [in] Number of segments being reduced
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Value type
+        typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+        Value identity = Value();
+        cub::Sum reduction_op;
+
+        typedef DeviceSegReduceDispatch<
+                ValueIterator,
+                SegmentOffsetIterator,
+                OutputIteratorT,
+                cub::Sum,
+                OffsetT>
+            DeviceSegReduceDispatch;
+
+        return DeviceSegReduceDispatch::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_values,
+            d_segment_offsets,
+            d_output,
+            num_values,
+            num_segments,
+            identity,
+            reduction_op,
+            stream,
+            debug_synchronous);
+    }
+};
+
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename OffsetT, typename Value>
+void Initialize(
+    GenMode         gen_mode,
+    Value           *h_values,
+    vector<OffsetT> &segment_offsets,
+    int             num_values,
+    int             avg_segment_size)
+{
+    // Initialize values
+//    if (g_verbose) printf("Values: ");
+    for (int i = 0; i < num_values; ++i)
+    {
+        InitValue(gen_mode, h_values[i], i);
+//        if (g_verbose) std::cout << h_values[i] << ", ";
+    }
+//    if (g_verbose) printf("\n\n");
+
+    // Initialize segment lengths
+    const unsigned int  MAX_INTEGER         = -1u;
+    const unsigned int  MAX_SEGMENT_LENGTH  = avg_segment_size * 2;
+    const double        SCALE_FACTOR        = double(MAX_SEGMENT_LENGTH) / double(MAX_INTEGER);
+
+    segment_offsets.push_back(0);
+
+    OffsetT consumed = 0;
+    OffsetT remaining = num_values;
+    while (remaining > 0)
+    {
+        // Randomly sample a 32-bit unsigned int
+        unsigned int segment_length;
+        RandomBits(segment_length);
+
+        // Scale to maximum segment length
+        segment_length = (unsigned int) (double(segment_length) * SCALE_FACTOR);
+        segment_length = CUB_MIN(segment_length, remaining);
+
+        consumed += segment_length;
+        remaining -= segment_length;
+
+        segment_offsets.push_back(consumed);
+    }
+}
+
+
+/**
+ * Compute reference answer
+ */
+template <typename OffsetT, typename Value>
+void ComputeReference(
+    Value       *h_values,
+    OffsetT     *h_segment_offsets,
+    Value       *h_reference,
+    int         num_segments,
+    Value       identity)
+{
+    if (g_verbose) printf("%d segment reductions: ", num_segments);
+    for (int segment = 0; segment < num_segments; ++segment)
+    {
+        h_reference[segment] = identity;
+
+        for (int i = h_segment_offsets[segment]; i < h_segment_offsets[segment + 1]; ++i)
+        {
+            h_reference[segment] += h_values[i];
+        }
+        if (g_verbose) std::cout << h_reference[segment] << ", ";
+    }
+    if (g_verbose) printf("\n\n");
+}
+
+
+/**
+ * Simple test of device
+ */
+template <
+    bool            CDP,
+    typename        OffsetT,
+    typename        Value,
+    typename        ReductionOp>
+void Test(
+    OffsetT         num_values,
+    int             avg_segment_size,
+    ReductionOp     reduction_op,
+    Value           identity,
+    char*           type_string)
+{
+    Value   *h_values = NULL;
+    Value   *h_reference = NULL;
+    OffsetT *h_segment_offsets = NULL;
+
+    printf("%d\n", num_values);
+
+    // Initialize problem on host
+    h_values = new Value[num_values];
+    vector<OffsetT> segment_offsets;
+    Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
+
+    // Allocate simple offsets array and copy STL vector into it
+    h_segment_offsets = new OffsetT[segment_offsets.size()];
+    for (int i = 0; i < segment_offsets.size(); ++i)
+        h_segment_offsets[i] = segment_offsets[i];
+
+    OffsetT num_segments = segment_offsets.size() - 1;
+    if (g_verbose)
+    {
+        printf("%d segment offsets: ", num_segments);
+        for (int i = 0; i < num_segments; ++i)
+            std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
+        if (g_verbose) std::cout << std::endl << std::endl;
+    }
+
+    // Solve problem on host
+    h_reference = new Value[num_segments];
+    ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
+
+    printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
+        (CDP) ? "CDP device invoked" : "Host-invoked",
+        (Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
+        num_values, (int) sizeof(Value), type_string,
+        num_segments, (int) sizeof(OffsetT));
+    fflush(stdout);
+
+    // Allocate and initialize problem on device
+    Value   *d_values = NULL;
+    OffsetT *d_segment_offsets = NULL;
+    Value   *d_output = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
+    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Request and allocate temporary storage
+    void    *d_temp_storage = NULL;
+    size_t  temp_storage_bytes = 0;
+    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output
+    CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
+    }
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
+        float giga_bandwidth = giga_rate *
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    // Device cleanup
+    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Host cleanup
+    if (h_values)           delete[] h_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (h_reference)        delete[] h_reference;
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_values          = 32 * 1024 * 1024;
+    int avg_segment_size    = 500;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_values);
+    args.GetCmdLineArgument("ss", avg_segment_size);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "[--n=<input samples>]\n"
+            "[--ss=<average segment size>]\n"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/experimental/histogram/histogram_cub.h b/thrust/dependencies/cub/experimental/histogram/histogram_cub.h
new file mode 100644
index 0000000000000000000000000000000000000000..07c2e4aa2db26a2f788003e950cb8c82f40a7846
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/histogram/histogram_cub.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_histogram.cuh>
+
+using namespace cub;
+
+template <
+    int         NUM_CHANNELS,
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_cub_histogram(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist, 
+    bool is_warmup)
+{
+    enum {
+        is_float = Equals<PixelType, float4>::VALUE,
+    };
+
+    typedef typename If<is_float, float, unsigned char>::Type    SampleT;    // Sample type
+    typedef typename If<is_float, float, unsigned int>::Type     LevelT;     // Level type (uint32 for uchar)
+
+    // Setup data structures
+    unsigned int*       d_histogram[ACTIVE_CHANNELS];
+    int                 num_levels[ACTIVE_CHANNELS];            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[ACTIVE_CHANNELS];           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[ACTIVE_CHANNELS];           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS);
+        num_levels[CHANNEL] = NUM_BINS + 1;
+        lower_level[CHANNEL] = 0;
+        upper_level[CHANNEL] = (is_float) ? 1 : 256;
+    }
+
+    // Allocate temporary storage
+    size_t temp_storage_bytes = 0;
+    void *d_temp_storage = NULL;
+
+    SampleT* d_image_samples = (SampleT*) d_image;
+
+    // Get amount of temporary storage needed
+    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_image_samples,
+        d_histogram,
+        num_levels,
+        lower_level,
+        upper_level,
+        width * height, 
+        (cudaStream_t) 0,
+        is_warmup);
+
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    // Compute histogram
+    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_image_samples,
+        d_histogram,
+        num_levels,
+        lower_level,
+        upper_level,
+        width * height, 
+        (cudaStream_t) 0,
+        is_warmup);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_temp_storage);
+
+    return elapsed_millis;
+}
+
diff --git a/thrust/dependencies/cub/experimental/histogram/histogram_gmem_atomics.h b/thrust/dependencies/cub/experimental/histogram/histogram_gmem_atomics.h
new file mode 100644
index 0000000000000000000000000000000000000000..3308a2851bec88a0b04c17413a92861a74298b89
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/histogram/histogram_gmem_atomics.h
@@ -0,0 +1,185 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <test/test_util.h>
+
+namespace histogram_gmem_atomics
+{
+    // Decode float4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        float* samples = reinterpret_cast<float*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+    }
+
+    // Decode uchar4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+    }
+
+    // Decode uchar1 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        bins[0] = (unsigned int) pixel.x;
+    }
+
+    // First-pass histogram kernel (binning into privatized counters)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS,
+        typename    PixelType>
+    __global__ void histogram_gmem_atomics(
+        const PixelType *in,
+        int width,
+        int height,
+        unsigned int *out)
+    {
+        // global position and size
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int nx = blockDim.x * gridDim.x;
+        int ny = blockDim.y * gridDim.y;
+
+        // threads in workgroup
+        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
+        int nt = blockDim.x * blockDim.y; // total threads in workgroup
+
+        // group index in 0..ngroups-1
+        int g = blockIdx.x + blockIdx.y * gridDim.x;
+
+        // initialize smem
+        unsigned int *gmem = out + g * NUM_PARTS;
+        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt)
+            gmem[i] = 0;
+        __syncthreads();
+
+        // process pixels (updates our group's partial histogram in gmem)
+        for (int col = x; col < width; col += nx)
+        {
+            for (int row = y; row < height; row += ny)
+            {
+                PixelType pixel = in[row * width + col];
+
+                unsigned int bins[ACTIVE_CHANNELS];
+                DecodePixel<NUM_BINS>(pixel, bins);
+
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                    atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
+            }
+        }
+    }
+
+    // Second pass histogram kernel (accumulation)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS>
+    __global__ void histogram_gmem_accum(
+        const unsigned int *in,
+        int n,
+        unsigned int *out)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i > ACTIVE_CHANNELS * NUM_BINS)
+            return; // out of range
+
+        unsigned int total = 0;
+        for (int j = 0; j < n; j++)
+            total += in[i + NUM_PARTS * j];
+
+        out[i] = total;
+    }
+
+
+}   // namespace histogram_gmem_atomics
+
+
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_gmem_atomics(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist,
+    bool warmup)
+{
+    enum
+    {
+        NUM_PARTS = 1024
+    };
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, 0);
+
+    dim3 block(32, 4);
+    dim3 grid(16, 16);
+    int total_blocks = grid.x * grid.y;
+
+    // allocate partial histogram
+    unsigned int *d_part_hist;
+    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
+
+    dim3 block2(128);
+    dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
+        d_image,
+        width,
+        height,
+        d_part_hist);
+
+    histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
+        d_part_hist,
+        total_blocks,
+        d_hist);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_part_hist);
+
+    return elapsed_millis;
+}
+
diff --git a/thrust/dependencies/cub/experimental/histogram/histogram_smem_atomics.h b/thrust/dependencies/cub/experimental/histogram/histogram_smem_atomics.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c70702e2e267ac070ae2d2d04c73bfb6e6b2e88
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/histogram/histogram_smem_atomics.h
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <test/test_util.h>
+
+namespace histogram_smem_atomics
+{
+    // Decode float4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        float* samples = reinterpret_cast<float*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+    }
+
+    // Decode uchar4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+    }
+
+    // Decode uchar1 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        bins[0] = (unsigned int) pixel.x;
+    }
+
+    // First-pass histogram kernel (binning into privatized counters)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS,
+        typename    PixelType>
+    __global__ void histogram_smem_atomics(
+        const PixelType *in,
+        int width,
+        int height,
+        unsigned int *out)
+    {
+        // global position and size
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int nx = blockDim.x * gridDim.x;
+        int ny = blockDim.y * gridDim.y;
+
+        // threads in workgroup
+        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
+        int nt = blockDim.x * blockDim.y; // total threads in workgroup
+
+        // group index in 0..ngroups-1
+        int g = blockIdx.x + blockIdx.y * gridDim.x;
+
+        // initialize smem
+        __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3];
+        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt)
+            smem[i] = 0;
+        __syncthreads();
+
+        // process pixels
+        // updates our group's partial histogram in smem
+        for (int col = x; col < width; col += nx)
+        {
+            for (int row = y; row < height; row += ny)
+            {
+                PixelType pixel = in[row * width + col];
+
+                unsigned int bins[ACTIVE_CHANNELS];
+                DecodePixel<NUM_BINS>(pixel, bins);
+
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                    atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1);
+            }
+        }
+
+        __syncthreads();
+
+        // move to our workgroup's slice of output
+        out += g * NUM_PARTS;
+
+        // store local output to global
+        for (int i = t; i < NUM_BINS; i += nt)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL];
+        }
+    }
+
+    // Second pass histogram kernel (accumulation)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS>
+    __global__ void histogram_smem_accum(
+        const unsigned int *in,
+        int n,
+        unsigned int *out)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range
+        unsigned int total = 0;
+        for (int j = 0; j < n; j++)
+            total += in[i + NUM_PARTS * j];
+        out[i] = total;
+    }
+
+}   // namespace histogram_smem_atomics
+
+
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_smem_atomics(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist, 
+    bool warmup)
+{
+    enum
+    {
+        NUM_PARTS = 1024
+    };
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, 0);
+
+    dim3 block(32, 4);
+    dim3 grid(16, 16);
+    int total_blocks = grid.x * grid.y;
+
+    // allocate partial histogram
+    unsigned int *d_part_hist;
+    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
+
+    dim3 block2(128);
+    dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
+        d_image,
+        width,
+        height,
+        d_part_hist);
+
+    histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
+        d_part_hist,
+        total_blocks,
+        d_hist);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_part_hist);
+
+    return elapsed_millis;
+}
+
diff --git a/thrust/dependencies/cub/experimental/histogram_compare.cu b/thrust/dependencies/cub/experimental/histogram_compare.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7ab66a16aeeb7145814f94652ec95577497681d5
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/histogram_compare.cu
@@ -0,0 +1,635 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <stdio.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+
+#include "histogram/histogram_gmem_atomics.h"
+#include "histogram/histogram_smem_atomics.h"
+#include "histogram/histogram_cub.h"
+
+#include <cub/util_allocator.cuh>
+#include <test/test_util.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants, and type declarations
+//---------------------------------------------------------------------
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+bool                    g_report = false;   // Whether to display a full report in CSV format
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+struct less_than_value
+{
+    inline bool operator()(
+        const std::pair<std::string, double> &a,
+        const std::pair<std::string, double> &b)
+    {
+        return a.second < b.second;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Targa (.tga) image file parsing
+//---------------------------------------------------------------------
+
+/**
+ * TGA image header info
+ */
+struct TgaHeader
+{
+    char idlength;
+    char colormaptype;
+    char datatypecode;
+    short colormaporigin;
+    short colormaplength;
+    char colormapdepth;
+    short x_origin;
+    short y_origin;
+    short width;
+    short height;
+    char bitsperpixel;
+    char imagedescriptor;
+
+    void Parse (FILE *fptr)
+    {
+        idlength = fgetc(fptr);
+        colormaptype = fgetc(fptr);
+        datatypecode = fgetc(fptr);
+        fread(&colormaporigin, 2, 1, fptr);
+        fread(&colormaplength, 2, 1, fptr);
+        colormapdepth = fgetc(fptr);
+        fread(&x_origin, 2, 1, fptr);
+        fread(&y_origin, 2, 1, fptr);
+        fread(&width, 2, 1, fptr);
+        fread(&height, 2, 1, fptr);
+        bitsperpixel = fgetc(fptr);
+        imagedescriptor = fgetc(fptr);
+    }
+
+    void Display (FILE *fptr)
+    {
+        fprintf(fptr, "ID length:           %d\n", idlength);
+        fprintf(fptr, "Color map type:      %d\n", colormaptype);
+        fprintf(fptr, "Image type:          %d\n", datatypecode);
+        fprintf(fptr, "Color map offset:    %d\n", colormaporigin);
+        fprintf(fptr, "Color map length:    %d\n", colormaplength);
+        fprintf(fptr, "Color map depth:     %d\n", colormapdepth);
+        fprintf(fptr, "X origin:            %d\n", x_origin);
+        fprintf(fptr, "Y origin:            %d\n", y_origin);
+        fprintf(fptr, "Width:               %d\n", width);
+        fprintf(fptr, "Height:              %d\n", height);
+        fprintf(fptr, "Bits per pixel:      %d\n", bitsperpixel);
+        fprintf(fptr, "Descriptor:          %d\n", imagedescriptor);
+    }
+};
+
+
+/**
+ * Decode image byte data into pixel
+ */
+void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
+{
+    if (bytes == 4)
+    {
+        pixel.x = tga_pixel[2];
+        pixel.y = tga_pixel[1];
+        pixel.z = tga_pixel[0];
+        pixel.w = tga_pixel[3];
+    }
+    else if (bytes == 3)
+    {
+        pixel.x = tga_pixel[2];
+        pixel.y = tga_pixel[1];
+        pixel.z = tga_pixel[0];
+        pixel.w = 0;
+    }
+    else if (bytes == 2)
+    {
+        pixel.x = (tga_pixel[1] & 0x7c) << 1;
+        pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
+        pixel.z = (tga_pixel[0] & 0x1f) << 3;
+        pixel.w = (tga_pixel[1] & 0x80);
+    }
+}
+
+
+/**
+ * Reads a .tga image file
+ */
+void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
+{
+    // Open the file
+    FILE *fptr;
+    if ((fptr = fopen(filename, "rb")) == NULL)
+    {
+        fprintf(stderr, "File open failed\n");
+        exit(-1);
+    }
+
+    // Parse header
+    TgaHeader header;
+    header.Parse(fptr);
+//    header.Display(stdout);
+    width = header.width;
+    height = header.height;
+
+    // Verify compatibility
+    if (header.datatypecode != 2 && header.datatypecode != 10)
+    {
+        fprintf(stderr, "Can only handle image type 2 and 10\n");
+        exit(-1);
+    }
+    if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
+    {
+        fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
+        exit(-1);
+    }
+    if (header.colormaptype != 0 && header.colormaptype != 1)
+    {
+        fprintf(stderr, "Can only handle color map types of 0 and 1\n");
+        exit(-1);
+    }
+
+    // Skip unnecessary header info
+    int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
+    fseek(fptr, skip_bytes, SEEK_CUR);
+
+    // Read the image
+    int pixel_bytes = header.bitsperpixel / 8;
+
+    // Allocate and initialize pixel data
+    size_t image_bytes = width * height * sizeof(uchar4);
+    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
+    {
+        fprintf(stderr, "malloc of image failed\n");
+        exit(-1);
+    }
+    memset(pixels, 0, image_bytes);
+
+    // Parse pixels
+    unsigned char   tga_pixel[5];
+    int             current_pixel = 0;
+    while (current_pixel < header.width * header.height)
+    {
+        if (header.datatypecode == 2)
+        {
+            // Uncompressed
+            if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
+            {
+                fprintf(stderr, "Unexpected end of file at pixel %d  (uncompressed)\n", current_pixel);
+                exit(-1);
+            }
+            ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
+            current_pixel++;
+        }
+        else if (header.datatypecode == 10)
+        {
+            // Compressed
+            if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
+            {
+                fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
+                exit(-1);
+            }
+            int run_length = tga_pixel[0] & 0x7f;
+            ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
+            current_pixel++;
+
+            if (tga_pixel[0] & 0x80)
+            {
+                // RLE chunk
+                for (int i = 0; i < run_length; i++)
+                {
+                    ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
+                    current_pixel++;
+                }
+            }
+            else
+            {
+                // Normal chunk
+                for (int i = 0; i < run_length; i++)
+                {
+                    if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
+                    {
+                        fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
+                        exit(-1);
+                    }
+                    ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
+                    current_pixel++;
+                }
+            }
+        }
+    }
+
+    // Close file
+    fclose(fptr);
+}
+
+
+
+//---------------------------------------------------------------------
+// Random image generation
+//---------------------------------------------------------------------
+
+/**
+ * Generate a random image with specified entropy
+ */
+void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
+{
+    int num_pixels = width * height;
+    size_t image_bytes = num_pixels * sizeof(uchar4);
+    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
+    {
+        fprintf(stderr, "malloc of image failed\n");
+        exit(-1);
+    }
+
+    for (int i = 0; i < num_pixels; ++i)
+    {
+        RandomBits(pixels[i].x, entropy_reduction);
+        RandomBits(pixels[i].y, entropy_reduction);
+        RandomBits(pixels[i].z, entropy_reduction);
+        RandomBits(pixels[i].w, entropy_reduction);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Histogram verification
+//---------------------------------------------------------------------
+
+// Decode float4 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    float* samples = reinterpret_cast<float*>(&pixel);
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+}
+
+// Decode uchar4 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+}
+
+// Decode uchar1 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    bins[0] = (unsigned int) pixel.x;
+}
+
+
+// Compute reference histogram.  Specialized for uchar4
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
+{
+    memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
+
+    for (int i = 0; i < width; i++)
+    {
+        for (int j = 0; j < height; j++)
+        {
+            PixelType pixel = image[i + j * width];
+
+            unsigned int bins[ACTIVE_CHANNELS];
+            DecodePixelGold<NUM_BINS>(pixel, bins);
+
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
+            }
+        }
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Test execution
+//---------------------------------------------------------------------
+
+/**
+ * Run a specific histogram implementation
+ */
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void RunTest(
+    std::vector<std::pair<std::string, double> >&   timings,
+    PixelType*                                      d_pixels,
+    const int                                       width,
+    const int                                       height,
+    unsigned int *                                  d_hist,
+    unsigned int *                                  h_hist,
+    int                                             timing_iterations,
+    const char *                                    long_name,
+    const char *                                    short_name,
+    double (*f)(PixelType*, int, int, unsigned int*, bool))
+{
+    if (!g_report) printf("%s ", long_name); fflush(stdout);
+
+    // Run single test to verify (and code cache)
+    (*f)(d_pixels, width, height, d_hist, !g_report);
+
+    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
+    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+
+    double elapsed_ms = 0;
+    for (int i = 0; i < timing_iterations; i++)
+    {
+        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
+    }
+    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
+    timings.push_back(std::pair<std::string, double>(short_name, avg_us));
+
+    if (!g_report)
+    {
+        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
+    }
+    else
+    {
+        printf("%.3f, ", avg_us); fflush(stdout);
+    }
+
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Evaluate corpus of histogram implementations
+ */
+template <
+    int         NUM_CHANNELS,
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void TestMethods(
+    PixelType*  h_pixels,
+    int         height,
+    int         width,
+    int         timing_iterations,
+    double      bandwidth_GBs)
+{
+    // Copy data to gpu
+    PixelType* d_pixels;
+    size_t pixel_bytes = width * height * sizeof(PixelType);
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
+    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
+
+    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
+
+    // Allocate results arrays on cpu/gpu
+    unsigned int *h_hist;
+    unsigned int *d_hist;
+    size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
+    h_hist = (unsigned int *) malloc(histogram_bytes);
+    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
+
+    // Compute reference cpu histogram
+    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
+
+    // Store timings
+    std::vector<std::pair<std::string, double> > timings;
+
+    // Run experiments
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+
+    // Report timings
+    if (!g_report)
+    {
+        std::sort(timings.begin(), timings.end(), less_than_value());
+        printf("Timings (us):\n");
+        for (int i = 0; i < timings.size(); i++)
+        {
+            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
+            printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
+        }
+        printf("\n");
+    }
+
+    // Free data
+    CubDebugExit(g_allocator.DeviceFree(d_pixels));
+    CubDebugExit(g_allocator.DeviceFree(d_hist));
+    free(h_hist);
+}
+
+
+/**
+ * Test different problem genres
+ */
+void TestGenres(
+    uchar4*     uchar4_pixels,
+    int         height,
+    int         width,
+    int         timing_iterations,
+    double      bandwidth_GBs)
+{
+    int num_pixels = width * height;
+
+    {
+        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
+
+        size_t      image_bytes     = num_pixels * sizeof(uchar1);
+        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);
+
+        // Convert to 1-channel (averaging first 3 channels)
+        for (int i = 0; i < num_pixels; ++i)
+        {
+            uchar1_pixels[i].x = (unsigned char)
+                (((unsigned int) uchar4_pixels[i].x +
+                  (unsigned int) uchar4_pixels[i].y +
+                  (unsigned int) uchar4_pixels[i].z) / 3);
+        }
+
+        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
+        free(uchar1_pixels);
+        if (g_report) printf(", ");
+    }
+
+    {
+        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
+        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
+        if (g_report) printf(", ");
+    }
+
+    {
+        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
+        size_t      image_bytes     = num_pixels * sizeof(float4);
+        float4*     float4_pixels   = (float4*) malloc(image_bytes);
+
+        // Convert to float4 with range [0.0, 1.0)
+        for (int i = 0; i < num_pixels; ++i)
+        {
+            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
+            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
+            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
+            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
+        }
+        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
+        free(float4_pixels);
+        if (g_report) printf("\n");
+    }
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf(
+            "%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "\n\t"
+                "--file=<.tga filename> "
+            "\n\t"
+                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
+                "[--height=<default: 1080>] "
+                "[--width=<default: 1920>] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    std::string         filename;
+    int                 timing_iterations   = 100;
+    int                 entropy_reduction   = 0;
+    int                 height              = 1080;
+    int                 width               = 1920;
+
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_report = args.CheckCmdLineFlag("report");
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("file", filename);
+    args.GetCmdLineArgument("height", height);
+    args.GetCmdLineArgument("width", width);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get GPU device bandwidth (GB/s)
+    int device_ordinal, bus_width, mem_clock_khz;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
+    double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
+
+    // Run test(s)
+    uchar4* uchar4_pixels = NULL;
+    if (!g_report)
+    {
+        if (!filename.empty())
+        {
+            // Parse targa file
+            ReadTga(uchar4_pixels, width, height, filename.c_str());
+            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
+        }
+        else
+        {
+            // Generate image
+            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
+            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
+        }
+
+        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+    }
+    else
+    {
+        // Run test suite
+        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
+
+        // Entropy reduction tests
+        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
+        {
+            printf("entropy reduction %d, ", entropy_reduction);
+            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
+            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        }
+        printf("entropy reduction -1, ");
+        GenerateRandomImage(uchar4_pixels, width, height, -1);
+        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        printf("\n");
+
+        // File image tests
+        std::vector<std::string> file_tests;
+        file_tests.push_back("animals");
+        file_tests.push_back("apples");
+        file_tests.push_back("sunset");
+        file_tests.push_back("cheetah");
+        file_tests.push_back("nature");
+        file_tests.push_back("operahouse");
+        file_tests.push_back("austin");
+        file_tests.push_back("cityscape");
+
+        for (int i = 0; i < file_tests.size(); ++i)
+        {
+            printf("%s, ", file_tests[i].c_str());
+            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
+            ReadTga(uchar4_pixels, width, height, filename.c_str());
+            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        }
+    }
+
+    free(uchar4_pixels);
+
+    CubDebugExit(cudaDeviceSynchronize());
+    printf("\n\n");
+
+    return 0;
+}
diff --git a/thrust/dependencies/cub/experimental/sparse_matrix.h b/thrust/dependencies/cub/experimental/sparse_matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb52333118c1a2d316ba4f0c6391ce69af0a5b8
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/sparse_matrix.h
@@ -0,0 +1,1244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Matrix data structures and parsing logic
+ ******************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cstring>
+
+#include <iterator>
+#include <string>
+#include <algorithm>
+#include <iostream>
+#include <queue>
+#include <set>
+#include <fstream>
+#include <stdio.h>
+
+#ifdef CUB_MKL
+    #include <numa.h>
+    #include <mkl.h>
+#endif
+
+using namespace std;
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+struct GraphStats
+{
+    int         num_rows;
+    int         num_cols;
+    int         num_nonzeros;
+
+    double      diag_dist_mean;         // mean
+    double      diag_dist_std_dev;      // sample std dev
+    double      pearson_r;    // coefficient of variation
+
+    double      row_length_mean;        // mean
+    double      row_length_std_dev;     // sample std_dev
+    double      row_length_variation;   // coefficient of variation
+    double      row_length_skewness;    // skewness
+
+    void Display(bool show_labels = true)
+    {
+        if (show_labels)
+            printf("\n"
+                "\t num_rows: %d\n"
+                "\t num_cols: %d\n"
+                "\t num_nonzeros: %d\n"
+                "\t diag_dist_mean: %.2f\n"
+                "\t diag_dist_std_dev: %.2f\n"
+                "\t pearson_r: %f\n"
+                "\t row_length_mean: %.5f\n"
+                "\t row_length_std_dev: %.5f\n"
+                "\t row_length_variation: %.5f\n"
+                "\t row_length_skewness: %.5f\n",
+                    num_rows,
+                    num_cols,
+                    num_nonzeros,
+                    diag_dist_mean,
+                    diag_dist_std_dev,
+                    pearson_r,
+                    row_length_mean,
+                    row_length_std_dev,
+                    row_length_variation,
+                    row_length_skewness);
+        else
+            printf(
+                "%d, "
+                "%d, "
+                "%d, "
+                "%.2f, "
+                "%.2f, "
+                "%f, "
+                "%.5f, "
+                "%.5f, "
+                "%.5f, "
+                "%.5f, ",
+                    num_rows,
+                    num_cols,
+                    num_nonzeros,
+                    diag_dist_mean,
+                    diag_dist_std_dev,
+                    pearson_r,
+                    row_length_mean,
+                    row_length_std_dev,
+                    row_length_variation,
+                    row_length_skewness);
+    }
+};
+
+
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+
+/**
+ * COO matrix type.  A COO matrix is just a vector of edge tuples.  Tuples are sorted
+ * first by row, then by column.
+ */
+template<typename ValueT, typename OffsetT>
+struct CooMatrix
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // COO edge tuple
+    struct CooTuple
+    {
+        OffsetT            row;
+        OffsetT            col;
+        ValueT             val;
+
+        CooTuple() {}
+        CooTuple(OffsetT row, OffsetT col) : row(row), col(col) {}
+        CooTuple(OffsetT row, OffsetT col, ValueT val) : row(row), col(col), val(val) {}
+
+        /**
+         * Comparator for sorting COO sparse format num_nonzeros
+         */
+        bool operator<(const CooTuple &other) const
+        {
+            if ((row < other.row) || ((row == other.row) && (col < other.col)))
+            {
+                return true;
+            }
+
+            return false;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Data members
+    //---------------------------------------------------------------------
+
+    // Fields
+    int                 num_rows;
+    int                 num_cols;
+    int                 num_nonzeros;
+    CooTuple*           coo_tuples;
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    // Constructor
+    CooMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), coo_tuples(NULL) {}
+
+
+    /**
+     * Clear
+     */
+    void Clear()
+    {
+        if (coo_tuples) delete[] coo_tuples;
+        coo_tuples = NULL;
+    }
+
+
+    // Destructor
+    ~CooMatrix()
+    {
+        Clear();
+    }
+
+
+    // Display matrix to stdout
+    void Display()
+    {
+        cout << "COO Matrix (" << num_rows << " rows, " << num_cols << " columns, " << num_nonzeros << " non-zeros):\n";
+        cout << "Ordinal, Row, Column, Value\n";
+        for (int i = 0; i < num_nonzeros; i++)
+        {
+            cout << '\t' << i << ',' << coo_tuples[i].row << ',' << coo_tuples[i].col << ',' << coo_tuples[i].val << "\n";
+        }
+    }
+
+
+    /**
+     * Builds a symmetric COO sparse from an asymmetric CSR matrix.
+     */
+    template <typename CsrMatrixT>
+    void InitCsrSymmetric(CsrMatrixT &csr_matrix)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = csr_matrix.num_cols;
+        num_cols        = csr_matrix.num_rows;
+        num_nonzeros    = csr_matrix.num_nonzeros * 2;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < csr_matrix.num_rows; ++row)
+        {
+            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
+            {
+                coo_tuples[nonzero].row = row;
+                coo_tuples[nonzero].col = csr_matrix.column_indices[nonzero];
+                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
+
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].row = coo_tuples[nonzero].col;
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].col = coo_tuples[nonzero].row;
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].val = csr_matrix.values[nonzero];
+
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+    }
+
+    /**
+     * Builds a COO sparse from a relabeled CSR matrix.
+     */
+    template <typename CsrMatrixT>
+    void InitCsrRelabel(CsrMatrixT &csr_matrix, OffsetT* relabel_indices)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = csr_matrix.num_rows;
+        num_cols        = csr_matrix.num_cols;
+        num_nonzeros    = csr_matrix.num_nonzeros;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
+            {
+                coo_tuples[nonzero].row = relabel_indices[row];
+                coo_tuples[nonzero].col = relabel_indices[csr_matrix.column_indices[nonzero]];
+                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+    }
+
+
+
+    /**
+     * Builds a METIS COO sparse from the given file.
+     */
+    void InitMetis(const string &metis_filename)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        // TODO
+    }
+
+
+    /**
+     * Builds a MARKET COO sparse from the given file.
+     */
+    void InitMarket(
+        const string&   market_filename,
+        ValueT          default_value       = 1.0,
+        bool            verbose             = false)
+    {
+        if (verbose) {
+            printf("Reading... "); fflush(stdout);
+        }
+
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        std::ifstream ifs;
+        ifs.open(market_filename.c_str(), std::ifstream::in);
+        if (!ifs.good())
+        {
+            fprintf(stderr, "Error opening file\n");
+            exit(1);
+        }
+
+        bool    array = false;
+        bool    symmetric = false;
+        bool    skew = false;
+        int     current_edge = -1;
+        char    line[1024];
+
+        if (verbose) {
+            printf("Parsing... "); fflush(stdout);
+        }
+
+        while (true)
+        {
+            ifs.getline(line, 1024);
+            if (!ifs.good())
+            {
+                // Done
+                break;
+            }
+
+            if (line[0] == '%')
+            {
+                // Comment
+                if (line[1] == '%')
+                {
+                    // Banner
+                    symmetric   = (strstr(line, "symmetric") != NULL);
+                    skew        = (strstr(line, "skew") != NULL);
+                    array       = (strstr(line, "array") != NULL);
+
+                    if (verbose) {
+                        printf("(symmetric: %d, skew: %d, array: %d) ", symmetric, skew, array); fflush(stdout);
+                    }
+                }
+            }
+            else if (current_edge == -1)
+            {
+                // Problem description
+                int nparsed = sscanf(line, "%d %d %d", &num_rows, &num_cols, &num_nonzeros);
+                if ((!array) && (nparsed == 3))
+                {
+                    if (symmetric)
+                        num_nonzeros *= 2;
+
+                    // Allocate coo matrix
+                    coo_tuples = new CooTuple[num_nonzeros];
+                    current_edge = 0;
+
+                }
+                else if (array && (nparsed == 2))
+                {
+                    // Allocate coo matrix
+                    num_nonzeros = num_rows * num_cols;
+                    coo_tuples = new CooTuple[num_nonzeros];
+                    current_edge = 0;
+                }
+                else
+                {
+                    fprintf(stderr, "Error parsing MARKET matrix: invalid problem description: %s\n", line);
+                    exit(1);
+                }
+
+            }
+            else
+            {
+                // Edge
+                if (current_edge >= num_nonzeros)
+                {
+                    fprintf(stderr, "Error parsing MARKET matrix: encountered more than %d num_nonzeros\n", num_nonzeros);
+                    exit(1);
+                }
+
+                int row, col;
+                double val;
+
+                if (array)
+                {
+                    if (sscanf(line, "%lf", &val) != 1)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed current_edge: '%s' at edge %d\n", line, current_edge);
+                        exit(1);
+                    }
+                    col = (current_edge / num_rows);
+                    row = (current_edge - (num_rows * col));
+
+                    coo_tuples[current_edge] = CooTuple(row, col, val);    // Convert indices to zero-based
+                }
+                else
+                {
+                    // Parse nonzero (note: using strtol and strtod is 2x faster than sscanf or istream parsing)
+                    char *l = line;
+                    char *t = NULL;
+
+                    // parse row
+                    row = strtol(l, &t, 0);
+                    if (t == l)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed row at edge %d\n", current_edge);
+                        exit(1);
+                    }
+                    l = t;
+
+                    // parse col
+                    col = strtol(l, &t, 0);
+                    if (t == l)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed col at edge %d\n", current_edge);
+                        exit(1);
+                    }
+                    l = t;
+
+                    // parse val
+                    val = strtod(l, &t);
+                    if (t == l)
+                    {
+                        val = default_value;
+                    }
+/*
+                    int nparsed = sscanf(line, "%d %d %lf", &row, &col, &val);
+                    if (nparsed == 2)
+                    {
+                        // No value specified
+                        val = default_value;
+                        
+                    }
+                    else if (nparsed != 3)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix 1: badly formed current_edge: %d parsed at edge %d\n", nparsed, current_edge);
+                        exit(1);
+                    }
+*/
+
+                    coo_tuples[current_edge] = CooTuple(row - 1, col - 1, val);    // Convert indices to zero-based
+
+                }
+
+                current_edge++;
+
+                if (symmetric && (row != col))
+                {
+                    coo_tuples[current_edge].row = coo_tuples[current_edge - 1].col;
+                    coo_tuples[current_edge].col = coo_tuples[current_edge - 1].row;
+                    coo_tuples[current_edge].val = coo_tuples[current_edge - 1].val * (skew ? -1 : 1);
+                    current_edge++;
+                }
+            }
+        }
+
+        // Adjust nonzero count (nonzeros along the diagonal aren't reversed)
+        num_nonzeros = current_edge;
+
+        if (verbose) {
+            printf("done. Ordering..."); fflush(stdout);
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        if (verbose) {
+            printf("done. "); fflush(stdout);
+        }
+
+        ifs.close();
+    }
+
+
+    /**
+     * Builds a dense matrix
+     */
+    int InitDense(
+        OffsetT     num_rows,
+        OffsetT     num_cols,
+        ValueT      default_value   = 1.0,
+        bool        verbose         = false)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        this->num_rows  = num_rows;
+        this->num_cols  = num_cols;
+
+        num_nonzeros    = num_rows * num_cols;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            for (OffsetT col = 0; col < num_cols; ++col)
+            {
+                coo_tuples[(row * num_cols) + col] = CooTuple(row, col, default_value);
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+    /**
+     * Builds a wheel COO sparse matrix having spokes spokes.
+     */
+    int InitWheel(
+        OffsetT     spokes,
+        ValueT      default_value   = 1.0,
+        bool        verbose         = false)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = spokes + 1;
+        num_cols        = num_rows;
+        num_nonzeros    = spokes * 2;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        // Add spoke num_nonzeros
+        int current_edge = 0;
+        for (OffsetT i = 0; i < spokes; i++)
+        {
+            coo_tuples[current_edge] = CooTuple(0, i + 1, default_value);
+            current_edge++;
+        }
+
+        // Add rim
+        for (OffsetT i = 0; i < spokes; i++)
+        {
+            OffsetT dest = (i + 1) % spokes;
+            coo_tuples[current_edge] = CooTuple(i + 1, dest + 1, default_value);
+            current_edge++;
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+
+    /**
+     * Builds a square 2D grid CSR matrix.  Interior num_vertices have degree 5 when including
+     * a self-loop.
+     *
+     * Returns 0 on success, 1 on failure.
+     */
+    int InitGrid2d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        int     interior_nodes  = (width - 2) * (width - 2);
+        int     edge_nodes      = (width - 2) * 4;
+        int     corner_nodes    = 4;
+        num_rows                       = width * width;
+        num_cols                       = num_rows;
+        num_nonzeros                   = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2);
+
+        if (self_loop)
+            num_nonzeros += num_rows;
+
+        coo_tuples          = new CooTuple[num_nonzeros];
+        int current_edge    = 0;
+
+        for (OffsetT j = 0; j < width; j++)
+        {
+            for (OffsetT k = 0; k < width; k++)
+            {
+                OffsetT me = (j * width) + k;
+
+                // West
+                OffsetT neighbor = (j * width) + (k - 1);
+                if (k - 1 >= 0) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // East
+                neighbor = (j * width) + (k + 1);
+                if (k + 1 < width) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // North
+                neighbor = ((j - 1) * width) + k;
+                if (j - 1 >= 0) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // South
+                neighbor = ((j + 1) * width) + k;
+                if (j + 1 < width) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                if (self_loop)
+                {
+                    neighbor = me;
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+            }
+        }
+
+        // Sort by rows, then columns, update dims
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+
+    /**
+     * Builds a square 3D grid COO sparse matrix.  Interior num_vertices have degree 7 when including
+     * a self-loop.  Values are unintialized, coo_tuples are sorted.
+     */
+    int InitGrid3d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            return -1;
+        }
+
+        OffsetT interior_nodes  = (width - 2) * (width - 2) * (width - 2);
+        OffsetT face_nodes      = (width - 2) * (width - 2) * 6;
+        OffsetT edge_nodes      = (width - 2) * 12;
+        OffsetT corner_nodes    = 8;
+        num_cols                       = width * width * width;
+        num_rows                       = num_cols;
+        num_nonzeros                     = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3);
+
+        if (self_loop)
+            num_nonzeros += num_rows;
+
+        coo_tuples          = new CooTuple[num_nonzeros];
+        int current_edge    = 0;
+
+        for (OffsetT i = 0; i < width; i++)
+        {
+            for (OffsetT j = 0; j < width; j++)
+            {
+                for (OffsetT k = 0; k < width; k++)
+                {
+
+                    OffsetT me = (i * width * width) + (j * width) + k;
+
+                    // Up
+                    OffsetT neighbor = (i * width * width) + (j * width) + (k - 1);
+                    if (k - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // Down
+                    neighbor = (i * width * width) + (j * width) + (k + 1);
+                    if (k + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // West
+                    neighbor = (i * width * width) + ((j - 1) * width) + k;
+                    if (j - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // East
+                    neighbor = (i * width * width) + ((j + 1) * width) + k;
+                    if (j + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // North
+                    neighbor = ((i - 1) * width * width) + (j * width) + k;
+                    if (i - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // South
+                    neighbor = ((i + 1) * width * width) + (j * width) + k;
+                    if (i + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    if (self_loop)
+                    {
+                        neighbor = me;
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+                }
+            }
+        }
+
+        // Sort by rows, then columns, update dims
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+};
+
+
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+
+/**
+ * CSR sparse format matrix
+ */
+template<
+    typename ValueT,
+    typename OffsetT>
+struct CsrMatrix
+{
+    int         num_rows;
+    int         num_cols;
+    int         num_nonzeros;
+    OffsetT*    row_offsets;
+    OffsetT*    column_indices;
+    ValueT*     values;
+    bool        numa_malloc;
+
+    /**
+     * Constructor
+     */
+    CsrMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), row_offsets(NULL), column_indices(NULL), values(NULL) 
+    {
+#ifdef CUB_MKL
+        numa_malloc = ((numa_available() >= 0) && (numa_num_task_nodes() > 1));
+#else
+        numa_malloc = false;
+#endif
+    }
+
+
+    /**
+     * Clear
+     */
+    void Clear()
+    {
+#ifdef CUB_MKL
+        if (numa_malloc) 
+        {
+            numa_free(row_offsets, sizeof(OffsetT) * (num_rows + 1));
+            numa_free(values, sizeof(ValueT) * num_nonzeros);
+            numa_free(column_indices, sizeof(OffsetT) * num_nonzeros);
+        }
+        else
+        {
+            if (row_offsets)    mkl_free(row_offsets);
+            if (column_indices) mkl_free(column_indices);
+            if (values)         mkl_free(values);
+        }
+
+#else
+        if (row_offsets)    delete[] row_offsets;
+        if (column_indices) delete[] column_indices;
+        if (values)         delete[] values;
+#endif
+
+        row_offsets = NULL;
+        column_indices = NULL;
+        values = NULL;
+    }
+
+    /**
+     * Destructor
+     */
+    ~CsrMatrix()
+    {
+        Clear();
+    }
+
+    GraphStats Stats()
+    {
+        GraphStats stats;
+        stats.num_rows = num_rows;
+        stats.num_cols = num_cols;
+        stats.num_nonzeros = num_nonzeros;
+
+        //
+        // Compute diag-distance statistics
+        //
+
+        OffsetT samples     = 0;
+        double  mean        = 0.0;
+        double  ss_tot      = 0.0;
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+                double x                = (col > row) ? col - row : row - col;
+
+                samples++;
+                double delta            = x - mean;
+                mean                    = mean + (delta / samples);
+                ss_tot                  += delta * (x - mean);
+            }
+        }
+        stats.diag_dist_mean            = mean;
+        double variance                 = ss_tot / samples;
+        stats.diag_dist_std_dev         = sqrt(variance);
+
+
+        //
+        // Compute deming statistics
+        //
+
+        samples         = 0;
+        double mean_x   = 0.0;
+        double mean_y   = 0.0;
+        double ss_x     = 0.0;
+        double ss_y     = 0.0;
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+
+                samples++;
+                double x                = col;
+                double y                = row;
+                double delta;
+
+                delta                   = x - mean_x;
+                mean_x                  = mean_x + (delta / samples);
+                ss_x                    += delta * (x - mean_x);
+
+                delta                   = y - mean_y;
+                mean_y                  = mean_y + (delta / samples);
+                ss_y                    += delta * (y - mean_y);
+            }
+        }
+
+        samples         = 0;
+        double s_xy     = 0.0;
+        double s_xxy    = 0.0;
+        double s_xyy    = 0.0;
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+
+                samples++;
+                double x                = col;
+                double y                = row;
+
+                double xy =             (x - mean_x) * (y - mean_y);
+                double xxy =            (x - mean_x) * (x - mean_x) * (y - mean_y);
+                double xyy =            (x - mean_x) * (y - mean_y) * (y - mean_y);
+                double delta;
+
+                delta                   = xy - s_xy;
+                s_xy                    = s_xy + (delta / samples);
+
+                delta                   = xxy - s_xxy;
+                s_xxy                   = s_xxy + (delta / samples);
+
+                delta                   = xyy - s_xyy;
+                s_xyy                   = s_xyy + (delta / samples);
+            }
+        }
+
+        double s_xx     = ss_x / num_nonzeros;
+        double s_yy     = ss_y / num_nonzeros;
+
+        double deming_slope = (s_yy - s_xx + sqrt(((s_yy - s_xx) * (s_yy - s_xx)) + (4 * s_xy * s_xy))) / (2 * s_xy);
+
+        stats.pearson_r = (num_nonzeros * s_xy) / (sqrt(ss_x) * sqrt(ss_y));
+
+
+        //
+        // Compute row-length statistics
+        //
+
+        // Sample mean
+        stats.row_length_mean       = double(num_nonzeros) / num_rows;
+        variance                    = 0.0;
+        stats.row_length_skewness   = 0.0;
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT length              = row_offsets[row + 1] - row_offsets[row];
+            double delta                = double(length) - stats.row_length_mean;
+            variance   += (delta * delta);
+            stats.row_length_skewness   += (delta * delta * delta);
+        }
+        variance                    /= num_rows;
+        stats.row_length_std_dev    = sqrt(variance);
+        stats.row_length_skewness   = (stats.row_length_skewness / num_rows) / pow(stats.row_length_std_dev, 3.0);
+        stats.row_length_variation  = stats.row_length_std_dev / stats.row_length_mean;
+
+        return stats;
+    }
+
+    /**
+     * Build CSR matrix from sorted COO matrix
+     */
+    void FromCoo(const CooMatrix<ValueT, OffsetT> &coo_matrix)
+    {
+        num_rows        = coo_matrix.num_rows;
+        num_cols        = coo_matrix.num_cols;
+        num_nonzeros    = coo_matrix.num_nonzeros;
+
+#ifdef CUB_MKL
+
+        if (numa_malloc)
+        {
+            numa_set_strict(1);
+//            numa_set_bind_policy(1);
+
+//        values          = (ValueT*) numa_alloc_interleaved(sizeof(ValueT) * num_nonzeros);
+//        row_offsets     = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * (num_rows + 1));
+//        column_indices  = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * num_nonzeros);
+
+            row_offsets     = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * (num_rows + 1), 0);
+            column_indices  = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * num_nonzeros, 0);
+            values          = (ValueT*) numa_alloc_onnode(sizeof(ValueT) * num_nonzeros, 1);
+        }
+        else
+        {
+            values          = (ValueT*) mkl_malloc(sizeof(ValueT) * num_nonzeros, 4096);
+            row_offsets     = (OffsetT*) mkl_malloc(sizeof(OffsetT) * (num_rows + 1), 4096);
+            column_indices  = (OffsetT*) mkl_malloc(sizeof(OffsetT) * num_nonzeros, 4096);
+
+        }
+
+#else
+        row_offsets     = new OffsetT[num_rows + 1];
+        column_indices  = new OffsetT[num_nonzeros];
+        values          = new ValueT[num_nonzeros];
+#endif
+
+        OffsetT prev_row = -1;
+        for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++)
+        {
+            OffsetT current_row = coo_matrix.coo_tuples[current_edge].row;
+
+            // Fill in rows up to and including the current row
+            for (OffsetT row = prev_row + 1; row <= current_row; row++)
+            {
+                row_offsets[row] = current_edge;
+            }
+            prev_row = current_row;
+
+            column_indices[current_edge]    = coo_matrix.coo_tuples[current_edge].col;
+            values[current_edge]            = coo_matrix.coo_tuples[current_edge].val;
+        }
+
+        // Fill out any trailing edgeless vertices (and the end-of-list element)
+        for (OffsetT row = prev_row + 1; row <= num_rows; row++)
+        {
+            row_offsets[row] = num_nonzeros;
+        }
+    }
+
+
+    /**
+     * Display log-histogram to stdout
+     */
+    void DisplayHistogram()
+    {
+        // Initialize
+        int log_counts[9];
+        for (int i = 0; i < 9; i++)
+        {
+            log_counts[i] = 0;
+        }
+
+        // Scan
+        int max_log_length = -1;
+        for (OffsetT row = 0; row < num_rows; row++)
+        {
+            OffsetT length = row_offsets[row + 1] - row_offsets[row];
+
+            int log_length = -1;
+            while (length > 0)
+            {
+                length /= 10;
+                log_length++;
+            }
+            if (log_length > max_log_length)
+            {
+                max_log_length = log_length;
+            }
+
+            log_counts[log_length + 1]++;
+        }
+        printf("CSR matrix (%d rows, %d columns, %d non-zeros):\n", (int) num_rows, (int) num_cols, (int) num_nonzeros);
+        for (int i = -1; i < max_log_length + 1; i++)
+        {
+            printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols);
+        }
+        fflush(stdout);
+    }
+
+
+    /**
+     * Display matrix to stdout
+     */
+    void Display()
+    {
+        printf("Input Matrix:\n");
+        for (OffsetT row = 0; row < num_rows; row++)
+        {
+            printf("%d [@%d, #%d]: ", row, row_offsets[row], row_offsets[row + 1] - row_offsets[row]);
+            for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++)
+            {
+                printf("%d (%f), ", column_indices[current_edge], values[current_edge]);
+            }
+            printf("\n");
+        }
+        fflush(stdout);
+    }
+
+
+};
+
+
+
+/******************************************************************************
+ * Matrix transformations
+ ******************************************************************************/
+
+// Comparator for ordering rows by degree (lowest first), then by row-id (lowest first)
+template <typename OffsetT>
+struct OrderByLow
+{
+    OffsetT* row_degrees;
+    OrderByLow(OffsetT* row_degrees) : row_degrees(row_degrees) {}
+
+    bool operator()(const OffsetT &a, const OffsetT &b)
+    {
+        if (row_degrees[a] < row_degrees[b])
+            return true;
+        else if (row_degrees[a] > row_degrees[b])
+            return false;
+        else
+            return (a < b);
+    }
+};
+
+// Comparator for ordering rows by degree (highest first), then by row-id (lowest first)
+template <typename OffsetT>
+struct OrderByHigh
+{
+    OffsetT* row_degrees;
+    OrderByHigh(OffsetT* row_degrees) : row_degrees(row_degrees) {}
+
+    bool operator()(const OffsetT &a, const OffsetT &b)
+    {
+        if (row_degrees[a] > row_degrees[b])
+            return true;
+        else if (row_degrees[a] < row_degrees[b])
+            return false;
+        else
+            return (a < b);
+    }
+};
+
+
+
+/**
+ * Reverse Cuthill-McKee
+ */
+template <typename ValueT, typename OffsetT>
+void RcmRelabel(
+    CsrMatrix<ValueT, OffsetT>&     matrix,
+    OffsetT*                        relabel_indices)
+{
+    // Initialize row degrees
+    OffsetT* row_degrees_in     = new OffsetT[matrix.num_rows];
+    OffsetT* row_degrees_out    = new OffsetT[matrix.num_rows];
+    for (OffsetT row = 0; row < matrix.num_rows; ++row)
+    {
+        row_degrees_in[row]         = 0;
+        row_degrees_out[row]        = matrix.row_offsets[row + 1] - matrix.row_offsets[row];
+    }
+    for (OffsetT nonzero = 0; nonzero < matrix.num_nonzeros; ++nonzero)
+    {
+        row_degrees_in[matrix.column_indices[nonzero]]++;
+    }
+
+    // Initialize unlabeled set 
+    typedef std::set<OffsetT, OrderByLow<OffsetT> > UnlabeledSet;
+    typename UnlabeledSet::key_compare  unlabeled_comp(row_degrees_in);
+    UnlabeledSet                        unlabeled(unlabeled_comp);
+    for (OffsetT row = 0; row < matrix.num_rows; ++row)
+    {
+        relabel_indices[row]    = -1;
+        unlabeled.insert(row);
+    }
+
+    // Initialize queue set
+    std::deque<OffsetT> q;
+
+    // Process unlabeled vertices (traverse connected components)
+    OffsetT relabel_idx = 0;
+    while (!unlabeled.empty())
+    {
+        // Seed the unvisited frontier queue with the unlabeled vertex of lowest-degree
+        OffsetT vertex = *unlabeled.begin();
+        q.push_back(vertex);
+
+        while (!q.empty())
+        {
+            vertex = q.front();
+            q.pop_front();
+
+            if (relabel_indices[vertex] == -1)
+            {
+                // Update this vertex
+                unlabeled.erase(vertex);
+                relabel_indices[vertex] = relabel_idx;
+                relabel_idx++;
+
+                // Sort neighbors by degree
+                OrderByLow<OffsetT> neighbor_comp(row_degrees_in);
+                std::sort(
+                    matrix.column_indices + matrix.row_offsets[vertex],
+                    matrix.column_indices + matrix.row_offsets[vertex + 1],
+                    neighbor_comp);
+
+                // Inspect neighbors, adding to the out frontier if unlabeled
+                for (OffsetT neighbor_idx = matrix.row_offsets[vertex];
+                    neighbor_idx < matrix.row_offsets[vertex + 1];
+                    ++neighbor_idx)
+                {
+                    OffsetT neighbor = matrix.column_indices[neighbor_idx];
+                    q.push_back(neighbor);
+                }
+            }
+        }
+    }
+
+/*
+    // Reverse labels
+    for (int row = 0; row < matrix.num_rows; ++row)
+    {
+        relabel_indices[row] = matrix.num_rows - relabel_indices[row] - 1;
+    }
+*/
+
+    // Cleanup
+    if (row_degrees_in) delete[] row_degrees_in;
+    if (row_degrees_out) delete[] row_degrees_out;
+}
+
+
+/**
+ * Reverse Cuthill-McKee
+ */
+template <typename ValueT, typename OffsetT>
+void RcmRelabel(
+    CsrMatrix<ValueT, OffsetT>&     matrix,
+    bool                            verbose = false)
+{
+    // Do not process if not square
+    if (matrix.num_cols != matrix.num_rows)
+    {
+        if (verbose) {
+            printf("RCM transformation ignored (not square)\n"); fflush(stdout);
+        }
+        return;
+    }
+
+    // Initialize relabel indices
+    OffsetT* relabel_indices = new OffsetT[matrix.num_rows];
+
+    if (verbose) {
+        printf("RCM relabeling... "); fflush(stdout);
+    }
+
+    RcmRelabel(matrix, relabel_indices);
+
+    if (verbose) {
+        printf("done. Reconstituting... "); fflush(stdout);
+    }
+
+    // Create a COO matrix from the relabel indices
+    CooMatrix<ValueT, OffsetT> coo_matrix;
+    coo_matrix.InitCsrRelabel(matrix, relabel_indices);
+
+    // Reconstitute the CSR matrix from the sorted COO tuples
+    if (relabel_indices) delete[] relabel_indices;
+    matrix.Clear();
+    matrix.FromCoo(coo_matrix);
+
+    if (verbose) {
+        printf("done. "); fflush(stdout);
+    }
+}
+
+
+
+
diff --git a/thrust/dependencies/cub/experimental/spmv_compare.cu b/thrust/dependencies/cub/experimental/spmv_compare.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b64297d8db8cbfc912ca60cb1eeab51cb785d22a
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/spmv_compare.cu
@@ -0,0 +1,917 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//---------------------------------------------------------------------
+// SpMV comparison tool
+//---------------------------------------------------------------------
+
+#include <stdio.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+
+#include <cusparse.h>
+
+#include "sparse_matrix.h"
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_spmv.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <test/test_util.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants, and type declarations
+//---------------------------------------------------------------------
+
+bool                    g_quiet     = false;        // Whether to display stats in CSV format
+bool                    g_verbose   = false;        // Whether to display output to console
+bool                    g_verbose2  = false;        // Whether to display input to console
+CachingDeviceAllocator  g_allocator(true);          // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// SpMV verification
+//---------------------------------------------------------------------
+
+// Compute reference SpMV y = Ax
+template <
+    typename ValueT,
+    typename OffsetT>
+void SpmvGold(
+    CsrMatrix<ValueT, OffsetT>&     a,
+    ValueT*                         vector_x,
+    ValueT*                         vector_y_in,
+    ValueT*                         vector_y_out,
+    ValueT                          alpha,
+    ValueT                          beta)
+{
+    for (OffsetT row = 0; row < a.num_rows; ++row)
+    {
+        ValueT partial = beta * vector_y_in[row];
+        for (
+            OffsetT offset = a.row_offsets[row];
+            offset < a.row_offsets[row + 1];
+            ++offset)
+        {
+            partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]];
+        }
+        vector_y_out[row] = partial;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// GPU I/O proxy
+//---------------------------------------------------------------------
+
+/**
+ * Read every matrix nonzero value, read every corresponding vector value
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ValueT,
+    typename    OffsetT,
+    typename    VectorItr>
+__launch_bounds__ (int(BLOCK_THREADS))
+__global__ void NonZeroIoKernel(
+    SpmvParams<ValueT, OffsetT> params,
+    VectorItr                   d_vector_x)
+{
+    enum
+    {
+        TILE_ITEMS      = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+
+    ValueT nonzero = 0.0;
+
+    int tile_idx = blockIdx.x;
+
+    OffsetT block_offset = tile_idx * TILE_ITEMS;
+
+    OffsetT column_indices[ITEMS_PER_THREAD];
+    ValueT values[ITEMS_PER_THREAD];
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
+
+        OffsetT* ci = params.d_column_indices + nonzero_idx;
+        ValueT*a = params.d_values + nonzero_idx;
+
+        column_indices[ITEM]    = (nonzero_idx < params.num_nonzeros) ? *ci : 0;
+        values[ITEM]            = (nonzero_idx < params.num_nonzeros) ? *a : 0.0;
+    }
+
+    __syncthreads();
+
+    // Read vector
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        ValueT vector_value    = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]);
+        nonzero                += vector_value * values[ITEM];
+    }
+
+    __syncthreads();
+
+    if (block_offset < params.num_rows)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
+            if (row_idx < params.num_rows)
+            {
+                OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx);
+
+                if ((row_end_offset >= 0) && (nonzero == nonzero))
+                    params.d_vector_y[row_idx] = nonzero;
+            }
+        }
+    }
+
+}
+
+
+/**
+ * Run GPU I/O proxy
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+float TestGpuCsrIoProxy(
+    SpmvParams<ValueT, OffsetT>&    params,
+    int                             timing_iterations)
+{
+    enum {
+        BLOCK_THREADS       = 128,
+        ITEMS_PER_THREAD    = 7,
+        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+//    size_t smem = 1024 * 16;
+    size_t smem = 1024 * 0;
+
+    unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
+    unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
+    unsigned int blocks = std::max(nonzero_blocks, row_blocks);
+
+    typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
+    TexItr x_itr;
+    CubDebugExit(x_itr.BindTexture(params.d_vector_x));
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
+
+
+    int spmv_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
+
+    if (!g_quiet)
+        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
+
+    // Warmup
+    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
+
+    // Check for failures
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(SyncStream(0));
+
+    // Timing
+    GpuTimer timer;
+    float elapsed_millis = 0.0;
+    timer.Start();
+    for (int it = 0; it < timing_iterations; ++it)
+    {
+        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    CubDebugExit(x_itr.UnbindTexture());
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+
+//---------------------------------------------------------------------
+// cuSparse HybMV
+//---------------------------------------------------------------------
+
+/**
+ * Run cuSparse HYB SpMV (specialized for fp32)
+ */
+template <
+    typename OffsetT>
+float TestCusparseHybmv(
+    float*                          vector_y_in,
+    float*                          reference_vector_y_out,
+    SpmvParams<float, OffsetT>&     params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // Construct Hyb matrix
+    cusparseMatDescr_t mat_desc;
+    cusparseHybMat_t hyb_desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
+    cusparseStatus_t status = cusparseScsr2hyb(
+        cusparse,
+        params.num_rows, params.num_cols,
+        mat_desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        hyb_desc,
+        0,
+        CUSPARSE_HYB_PARTITION_AUTO);
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
+
+    cudaDeviceSynchronize();
+    cpu_timer.Stop();
+    float elapsed_millis = cpu_timer.ElapsedMillis();
+    printf("HYB setup ms, %.5f, ", elapsed_millis);
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
+        cusparse,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        &params.alpha, mat_desc,
+        hyb_desc,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
+            cusparse,
+            CUSPARSE_OPERATION_NON_TRANSPOSE,
+            &params.alpha, mat_desc,
+            hyb_desc,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Cleanup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+/**
+ * Run cuSparse HYB SpMV (specialized for fp64)
+ */
+template <
+    typename OffsetT>
+float TestCusparseHybmv(
+    double*                         vector_y_in,
+    double*                         reference_vector_y_out,
+    SpmvParams<double, OffsetT>&    params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // Construct Hyb matrix
+    cusparseMatDescr_t mat_desc;
+    cusparseHybMat_t hyb_desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
+        cusparse,
+        params.num_rows, params.num_cols,
+        mat_desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        hyb_desc,
+        0,
+        CUSPARSE_HYB_PARTITION_AUTO));
+
+    cudaDeviceSynchronize();
+    cpu_timer.Stop();
+    float elapsed_millis = cpu_timer.ElapsedMillis();
+    printf("HYB setup ms, %.5f, ", elapsed_millis);
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
+        cusparse,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        &params.alpha, mat_desc,
+        hyb_desc,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
+            cusparse,
+            CUSPARSE_OPERATION_NON_TRANSPOSE,
+            &params.alpha, mat_desc,
+            hyb_desc,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Cleanup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+
+//---------------------------------------------------------------------
+// cuSparse CsrMV
+//---------------------------------------------------------------------
+
+/**
+ * Run cuSparse SpMV (specialized for fp32)
+ */
+template <
+    typename OffsetT>
+float TestCusparseCsrmv(
+    float*                          vector_y_in,
+    float*                          reference_vector_y_out,
+    SpmvParams<float, OffsetT>&     params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    cusparseMatDescr_t desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
+        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    float elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
+            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
+    return elapsed_millis / timing_iterations;
+}
+
+
+/**
+ * Run cuSparse SpMV (specialized for fp64)
+ */
+template <
+    typename OffsetT>
+float TestCusparseCsrmv(
+    double*                         vector_y_in,
+    double*                         reference_vector_y_out,
+    SpmvParams<double, OffsetT>&    params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    cusparseMatDescr_t desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
+        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    float elapsed_millis = 0.0;
+    GpuTimer timer;
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
+            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
+    return elapsed_millis / timing_iterations;
+}
+
+//---------------------------------------------------------------------
+// GPU Merge-based SpMV
+//---------------------------------------------------------------------
+
+/**
+ * Run CUB SpMV
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+float TestGpuMergeCsrmv(
+    ValueT*                         vector_y_in,
+    ValueT*                         reference_vector_y_out,
+    SpmvParams<ValueT, OffsetT>&    params,
+    int                             timing_iterations)
+{
+    // Allocate temporary storage
+    size_t temp_storage_bytes = 0;
+    void *d_temp_storage = NULL;
+
+    // Get amount of temporary storage needed
+    CubDebugExit(DeviceSpmv::CsrMV(
+        d_temp_storage, temp_storage_bytes,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, params.d_vector_y,
+        params.num_rows, params.num_cols, params.num_nonzeros,
+// params.alpha, params.beta,
+        (cudaStream_t) 0, false));
+
+    // Allocate
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    CubDebugExit(DeviceSpmv::CsrMV(
+        d_temp_storage, temp_storage_bytes,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, params.d_vector_y,
+        params.num_rows, params.num_cols, params.num_nonzeros, 
+// params.alpha, params.beta,
+        (cudaStream_t) 0, !g_quiet));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    GpuTimer timer;
+    float elapsed_millis = 0.0;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        CubDebugExit(DeviceSpmv::CsrMV(
+            d_temp_storage, temp_storage_bytes,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, params.d_vector_y,
+            params.num_rows, params.num_cols, params.num_nonzeros, 
+// params.alpha, params.beta,
+            (cudaStream_t) 0, false));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    return elapsed_millis / timing_iterations;
+}
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Display perf
+ */
+template <typename ValueT, typename OffsetT>
+void DisplayPerf(
+    float                           device_giga_bandwidth,
+    double                          avg_millis,
+    CsrMatrix<ValueT, OffsetT>&     csr_matrix)
+{
+    double nz_throughput, effective_bandwidth;
+    size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
+        (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
+
+    nz_throughput       = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
+    effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
+
+    if (!g_quiet)
+        printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
+            sizeof(ValueT) * 8,
+            avg_millis,
+            2 * nz_throughput,
+            effective_bandwidth,
+            effective_bandwidth / device_giga_bandwidth * 100);
+    else
+        printf("%.5f, %.6f, %.3lf, %.2f%%, ",
+            avg_millis,
+            2 * nz_throughput,
+            effective_bandwidth,
+            effective_bandwidth / device_giga_bandwidth * 100);
+
+    fflush(stdout);
+}
+
+
+
+/**
+ * Run tests
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+void RunTest(
+    bool                        rcm_relabel,
+    ValueT                      alpha,
+    ValueT                      beta,
+    CooMatrix<ValueT, OffsetT>& coo_matrix,
+    int                         timing_iterations,
+    CommandLineArgs&            args)
+{
+    // Adaptive timing iterations: run 16 billion nonzeros through
+    if (timing_iterations == -1)
+        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
+
+    if (!g_quiet)
+        printf("\t%d timing iterations\n", timing_iterations);
+
+    // Convert to CSR
+    CsrMatrix<ValueT, OffsetT> csr_matrix;
+    csr_matrix.FromCoo(coo_matrix);
+    if (!args.CheckCmdLineFlag("csrmv"))
+        coo_matrix.Clear();
+
+    // Relabel
+    if (rcm_relabel)
+    {
+        if (!g_quiet)
+        {
+            csr_matrix.Stats().Display();
+            printf("\n");
+            csr_matrix.DisplayHistogram();
+            printf("\n");
+            if (g_verbose2)
+                csr_matrix.Display();
+            printf("\n");
+        }
+
+        RcmRelabel(csr_matrix, !g_quiet);
+
+        if (!g_quiet) printf("\n");
+    }
+
+    // Display matrix info
+    csr_matrix.Stats().Display(!g_quiet);
+    if (!g_quiet)
+    {
+        printf("\n");
+        csr_matrix.DisplayHistogram();
+        printf("\n");
+        if (g_verbose2)
+            csr_matrix.Display();
+        printf("\n");
+    }
+    fflush(stdout);
+
+    // Allocate input and output vectors
+    ValueT* vector_x        = new ValueT[csr_matrix.num_cols];
+    ValueT* vector_y_in     = new ValueT[csr_matrix.num_rows];
+    ValueT* vector_y_out    = new ValueT[csr_matrix.num_rows];
+
+    for (int col = 0; col < csr_matrix.num_cols; ++col)
+        vector_x[col] = 1.0;
+
+    for (int row = 0; row < csr_matrix.num_rows; ++row)
+        vector_y_in[row] = 1.0;
+
+    // Compute reference answer
+    SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta);
+
+    float avg_millis;
+
+    if (g_quiet) {
+        printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout);
+    }
+
+    // Get GPU device bandwidth (GB/s)
+    float device_giga_bandwidth = args.device_giga_bandwidth;
+
+    // Allocate and initialize GPU problem
+    SpmvParams<ValueT, OffsetT> params;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_values,          sizeof(ValueT) * csr_matrix.num_nonzeros));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_x,        sizeof(ValueT) * csr_matrix.num_cols));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_y,        sizeof(ValueT) * csr_matrix.num_rows));
+    params.num_rows         = csr_matrix.num_rows;
+    params.num_cols         = csr_matrix.num_cols;
+    params.num_nonzeros     = csr_matrix.num_nonzeros;
+    params.alpha            = alpha;
+    params.beta             = beta;
+
+    CubDebugExit(cudaMemcpy(params.d_values,            csr_matrix.values,          sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_row_end_offsets,   csr_matrix.row_offsets,     sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_column_indices,    csr_matrix.column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_vector_x,          vector_x,                   sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));
+
+    if (!g_quiet) printf("\n\n");
+    printf("GPU CSR I/O Prox, "); fflush(stdout);
+    avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
+    DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+
+    if (args.CheckCmdLineFlag("csrmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("CUB, "); fflush(stdout);
+        avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+    // Initialize cuSparse
+    cusparseHandle_t cusparse;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));
+
+    if (args.CheckCmdLineFlag("csrmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("Cusparse CsrMV, "); fflush(stdout);
+        avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+    if (args.CheckCmdLineFlag("hybmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("Cusparse HybMV, "); fflush(stdout);
+
+        avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+
+    // Cleanup
+    if (params.d_values)            CubDebugExit(g_allocator.DeviceFree(params.d_values));
+    if (params.d_row_end_offsets)   CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
+    if (params.d_column_indices)    CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
+    if (params.d_vector_x)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
+    if (params.d_vector_y)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));
+
+    if (vector_x)                   delete[] vector_x;
+    if (vector_y_in)                delete[] vector_y_in;
+    if (vector_y_out)               delete[] vector_y_out;
+}
+
+/**
+ * Run tests
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+void RunTests(
+    bool                rcm_relabel,
+    ValueT              alpha,
+    ValueT              beta,
+    const std::string&  mtx_filename,
+    int                 grid2d,
+    int                 grid3d,
+    int                 wheel,
+    int                 dense,
+    int                 timing_iterations,
+    CommandLineArgs&    args)
+{
+    // Initialize matrix in COO form
+    CooMatrix<ValueT, OffsetT> coo_matrix;
+
+    if (!mtx_filename.empty())
+    {
+        // Parse matrix market file
+        printf("%s, ", mtx_filename.c_str()); fflush(stdout);
+        coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
+
+        if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1))
+        {
+            if (!g_quiet) printf("Trivial dataset\n");
+            exit(0);
+        }
+    }
+    else if (grid2d > 0)
+    {
+        // Generate 2D lattice
+        printf("grid2d_%d, ", grid2d); fflush(stdout);
+        coo_matrix.InitGrid2d(grid2d, false);
+    }
+    else if (grid3d > 0)
+    {
+        // Generate 3D lattice
+        printf("grid3d_%d, ", grid3d); fflush(stdout);
+        coo_matrix.InitGrid3d(grid3d, false);
+    }
+    else if (wheel > 0)
+    {
+        // Generate wheel graph
+        printf("wheel_%d, ", grid2d); fflush(stdout);
+        coo_matrix.InitWheel(wheel);
+    }
+    else if (dense > 0)
+    {
+        // Generate dense graph
+        OffsetT size = 1 << 24; // 16M nnz
+        args.GetCmdLineArgument("size", size);
+
+        OffsetT rows = size / dense;
+        printf("dense_%d_x_%d, ", rows, dense); fflush(stdout);
+        coo_matrix.InitDense(rows, dense);
+    }
+    else
+    {
+        fprintf(stderr, "No graph type specified.\n");
+        exit(1);
+    }
+
+    RunTest(
+        rcm_relabel,
+        alpha,
+        beta,
+        coo_matrix,
+        timing_iterations,
+        args);
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf(
+            "%s "
+            "[--csrmv | --hybmv | --bsrmv ] "
+            "[--device=<device-id>] "
+            "[--quiet] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "[--fp64] "
+            "[--rcm] "
+            "[--alpha=<alpha scalar (default: 1.0)>] "
+            "[--beta=<beta scalar (default: 0.0)>] "
+            "\n\t"
+                "--mtx=<matrix market file> "
+            "\n\t"
+                "--dense=<cols>"
+            "\n\t"
+                "--grid2d=<width>"
+            "\n\t"
+                "--grid3d=<width>"
+            "\n\t"
+                "--wheel=<spokes>"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    bool                fp64;
+    bool                rcm_relabel;
+    std::string         mtx_filename;
+    int                 grid2d              = -1;
+    int                 grid3d              = -1;
+    int                 wheel               = -1;
+    int                 dense               = -1;
+    int                 timing_iterations   = -1;
+    float               alpha               = 1.0;
+    float               beta                = 0.0;
+
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose2 = args.CheckCmdLineFlag("v2");
+    g_quiet = args.CheckCmdLineFlag("quiet");
+    fp64 = args.CheckCmdLineFlag("fp64");
+    rcm_relabel = args.CheckCmdLineFlag("rcm");
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("mtx", mtx_filename);
+    args.GetCmdLineArgument("grid2d", grid2d);
+    args.GetCmdLineArgument("grid3d", grid3d);
+    args.GetCmdLineArgument("wheel", wheel);
+    args.GetCmdLineArgument("dense", dense);
+    args.GetCmdLineArgument("alpha", alpha);
+    args.GetCmdLineArgument("beta", beta);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run test(s)
+    if (fp64)
+    {
+        RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
+    }
+    else
+    {
+        RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
+    }
+
+    CubDebugExit(cudaDeviceSynchronize());
+    printf("\n");
+
+    return 0;
+}
diff --git a/thrust/dependencies/cub/experimental/spmv_script.sh b/thrust/dependencies/cub/experimental/spmv_script.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f43204315a3d136e50c4fc8bee3d57622fa5c7be
--- /dev/null
+++ b/thrust/dependencies/cub/experimental/spmv_script.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216
+do
+	echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7`
+done
+
+echo
+echo
+
+for i in `ls /home/dumerrill/graphs/spmv/*.mtx`
+do
+    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
+    then
+    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
+    fi
+done
+
+echo
+echo
+
+for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx`
+#for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx`
+do 
+    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
+    then
+    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
+    fi
+done 
+
diff --git a/thrust/dependencies/cub/test/.gitignore b/thrust/dependencies/cub/test/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..978ba977dba7ed3214dc442ff9b157095cce3deb
--- /dev/null
+++ b/thrust/dependencies/cub/test/.gitignore
@@ -0,0 +1,3 @@
+/bin
+/link_main.obj
+/dummy/
diff --git a/thrust/dependencies/cub/test/CMakeLists.txt b/thrust/dependencies/cub/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1b777cb9d6068a6b7148334c485dbcc0bb07a8e5
--- /dev/null
+++ b/thrust/dependencies/cub/test/CMakeLists.txt
@@ -0,0 +1,136 @@
+# TODO investigate whether this is really needed:
+math(EXPR CUB_TEST_ARCH "${CUB_MINIMAL_ENABLED_ARCH} * 10")
+message(STATUS "CUB Test architecture (TEST_ARCH): ${CUB_TEST_ARCH}")
+
+# Create meta targets that build all tests for a single configuration:
+foreach(cub_target IN LISTS CUB_TARGETS)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+  set(config_meta_target ${config_prefix}.tests)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+file(GLOB test_srcs
+  RELATIVE ${CUB_SOURCE_DIR}/test
+  CONFIGURE_DEPENDS
+  test_*.cu
+)
+
+## cub_add_test
+#
+# Add a test executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the test
+#   target. Useful for post-processing target information.
+# test_name: The name of the test minus "<config_prefix>.test." For example,
+#   testing/vector.cu will be "vector", and testing/cuda/copy.cu will be
+#   "cuda.copy".
+# test_src: The source file that implements the test.
+# cub_target: The reference cub target with configuration information.
+#
+function(cub_add_test target_name_var test_name test_src cub_target)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+  # The actual name of the test's target:
+  set(test_target ${config_prefix}.test.${test_name})
+  set(${target_name_var} ${test_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.tests)
+  set(test_meta_target cub.all.test.${test_name})
+
+  add_executable(${test_target} "${test_src}")
+  target_link_libraries(${test_target} ${cub_target})
+  cub_clone_target_properties(${test_target} ${cub_target})
+  target_compile_definitions(${test_target} PRIVATE TEST_ARCH=${CUB_TEST_ARCH})
+  target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test")
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${test_target})
+
+  # Meta target that builds tests with this name for all configurations:
+  if (NOT TARGET ${test_meta_target})
+    add_custom_target(${test_meta_target})
+  endif()
+  add_dependencies(${test_meta_target} ${test_target})
+
+  if (CUB_ENABLE_TESTS_WITH_RDC)
+    set_target_properties(${test_target} PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+    )
+  endif()
+
+  add_test(NAME ${test_target}
+    COMMAND "$<TARGET_FILE:${test_target}>"
+  )
+endfunction()
+
+# Sets HAS_QUICK_VARIANT / HAS_QUICKER_VARIANT / NO_VARIANTS to True/False in
+# the calling scope.
+# Used to detect variants of unit tests depending on whether a source file
+# contains the strings "QUICK_TEST" or "QUICKER_TEST".
+function(cub_check_for_test_variants src)
+  file(READ "${src}" data)
+
+  string(FIND "${data}" "QUICK_TEST" quick_loc)
+  set(HAS_QUICK_VARIANT False PARENT_SCOPE)
+  if (NOT quick_loc EQUAL -1)
+    set(HAS_QUICK_VARIANT True PARENT_SCOPE)
+  endif()
+
+  string(FIND "${data}" "QUICKER_TEST" quicker_loc)
+  set(HAS_QUICKER_VARIANT False PARENT_SCOPE)
+  if (NOT quicker_loc EQUAL -1)
+    set(HAS_QUICKER_VARIANT True PARENT_SCOPE)
+  endif()
+
+  set(NO_VARIANTS False PARENT_SCOPE)
+  if (NOT (HAS_QUICK_VARIANT OR HAS_QUICKER_VARIANT))
+    set(NO_VARIANTS True PARENT_SCOPE)
+  endif()
+endfunction()
+
+foreach (test_src IN LISTS test_srcs)
+  # TODO: Per-test flags.
+
+  get_filename_component(test_name "${test_src}" NAME_WE)
+  string(REGEX REPLACE "^test_" "" test_name "${test_name}")
+
+  # Some tests change behavior based on whether the compiler defs QUICK_TEST
+  # and/or QUICKER_TEST are defined. Detect these and build variants for each
+  # configuration:
+  cub_check_for_test_variants("${test_src}")
+
+  foreach(cub_target IN LISTS CUB_TARGETS)
+    if (NO_VARIANTS)
+      # Only one version of this test.
+      cub_add_test(test_target ${test_name} "${test_src}" ${cub_target})
+    else()
+      # By default (no flags), the "thorough" version of the test is built:
+      cub_add_test(test_target_thorough
+        ${test_name}.thorough
+        "${test_src}"
+        ${cub_target}
+      )
+
+      # Add the other variants with appropriate suffixes:
+      if (HAS_QUICK_VARIANT)
+        cub_add_test(test_target_quick
+          ${test_name}.quick
+          "${test_src}"
+          ${cub_target}
+        )
+        target_compile_definitions(${test_target_quick} PRIVATE QUICK_TEST)
+      endif()
+
+      if (HAS_QUICKER_VARIANT)
+        cub_add_test(test_target_quicker
+          ${test_name}.quicker
+          "${test_src}"
+          ${cub_target}
+        )
+        target_compile_definitions(${test_target_quicker} PRIVATE QUICKER_TEST)
+      endif()
+    endif()
+  endforeach()
+endforeach()
diff --git a/thrust/dependencies/cub/test/Makefile b/thrust/dependencies/cub/test/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..958760a87c068922e6f1840f1cef0f254ea1a698
--- /dev/null
+++ b/thrust/dependencies/cub/test/Makefile
@@ -0,0 +1,468 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>] [quickertest=<0|1>]
+#
+#-------------------------------------------------------------------------------
+
+include ../common.mk 
+ 
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# Testing mode option (quick/thorough)
+ifeq ($(quickertest), 1)
+	NVCCFLAGS += -DQUICKER_TEST
+	TEST_SUFFIX = quicker
+else ifeq ($(quicktest), 1)
+	NVCCFLAGS += -DQUICK_TEST
+	TEST_SUFFIX = quick
+else 
+	TEST_SUFFIX = thorough
+	NPPI = 
+endif
+
+
+# CUDA memcheck (enabled by default) 
+ifeq ($(memcheck), 0)
+	MEMCHECK = 
+else 
+	MEMCHECK = cuda-memcheck
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+# Includes
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+# Suffix to append to each binary
+SUFFIX = $(BIN_SUFFIX)_$(TEST_SUFFIX)
+
+# Define test arch
+DEFINES += -DTEST_ARCH=$(TEST_ARCH)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =				$(CUB_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+
+BLOCK_REDUCE = 		test_block_reduce_raking \
+	 				test_block_reduce_warp_reductions		
+
+
+BLOCK_SCAN = 		test_block_scan_raking \
+	 				test_block_scan_raking_memoize \
+	 				test_block_scan_warp_scans		
+
+
+BLOCK_RADIX_SORT = 	test_block_radix_sort_keys \
+	 				test_block_radix_sort_pairs	
+
+DEVICE_RADIX_SORT = 	test_device_radix_sort \
+	 					test_device_radix_sort_segmented	
+		
+ALL = 				link \
+	 				test_iterator \
+	 				test_allocator \
+	 				test_warp_scan \
+	 				test_warp_reduce \
+	 				$(BLOCK_REDUCE) \
+	 				$(BLOCK_SCAN) \
+	 				$(BLOCK_RADIX_SORT) \
+	 				test_block_load_store \
+	 				test_block_histogram \
+				 	test_device_reduce \
+			 		test_device_histogram \
+			 		test_device_scan \
+	 				$(DEVICE_RADIX_SORT) \
+					test_device_reduce_by_key\
+					test_device_run_length_encode\
+		 			test_device_select_unique \
+					test_device_select_if 
+		
+#	 	test_grid_barrier \		fails on sm110
+#	 	test_device_seg_reduce
+		
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+#-------------------------------------------------------------------------------
+# make all
+#-------------------------------------------------------------------------------
+
+all : $(ALL)
+
+
+#-------------------------------------------------------------------------------
+# make run
+#-------------------------------------------------------------------------------
+
+run : 
+	for i in $(ALL); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_block_reduce : 
+	for i in $(BLOCK_REDUCE); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_block_scan : 
+	for i in $(BLOCK_SCAN); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_block_radix_sort : 
+	for i in $(BLOCK_RADIX_SORT); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_device_radix_sort : 
+	for i in $(DEVICE_RADIX_SORT); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+
+#-------------------------------------------------------------------------------
+# make link
+#-------------------------------------------------------------------------------
+
+link : bin/link_$(SUFFIX)
+
+bin/link_$(SUFFIX) : link_a.cu link_b.cu link_main.cpp $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_a.cu -c -o bin/link_a.obj
+	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_b.cu -c -o bin/link_b.obj
+	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_main.cpp bin/link_a.obj bin/link_b.obj -o bin/link_$(SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# make test_iterator 
+#-------------------------------------------------------------------------------
+
+test_iterator: bin/test_iterator_$(SUFFIX)
+
+bin/test_iterator_$(SUFFIX) : test_iterator.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_iterator_$(SUFFIX) test_iterator.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_allocator 
+#-------------------------------------------------------------------------------
+
+test_allocator: bin/test_allocator_$(SUFFIX)
+
+bin/test_allocator_$(SUFFIX) : test_allocator.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_allocator_$(SUFFIX) test_allocator.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
+	
+#-------------------------------------------------------------------------------
+# make test_grid_barrier 
+#-------------------------------------------------------------------------------
+
+test_grid_barrier: bin/test_grid_barrier_$(SUFFIX)
+
+bin/test_grid_barrier_$(SUFFIX) : test_grid_barrier.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_grid_barrier_$(SUFFIX) test_grid_barrier.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+	
+
+#-------------------------------------------------------------------------------
+# make test_warp_scan 
+#-------------------------------------------------------------------------------
+
+test_warp_scan: bin/test_warp_scan_$(SUFFIX)
+
+bin/test_warp_scan_$(SUFFIX) : test_warp_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_warp_scan_$(SUFFIX) test_warp_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_warp_reduce 
+#-------------------------------------------------------------------------------
+
+test_warp_reduce: bin/test_warp_reduce_$(SUFFIX)
+
+bin/test_warp_reduce_$(SUFFIX) : test_warp_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_warp_reduce_$(SUFFIX) test_warp_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_reduce_raking
+#-------------------------------------------------------------------------------
+
+test_block_reduce_raking: bin/test_block_reduce_raking_$(SUFFIX)
+
+bin/test_block_reduce_raking_$(SUFFIX) : test_block_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_RAKING $(SM_TARGETS) -o bin/test_block_reduce_raking_$(SUFFIX) test_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_reduce_warp_reductions 
+#-------------------------------------------------------------------------------
+
+test_block_reduce_warp_reductions: bin/test_block_reduce_warp_reductions_$(SUFFIX)
+
+bin/test_block_reduce_warp_reductions_$(SUFFIX) : test_block_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_WARP_REDUCTIONS $(SM_TARGETS) -o bin/test_block_reduce_warp_reductions_$(SUFFIX) test_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_reduce 
+#-------------------------------------------------------------------------------
+
+test_block_reduce: $(BLOCK_REDUCE)
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan_raking
+#-------------------------------------------------------------------------------
+
+test_block_scan_raking: bin/test_block_scan_raking_$(SUFFIX)
+
+bin/test_block_scan_raking_$(SUFFIX) : test_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_RAKING $(SM_TARGETS) -o bin/test_block_scan_raking_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan_raking_memoize
+#-------------------------------------------------------------------------------
+
+test_block_scan_raking_memoize: bin/test_block_scan_raking_memoize_$(SUFFIX)
+
+bin/test_block_scan_raking_memoize_$(SUFFIX) : test_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_RAKING_MEMOIZE $(SM_TARGETS) -o bin/test_block_scan_raking_memoize_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan_warp_scans
+#-------------------------------------------------------------------------------
+
+test_block_scan_warp_scans: bin/test_block_scan_warp_scans_$(SUFFIX)
+
+bin/test_block_scan_warp_scans_$(SUFFIX) : test_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_WARP_SCANS $(SM_TARGETS) -o bin/test_block_scan_warp_scans_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan 
+#-------------------------------------------------------------------------------
+
+test_block_scan: $(BLOCK_SCAN)
+
+
+#-------------------------------------------------------------------------------
+# make test_block_load_store 
+#-------------------------------------------------------------------------------
+
+test_block_load_store: bin/test_block_load_store_$(SUFFIX)
+
+bin/test_block_load_store_$(SUFFIX) : test_block_load_store.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_load_store_$(SUFFIX) test_block_load_store.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
+	
+#-------------------------------------------------------------------------------
+# make test_block_radix_sort_keys 
+#-------------------------------------------------------------------------------
+
+test_block_radix_sort_keys: bin/test_block_radix_sort_keys_$(SUFFIX)
+
+bin/test_block_radix_sort_keys_$(SUFFIX) : test_block_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_KEYS_ONLY $(SM_TARGETS) -o bin/test_block_radix_sort_keys_$(SUFFIX) test_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make test_block_radix_sort_pairs 
+#-------------------------------------------------------------------------------
+
+test_block_radix_sort_pairs: bin/test_block_radix_sort_pairs_$(SUFFIX)
+
+bin/test_block_radix_sort_pairs_$(SUFFIX) : test_block_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_radix_sort_pairs_$(SUFFIX) test_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_block_radix_sort
+#-------------------------------------------------------------------------------
+
+test_block_radix_sort : $(BLOCK_RADIX_SORT)
+
+
+#-------------------------------------------------------------------------------
+# make test_block_histogram 
+#-------------------------------------------------------------------------------
+
+test_block_histogram: bin/test_block_histogram_$(SUFFIX)
+
+bin/test_block_histogram_$(SUFFIX) : test_block_histogram.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_histogram_$(SUFFIX) test_block_histogram.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_reduce
+#-------------------------------------------------------------------------------
+
+test_device_reduce: bin/test_device_reduce_$(SUFFIX)
+
+bin/test_device_reduce_$(SUFFIX) : test_device_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_reduce_$(SUFFIX) test_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_histogram
+#-------------------------------------------------------------------------------
+
+test_device_histogram: bin/test_device_histogram_$(SUFFIX)
+
+bin/test_device_histogram_$(SUFFIX) : test_device_histogram.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_histogram_$(SUFFIX) test_device_histogram.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) $(NPPI) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_scan
+#-------------------------------------------------------------------------------
+
+test_device_scan: bin/test_device_scan_$(SUFFIX)
+
+bin/test_device_scan_$(SUFFIX) : test_device_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_scan_$(SUFFIX) test_device_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_radix_sort
+#-------------------------------------------------------------------------------
+
+test_device_radix_sort: bin/test_device_radix_sort_$(SUFFIX)
+
+bin/test_device_radix_sort_$(SUFFIX) : test_device_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_radix_sort_$(SUFFIX) test_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_radix_sort_segmented
+#-------------------------------------------------------------------------------
+
+test_device_radix_sort_segmented: bin/test_device_radix_sort_segmented_$(SUFFIX)
+
+bin/test_device_radix_sort_segmented_$(SUFFIX) : test_device_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DSEGMENTED_SORT $(SM_TARGETS) -o bin/test_device_radix_sort_segmented_$(SUFFIX) test_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_select_unique
+#-------------------------------------------------------------------------------
+
+test_device_select_unique: bin/test_device_select_unique_$(SUFFIX)
+
+bin/test_device_select_unique_$(SUFFIX) : test_device_select_unique.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_select_unique_$(SUFFIX) test_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_select_if
+#-------------------------------------------------------------------------------
+
+test_device_select_if: bin/test_device_select_if_$(SUFFIX)
+
+bin/test_device_select_if_$(SUFFIX) : test_device_select_if.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_select_if_$(SUFFIX) test_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make test_device_reduce_by_key
+#-------------------------------------------------------------------------------
+
+test_device_reduce_by_key: bin/test_device_reduce_by_key_$(SUFFIX)
+
+bin/test_device_reduce_by_key_$(SUFFIX) : test_device_reduce_by_key.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_reduce_by_key_$(SUFFIX) test_device_reduce_by_key.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make test_device_run_length_encode
+#-------------------------------------------------------------------------------
+
+test_device_run_length_encode: bin/test_device_run_length_encode_$(SUFFIX)
+
+bin/test_device_run_length_encode_$(SUFFIX) : test_device_run_length_encode.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_run_length_encode_$(SUFFIX) test_device_run_length_encode.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+
+
+#-------------------------------------------------------------------------------
+# make test_device_seg_reduce
+#-------------------------------------------------------------------------------
+#
+#test_device_seg_reduce: bin/test_device_seg_reduce_$(SUFFIX)
+#
+#bin/test_device_seg_reduce_$(SUFFIX) : test_device_seg_reduce.cu $(DEPS)
+#	mkdir -p bin
+#	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_seg_reduce_$(SUFFIX) test_device_seg_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
diff --git a/thrust/dependencies/cub/test/half.h b/thrust/dependencies/cub/test/half.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5255d61fec8e2ed3890bbe8a2ba833de8659fde
--- /dev/null
+++ b/thrust/dependencies/cub/test/half.h
@@ -0,0 +1,317 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utilities for interacting with the opaque CUDA __half type
+ */
+
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <iosfwd>
+
+#include <cub/util_type.cuh>
+
+#ifdef __GNUC__
+// There's a ton of type-punning going on in this file.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+
+/******************************************************************************
+ * half_t
+ ******************************************************************************/
+
+/**
+ * Host-based fp16 data type compatible and convertible with __half
+ */
+struct half_t
+{
+    uint16_t __x;
+
+    /// Constructor from __half
+    __host__ __device__ __forceinline__
+    half_t(const __half &other)
+    {
+        __x = reinterpret_cast<const uint16_t&>(other);
+    }
+
+    /// Constructor from integer
+    __host__ __device__ __forceinline__
+    half_t(int a)
+    {
+        *this = half_t(float(a));
+    }
+
+    /// Default constructor
+    __host__ __device__ __forceinline__
+    half_t() : __x(0)
+    {}
+
+    /// Constructor from float
+    __host__ __device__ __forceinline__
+    half_t(float a)
+    {
+        // Stolen from Norbert Juffa
+        uint32_t ia = *reinterpret_cast<uint32_t*>(&a);
+        uint16_t ir;
+
+        ir = (ia >> 16) & 0x8000;
+
+        if ((ia & 0x7f800000) == 0x7f800000)
+        {
+            if ((ia & 0x7fffffff) == 0x7f800000)
+            {
+                ir |= 0x7c00; /* infinity */
+            }
+            else
+            {
+                ir = 0x7fff; /* canonical NaN */
+            }
+        }
+        else if ((ia & 0x7f800000) >= 0x33000000)
+        {
+            int32_t shift = (int32_t) ((ia >> 23) & 0xff) - 127;
+            if (shift > 15)
+            {
+                ir |= 0x7c00; /* infinity */
+            }
+            else
+            {
+                ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
+                if (shift < -14)
+                { /* denormal */
+                    ir |= ia >> (-1 - shift);
+                    ia = ia << (32 - (-1 - shift));
+                }
+                else
+                { /* normal */
+                    ir |= ia >> (24 - 11);
+                    ia = ia << (32 - (24 - 11));
+                    ir = ir + ((14 + shift) << 10);
+                }
+                /* IEEE-754 round to nearest of even */
+                if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1)))
+                {
+                    ir++;
+                }
+            }
+        }
+
+        this->__x = ir;
+    }
+
+    /// Cast to __half
+    __host__ __device__ __forceinline__
+    operator __half() const
+    {
+        return reinterpret_cast<const __half&>(__x);
+    }
+
+    /// Cast to float
+    __host__ __device__ __forceinline__
+    operator float() const
+    {
+        // Stolen from Andrew Kerr
+
+        int sign        = ((this->__x >> 15) & 1);
+        int exp         = ((this->__x >> 10) & 0x1f);
+        int mantissa    = (this->__x & 0x3ff);
+        uint32_t f      = 0;
+
+        if (exp > 0 && exp < 31)
+        {
+            // normal
+            exp += 112;
+            f = (sign << 31) | (exp << 23) | (mantissa << 13);
+        }
+        else if (exp == 0)
+        {
+            if (mantissa)
+            {
+                // subnormal
+                exp += 113;
+                while ((mantissa & (1 << 10)) == 0)
+                {
+                    mantissa <<= 1;
+                    exp--;
+                }
+                mantissa &= 0x3ff;
+                f = (sign << 31) | (exp << 23) | (mantissa << 13);
+            }
+            else if (sign)
+            {
+                f = 0x80000000; // negative zero
+            }
+            else
+            {
+                f = 0x0;        // zero
+            }
+        }
+        else if (exp == 31)
+        {
+            if (mantissa)
+            {
+                f = 0x7fffffff;     // not a number
+            }
+            else
+            {
+                f = (0xff << 23) | (sign << 31);    //  inf
+            }
+        }
+        return *reinterpret_cast<float const *>(&f);
+    }
+
+
+    /// Get raw storage
+    __host__ __device__ __forceinline__
+    uint16_t raw()
+    {
+        return this->__x;
+    }
+
+    /// Equality
+    __host__ __device__ __forceinline__
+    bool operator ==(const half_t &other)
+    {
+        return (this->__x == other.__x);
+    }
+
+    /// Inequality
+    __host__ __device__ __forceinline__
+    bool operator !=(const half_t &other)
+    {
+        return (this->__x != other.__x);
+    }
+
+    /// Assignment by sum
+    __host__ __device__ __forceinline__
+    half_t& operator +=(const half_t &rhs)
+    {
+        *this = half_t(float(*this) + float(rhs));
+        return *this;
+    }
+
+    /// Multiply
+    __host__ __device__ __forceinline__
+    half_t operator*(const half_t &other)
+    {
+        return half_t(float(*this) * float(other));
+    }
+
+    /// Add
+    __host__ __device__ __forceinline__
+    half_t operator+(const half_t &other)
+    {
+        return half_t(float(*this) + float(other));
+    }
+
+    /// Less-than
+    __host__ __device__ __forceinline__
+    bool operator<(const half_t &other) const
+    {
+        return float(*this) < float(other);
+    }
+
+    /// Less-than-equal
+    __host__ __device__ __forceinline__
+    bool operator<=(const half_t &other) const
+    {
+        return float(*this) <= float(other);
+    }
+
+    /// Greater-than
+    __host__ __device__ __forceinline__
+    bool operator>(const half_t &other) const
+    {
+        return float(*this) > float(other);
+    }
+
+    /// Greater-than-equal
+    __host__ __device__ __forceinline__
+    bool operator>=(const half_t &other) const
+    {
+        return float(*this) >= float(other);
+    }
+
+    /// numeric_traits<half_t>::max
+    __host__ __device__ __forceinline__
+    static half_t max() {
+        uint16_t max_word = 0x7BFF;
+        return reinterpret_cast<half_t&>(max_word);
+    }
+
+    /// numeric_traits<half_t>::lowest
+    __host__ __device__ __forceinline__
+    static half_t lowest() {
+        uint16_t lowest_word = 0xFBFF;
+        return reinterpret_cast<half_t&>(lowest_word);
+    }
+};
+
+
+/******************************************************************************
+ * I/O stream overloads
+ ******************************************************************************/
+
+/// Insert formatted \p half_t into the output stream
+std::ostream& operator<<(std::ostream &out, const half_t &x)
+{
+    out << (float)x;
+    return out;
+}
+
+
+/// Insert formatted \p __half into the output stream
+std::ostream& operator<<(std::ostream &out, const __half &x)
+{
+    return out << half_t(x);
+}
+
+
+/******************************************************************************
+ * Traits overloads
+ ******************************************************************************/
+
+template <>
+struct cub::FpLimits<half_t>
+{
+    static __host__ __device__ __forceinline__ half_t Max() { return half_t::max(); }
+
+    static __host__ __device__ __forceinline__ half_t Lowest() { return half_t::lowest(); }
+};
+
+template <> struct cub::NumericTraits<half_t> : cub::BaseTraits<FLOATING_POINT, true, false, unsigned short, half_t> {};
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
diff --git a/thrust/dependencies/cub/test/link_a.cu b/thrust/dependencies/cub/test/link_a.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a9b19f93d824b64ce5fa16add9374bcdfc89600
--- /dev/null
+++ b/thrust/dependencies/cub/test/link_a.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void a()
+{
+    printf("a() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/thrust/dependencies/cub/test/link_b.cu b/thrust/dependencies/cub/test/link_b.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a19ec407d90791b6c9e8cdaad874aa585fcb46cf
--- /dev/null
+++ b/thrust/dependencies/cub/test/link_b.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void b()
+{
+    printf("b() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/thrust/dependencies/cub/test/link_main.cpp b/thrust/dependencies/cub/test/link_main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef677ee03b4febf543deed0867dd46e73b42e37d
--- /dev/null
+++ b/thrust/dependencies/cub/test/link_main.cpp
@@ -0,0 +1,10 @@
+#include <stdio.h>
+
+extern void a();
+extern void b();
+
+int main()
+{
+    printf("hello world\n");
+    return 0;
+}
diff --git a/thrust/dependencies/cub/test/mersenne.h b/thrust/dependencies/cub/test/mersenne.h
new file mode 100644
index 0000000000000000000000000000000000000000..2807dede70d7b290705d0a051c4d400da60f5872
--- /dev/null
+++ b/thrust/dependencies/cub/test/mersenne.h
@@ -0,0 +1,162 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+
+#include <stdio.h>
+
+namespace mersenne {
+
+/* Period parameters */
+const unsigned int N          = 624;
+const unsigned int M          = 397;
+const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
+const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
+const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
+
+static unsigned int mt[N];  /* the array for the state vector  */
+static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned int s)
+{
+    mt[0] = s & 0xffffffff;
+    for (mti = 1; mti < static_cast<int>(N); mti++)
+    {
+        mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
+
+        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
+        /* In the previous versions, MSBs of the seed affect   */
+        /* only MSBs of the array mt[].                        */
+        /* 2002/01/09 modified by Makoto Matsumoto             */
+
+        mt[mti] &= 0xffffffff;
+        /* for >32 bit machines */
+    }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned int init_key[], int key_length)
+{
+    int i, j, k;
+    init_genrand(19650218);
+    i = 1;
+    j = 0;
+    k = (static_cast<int>(N) > key_length
+	 ? static_cast<int>(N)
+	 : key_length);
+    for (; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
+            + init_key[j] + j;  /* non linear */
+        mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
+        i++;
+        j++;
+        if (i >= static_cast<int>(N))
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+        if (j >= key_length) j = 0;
+    }
+    for (k = N - 1; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
+        mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
+        i++;
+        if (i >= static_cast<int>(N))
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+    }
+
+    mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned int genrand_int32(void)
+{
+    unsigned int y;
+    static unsigned int mag01[2] = { 0x0, MATRIX_A };
+
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+    if (mti >= static_cast<int>(N))
+    { /* generate N words at one time */
+        int kk;
+
+        if (mti == N + 1) /* if init_genrand() has not been called, */
+        init_genrand(5489); /* a defat initial seed is used */
+
+        for (kk = 0; kk < static_cast<int>(N - M); kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        for (; kk < static_cast<int>(N - 1); kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+        mti = 0;
+    }
+
+    y = mt[mti++];
+
+    /* Tempering */
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= (y >> 18);
+
+    return y;
+}
+
+
+
+} // namespace mersenne
diff --git a/thrust/dependencies/cub/test/test_allocator.cu b/thrust/dependencies/cub/test/test_allocator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7714358c339154e570a4870b2381543daf31489
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_allocator.cu
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for caching allocator of device memory
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--bytes=<timing bytes>]"
+            "[--i=<timing iterations>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+#if (CUB_PTX_ARCH == 0)
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get number of GPUs and current GPU
+    int num_gpus;
+    int initial_gpu;
+    int timing_iterations           = 10000;
+    int timing_bytes                = 1024 * 1024;
+
+    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
+    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("bytes", timing_bytes);
+
+    // Create default allocator (caches up to 6MB in device allocations per GPU)
+    CachingDeviceAllocator allocator;
+    allocator.debug = true;
+
+    printf("Running single-gpu tests...\n"); fflush(stdout);
+
+    //
+    // Test0
+    //
+
+    // Create a new stream
+    cudaStream_t other_stream;
+    CubDebugExit(cudaStreamCreate(&other_stream));
+
+    // Allocate 999 bytes on the current gpu in stream0
+    char *d_999B_stream0_a;
+    char *d_999B_stream0_b;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_a
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+
+    // Allocate another 999 bytes in stream 0
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Allocate 999 bytes on the current gpu in other_stream
+    char *d_999B_stream_other_a;
+    char *d_999B_stream_other_b;
+    allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
+
+    // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have one cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Check that we can now use both allocations in other_stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other_a and d_999B_stream_other_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(cudaStreamDestroy(other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Free all cached
+    CubDebugExit(allocator.FreeAllCached());
+
+    //
+    // Test1
+    //
+
+    // Allocate 5 bytes on the current gpu
+    char *d_5B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
+
+    // Check that that we have zero free bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test2
+    //
+
+    // Allocate 4096 bytes on the current gpu
+    char *d_4096B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    //
+    // Test3
+    //
+
+    // DeviceFree d_5B
+    CubDebugExit(allocator.DeviceFree(d_5B));
+
+    // Check that that we have min_bin_bytes free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test4
+    //
+
+    // DeviceFree d_4096B
+    CubDebugExit(allocator.DeviceFree(d_4096B));
+
+    // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
+
+    // Check that that we have 0 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+    // Check that that we have 2 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 2);
+
+    //
+    // Test5
+    //
+
+    // Allocate 768 bytes on the current gpu
+    char *d_768B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test6
+    //
+
+    // Allocate max_cached_bytes on the current gpu
+    char *d_max_cached;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
+
+    // DeviceFree d_max_cached
+    CubDebugExit(allocator.DeviceFree(d_max_cached));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we still have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test7
+    //
+
+    // Free all cached blocks on all GPUs
+    CubDebugExit(allocator.FreeAllCached());
+
+    // Check that that we have 0 bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 0 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Check that that still we have 1 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test8
+    //
+
+    // Allocate max cached bytes + 1 on the current gpu
+    char *d_max_cached_plus;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
+
+    // DeviceFree max cached bytes
+    CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
+
+    // DeviceFree d_768B
+    CubDebugExit(allocator.DeviceFree(d_768B));
+
+    unsigned int power;
+    size_t rounded_bytes;
+    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
+
+    // Check that that we have 4096 free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+    // Check that that we have 1 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Check that that still we have 0 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+#ifndef CUB_CDP
+    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
+
+    if (num_gpus > 1)
+    {
+        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
+
+        //
+        // Test9
+        //
+
+        // Allocate 768 bytes on the next gpu
+        int next_gpu = (initial_gpu + 1) % num_gpus;
+        char *d_768B_2;
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // DeviceFree d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Re-allocate 768 bytes on the next gpu
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // Re-free d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Check that that we have 4096 free bytes cached on the initial gpu
+        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+        // Check that that we have 4096 free bytes cached on the second gpu
+        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
+
+        // Check that that we have 2 cached blocks across all GPUs
+        AssertEquals(allocator.cached_blocks.size(), 2);
+
+        // Check that that still we have 0 live block across all GPUs
+        AssertEquals(allocator.live_blocks.size(), 0);
+    }
+#endif  // CUB_CDP
+
+    //
+    // Performance
+    //
+
+    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
+    CpuTimer    cpu_timer;
+    char        *d_1024MB                       = NULL;
+    allocator.debug                             = false;
+
+    // Prime the caching allocator and the kernel
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+    CubDebugExit(allocator.DeviceFree(d_1024MB));
+    cub::EmptyKernel<void><<<1, 32>>>();
+
+    // CUDA
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    // CUB
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
+    GpuTimer gpu_timer;
+
+    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // Kernel-only
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        cub::EmptyKernel<void><<<1, 32>>>();
+    }
+    gpu_timer.Stop();
+    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // CUDA
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    // CUB
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+
+#endif
+
+    printf("Success\n");
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/test/test_block_histogram.cu b/thrust/dependencies/cub/test/test_block_histogram.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b76466fc67efe509213c4e4edad933c4af32a691
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_block_histogram.cu
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <string>
+#include <typeinfo>
+
+#include <cub/block/block_histogram.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * BlockHistogram test kernel.
+ */
+template <
+    int                     BINS,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm ALGORITHM,
+    typename                T,
+    typename                HistoCounter>
+__global__ void BlockHistogramKernel(
+    T                       *d_samples,
+    HistoCounter            *d_histogram)
+{
+    // Parameterize BlockHistogram type for our thread block
+    typedef BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM> BlockHistogram;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
+
+    // Test histo (writing directly to histogram buffer in global)
+    BlockHistogram(temp_storage).Histogram(data, d_histogram);
+}
+
+
+/**
+ * Initialize problem (and solution)
+ */
+template <
+    int             BINS,
+    typename        SampleT>
+void Initialize(
+    GenMode         gen_mode,
+    SampleT         *h_samples,
+    int             *h_histograms_linear,
+    int             num_samples)
+{
+    // Init bins
+    for (int bin = 0; bin < BINS; ++bin)
+    {
+        h_histograms_linear[bin] = 0;
+    }
+
+    if (g_verbose) printf("Samples: \n");
+
+    // Initialize interleaved channel samples and histogram them correspondingly
+    for (int i = 0; i < num_samples; ++i)
+    {
+        InitValue(gen_mode, h_samples[i], i);
+        h_samples[i] %= BINS;
+
+        if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", ";
+
+        h_histograms_linear[h_samples[i]]++;
+    }
+
+    if (g_verbose) printf("\n\n");
+}
+
+
+/**
+ * Test BlockHistogram
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm     ALGORITHM>
+void Test(
+    GenMode                     gen_mode)
+{
+    int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n",
+        (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC",
+        num_samples,
+        typeid(SampleT).name(),
+        (int) sizeof(SampleT),
+        BINS,
+        BLOCK_THREADS,
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    SampleT         *h_samples          = new SampleT[num_samples];
+    int   *h_reference = new int[BINS];
+
+    // Initialize problem
+    Initialize<BINS>(gen_mode, h_samples, h_reference, num_samples);
+
+    // Allocate problem device arrays
+    SampleT         *d_samples = NULL;
+    int             *d_histogram = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples,             sizeof(SampleT) * num_samples));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram,   sizeof(int) * BINS));
+
+    // Initialize/clear device arrays
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS));
+
+    // Run kernel
+    BlockHistogramKernel<BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<1, BLOCK_THREADS>>>(
+        d_samples,
+        d_histogram);
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose);
+    printf("\t%s\n\n", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (h_reference) delete[] h_reference;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test different sample distributions
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm     ALGORITHM>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(UNIFORM);
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(INTEGER_SEED);
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(RANDOM);
+}
+
+
+/**
+ * Test different ALGORITHM
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_SORT>();
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_ATOMIC>();
+}
+
+
+/**
+ * Test different ITEMS_PER_THREAD
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, 1>();
+    Test<SampleT, BINS, BLOCK_THREADS, 5>();
+}
+
+
+/**
+ * Test different BLOCK_THREADS
+ */
+template <
+    typename                    SampleT,
+    int                         BINS>
+void Test()
+{
+    Test<SampleT, BINS, 32>();
+    Test<SampleT, BINS, 96>();
+    Test<SampleT, BINS, 128>();
+}
+
+
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<total input samples across all channels> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_SORT>(RANDOM);
+    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_ATOMIC>(RANDOM);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        Test<unsigned char, 32>();
+        Test<unsigned char, 256>();
+        Test<unsigned short, 1024>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_block_load_store.cu b/thrust/dependencies/cub/test/test_block_load_store.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e35491a2e9855d9c4da6b2734a35040058a3dfc9
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_block_load_store.cu
@@ -0,0 +1,549 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockLoad and BlockStore utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/**
+ * Test load/store kernel.
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    typename            InputIteratorT,
+    typename            OutputIteratorT>
+__launch_bounds__ (BLOCK_THREADS, 1)
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    OutputIteratorT    d_out_unguarded,
+    OutputIteratorT    d_out_guarded,
+    int               num_items)
+{
+    enum
+    {
+        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Threadblock load/store abstraction types
+    typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
+    typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
+
+    // Shared memory type for this thread block
+    union TempStorage
+    {
+        typename BlockLoad::TempStorage     load;
+        typename BlockStore::TempStorage    store;
+    };
+
+    // Allocate temp storage in shared memory
+    __shared__ TempStorage temp_storage;
+
+    // Threadblock work bounds
+    int block_offset = blockIdx.x * TILE_SIZE;
+    int guarded_elements = num_items - block_offset;
+
+    // Tile of items
+    OutputT data[ITEMS_PER_THREAD];
+
+    // Load data
+    BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
+
+    __syncthreads();
+
+    // Store data
+    BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
+
+    __syncthreads();
+
+    // reset data
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        data[ITEM] = OutputT();
+
+    __syncthreads();
+
+    // Load data
+    BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
+
+    __syncthreads();
+
+    // Store data
+    BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
+}
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Test load/store variants
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    typename            InputIteratorT,
+    typename            OutputIteratorT>
+void TestKernel(
+    T                   *h_in,
+    InputIteratorT      d_in,
+    OutputIteratorT      d_out_unguarded_itr,
+    OutputIteratorT      d_out_guarded_itr,
+    T                   *d_out_unguarded_ptr,
+    T                   *d_out_guarded_ptr,
+    int                 grid_size,
+    int                 guarded_elements)
+{
+    int compare;
+
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Test with discard output iterator
+    typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
+    DiscardOutputIterator<OffsetT> discard_itr;
+
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
+        <<<grid_size, BLOCK_THREADS>>>(
+            d_in,
+            discard_itr,
+            discard_itr,
+            guarded_elements);
+
+    // Test with regular output iterator
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
+        <<<grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out_unguarded_itr,
+            d_out_guarded_itr,
+            guarded_elements);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
+    printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check results
+    compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
+    printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test native pointer.  Specialized for sufficient resources
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM>
+void TestNative(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<true>      /*sufficient_resources*/)
+{
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+    int guarded_elements = int(fraction_valid * float(unguarded_elements));
+
+    // Allocate host arrays
+    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
+
+    // Allocate device arrays
+    T *d_in = NULL;
+    T *d_out_unguarded = NULL;
+    T *d_out_guarded = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
+    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
+    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
+
+    // Initialize problem on host and device
+    for (int i = 0; i < unguarded_elements; ++i)
+    {
+        InitValue(INTEGER_SEED, h_in[i], i);
+    }
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
+
+    printf("TestNative "
+        "grid_size(%d) "
+        "guarded_elements(%d) "
+        "unguarded_elements(%d) "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "LOAD_ALGORITHM(%d) "
+        "STORE_ALGORITHM(%d) "
+        "sizeof(T)(%d)\n",
+            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
+
+    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
+        h_in,
+        (T const *) d_in,   // Test const
+        d_out_unguarded,
+        d_out_guarded,
+        d_out_unguarded,
+        d_out_guarded,
+        grid_size,
+        guarded_elements);
+
+    // Cleanup
+    if (h_in) free(h_in);
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
+    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
+}
+
+
+/**
+ * Test native pointer.  Specialized for insufficient resources
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM>
+void TestNative(
+    int                 /*grid_size*/,
+    float               /*fraction_valid*/,
+    Int2Type<false>     /*sufficient_resources*/)
+{}
+
+
+/**
+ * Test iterator.  Specialized for sufficient resources.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    CacheLoadModifier   LOAD_MODIFIER,
+    CacheStoreModifier  STORE_MODIFIER>
+void TestIterator(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<true>      /*sufficient_resources*/)
+{
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+    int guarded_elements = int(fraction_valid * float(unguarded_elements));
+
+    // Allocate host arrays
+    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
+
+    // Allocate device arrays
+    T *d_in = NULL;
+    T *d_out_unguarded = NULL;
+    T *d_out_guarded = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
+    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
+    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
+
+    // Initialize problem on host and device
+    for (int i = 0; i < unguarded_elements; ++i)
+    {
+        InitValue(INTEGER_SEED, h_in[i], i);
+    }
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
+
+    printf("TestIterator "
+        "grid_size(%d) "
+        "guarded_elements(%d) "
+        "unguarded_elements(%d) "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "LOAD_ALGORITHM(%d) "
+        "STORE_ALGORITHM(%d) "
+        "LOAD_MODIFIER(%d) "
+        "STORE_MODIFIER(%d) "
+        "sizeof(T)(%d)\n",
+            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
+
+    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
+        h_in,
+        CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
+        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
+        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
+        d_out_unguarded,
+        d_out_guarded,
+        grid_size,
+        guarded_elements);
+
+    // Cleanup
+    if (h_in) free(h_in);
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
+    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
+}
+
+/**
+ * Test iterator.  Specialized for insufficient resources.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    CacheLoadModifier   LOAD_MODIFIER,
+    CacheStoreModifier  STORE_MODIFIER>
+void TestIterator(
+    int                 /*grid_size*/,
+    float               /*fraction_valid*/,
+    Int2Type<false>     /*sufficient_resources*/)
+{}
+
+
+/**
+ * Evaluate different pointer access types
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockLoadAlgorithm      LOAD_ALGORITHM,
+    BlockStoreAlgorithm     STORE_ALGORITHM>
+void TestPointerType(
+    int             grid_size,
+    float           fraction_valid)
+{
+    // Threadblock load/store abstraction types
+    typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
+    typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
+
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 16;
+    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 16;
+    static const bool sufficient_threads    = BLOCK_THREADS <= 512;
+#else
+    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 48;
+    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 48;
+    static const bool sufficient_threads    = BLOCK_THREADS <= 1024;
+#endif
+
+    static const bool sufficient_resources  = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
+
+    TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
+    TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
+}
+
+
+/**
+ * Evaluate different time-slicing strategies
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockLoadAlgorithm      LOAD_ALGORITHM,
+    BlockStoreAlgorithm     STORE_ALGORITHM>
+void TestSlicedStrategy(
+    int             grid_size,
+    float           fraction_valid)
+{
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
+}
+
+
+
+/**
+ * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD>
+void TestStrategy(
+    int             grid_size,
+    float           fraction_valid,
+    Int2Type<false> /*is_warp_multiple*/)
+{
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD>
+void TestStrategy(
+    int             grid_size,
+    float           fraction_valid,
+    Int2Type<true>  /*is_warp_multiple*/)
+{
+    TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Evaluate different register blocking
+ */
+template <
+    typename T,
+    int BLOCK_THREADS>
+void TestItemsPerThread(
+    int grid_size,
+    float fraction_valid)
+{
+    Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
+
+    TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
+}
+
+
+/**
+ * Evaluate different thread block sizes
+ */
+template <typename T>
+void TestThreads(
+    int grid_size,
+    float fraction_valid)
+{
+    TestItemsPerThread<T, 15>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 32>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 72>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 96>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 128>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    TestNative<     int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
+    TestIterator<   int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
+
+#else
+
+    // Compile/run thorough tests
+    TestThreads<char>(2, 0.8f);
+    TestThreads<int>(2, 0.8f);
+    TestThreads<long>(2, 0.8f);
+    TestThreads<long2>(2, 0.8f);
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        TestThreads<double2>(2, 0.8f);
+    TestThreads<TestFoo>(2, 0.8f);
+    TestThreads<TestBar>(2, 0.8f);
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_block_radix_sort.cu b/thrust/dependencies/cub/test/test_block_radix_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6929dcdf5bf5946779c9cea6d2cea726514b4499
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_block_radix_sort.cu
@@ -0,0 +1,721 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+#include <iostream>
+
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/// Specialized descending, blocked -> blocked
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<true>              is_descending,
+    Int2Type<true>              is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectBlocked(threadIdx.x, d_keys, keys);
+    StoreDirectBlocked(threadIdx.x, d_values, values);
+}
+
+/// Specialized descending, blocked -> striped
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<true>              is_descending,
+    Int2Type<false>             is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
+}
+
+/// Specialized ascending, blocked -> blocked
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<false>             is_descending,
+    Int2Type<true>              is_blocked_output)
+{
+    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectBlocked(threadIdx.x, d_keys, keys);
+    StoreDirectBlocked(threadIdx.x, d_values, values);
+}
+
+/// Specialized ascending, blocked -> striped
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<false>             is_descending,
+    Int2Type<false>             is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
+}
+
+
+
+/**
+ * BlockRadixSort kernel
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    int                 RADIX_BITS,
+    bool                MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig SMEM_CONFIG,
+    int                 DESCENDING,
+    int                 BLOCKED_OUTPUT,
+    typename            Key,
+    typename            Value>
+__launch_bounds__ (BLOCK_THREADS, 1)
+__global__ void Kernel(
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     *d_elapsed)
+{
+    // Threadblock load/store abstraction types
+    typedef BlockRadixSort<
+            Key,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            Value,
+            RADIX_BITS,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG>
+        BlockRadixSortT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockRadixSortT::TempStorage temp_storage;
+
+    // Items per thread
+    Key     keys[ITEMS_PER_THREAD];
+    Value   values[ITEMS_PER_THREAD];
+
+    LoadDirectBlocked(threadIdx.x, d_keys, keys);
+    LoadDirectBlocked(threadIdx.x, d_values, values);
+
+    // Start cycle timer
+    clock_t stop;
+    clock_t start = clock();
+
+    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
+        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
+
+    // Store time
+    if (threadIdx.x == 0)
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename Key,
+    typename Value,
+    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
+struct Pair
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+/**
+ * Simple key-value pairing (specialized for floating point types)
+ */
+template <typename Key, typename Value>
+struct Pair<Key, Value, true>
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // Key in unsigned bits
+        typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+        // Return true if key is negative zero and b.key is positive zero
+        UnsignedBits key_bits   = SafeBitCast<UnsignedBits>(key);
+        UnsignedBits b_key_bits = SafeBitCast<UnsignedBits>(b.key);
+        UnsignedBits HIGH_BIT   = Traits<Key>::HIGH_BIT;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key-value sorting problem.
+ */
+template <bool DESCENDING, typename Key, typename Value>
+void Initialize(
+    GenMode         gen_mode,
+    Key             *h_keys,
+    Value           *h_values,
+    Key             *h_reference_keys,
+    Value           *h_reference_values,
+    int             num_items,
+    int             entropy_reduction,
+    int             begin_bit,
+    int             end_bit)
+{
+    (void)entropy_reduction; // unused
+
+    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_keys[i], i);
+
+        RandomBits(h_values[i]);
+
+        // Mask off unwanted portions
+        int num_bits = end_bit - begin_bit;
+        if ((begin_bit > 0) || (end_bit < static_cast<int>(sizeof(Key) * 8)))
+        {
+            unsigned long long base = 0;
+            memcpy(&base, &h_keys[i], sizeof(Key));
+            base &= ((1ull << num_bits) - 1) << begin_bit;
+            memcpy(&h_keys[i], &base, sizeof(Key));
+        }
+
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
+    std::stable_sort(h_pairs, h_pairs + num_items);
+    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_keys[i]     = h_pairs[i].key;
+        h_reference_values[i]   = h_pairs[i].value;
+    }
+
+    delete[] h_pairs;
+}
+
+
+
+
+/**
+ * Test BlockRadixSort kernel
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestDriver(
+    GenMode                 gen_mode,
+    int                     entropy_reduction,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    enum
+    {
+        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
+        KEYS_ONLY = Equals<Value, NullType>::VALUE,
+    };
+
+    // Allocate host arrays
+    Key     *h_keys             = new Key[TILE_SIZE];
+    Key     *h_reference_keys   = new Key[TILE_SIZE];
+    Value   *h_values           = new Value[TILE_SIZE];
+    Value   *h_reference_values = new Value[TILE_SIZE];
+
+    // Allocate device arrays
+    Key     *d_keys     = NULL;
+    Value   *d_values   = NULL;
+    clock_t *d_elapsed  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+
+    // Initialize problem and solution on host
+    Initialize<DESCENDING>(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values,
+        TILE_SIZE, entropy_reduction, begin_bit, end_bit);
+
+    // Copy problem to device
+    CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice));
+
+    printf("%s "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "RADIX_BITS(%d) "
+        "MEMOIZE_OUTER_SCAN(%d) "
+        "INNER_SCAN_ALGORITHM(%d) "
+        "SMEM_CONFIG(%d) "
+        "DESCENDING(%d) "
+        "BLOCKED_OUTPUT(%d) "
+        "sizeof(Key)(%d) "
+        "sizeof(Value)(%d) "
+        "gen_mode(%d), "
+        "entropy_reduction(%d) "
+        "begin_bit(%d) "
+        "end_bit(%d), "
+        "samples(%d)\n",
+            ((KEYS_ONLY) ? "Keys-only" : "Key-value"),
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            RADIX_BITS,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            DESCENDING,
+            BLOCKED_OUTPUT,
+            (int) sizeof(Key),
+            (int) sizeof(Value),
+            gen_mode,
+            entropy_reduction,
+            begin_bit,
+            end_bit,
+            g_num_rand_samples);
+
+    // Set shared memory config
+    cudaDeviceSetSharedMemConfig(SMEM_CONFIG);
+
+    // Run kernel
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT><<<1, BLOCK_THREADS>>>(
+        d_keys, d_values, begin_bit, end_bit, d_elapsed);
+
+    // Flush kernel output / errors
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check keys results
+    printf("\tKeys: ");
+    int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check value results
+    if (!KEYS_ONLY)
+    {
+        printf("\tValues: ");
+        int compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+    printf("\n");
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+    printf("\n");
+
+    // Cleanup
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (d_keys)             CubDebugExit(g_allocator.DeviceFree(d_keys));
+    if (d_values)           CubDebugExit(g_allocator.DeviceFree(d_values));
+    if (d_elapsed)          CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test driver (valid tile size <= MAX_SMEM_BYTES)
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestValid(Int2Type<true> /*fits_smem_capacity*/)
+{
+    // Iterate begin_bit
+    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
+    {
+        // Iterate end bit
+        for (int end_bit = begin_bit + 1;
+             end_bit <= static_cast<int>(sizeof(Key) * 8);
+             end_bit = end_bit * 2 + begin_bit)
+        {
+            // Uniform key distribution
+            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                UNIFORM, 0, begin_bit, end_bit);
+
+            // Sequential key distribution
+            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                INTEGER_SEED, 0, begin_bit, end_bit);
+
+            // Iterate random with entropy_reduction
+            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
+            {
+                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                    RANDOM, entropy_reduction, begin_bit, end_bit);
+            }
+        }
+    }
+}
+
+
+/**
+ * Test driver (invalid tile size)
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestValid(Int2Type<false> fits_smem_capacity)
+{}
+
+
+/**
+ * Test ascending/descending and to-blocked/to-striped
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    typename                Key,
+    typename                Value>
+void Test()
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;
+
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
+#else
+    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
+#endif
+
+    // Sort-ascending, to-striped
+    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);
+
+    // Sort-descending, to-blocked
+    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);
+
+    // Not necessary
+//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
+//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
+}
+
+
+/**
+ * Test value type and smem config
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    typename                Key>
+void TestKeys()
+{
+    // Test keys-only sorting with both smem configs
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
+#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
+#endif
+}
+
+
+/**
+ * Test value type and smem config
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    typename                Key>
+void TestKeysAndPairs()
+{
+    // Test pairs sorting with only 4-byte configs
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
+}
+
+
+/**
+ * Test key type
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM>
+void Test()
+{
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef TEST_KEYS_ONLY
+
+    // Test unsigned types with keys-only
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned char>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned short>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned int>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long long>();
+
+#else
+
+    // Test signed and fp types with paired values
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long long>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, float>();
+    if (ptx_version > 120)
+    {
+        // Don't check doubles on PTX120 or below because they're down-converted
+        TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, double>();
+    }
+
+#endif
+}
+
+
+/**
+ * Test inner scan algorithm
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_RAKING>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_WARP_SCANS>();
+}
+
+
+/**
+ * Test outer scan algorithm
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, true>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, false>();
+}
+
+
+/**
+ * Test radix bits
+ */
+template <
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 5>();
+}
+
+
+/**
+ * Test items per thread
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+    Test<BLOCK_THREADS, 1>();
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    // Open64 compiler can't handle the number of test cases
+#else
+    Test<BLOCK_THREADS, 4>();
+#endif
+    Test<BLOCK_THREADS, 11>();
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    {
+        typedef float T;
+        TestDriver<32, 4, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(INTEGER_SEED, 0, 0, sizeof(T) * 8);
+    }
+/*
+    // Compile/run quick tests
+    typedef unsigned int T;
+    TestDriver<64, 17, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+    TestDriver<96, 8, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+    TestDriver<128, 2, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+*/
+
+#else
+
+    // Compile/run thorough tests
+    Test<32>();
+    Test<64>();
+    Test<160>();
+
+
+#endif  // QUICK_TEST
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_block_reduce.cu b/thrust/dependencies/cub/test/test_block_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b439b40617d3a221882aa3715d5a90df3959e44
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_block_reduce.cu
@@ -0,0 +1,823 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cuda_runtime_api.h>
+#include <typeinfo>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_debug.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/// Generic reduction (full, 1)
+template <typename BlockReduceT, typename T, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op)
+{
+    return block_reduce.Reduce(data[0], reduction_op);
+}
+
+/// Generic reduction (full, ITEMS_PER_THREAD)
+template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op)
+{
+    return block_reduce.Reduce(data, reduction_op);
+}
+
+/// Generic reduction (partial, 1)
+template <typename BlockReduceT, typename T, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads)
+{
+    return block_reduce.Reduce(data, reduction_op, valid_threads);
+}
+
+/// Sum reduction (full, 1)
+template <typename BlockReduceT, typename T>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op)
+{
+    return block_reduce.Sum(data[0]);
+}
+
+/// Sum reduction (full, ITEMS_PER_THREAD)
+template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
+{
+    return block_reduce.Sum(data);
+}
+
+/// Sum reduction (partial, 1)
+template <typename BlockReduceT, typename T>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
+{
+    return block_reduce.Sum(data, valid_threads);
+}
+
+
+/**
+ * Test full-tile reduction kernel (where num_items is an even
+ * multiple of BLOCK_THREADS)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void FullTileReduceKernel(
+    T                       *d_in,
+    T                       *d_out,
+    ReductionOp             reduction_op,
+    int                     tiles,
+    clock_t                 *d_elapsed)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Cooperative thread block reduction utility type (returns aggregate in thread 0)
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+
+    // Load first tile of data
+    int block_offset = 0;
+
+    if (block_offset < TILE_SIZE * tiles)
+    {
+        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
+        block_offset += TILE_SIZE;
+
+        // Start cycle timer
+        clock_t start = clock();
+
+        // Cooperative reduce first tile
+        BlockReduceT block_reduce(temp_storage) ;
+        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
+
+        // Stop cycle timer
+ #if CUB_PTX_ARCH == 100
+        // Bug: recording stop clock causes mis-write of running prefix value
+        clock_t stop = 0;
+#else
+        clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+        clock_t elapsed = (start > stop) ? start - stop : stop - start;
+
+        // Loop over input tiles
+        while (block_offset < TILE_SIZE * tiles)
+        {
+            // TestBarrier between thread block reductions
+            __syncthreads();
+    
+            // Load tile of data
+            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
+            block_offset += TILE_SIZE;
+
+            // Start cycle timer
+            clock_t start = clock();
+
+            // Cooperatively reduce the tile's aggregate
+            BlockReduceT block_reduce(temp_storage) ;
+            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
+
+            // Stop cycle timer
+#if CUB_PTX_ARCH == 100
+            // Bug: recording stop clock causes mis-write of running prefix value
+            clock_t stop = 0;
+#else
+            clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+            elapsed += (start > stop) ? start - stop : stop - start;
+
+            // Reduce thread block aggregate
+            block_aggregate = reduction_op(block_aggregate, tile_aggregate);
+        }
+
+        // Store data
+        if (linear_tid == 0)
+        {
+            d_out[0] = block_aggregate;
+            *d_elapsed = elapsed;
+        }
+    }
+}
+
+
+
+/**
+ * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void PartialTileReduceKernel(
+    T                       *d_in,
+    T                       *d_out,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    clock_t                 *d_elapsed)
+{
+    // Cooperative thread block reduction utility type (returns aggregate only in thread-0)
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T partial;
+
+    // Load partial tile data
+    if (linear_tid < num_items)
+    {
+        partial = d_in[linear_tid];
+    }
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Cooperatively reduce the tile's aggregate
+    BlockReduceT block_reduce(temp_storage) ;
+    T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);
+
+    // Stop cycle timer
+#if CUB_PTX_ARCH == 100
+    // Bug: recording stop clock causes mis-write of running prefix value
+    clock_t stop = 0;
+#else
+    clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+
+    clock_t elapsed = (start > stop) ? start - stop : stop - start;
+
+    // Store data
+    if (linear_tid == 0)
+    {
+        d_out[0] = tile_aggregate;
+        *d_elapsed = elapsed;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ReductionOp>
+void Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           h_reference[1],
+    ReductionOp reduction_op,
+    int         num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        if (i == 0)
+            h_reference[0] = h_in[0];
+        else
+            h_reference[0] = reduction_op(h_reference[0], h_in[i]);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n");
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Full tile test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Test full-tile reduction.  (Specialized for sufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op,
+    Int2Type<true>          /*sufficient_resources*/)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    int num_items = TILE_SIZE * tiles;
+
+    // Allocate host arrays
+    T *h_in = new T[num_items];
+    T h_reference[1];
+
+    // Initialize problem
+    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
+
+    // Test multi-tile (unguarded)
+    printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n",
+        Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        gen_mode,
+        num_items,
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,
+        tiles,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        reduction_op,
+        tiles,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test full-tile reduction.  (Specialized for insufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op,
+    Int2Type<false>         sufficient_resources)
+{}
+
+
+/**
+ * Test full-tile reduction.
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
+
+    enum 
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024),
+        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512),
+#else
+        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024),
+        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024),
+#endif
+    };
+
+    TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
+}
+
+
+/**
+ * Run battery of tests for different thread block dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
+}
+
+/**
+ * Run battery of tests for different thread items
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op);
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op);
+}
+
+
+/**
+ * Run battery of full-tile tests for different numbers of tiles
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    ReductionOp             reduction_op)
+{
+    for (int tiles = 1; tiles < 3; tiles++)
+    {
+        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op);
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Partial-tile test generation
+//---------------------------------------------------------------------
+
+/**
+ * Test partial-tile reduction.  (Specialized for sufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    Int2Type<true>          /*sufficient_resources*/)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS;
+
+    // Allocate host arrays
+    T *h_in = new T[num_items];
+    T h_reference[1];
+
+    // Initialize problem
+    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
+
+    printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        gen_mode,
+        num_items,
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        num_items,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+
+/**
+ * Test partial-tile reduction (specialized for insufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    Int2Type<false>         sufficient_resources)
+{}
+
+
+/**
+ *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
+
+    enum 
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 16 * 1024,
+        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 512,
+#else
+        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 48 * 1024,
+        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 1024,
+#endif
+    };
+
+    TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
+}
+
+
+
+/**
+ *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    ReductionOp             reduction_op)
+{
+    for (
+        int num_items = 1;
+        num_items < BLOCK_THREADS;
+        num_items += CUB_MAX(1, BLOCK_THREADS / 5))
+    {
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op);
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Run battery of full-tile tests for different gen modes
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void Test(
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
+    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
+
+    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
+    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
+
+    if (Traits<T>::CATEGORY != FLOATING_POINT)
+    {
+        // Don't test randomly-generated floats b/c of stability
+        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
+    }
+}
+
+
+/**
+ * Run battery of tests for different block-reduction algorithmic variants
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        T,
+    typename        ReductionOp>
+void Test(
+    ReductionOp     reduction_op)
+{
+  (void)reduction_op;
+#ifdef TEST_RAKING
+    Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op);
+    Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op);
+#endif
+#ifdef TEST_WARP_REDUCTIONS
+    Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op);
+#endif
+}
+
+
+/**
+ * Run battery of tests for different block sizes
+ */
+template <
+    typename        T,
+    typename        ReductionOp>
+void Test(
+    ReductionOp     reduction_op)
+{
+    Test<7,   T>(reduction_op);
+    Test<32,  T>(reduction_op);
+    Test<63,  T>(reduction_op);
+    Test<97,  T>(reduction_op);
+    Test<128, T>(reduction_op);
+    Test<238, T>(reduction_op);
+}
+
+
+/**
+ * Run battery of tests for different block sizes
+ */
+template <typename T>
+void Test()
+{
+    Test<T>(Sum());
+    Test<T>(Max());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+
+
+    printf("\n full tile ------------------------\n\n");
+
+    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 4, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 4, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 4, int>(RANDOM, 1, Sum());
+
+    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 1, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 1, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 1, int>(RANDOM, 1, Sum());
+
+    printf("\n partial tile ------------------------\n\n");
+
+    TestPartialTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, int>(RANDOM, 7, Sum());
+    TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, int>(RANDOM, 7, Sum());
+    TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, int>(RANDOM, 7, Sum());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // primitives
+        Test<char>();
+        Test<short>();
+        Test<int>();
+        Test<long long>();
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            Test<double>();
+
+        Test<float>();
+
+        // vector types
+        Test<char2>();
+        Test<short2>();
+        Test<int2>();
+        Test<longlong2>();
+
+        Test<char4>();
+        Test<short4>();
+        Test<int4>();
+        Test<longlong4>();
+
+        // Complex types
+        Test<TestFoo>();
+        Test<TestBar>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
diff --git a/thrust/dependencies/cub/test/test_block_scan.cu b/thrust/dependencies/cub/test/test_block_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d3c340d2b59b032bfeb3a647bb43d0cae872e79c
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_block_scan.cu
@@ -0,0 +1,932 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+#include <limits>
+#include <typeinfo>
+
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * Primitive variant to test
+ */
+enum TestMode
+{
+    BASIC,
+    AGGREGATE,
+    PREFIX,
+};
+
+
+/**
+ * Scan mode to test
+ */
+enum ScanMode
+{
+    EXCLUSIVE,
+    INCLUSIVE
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+
+/**
+ * Stateful prefix functor
+ */
+template <
+    typename T,
+    typename ScanOpT>
+struct BlockPrefixCallbackOp
+{
+    int     linear_tid;
+    T       prefix;
+    ScanOpT  scan_op;
+
+    __device__ __forceinline__
+    BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) :
+        linear_tid(linear_tid),
+        prefix(prefix),
+        scan_op(scan_op)
+    {}
+
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+        // For testing purposes
+        T retval = (linear_tid == 0) ? prefix  : T();
+        prefix = scan_op(prefix, block_aggregate);
+        return retval;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Exclusive scan
+//---------------------------------------------------------------------
+
+/// Exclusive scan (BASIC, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op);
+}
+
+/// Exclusive scan (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, initial_value, scan_op);
+}
+
+/// Exclusive scan (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate);
+}
+
+/// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate);
+}
+
+/// Exclusive scan (PREFIX, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op);
+}
+
+/// Exclusive scan (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, scan_op, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Exclusive sum
+//---------------------------------------------------------------------
+
+/// Exclusive sum (BASIC, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0]);
+}
+
+/// Exclusive sum (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data);
+}
+
+/// Exclusive sum (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0], block_aggregate);
+}
+
+/// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data, block_aggregate);
+}
+
+/// Exclusive sum (PREFIX, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0], prefix_op);
+}
+
+/// Exclusive sum (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Inclusive scan
+//---------------------------------------------------------------------
+
+/// Inclusive scan (BASIC, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op);
+}
+
+/// Inclusive scan (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op);
+}
+
+/// Inclusive scan (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate);
+}
+
+/// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op, block_aggregate);
+}
+
+/// Inclusive scan (PREFIX, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op);
+}
+
+/// Inclusive scan (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Inclusive sum
+//---------------------------------------------------------------------
+
+/// Inclusive sum (BASIC, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0]);
+}
+
+/// Inclusive sum (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data);
+}
+
+/// Inclusive sum (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0], block_aggregate);
+}
+
+/// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data, block_aggregate);
+}
+
+/// Inclusive sum (PREFIX, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0], prefix_op);
+}
+
+/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data, prefix_op);
+}
+
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * BlockScan test kernel.
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            T,
+    typename            ScanOpT>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void BlockScanKernel(
+    T                   *d_in,
+    T                   *d_out,
+    T                   *d_aggregate,
+    ScanOpT              scan_op,
+    T                   initial_value,
+    clock_t             *d_elapsed)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Parameterize BlockScan type for our thread block
+    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockScanT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    LoadDirectBlocked(linear_tid, d_in, data);
+
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test scan
+    T                                   block_aggregate;
+    BlockScanT                          block_scan(temp_storage);
+    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);
+
+    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
+        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
+
+    // Stop cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Store output
+    StoreDirectBlocked(linear_tid, d_out, data);
+
+    // Store block_aggregate
+    if (TEST_MODE != BASIC)
+        d_aggregate[linear_tid] = block_aggregate;
+
+    // Store prefix
+    if (TEST_MODE == PREFIX)
+    {
+        if (linear_tid == 0)
+            d_out[TILE_SIZE] = prefix_op.prefix;
+    }
+
+    // Store time
+    if (linear_tid == 0)
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive-scan problem (and solution)
+ */
+template <typename T, typename ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT     scan_op,
+    T           initial_value,
+    Int2Type<EXCLUSIVE>)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    h_reference[0]      = initial_value;
+    T inclusive         = scan_op(initial_value, h_in[0]);
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        h_reference[i] = inclusive;
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Initialize inclusive-scan problem (and solution)
+ */
+template <typename T, typename ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT      scan_op,
+    T           initial_value,
+    Int2Type<INCLUSIVE>)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    T inclusive         = scan_op(initial_value, h_in[0]);
+    h_reference[0]      = inclusive;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+        h_reference[i] = inclusive;
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Test thread block scan.  (Specialized for sufficient resources)
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value,
+    Int2Type<true>      /*sufficient_resources*/)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    T *h_in = new T[TILE_SIZE];
+    T *h_reference = new T[TILE_SIZE];
+    T *h_aggregate = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    T block_aggregate = Initialize(
+        gen_mode,
+        h_in,
+        h_reference,
+        TILE_SIZE,
+        scan_op,
+        initial_value,
+        Int2Type<SCAN_MODE>());
+
+    // Test reference block_aggregate is returned in all threads
+    for (int i = 0; i < BLOCK_THREADS; ++i)
+    {
+        h_aggregate[i] = block_aggregate;
+    }
+
+    // Run kernel
+    printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) thread block threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n",
+        TEST_MODE, gen_mode, ALGORITHM,
+        (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,  TILE_SIZE,
+        typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    T       *d_aggregate = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
+    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+        {
+            std::cout << CoutCast(h_in[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Run block_aggregate/prefix kernel
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        d_aggregate,
+        scan_op,
+        initial_value,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tScan results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (TEST_MODE == AGGREGATE)
+    {
+        // Copy out and display block_aggregate
+        printf("\tScan block aggregate: ");
+        compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    if (TEST_MODE == PREFIX)
+    {
+        // Copy out and display updated prefix
+        printf("\tScan running total: ");
+        T running_total = scan_op(initial_value, block_aggregate);
+        compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_aggregate) delete[] h_aggregate;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test thread block scan.  (Specialized for insufficient resources)
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             /*gen_mode*/,
+    ScanOpT             /*scan_op*/,
+    T                   /*initial_value*/,
+    Int2Type<false>     /*sufficient_resources*/)
+{}
+
+
+/**
+ * Test thread block scan.
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
+
+    enum
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
+        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 512),
+#else
+        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
+        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 1024),
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+        // Accommodate ptxas crash bug (access violation) on Windows
+        special_skip            = ((TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (BLOCK_DIM_Z > 1)),
+#else
+        special_skip            = false,
+#endif
+        sufficient_resources    = (sufficient_smem && sufficient_threads && !special_skip),
+    };
+
+    Test<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(
+        gen_mode, scan_op, initial_value, Int2Type<sufficient_resources>());
+}
+
+
+
+/**
+ * Run test for different thread block dimensions
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+    Test<BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
+    Test<BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
+}
+
+
+/**
+ * Run test for different policy types
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    ScanMode    SCAN_MODE,
+    TestMode    TEST_MODE,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+  (void)gen_mode;
+  (void)scan_op;
+  (void)initial_value;
+#ifdef TEST_RAKING
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING>(gen_mode, scan_op, initial_value);
+#endif
+#ifdef TEST_RAKING_MEMOIZE
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING_MEMOIZE>(gen_mode, scan_op, initial_value);
+#endif
+#ifdef TEST_WARP_SCANS
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_WARP_SCANS>(gen_mode, scan_op, initial_value);
+#endif
+}
+
+
+/**
+ * Run tests for different primitive variants
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           identity,
+    T           initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, scan_op, identity);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, scan_op, identity);
+
+    // Exclusive (non-specialized, so we can use initial-value)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+
+    // Inclusive
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, BASIC>(gen_mode, scan_op, identity);      // This scan doesn't take an initial value
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);  // This scan doesn't take an initial value
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, PREFIX>(gen_mode, scan_op, initial_value);
+}
+
+
+/**
+ * Run tests for different problem-generation options
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    ScanOpT     scan_op,
+    T           identity,
+    T           initial_value)
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(UNIFORM, scan_op, identity, initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(INTEGER_SEED, scan_op, identity, initial_value);
+
+    // Don't test randomly-generated floats b/c of stability
+    if (Traits<T>::CATEGORY != FLOATING_POINT)
+        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(RANDOM, scan_op, identity, initial_value);
+}
+
+
+/**
+ * Run tests for different data types and scan ops
+ */
+template <
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+void Test()
+{
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // primitive
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned char) 0, (unsigned char) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned short) 0, (unsigned short) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned int) 0, (unsigned int) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned long long) 0, (unsigned long long) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (float) 0, (float) 99);
+
+    // primitive (alternative scan op)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<char>::min(), (char) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<short>::min(), (short) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<int>::min(), (int) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<long long>::min(), (long long) 99);
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<double>::max() * -1, (double) 99);
+
+    // vec-1
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar1(0), make_uchar1(17));
+
+    // vec-2
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar2(0, 0), make_uchar2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ushort2(0, 0), make_ushort2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uint2(0, 0), make_uint2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21));
+
+    // vec-4
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85));
+
+    // complex
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestBar(0, 0), TestBar(17, 21));
+
+}
+
+
+/**
+ * Run tests for different items per thread
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+    Test<BLOCK_THREADS, 1>();
+    Test<BLOCK_THREADS, 2>();
+    Test<BLOCK_THREADS, 9>();
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
+
+    // Compile/run quick tests
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0));
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0));
+
+    Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85));
+
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Run tests for different thread block sizes
+        Test<17>();
+        Test<32>();
+        Test<62>();
+        Test<65>();
+//            Test<96>();             // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21));
+        Test<128>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/thrust/dependencies/cub/test/test_device_histogram.cu b/thrust/dependencies/cub/test/test_device_histogram.cu
new file mode 100644
index 0000000000000000000000000000000000000000..326856b10fd34a2a632ccb947c05e1a4572db3ea
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_histogram.cu
@@ -0,0 +1,1692 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <algorithm>
+#include <typeinfo>
+
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+    #include <npp.h>
+#endif
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_histogram.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    NPP,        // NPP method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+bool                    g_verbose_input     = false;
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+
+
+//---------------------------------------------------------------------
+// Dispatch to NPP histogram
+//---------------------------------------------------------------------
+
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+
+/**
+ * Dispatch to single-channel 8b NPP histo-even
+ */
+template <typename CounterT, typename LevelT, typename OffsetT>
+//CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<1>             num_channels,
+    Int2Type<1>             num_active_channels,
+    Int2Type<NPP>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef unsigned char SampleT;
+
+    cudaError_t error = cudaSuccess;
+    NppiSize oSizeROI = {
+        num_row_pixels,
+        num_rows
+    };
+
+    if (d_temp_storage_bytes == NULL)
+    {
+        int nDeviceBufferSize;
+        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
+        temp_storage_bytes = nDeviceBufferSize;
+    }
+    else
+    {
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            // compute the histogram
+            nppiHistogramEven_8u_C1R(
+                d_samples,
+                row_stride_bytes,
+                oSizeROI,
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0],
+                upper_level[0],
+                (Npp8u*) d_temp_storage);
+        }
+    }
+
+    return error;
+}
+
+
+/**
+ * Dispatch to 3/4 8b NPP histo-even
+ */
+template <typename CounterT, typename LevelT, typename OffsetT>
+//CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<4>          num_channels,
+    Int2Type<3>   num_active_channels,
+    Int2Type<NPP>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef unsigned char SampleT;
+
+    cudaError_t error = cudaSuccess;
+    NppiSize oSizeROI = {
+        num_row_pixels,
+        num_rows
+    };
+
+    if (d_temp_storage_bytes == NULL)
+    {
+        int nDeviceBufferSize;
+        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
+        temp_storage_bytes = nDeviceBufferSize;
+    }
+    else
+    {
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            // compute the histogram
+            nppiHistogramEven_8u_AC4R(
+                d_samples,
+                row_stride_bytes,
+                oSizeROI,
+                d_histogram,
+                num_levels,
+                lower_level,
+                upper_level,
+                (Npp8u*) d_temp_storage);
+        }
+    }
+
+    return error;
+}
+
+
+#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceHistogram entrypoints
+//---------------------------------------------------------------------
+
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
+struct Dispatch;
+
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
+struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
+{
+    /**
+     * Dispatch to CUB multi histogram-range entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                                ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *(&d_levels)[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        cudaError_t error = cudaSuccess;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram,
+                num_levels,
+                d_levels,
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+
+    /**
+     * Dispatch to CUB multi histogram-even entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *lower_level,           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              *upper_level,           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram,
+                num_levels,
+                lower_level,
+                upper_level,
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+};
+
+
+template <>
+struct Dispatch<1, 1, CUB>
+{
+
+    /**
+     * Dispatch to CUB single histogram-range entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              (&d_levels)[1],                         ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramRange(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram[0],
+                num_levels[0],
+                d_levels[0],
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+
+    /**
+     * Dispatch to CUB single histogram-even entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *lower_level,                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              *upper_level,                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramEven(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0],
+                upper_level[0],
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+};
+
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceHistogram
+ * /
+template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
+__global__ void CnpDispatchKernel(
+    Int2Type<ALGORITHM> algorithm,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    SampleT             *d_samples,
+    SampleIteratorT      d_sample_itr,
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
+    int                 num_samples,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/ **
+ * Dispatch to CDP kernel
+ * /
+template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
+cudaError_t Dispatch(
+    Int2Type<ALGORITHM> algorithm,
+    Int2Type<true>      use_cdp,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleT             *d_samples,
+    SampleIteratorT      d_sample_itr,
+    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
+    int                 num_samples,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+*/
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+// Searches for bin given a list of bin-boundary levels
+template <typename LevelT>
+struct SearchTransform
+{
+    LevelT          *levels;      // Pointer to levels array
+    int             num_levels;   // Number of levels in array
+
+    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
+        if (bin < 0)
+        {
+            // Sample out of range
+            return num_levels;
+        }
+        return bin;
+    }
+};
+
+
+// Scales samples to evenly-spaced bins
+template <typename LevelT>
+struct ScaleTransform
+{
+    int    num_levels;  // Number of levels in array
+    LevelT max;         // Max sample level (exclusive)
+    LevelT min;         // Min sample level (inclusive)
+    LevelT scale;       // Bin scaling factor
+
+    void Init(
+        int    num_levels,  // Number of levels in array
+        LevelT max,         // Max sample level (exclusive)
+        LevelT min,         // Min sample level (inclusive)
+        LevelT scale)       // Bin scaling factor
+    {
+        this->num_levels = num_levels;
+        this->max = max;
+        this->min = min;
+        this->scale = scale;
+    }
+
+    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        if ((sample < min) || (sample >= max))
+        {
+            // Sample out of range
+            return num_levels;
+        }
+
+        return (int) ((((LevelT) sample) - min) / scale);
+    }
+};
+
+// Scales samples to evenly-spaced bins
+template <>
+struct ScaleTransform<float>
+{
+    int   num_levels;  // Number of levels in array
+    float max;         // Max sample level (exclusive)
+    float min;         // Min sample level (inclusive)
+    float scale;       // Bin scaling factor
+
+    void Init(
+        int    num_levels,  // Number of levels in array
+        float max,         // Max sample level (exclusive)
+        float min,         // Min sample level (inclusive)
+        float scale)       // Bin scaling factor
+    {
+        this->num_levels = num_levels;
+        this->max = max;
+        this->min = min;
+        this->scale = 1.0f / scale;
+    }
+
+    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        if ((sample < min) || (sample >= max))
+        {
+            // Sample out of range
+            return num_levels;
+        }
+
+        return (int) ((((float) sample) - min) * scale);
+    }
+};
+
+
+/**
+ * Generate sample
+ */
+template <typename T, typename LevelT>
+void Sample(T &datum, LevelT max_level, int entropy_reduction)
+{
+    unsigned int max = (unsigned int) -1;
+    unsigned int bits;
+    RandomBits(bits, entropy_reduction);
+    float fraction = (float(bits) / max);
+
+    datum = (T) (fraction * max_level);
+}
+
+
+/**
+ * Initialize histogram samples
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        LevelT,
+    typename        SampleT,
+    typename        OffsetT>
+void InitializeSamples(
+    LevelT          max_level,
+    int             entropy_reduction,
+    SampleT         *h_samples,
+    OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    // Initialize samples
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Init sample value
+                Sample(h_samples[offset], max_level, entropy_reduction);
+                if (g_verbose_input)
+                {
+                    if (channel > 0) printf(", ");
+                    std::cout << CoutCast(h_samples[offset]);
+                }
+            }
+        }
+    }
+}
+
+
+/**
+ * Initialize histogram solutions
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        SampleIteratorT,
+    typename        TransformOp,
+    typename        OffsetT>
+void InitializeBins(
+    SampleIteratorT h_samples,
+    int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    // Init bins
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
+        {
+            h_histogram[CHANNEL][bin] = 0;
+        }
+    }
+
+    // Initialize samples
+    if (g_verbose_input) printf("Samples: \n");
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            if (g_verbose_input) printf("[");
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Update sample bin
+                int bin = transform_op[channel](h_samples[offset]);
+                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
+                if ((bin >= 0) && (bin < num_levels[channel] - 1))
+                {
+                    // valid bin
+                    h_histogram[channel][bin]++;
+                }
+            }
+            if (g_verbose_input) printf("]");
+        }
+        if (g_verbose_input) printf("\n\n");
+    }
+}
+
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT,
+    typename        SampleIteratorT>
+void TestEven(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    SampleIteratorT h_samples,
+    SampleIteratorT d_samples)
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
+        (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+
+        transform_op[channel].Init(
+            num_levels[channel],
+            upper_level[channel],
+            lower_level[channel],
+            ((upper_level[channel] - lower_level[channel]) / bins));
+    }
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 8;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, false);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenNative(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    // Allocate and initialize host sample data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT* d_samples = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        h_samples, d_samples);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenIterator(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    SampleT sample = (SampleT) lower_level[0];
+    ConstantInputIterator<SampleT> sample_itr(sample);
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        sample_itr, sample_itr);
+
+}
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
+        std::cout << levels[channel][0];
+        for (int level = 1; level < num_levels[channel]; ++level)
+            std::cout << ", " << levels[channel][level];
+        printf("]\n");
+    }
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        transform_op[channel].levels = levels[channel];
+        transform_op[channel].num_levels = num_levels[channel];
+
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+    }
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT*        d_samples = NULL;
+    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
+    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
+        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
+
+        int bins = num_levels[channel] - 1;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 9;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, false);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+
+        if (d_levels[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
+    }
+
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEven(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    LevelT lower_level[NUM_ACTIVE_CHANNELS];
+    LevelT upper_level[NUM_ACTIVE_CHANNELS];
+
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / max_bins;
+
+    // Set upper and lower levels for each channel
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int num_bins = num_levels[channel] - 1;
+        lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
+        upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
+    }
+
+    // Test pointer-based samples
+    TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Test iterator-based samples (CUB-only)
+    TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+}
+
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / max_bins;
+
+    LevelT* levels[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        levels[channel] = new LevelT[num_levels[channel]];
+
+        int num_bins = num_levels[channel] - 1;
+        LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
+
+        for (int level = 0; level < num_levels[channel]; ++level)
+            levels[channel][level] = lower_level + (level * min_level_increment);
+    }
+
+    TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        delete[] levels[channel];
+
+}
+
+
+
+/**
+ * Test different entrypoints
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+/**
+ * Test different number of levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    int num_levels[NUM_ACTIVE_CHANNELS];
+
+// Unnecessary testing
+//    // All the same level
+//    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+//    {
+//        num_levels[channel] = max_num_levels;
+//    }
+//    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+//        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    // All different levels
+    num_levels[0] = max_num_levels;
+    for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
+    }
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+
+/**
+ * Test different entropy-levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 0,   max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
+}
+
+
+/**
+ * Test different row strides
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
+
+    // No padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
+
+    // 13 samples padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
+}
+
+
+/**
+ * Test different problem sizes
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // 0 row/col images
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(0), max_level, max_num_levels);
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(0), OffsetT(0), max_level, max_num_levels);
+
+    // 1080 image
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
+
+    // Sample different aspect ratios sizes
+    for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
+    {
+        for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
+        {
+            Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+                cols, rows, max_level, max_num_levels);
+        }
+    }
+
+    // Randomly select linear problem size between 1:10,000,000
+    unsigned int max_int = (unsigned int) -1;
+    for (int i = 0; i < 4; ++i)
+    {
+        unsigned int num_items;
+        RandomBits(num_items);
+        num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+        num_items = CUB_MAX(1, num_items);
+
+        Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+            OffsetT(num_items), 1, max_level, max_num_levels);
+    }
+}
+
+
+
+/**
+ * Test different channel interleavings (valid specialiation)
+ */
+template <
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestChannels(
+    LevelT          max_level,
+    int             max_num_levels,
+    Int2Type<true>  /*is_valid_tag*/)
+{
+    Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+}
+
+
+/**
+ * Test different channel interleavings (invalid specialiation)
+ */
+template <
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestChannels(
+    LevelT          /*max_level*/,
+    int             /*max_num_levels*/,
+    Int2Type<false> /*is_valid_tag*/)
+{}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_row_pixels = -1;
+    int entropy_reduction = 0;
+    int num_rows = 1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", num_row_pixels);
+
+    int row_stride_pixels = num_row_pixels;
+
+    args.GetCmdLineArgument("rows", num_rows);
+    args.GetCmdLineArgument("stride", row_stride_pixels);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+    bool compare_npp = args.CheckCmdLineFlag("npp");
+#endif
+
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<pixels per row>] "
+            "[--rows=<number of rows>] "
+            "[--stride=<row stride in pixels>] "
+            "[--i=<timing iterations>] "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--entropy=<entropy-reduction factor (default 0)>]"
+            "[--v] "
+            "[--cdp]"
+            "[--npp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    if (num_row_pixels < 0)
+    {
+        num_row_pixels      = 1920 * 1080;
+        row_stride_pixels   = num_row_pixels;
+    }
+
+#if defined(QUICKER_TEST)
+
+    // Compile/run quick tests
+    {
+        // HistogramEven: unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        // The NPP path doesn't compile as of 2020-06:
+        // No Dispatch<int, int, NPP> specialization defined.
+//        if (compare_npp)
+//            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: signed char 256 bins
+        typedef signed char         SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    {
+        // HistogramEven: unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        // The NPP path doesn't compile as of 2020-06:
+        // No Dispatch<int, int, NPP> specialization defined.
+//        if (compare_npp)
+//            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 4/4 multichannel Unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[4]       = {257, 257, 257, 257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 3/4 multichannel Unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[3]       = {257, 257, 257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        // The NPP path doesn't compile as of 2020-06:
+        // No Dispatch<int, int, NPP> specialization defined.
+//        if (compare_npp)
+//            TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: short [0,1024] 256 bins
+        typedef unsigned short      SampleT;
+        typedef unsigned short      LevelT;
+
+        LevelT  max_level           = 1024;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: float [0,1.0] 256 bins
+        typedef float               SampleT;
+        typedef float               LevelT;
+
+        LevelT  max_level           = 1.0;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
+        typedef float               SampleT;
+        typedef float               LevelT;
+
+         LevelT  max_level           = 1.0;
+         int     num_levels[3]       = {257, 257, 257};
+         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+         TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: signed char 256 bins
+        typedef signed char         SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[3]       = {257, 129, 65};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+    {
+        // HistogramEven: double [0,1.0] 64 bins
+        typedef double              SampleT;
+        typedef double              LevelT;
+
+        LevelT  max_level           = 1.0;
+        int     num_levels[1]       = {65};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: short [0,1024] 512 bins
+        typedef unsigned short      SampleT;
+        typedef unsigned short      LevelT;
+
+        LevelT  max_level           = 1024;
+        int     num_levels[1]       = {513};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        TestChannels <unsigned char,    int, int,   int>(256,   256 + 1, Int2Type<true>());
+        TestChannels <signed char,      int, int,   int>(256,   256 + 1, Int2Type<true>());
+        TestChannels <unsigned short,   int, int,   int>(128,   128 + 1, Int2Type<true>());
+        TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, Int2Type<true>());
+        TestChannels <float,            int, float, int>(1.0,   256 + 1, Int2Type<true>());
+
+		// Test down-conversion of size_t offsets to int
+        TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
+    }
+
+#endif
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/test/test_device_radix_sort.cu b/thrust/dependencies/cub/test/test_device_radix_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d73c78bd27cde48992e33a291dc9e7abc3a57e05
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_radix_sort.cu
@@ -0,0 +1,1316 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+#include <typeinfo>
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
+
+#include "test_util.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/reverse.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,                        // CUB method (allows overwriting of input)
+    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
+
+    CUB_SEGMENTED,              // CUB method (allows overwriting of input)
+    CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
+
+    THRUST,                     // Thrust method
+    CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>         /*is_descending*/,
+    Int2Type<CUB>           /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     /*num_segments*/,
+    const int               */*d_segment_offsets*/,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>             /*is_descending*/,
+    Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
+    int                         */*d_selector*/,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     /*num_segments*/,
+    const int               */*d_segment_offsets*/,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>          /*is_descending*/,
+    Int2Type<CUB>           /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     /*num_segments*/,
+    const int               */*d_segment_offsets*/,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+}
+
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>              /*is_descending*/,
+    Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
+    int                         */*d_selector*/,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     /*num_segments*/,
+    const int               */*d_segment_offsets*/,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>         /*is_descending*/,
+    Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceSegmentedRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>                         /*is_descending*/,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
+    int                                     */*d_selector*/,
+    size_t                                  */*d_temp_storage_bytes*/,
+    cudaError_t                             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>          /*is_descending*/,
+    Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>                          /*is_descending*/,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
+    int                                     */*d_selector*/,
+    size_t                                  */*d_temp_storage_bytes*/,
+    cudaError_t                             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch keys-only to Thrust sorting entrypoint
+ */
+template <int IS_DESCENDING, typename KeyT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> /*is_descending*/,
+    Int2Type<THRUST>        /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<NullType>  &/*d_values*/,
+    int                     num_items,
+    int                     /*num_segments*/,
+    const int               */*d_segment_offsets*/,
+    int                     /*begin_bit*/,
+    int                     /*end_bit*/,
+    cudaStream_t            /*stream*/,
+    bool                    /*debug_synchronous*/)
+{
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
+
+        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+        thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
+        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch key-value pairs to Thrust sorting entrypoint
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> /*is_descending*/,
+    Int2Type<THRUST>        /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     /*num_segments*/,
+    const int               */*d_segment_offsets*/,
+    int                     /*begin_bit*/,
+    int                     /*end_bit*/,
+    cudaStream_t            /*stream*/,
+    bool                    /*debug_synchronous*/)
+{
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyT>     d_keys_wrapper(d_keys.Current());
+        thrust::device_ptr<ValueT>   d_values_wrapper(d_values.Current());
+
+        if (IS_DESCENDING) {
+            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
+        }
+
+        thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
+
+        if (IS_DESCENDING) {
+            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceRadixSort
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+__global__ void CnpDispatchKernel(
+    Int2Type<IS_DESCENDING> is_descending,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  temp_storage_bytes,
+    DoubleBuffer<KeyT>      d_keys,
+    DoubleBuffer<ValueT>    d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    bool                    debug_synchronous)
+{
+#ifndef CUB_CDP
+  (void)is_descending;
+  (void)d_selector;
+  (void)d_temp_storage_bytes;
+  (void)d_cdp_error;
+  (void)d_temp_storage;
+  (void)temp_storage_bytes;
+  (void)d_keys;
+  (void)d_values;
+  (void)num_items;
+  (void)num_segments;
+  (void)d_segment_offsets;
+  (void)begin_bit;
+  (void)end_bit;
+  (void)debug_synchronous;
+    *d_cdp_error            = cudaErrorNotSupported;
+#else
+    *d_cdp_error            = Dispatch(
+                                is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+                                d_temp_storage, temp_storage_bytes, d_keys, d_values,
+                                num_items, num_segments, d_segment_offsets,
+                                begin_bit, end_bit, 0, debug_synchronous);
+    *d_temp_storage_bytes   = temp_storage_bytes;
+    *d_selector             = d_keys.selector;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<CDP>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(
+        is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, debug_synchronous);
+
+    // Copy out selector
+    CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
+    d_values.selector = d_keys.selector;
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename KeyT,
+    typename ValueT,
+    bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
+struct Pair
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for bool types)
+ */
+template <typename ValueT>
+struct Pair<bool, ValueT, false>
+{
+    bool     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (!key && b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for floating point types)
+ */
+template <typename KeyT, typename ValueT>
+struct Pair<KeyT, ValueT, true>
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // KeyT in unsigned bits
+        typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+        // Return true if key is negative zero and b.key is positive zero
+        UnsignedBits key_bits   = SafeBitCast<UnsignedBits>(key);
+        UnsignedBits b_key_bits = SafeBitCast<UnsignedBits>(b.key);
+        UnsignedBits HIGH_BIT   = Traits<KeyT>::HIGH_BIT;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key data
+ */
+template <typename KeyT>
+void InitializeKeyBits(
+    GenMode         gen_mode,
+    KeyT            *h_keys,
+    int             num_items,
+    int             /*entropy_reduction*/)
+{
+    for (int i = 0; i < num_items; ++i)
+        InitValue(gen_mode, h_keys[i], i);
+}
+
+
+/**
+ * Initialize solution
+ */
+template <bool IS_DESCENDING, typename KeyT>
+void InitializeSolution(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit,
+    int     *&h_reference_ranks,
+    KeyT    *&h_reference_keys)
+{
+    typedef Pair<KeyT, int> PairT;
+
+    PairT *h_pairs = new PairT[num_items];
+
+    int num_bits = end_bit - begin_bit;
+    for (int i = 0; i < num_items; ++i)
+    {
+
+        // Mask off unwanted portions
+        if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
+        {
+            unsigned long long base = 0;
+            memcpy(&base, &h_keys[i], sizeof(KeyT));
+            base &= ((1ull << num_bits) - 1) << begin_bit;
+            memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
+        }
+        else
+        {
+            h_pairs[i].key = h_keys[i];
+        }
+
+        h_pairs[i].value = i;
+    }
+
+    printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
+
+    for (int i = 0; i < num_segments; ++i)
+    {
+        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+    }
+
+    printf(" Done.\n"); fflush(stdout);
+
+    h_reference_ranks  = new int[num_items];
+    h_reference_keys   = new KeyT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_ranks[i]    = h_pairs[i].value;
+        h_reference_keys[i]     = h_keys[h_pairs[i].value];
+    }
+
+    if (h_pairs) delete[] h_pairs;
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Test DeviceRadixSort
+ */
+template <
+    Backend     BACKEND,
+    bool        IS_DESCENDING,
+    typename    KeyT,
+    typename    ValueT>
+void Test(
+    KeyT        *h_keys,
+    ValueT      *h_values,
+    int         num_items,
+    int         num_segments,
+    int         *h_segment_offsets,
+    int         begin_bit,
+    int         end_bit,
+    KeyT        *h_reference_keys,
+    ValueT      *h_reference_values)
+{
+    // Key alias type
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    typedef typename If<Equals<KeyT, half_t>::VALUE, __half, KeyT>::Type KeyAliasT;
+#else
+    typedef KeyT KeyAliasT;
+#endif
+
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
+        (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (KEYS_ONLY) ? "keys-only" : "key-value",
+        num_items, num_segments,
+        (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
+        IS_DESCENDING, begin_bit, end_bit);
+    fflush(stdout);
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+    }
+
+    // Allocate device arrays
+    DoubleBuffer<KeyAliasT> d_keys;
+    DoubleBuffer<ValueT>    d_values;
+    int                     *d_selector;
+    int                     *d_segment_offsets;
+    size_t                  *d_temp_storage_bytes;
+    cudaError_t             *d_cdp_error;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
+    if (!KEYS_ONLY)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
+    }
+
+    // Allocate temporary storage (and make it un-aligned)
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, 0, true));
+
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
+    void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
+
+    // Initialize/clear device arrays
+    d_keys.selector = 0;
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
+    if (!KEYS_ONLY)
+    {
+        d_values.selector = 0;
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
+    }
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, 0, true));
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    printf("Warmup done.  Checking results:\n"); fflush(stdout);
+    int compare = CompareDeviceResults(h_reference_keys, reinterpret_cast<KeyT*>(d_keys.Current()), num_items, true, g_verbose);
+    printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
+    if (!KEYS_ONLY)
+    {
+        int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+        compare |= values_compare;
+        printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+    if (BACKEND == CUB_NO_OVERWRITE)
+    {
+        // Check that input isn't overwritten
+        int input_compare = CompareDeviceResults(h_keys, reinterpret_cast<KeyT*>(d_keys.d_buffers[0]), num_items, true, g_verbose);
+        compare |= input_compare;
+        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Performance
+    if (g_timing_iterations)
+        printf("\nPerforming timing iterations:\n"); fflush(stdout);
+
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0f;
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Initialize/clear device arrays
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
+        if (!KEYS_ONLY)
+        {
+            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
+        }
+
+        gpu_timer.Start();
+        CubDebugExit(Dispatch(
+            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+            num_items, num_segments, d_segment_offsets,
+            begin_bit, end_bit, 0, false));
+        gpu_timer.Stop();
+        elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = (KEYS_ONLY) ?
+            giga_rate * sizeof(KeyT) * 2 :
+            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
+        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test backend
+ */
+template <bool IS_DESCENDING, typename KeyT, typename ValueT>
+void TestBackend(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit,
+    KeyT    *h_reference_keys,
+    int     *h_reference_ranks)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    ValueT *h_values             = NULL;
+    ValueT *h_reference_values   = NULL;
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (int i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+
+#ifdef SEGMENTED_SORT
+    // Test multi-segment implementations
+    Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+#else   // SEGMENTED_SORT
+    if (num_segments == 1)
+    {
+        // Test single-segment implementations
+        Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+        Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    #ifdef CUB_CDP
+        Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    #endif
+    }
+#endif  // SEGMENTED_SORT
+
+    if (h_values) delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+}
+
+
+
+
+/**
+ * Test value type
+ */
+template <bool IS_DESCENDING, typename KeyT>
+void TestValueTypes(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit)
+{
+    // Initialize the solution
+
+    int *h_reference_ranks = NULL;
+    KeyT *h_reference_keys = NULL;
+    InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    // Test keys-only
+    TestBackend<IS_DESCENDING, KeyT, NullType>          (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 8b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned char>     (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 32b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned int>      (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 64b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with non-trivially-constructable value
+    TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Cleanup
+    if (h_reference_ranks) delete[] h_reference_ranks;
+    if (h_reference_keys) delete[] h_reference_keys;
+}
+
+
+
+/**
+ * Test ascending/descending
+ */
+template <typename KeyT>
+void TestDirection(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit)
+{
+    TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+    TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+}
+
+
+/**
+ * Test different bit ranges
+ */
+template <typename KeyT>
+void TestBits(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets)
+{
+    // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
+    if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
+    {
+        // Partial bits
+        int begin_bit = 1;
+        int end_bit = (sizeof(KeyT) * 8) - 1;
+        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+
+        // Across subword boundaries
+        int mid_bit = sizeof(KeyT) * 4;
+        printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
+    }
+
+    printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
+    TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
+}
+
+
+/**
+ * Test different segment compositions
+ */
+template <typename KeyT>
+void TestSegments(
+    KeyT    *h_keys,
+    int     num_items,
+    int     max_segments)
+{
+    int *h_segment_offsets = new int[max_segments + 1];
+
+#ifdef SEGMENTED_SORT
+    for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
+    {
+        if (num_items / num_segments < 128 * 1000) {
+            // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+            InitializeSegments(num_items, num_segments, h_segment_offsets);
+            TestBits(h_keys, num_items, num_segments, h_segment_offsets);
+        }
+    }
+#else
+    // Test single segment
+    if (num_items < 128 * 1000) {
+        // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+        InitializeSegments(num_items, 1, h_segment_offsets);
+        TestBits(h_keys, num_items, 1, h_segment_offsets);
+    }
+#endif
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+
+/**
+ * Test different (sub)lengths and number of segments
+ */
+template <typename KeyT>
+void TestSizes(
+    KeyT    *h_keys,
+    int     max_items,
+    int     max_segments)
+{
+    for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
+    {
+        TestSegments(h_keys, num_items, max_segments);
+    }
+    TestSegments(h_keys, 1, max_segments);
+    TestSegments(h_keys, 0, max_segments);
+}
+
+
+/**
+ * Test key sampling distributions
+ */
+template <typename KeyT>
+void TestGen(
+    int             max_items,
+    int             max_segments)
+{
+    if (max_items < 0)
+        max_items = 9000003;
+
+    if (max_segments < 0)
+        max_segments = 5003;
+
+    KeyT *h_keys = new KeyT[max_items];
+
+    for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
+    {
+        printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
+        InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
+        TestSizes(h_keys, max_items, max_segments);
+    }
+
+    printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
+    TestSizes(h_keys, max_items, max_segments);
+
+    printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
+    TestSizes(h_keys, max_items, max_segments);
+
+    if (h_keys) delete[] h_keys;
+}
+
+
+//---------------------------------------------------------------------
+// Simple test
+//---------------------------------------------------------------------
+
+template <
+    Backend     BACKEND,
+    typename    KeyT,
+    typename    ValueT,
+    bool        IS_DESCENDING>
+void Test(
+    int         num_items,
+    int         num_segments,
+    GenMode     gen_mode,
+    int         entropy_reduction,
+    int         begin_bit,
+    int         end_bit)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    KeyT    *h_keys             = new KeyT[num_items];
+    int     *h_reference_ranks  = NULL;
+    KeyT    *h_reference_keys   = NULL;
+    ValueT  *h_values           = NULL;
+    ValueT  *h_reference_values = NULL;
+    int     *h_segment_offsets  = new int[num_segments + 1];
+
+    if (end_bit < 0)
+        end_bit = sizeof(KeyT) * 8;
+
+    InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
+    InitializeSegments(num_items, num_segments, h_segment_offsets);
+    InitializeSolution<IS_DESCENDING>(
+        h_keys, num_items, num_segments, h_segment_offsets,
+        begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (int i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+    if (h_reference_ranks) delete[] h_reference_ranks;
+
+    printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
+    Test<BACKEND, IS_DESCENDING>(
+        h_keys, h_values,
+        num_items, num_segments, h_segment_offsets,
+        begin_bit, end_bit, h_reference_keys, h_reference_values);
+
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int bits = -1;
+    int num_items = -1;
+    int num_segments = -1;
+    int entropy_reduction = 0;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("s", num_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("bits", bits);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--bits=<valid key bits>]"
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--entropy=<entropy-reduction factor (default 0)>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    enum {
+        IS_DESCENDING   = false
+    };
+
+    // Compile/run basic CUB test
+    if (num_items < 0)      num_items       = 48000000;
+    if (num_segments < 0)   num_segments    = 5000;
+
+    Test<CUB_SEGMENTED, unsigned int,       NullType, IS_DESCENDING>(num_items, num_segments, RANDOM, entropy_reduction, 0, bits);
+
+    printf("\n-------------------------------\n");
+
+    Test<CUB,           unsigned char,      NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned int,       NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned long long, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    printf("\n-------------------------------\n");
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    Test<CUB,           half_t,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+#endif
+    Test<CUB,           float,              NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           double,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    printf("\n-------------------------------\n");
+
+    Test<CUB,           unsigned char,      unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned int,       unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned long long, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0)      num_items       = 48000000;
+    if (num_segments < 0)   num_segments    = 5000;
+
+    // Compare CUB and thrust on 32b keys-only
+    Test<CUB, unsigned int, NullType, false> (                      num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned int, NullType, false> (                   num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    // Compare CUB and thrust on 64b keys-only
+    Test<CUB, unsigned long long, NullType, false> (                num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned long long, NullType, false> (             num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+
+    // Compare CUB and thrust on 32b key-value pairs
+    Test<CUB, unsigned int, unsigned int, false> (                  num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned int, unsigned int, false> (               num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    // Compare CUB and thrust on 64b key + 32b value pairs
+    Test<CUB, unsigned long long, unsigned int, false> (      num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned long long, unsigned int, false> (   num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        TestGen<bool>                 (num_items, num_segments);
+
+        TestGen<char>                 (num_items, num_segments);
+        TestGen<signed char>          (num_items, num_segments);
+        TestGen<unsigned char>        (num_items, num_segments);
+
+        TestGen<short>                (num_items, num_segments);
+        TestGen<unsigned short>       (num_items, num_segments);
+
+        TestGen<int>                  (num_items, num_segments);
+        TestGen<unsigned int>         (num_items, num_segments);
+
+        TestGen<long>                 (num_items, num_segments);
+        TestGen<unsigned long>        (num_items, num_segments);
+
+        TestGen<long long>            (num_items, num_segments);
+        TestGen<unsigned long long>   (num_items, num_segments);
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+        TestGen<half_t>                (num_items, num_segments);
+#endif
+        TestGen<float>                (num_items, num_segments);
+
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            TestGen<double>           (num_items, num_segments);
+
+    }
+
+#endif
+
+    return 0;
+}
+
diff --git a/thrust/dependencies/cub/test/test_device_reduce.cu b/thrust/dependencies/cub/test/test_device_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..262dab302bad6e7bd136be026e2eb555bfafc790
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_reduce.cu
@@ -0,0 +1,1360 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+int                     g_ptx_version;
+int                     g_sm_count;
+double                  g_device_giga_bandwidth;
+bool                    g_verbose           = false;
+bool                    g_verbose_input     = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,            // CUB method
+    CUB_SEGMENTED,  // CUB segmented method
+    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+    THRUST,         // Thrust method
+};
+
+
+// Custom max functor
+struct CustomMax
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename OutputT>
+    __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, reduction_op, identity,
+            stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    cub::Sum            /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    cub::Min            /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    cub::Max            /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    cub::ArgMin         /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    cub::ArgMax         /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSegmentedReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Sum            /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Min            /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Max            /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMin         /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMax         /*reduction_op*/,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduction entrypoint (min or max specialization)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    ReductionOpT         reduction_op,
+    cudaStream_t        /*stream*/,
+    bool                /*debug_synchronous*/)
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        OutputT init;
+        CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
+
+        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
+        OutputT retval;
+        for (int i = 0; i < timing_iterations; ++i)
+        {
+            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
+        }
+
+        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
+            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+/**
+ * Dispatch to reduction entrypoint (sum specialization)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    OffsetIteratorT     /*d_segment_offsets*/,
+    Sum                 /*reduction_op*/,
+    cudaStream_t        /*stream*/,
+    bool                /*debug_synchronous*/)
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
+        OutputT retval;
+        for (int i = 0; i < timing_iterations; ++i)
+        {
+            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
+        }
+
+        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
+            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceReduce
+ */
+template <
+    typename            InputIteratorT,
+    typename            OutputIteratorT,
+    typename            OffsetIteratorT,
+    typename            ReductionOpT>
+__global__ void CnpDispatchKernel(
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    (void)timing_iterations;
+    (void)d_temp_storage_bytes;
+    (void)d_cdp_error;
+    (void)d_temp_storage;
+    (void)temp_storage_bytes;
+    (void)d_in;
+    (void)d_out;
+    (void)num_items;
+    (void)max_segments;
+    (void)d_segment_offsets;
+    (void)reduction_op;
+    (void)debug_synchronous;
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CUB_CDP kernel
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_CDP>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+/// Initialize problem
+template <typename InputT>
+void Initialize(
+    GenMode         gen_mode,
+    InputT          *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose_input)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/// Solve problem (max/custom-max functor)
+template <typename ReductionOpT, typename InputT, typename _OutputT>
+struct Solution
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        ReductionOpT reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (min functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Min, InputT, _OutputT>
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::Min reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (sum functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Sum, InputT, _OutputT>
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::Sum reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate;
+            InitValue(INTEGER_SEED, aggregate, 0);
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (argmin functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMin, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::ArgMin reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+            {
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
+                aggregate = reduction_op(aggregate, item);
+            }
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (argmax functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMax, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::ArgMax reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+            {
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
+                aggregate = reduction_op(aggregate, item);
+            }
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+/// Test DeviceReduce for a given problem input
+template <
+    typename                BackendT,
+    typename                DeviceInputIteratorT,
+    typename                DeviceOutputIteratorT,
+    typename                HostReferenceIteratorT,
+    typename                OffsetT,
+    typename                OffsetIteratorT,
+    typename                ReductionOpT>
+void Test(
+    BackendT                backend,
+    DeviceInputIteratorT    d_in,
+    DeviceOutputIteratorT   d_out,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         d_segment_offsets,
+    ReductionOpT            reduction_op,
+    HostReferenceIteratorT  h_reference)
+{
+    // Input data types
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
+
+    // Allocate CUB_CDP device arrays for temp storage size and error
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Inquire temp device storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Allocate temp device storage
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    if (g_timing_iterations > 0)
+    {
+        GpuTimer gpu_timer;
+        gpu_timer.Start();
+
+        CubDebugExit(Dispatch(backend, g_timing_iterations,
+            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, num_segments, d_segment_offsets,
+            reduction_op, 0, false));
+
+        gpu_timer.Stop();
+        float elapsed_millis = gpu_timer.ElapsedMillis();
+
+        // Display performance
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(InputT);
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
+            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/// Test DeviceReduce
+template <
+    Backend                 BACKEND,
+    typename                OutputValueT,
+    typename                HostInputIteratorT,
+    typename                DeviceInputIteratorT,
+    typename                OffsetT,
+    typename                OffsetIteratorT,
+    typename                ReductionOpT>
+void SolveAndTest(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         h_segment_offsets,
+    OffsetIteratorT         d_segment_offsets,
+    ReductionOpT            reduction_op)
+{
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
+    typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
+    typedef typename SolutionT::OutputT                                         OutputT;
+
+    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
+        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
+        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
+    fflush(stdout);
+
+    // Allocate and solve solution
+    OutputT *h_reference = new OutputT[num_segments];
+    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
+
+//    // Run with discard iterator
+//    DiscardOutputIterator<OffsetT> discard_itr;
+//    Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
+
+    // Run with output data (cleared for sanity-check)
+    OutputT *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
+    Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (h_reference) delete[] h_reference;
+}
+
+
+/// Test specific problem type
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        OffsetT,
+    typename        ReductionOpT>
+void TestProblem(
+    OffsetT         num_items,
+    OffsetT         num_segments,
+    GenMode         gen_mode,
+    ReductionOpT    reduction_op)
+{
+    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    fflush(stdout);
+
+    // Initialize value data
+    InputT* h_in = new InputT[num_items];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize segment data
+    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
+    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+
+    // Initialize device data
+    OffsetT *d_segment_offsets      = NULL;
+    InputT  *d_in                   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
+
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (h_in)               delete[] h_in;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/// Test different operators
+template <
+    Backend             BACKEND,
+    typename            OutputT,
+    typename            HostInputIteratorT,
+    typename            DeviceInputIteratorT,
+    typename            OffsetT,
+    typename            OffsetIteratorT>
+void TestByOp(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         h_segment_offsets,
+    OffsetIteratorT         d_segment_offsets)
+{
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
+}
+
+
+/// Test different backends
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestByBackend(
+    OffsetT     num_items,
+    OffsetT     max_segments,
+    GenMode     gen_mode)
+{
+    // Initialize host data
+    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
+        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+
+    InputT  *h_in               = new InputT[num_items];
+    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize device data
+    InputT  *d_in               = NULL;
+    OffsetT *d_segment_offsets  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    //
+    // Test single-segment implementations
+    //
+
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    // Page-aligned-input tests
+    TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);                 // Host-dispatch
+#ifdef CUB_CDP
+    TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);             // Device-dispatch
+#endif
+
+    // Non-page-aligned-input tests
+    if (num_items > 1)
+    {
+        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
+        TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
+    }
+
+    //
+    // Test segmented implementation
+    //
+
+    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+    int max_items_per_segment = 128000;
+
+    for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
+        num_segments < max_segments;
+        num_segments = (num_segments * 32) + 1)
+    {
+        // Test with segment pointer
+        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+        TestByOp<CUB_SEGMENTED, OutputT>(
+            h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
+
+        // Test with segment iterator
+        typedef CastOp<OffsetT> IdentityOpT;
+        IdentityOpT identity_op;
+        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
+            h_segment_offsets,
+            identity_op);
+       TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
+            d_segment_offsets,
+            identity_op);
+
+        TestByOp<CUB_SEGMENTED, OutputT>(
+            h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
+    }
+
+    if (h_in)               delete[] h_in;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+}
+
+
+/// Test different input-generation modes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+void TestByGenMode(
+    OffsetT num_items,
+    OffsetT max_segments)
+{
+    //
+    // Test pointer support using different input-generation modes
+    //
+
+    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
+
+    //
+    // Test iterator support using a constant-iterator and SUM
+    //
+
+    InputT val;
+    InitValue(UNIFORM, val, 0);
+    ConstantInputIterator<InputT, OffsetT> h_in(val);
+
+    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
+#ifdef CUB_CDP
+    SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
+#endif
+
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+
+/// Test different problem sizes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+struct TestBySize
+{
+    OffsetT max_items;
+    OffsetT max_segments;
+
+    TestBySize(OffsetT max_items, OffsetT max_segments) :
+        max_items(max_items),
+        max_segments(max_segments)
+    {}
+
+    template <typename ActivePolicyT>
+    cudaError_t Invoke()
+    {
+        //
+        // Black-box testing on all backends
+        //
+
+        // Test 0, 1, many
+        TestByGenMode<InputT, OutputT>(0,           max_segments);
+        TestByGenMode<InputT, OutputT>(1,           max_segments);
+        TestByGenMode<InputT, OutputT>(max_items,   max_segments);
+
+        // Test random problem sizes from a log-distribution [8, max_items-ish)
+        int     num_iterations = 8;
+        double  max_exp = log(double(max_items)) / log(double(2.0));
+        for (int i = 0; i < num_iterations; ++i)
+        {
+            OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
+            TestByGenMode<InputT, OutputT>(num_items, max_segments);
+        }
+
+        //
+        // White-box testing of single-segment problems around specific sizes
+        //
+
+        // Tile-boundaries: multiple blocks, one tile per block
+        OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
+
+        // Tile-boundaries: multiple blocks, multiple tiles per block
+        OffsetT sm_occupancy = 32;
+        OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
+        TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
+
+        return cudaSuccess;
+    }
+};
+
+
+/// Test problem type
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestType(
+    OffsetT     max_items,
+    OffsetT     max_segments)
+{
+    typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
+
+    TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
+
+    MaxPolicyT::Invoke(g_ptx_version, dispatch);
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef int OffsetT;
+
+    OffsetT max_items       = 27000000;
+    OffsetT max_segments    = 34000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", max_items);
+    args.GetCmdLineArgument("s", max_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+
+    // Get ptx version
+    CubDebugExit(PtxVersion(g_ptx_version));
+
+    // Get SM count
+    g_sm_count = args.deviceProp.multiProcessorCount;
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic test
+
+
+    TestProblem<CUB, char, int>(            max_items, 1, RANDOM_BIT, Sum());
+    TestProblem<CUB, short, int>(           max_items, 1, RANDOM_BIT, Sum());
+
+    printf("\n-------------------------------\n");
+
+    TestProblem<CUB, int, int>(             max_items, 1, RANDOM_BIT, Sum());
+    TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum());
+
+    printf("\n-------------------------------\n");
+
+    TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum());
+    TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum());
+
+    printf("\n-------------------------------\n");
+
+    TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum());
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick comparison tests
+
+    TestProblem<CUB, char, char>(         max_items * 4, 1, UNIFORM, Sum());
+    TestProblem<THRUST, char, char>(      max_items * 4, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, short, short>(        max_items * 2, 1, UNIFORM, Sum());
+    TestProblem<THRUST, short, short>(     max_items * 2, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, int, int>(          max_items,     1, UNIFORM, Sum());
+    TestProblem<THRUST, int, int>(       max_items,     1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, long long, long long>(    max_items / 2, 1, UNIFORM, Sum());
+    TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, TestFoo, TestFoo>(      max_items / 4, 1, UNIFORM, Max());
+    TestProblem<THRUST, TestFoo, TestFoo>(   max_items / 4, 1, UNIFORM, Max());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        TestType<char, char>(max_items, max_segments);
+
+        TestType<unsigned char, unsigned char>(max_items, max_segments);
+
+        TestType<char, int>(max_items, max_segments);
+
+        TestType<short, short>(max_items, max_segments);
+        TestType<int, int>(max_items, max_segments);
+        TestType<long, long>(max_items, max_segments);
+        TestType<long long, long long>(max_items, max_segments);
+
+        TestType<uchar2, uchar2>(max_items, max_segments);
+        TestType<uint2, uint2>(max_items, max_segments);
+        TestType<ulonglong2, ulonglong2>(max_items, max_segments);
+        TestType<ulonglong4, ulonglong4>(max_items, max_segments);
+
+        TestType<TestFoo, TestFoo>(max_items, max_segments);
+        TestType<TestBar, TestBar>(max_items, max_segments);
+    }
+
+#endif
+
+
+    printf("\n");
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_device_reduce_by_key.cu b/thrust/dependencies/cub/test/test_device_reduce_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ff24960a32ff9b2b96c722c1b9cd93bac5201b48
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_reduce_by_key.cu
@@ -0,0 +1,853 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::ReduceByKey utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                  /*equality_op*/,
+    ReductionOpT                 reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::ReduceByKey(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            d_num_runs,
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>            /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 /*equality_op*/,
+    ReductionOpT                /*reduction_op*/,
+    OffsetT                     num_items,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The input keys type
+    typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE),   // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeyInputIteratorT>::value_type,                                           // ... then the input iterator's value type,
+        typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT;                        // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValueInputIteratorT>::value_type,                                         // ... then the input iterator's value type,
+        typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT;                     // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
+        thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
+
+        thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
+        thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
+
+        thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_ends = thrust::reduce_by_key(
+                d_keys_in_wrapper,
+                d_keys_in_wrapper + num_items,
+                d_values_in_wrapper,
+                d_keys_out_wrapper,
+                d_values_out_wrapper);
+        }
+
+        OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
+
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+
+        int repeat;
+
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    typename        KeyInputIteratorT,
+    typename        ValueInputIteratorT,
+    typename        KeyT,
+    typename        ValueT,
+    typename        EqualityOpT,
+    typename        ReductionOpT>
+int Solve(
+    KeyInputIteratorT       h_keys_in,
+    KeyT                    *h_keys_reference,
+    ValueInputIteratorT     h_values_in,
+    ValueT                  *h_values_reference,
+    EqualityOpT             equality_op,
+    ReductionOpT            reduction_op,
+    int                     num_items)
+{
+    // First item
+    KeyT previous        = h_keys_in[0];
+    ValueT aggregate     = h_values_in[0];
+    int num_segments    = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_keys_in[i]))
+        {
+            h_keys_reference[num_segments] = previous;
+            h_values_reference[num_segments] = aggregate;
+            num_segments++;
+            aggregate = h_values_in[i];
+        }
+        else
+        {
+            aggregate = reduction_op(aggregate, h_values_in[i]);
+        }
+        previous = h_keys_in[i];
+    }
+
+    h_keys_reference[num_segments] = previous;
+    h_values_reference[num_segments] = aggregate;
+    num_segments++;
+
+    return num_segments;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceKeyInputIteratorT,
+    typename            DeviceValueInputIteratorT,
+    typename            KeyT,
+    typename            ValueT,
+    typename            EqualityOpT,
+    typename            ReductionOpT>
+void Test(
+    DeviceKeyInputIteratorT     d_keys_in,
+    DeviceValueInputIteratorT   d_values_in,
+    KeyT*                       h_keys_reference,
+    ValueT*                     h_values_reference,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    int                         num_segments,
+    int                         num_items)
+{
+    // Allocate device output arrays and number of segments
+    KeyT*   d_keys_out             = NULL;
+    ValueT* d_values_out           = NULL;
+    int*    d_num_runs         = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
+    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
+    printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
+    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
+
+    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis  = elapsed_millis / g_timing_iterations;
+        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
+        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
+    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT* h_values_in        = new ValueT[num_items];
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_values_in[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    ValueT   *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestIterator(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<ValueT, int> h_values_in(one_val);
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op,
+    int             max_segment)
+{
+    // 0 key-bit entropy reduction rounds
+    TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
+
+    if (max_segment > 1)
+    {
+        // 2 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
+
+        // 7 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
+    }
+}
+
+
+/**
+ * Test different avg segment lengths modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
+
+    // Evaluate different max-segment lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
+    {
+        Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
+    }
+}
+
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestDispatch(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<CUB, KeyT, ValueT>(num_items, reduction_op);
+#ifdef CUB_CDP
+    Test<CDP, KeyT, ValueT>(num_items, reduction_op);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestSize(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<KeyT, ValueT>(1,        reduction_op);
+        TestDispatch<KeyT, ValueT>(100,      reduction_op);
+        TestDispatch<KeyT, ValueT>(10000,    reduction_op);
+        TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
+    }
+    else
+    {
+        TestDispatch<KeyT, ValueT>(num_items, reduction_op);
+    }
+
+}
+
+
+template <
+    typename        KeyT,
+    typename        ValueT>
+void TestOp(
+    int             num_items)
+{
+    TestSize<KeyT, ValueT>(num_items, cub::Sum());
+    TestSize<KeyT, ValueT>(num_items, cub::Max());
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("---- RLE int ---- \n");
+    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- RLE long long ---- \n");
+    TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- int ---- \n");
+    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- float ---- \n");
+    TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+    {
+        printf("---- double ---- \n");
+        TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+        TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+    }
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+
+        // Test different input types
+        TestOp<int, char>(num_items);
+        TestOp<int, short>(num_items);
+        TestOp<int, int>(num_items);
+        TestOp<int, long>(num_items);
+        TestOp<int, long long>(num_items);
+        TestOp<int, float>(num_items);
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            TestOp<int, double>(num_items);
+
+        TestOp<int, uchar2>(num_items);
+        TestOp<int, uint2>(num_items);
+        TestOp<int, uint3>(num_items);
+        TestOp<int, uint4>(num_items);
+        TestOp<int, ulonglong4>(num_items);
+        TestOp<int, TestFoo>(num_items);
+        TestOp<int, TestBar>(num_items);
+
+        TestOp<char, int>(num_items);
+        TestOp<long long, int>(num_items);
+        TestOp<TestFoo, int>(num_items);
+        TestOp<TestBar, int>(num_items);
+
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_device_run_length_encode.cu b/thrust/dependencies/cub/test/test_device_run_length_encode.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9d961fbb9d99f91a2307469a6df8ff3c9ac0d82b
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_run_length_encode.cu
@@ -0,0 +1,890 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::RunLengthEncode utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+// Operation types
+enum RleMethod
+{
+    RLE,                // Run length encode
+    NON_TRIVIAL,
+    CSR,
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<RLE>               /*method*/,
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      /*d_offsets_out*/,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               /*equality_op*/,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceRunLengthEncode::Encode(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to non-trivial runs entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<NON_TRIVIAL>       /*method*/,
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       /*d_unique_out*/,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               /*equality_op*/,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+cudaError_t Dispatch(
+    Int2Type<RLE>               /*method*/,
+    Int2Type<THRUST>            /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      /*d_offsets_out*/,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               /*equality_op*/,
+    OffsetT                     num_items,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT;                          // ... else the output iterator's value type
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
+        thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
+        thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);
+
+        thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
+
+        LengthT one_val;
+        InitValue(INTEGER_SEED, one_val, 1);
+        thrust::constant_iterator<LengthT> constant_one(one_val);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_ends = thrust::reduce_by_key(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                constant_one,
+                d_unique_out_wrapper,
+                d_lengths_out_wrapper);
+        }
+
+        OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceRunLengthEncode
+ */
+template <
+    int                         RLE_METHOD,
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    EqualityOp,
+    typename                    OffsetT>
+__global__ void CnpDispatchKernel(
+    Int2Type<RLE_METHOD>            method,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <
+    int                         RLE_METHOD,
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    EqualityOp,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<RLE_METHOD>        method,
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    EqualityOp                  equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    RleMethod       RLE_METHOD,
+    typename        InputIteratorT,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT,
+    typename        EqualityOp>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_unique_reference,
+    OffsetT         *h_offsets_reference,
+    LengthT         *h_lengths_reference,
+    EqualityOp      equality_op,
+    int             num_items)
+{
+    if (num_items == 0) 
+        return 0;
+
+    // First item
+    T       previous        = h_in[0];
+    LengthT  length          = 1;
+    int     num_runs        = 0;
+    int     run_begin       = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_in[i]))
+        {
+            if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+            {
+                h_unique_reference[num_runs]      = previous;
+                h_offsets_reference[num_runs]     = run_begin;
+                h_lengths_reference[num_runs]     = length;
+                num_runs++;
+            }
+            length = 1;
+            run_begin = i;
+        }
+        else
+        {
+            length++;
+        }
+        previous = h_in[i];
+    }
+
+    if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+    {
+        h_unique_reference[num_runs]    = previous;
+        h_offsets_reference[num_runs]   = run_begin;
+        h_lengths_reference[num_runs]   = length;
+        num_runs++;
+    }
+
+    return num_runs;
+}
+
+
+
+/**
+ * Test DeviceRunLengthEncode for a given problem input
+ */
+template <
+    RleMethod           RLE_METHOD,
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T,
+    typename            OffsetT,
+    typename            LengthT,
+    typename            EqualityOp>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_unique_reference,
+    OffsetT             *h_offsets_reference,
+    LengthT             *h_lengths_reference,
+    EqualityOp          equality_op,
+    int                 num_runs,
+    int                 num_items)
+{
+    // Allocate device output arrays and number of segments
+    T*          d_unique_out       = NULL;
+    LengthT*    d_offsets_out      = NULL;
+    OffsetT*    d_lengths_out      = NULL;
+    int*        d_num_runs         = NULL;
+
+    if (RLE_METHOD == RLE)
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
+    if (RLE_METHOD == NON_TRIVIAL)
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*          d_temp_storage_bytes = NULL;
+    cudaError_t*     d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void*           d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    if (RLE_METHOD == RLE)
+        CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
+    if (RLE_METHOD == NON_TRIVIAL)
+        CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
+    CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare0 = 0;
+    int compare1 = 0;
+    int compare2 = 0;
+    int compare3 = 0;
+
+    if (RLE_METHOD == RLE)
+    {
+        compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
+        printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
+    }
+
+    if (RLE_METHOD != RLE)
+    {
+        compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
+        printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
+    }
+
+    if (RLE_METHOD != CSR)
+    {
+        compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+        printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
+    }
+
+    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
+        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
+    if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
+    if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare0 | compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceRunLengthEncode on pointer type
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*      h_in                    = new T[num_items];
+    T*      h_unique_reference      = new T[num_items];
+    OffsetT* h_offsets_reference     = new OffsetT[num_items];
+    LengthT* h_lengths_reference     = new LengthT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+
+    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
+
+    printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
+        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_runs, float(num_items) / num_runs,
+        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T* d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_unique_reference) delete[] h_unique_reference;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestIterator(
+    int             num_items,
+    Int2Type<true>  /*is_primitive*/)
+{
+    // Allocate host arrays
+    T* h_unique_reference       = new T[num_items];
+    OffsetT* h_offsets_reference = new OffsetT[num_items];
+    LengthT* h_lengths_reference = new LengthT[num_items];
+
+    T one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<T, int> h_in(one_val);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
+
+    printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
+        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_runs, float(num_items) / num_runs,
+        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
+    fflush(stdout);
+
+    // Run Test
+    Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
+
+    // Cleanup
+    if (h_unique_reference) delete[] h_unique_reference;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+}
+
+
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestIterator(
+    int             /*num_items*/,
+    Int2Type<false> /*is_primitive*/)
+{}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void Test(
+    int             num_items)
+{
+    // Test iterator (one run)
+    TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
+
+    // num_items runs
+    TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
+
+    // Evaluate different run lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
+    {
+        // Uniform selection run length
+        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
+
+        // Reduced-entropy run length
+        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestDispatch(
+    int             num_items)
+{
+    Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
+    Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
+
+#ifdef CUB_CDP
+    Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
+    Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestSize(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<T, OffsetT, LengthT>(0);
+        TestDispatch<T, OffsetT, LengthT>(1);
+        TestDispatch<T, OffsetT, LengthT>(100);
+        TestDispatch<T, OffsetT, LengthT>(10000);
+        TestDispatch<T, OffsetT, LengthT>(1000000);
+
+        // Randomly select problem size between 1:10,000,000
+        unsigned int max_int = (unsigned int) -1;
+        for (int i = 0; i < 10; ++i)
+        {
+            unsigned int num_items;
+            RandomBits(num_items);
+            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+            num_items = CUB_MAX(1, num_items);
+            TestDispatch<T, OffsetT, LengthT>(num_items);
+        }
+    }
+    else
+    {
+        TestDispatch<T, OffsetT, LengthT>(num_items);
+    }
+
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int max_segment              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", max_segment);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestPointer<NON_TRIVIAL,    CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestIterator<RLE,           CUB, float, int, int>(  num_items, Int2Type<Traits<float>::PRIMITIVE>());
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestPointer<RLE,            THRUST, int, int, int>(    num_items, entropy_reduction, max_segment);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        TestSize<char,          int, int>(num_items);
+        TestSize<short,         int, int>(num_items);
+        TestSize<int,           int, int>(num_items);
+        TestSize<long,          int, int>(num_items);
+        TestSize<long long,     int, int>(num_items);
+        TestSize<float,         int, int>(num_items);
+        TestSize<double,        int, int>(num_items);
+
+        TestSize<uchar2,        int, int>(num_items);
+        TestSize<uint2,         int, int>(num_items);
+        TestSize<uint3,         int, int>(num_items);
+        TestSize<uint4,         int, int>(num_items);
+        TestSize<ulonglong4,    int, int>(num_items);
+        TestSize<TestFoo,       int, int>(num_items);
+        TestSize<TestBar,       int, int>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_device_scan.cu b/thrust/dependencies/cub/test/test_device_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..998f6b12f5da9828fd9ea783ac23ac4fcef95c6f
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_scan.cu
@@ -0,0 +1,1039 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/device/device_scan.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+double                  g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceScan entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    IsPrimitiveT        /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    Int2Type<true>      /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 /*scan_op*/,
+    InitialValueT       /*initial_value*/,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    IsPrimitiveT        /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    NullType            /*initial_value*/,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    Int2Type<true>      /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 /*scan_op*/,
+    NullType            /*initial_value*/,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    /*dispatch_to*/,
+    IsPrimitiveT        /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        /*stream*/,
+    bool                /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    /*dispatch_to*/,
+    Int2Type<true>      /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 /*scan_op*/,
+    InitialValueT       /*initial_value*/,
+    OffsetT             num_items,
+    cudaStream_t        /*stream*/,
+    bool                /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    /*dispatch_to*/,
+    IsPrimitiveT        /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    NullType            /*initial_value*/,
+    OffsetT             num_items,
+    cudaStream_t        /*stream*/,
+    bool                /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    /*dispatch_to*/,
+    Int2Type<true>      /*is_primitive*/,
+    int                 timing_timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 /*scan_op*/,
+    NullType            /*initial_value*/,
+    OffsetT             num_items,
+    cudaStream_t        /*stream*/,
+    bool                /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceScan
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    (void)is_primitive;
+    (void)timing_timing_iterations;
+    (void)d_temp_storage_bytes;
+    (void)d_cdp_error;
+    (void)d_temp_storage;
+    (void)temp_storage_bytes;
+    (void)d_in;
+    (void)d_out;
+    (void)scan_op;
+    (void)initial_value;
+    (void)num_items;
+    (void)debug_synchronous;
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(
+        Int2Type<CUB>(),
+        is_primitive,
+        timing_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(
+        is_primitive,
+        timing_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode      gen_mode,
+    T            *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    OutputT         initial_value)
+{
+    if (num_items > 0)
+    {
+        OutputT val         = h_in[0];
+        h_reference[0]      = initial_value;
+        OutputT inclusive   = scan_op(initial_value, val);
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            val = h_in[i];
+            h_reference[i] = inclusive;
+            inclusive = scan_op(inclusive, val);
+        }
+    }
+}
+
+
+/**
+ * Solve inclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    NullType)
+{
+    if (num_items > 0)
+    {
+        OutputT inclusive   = h_in[0];
+        h_reference[0]      = inclusive;
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            OutputT val = h_in[i];
+            inclusive = scan_op(inclusive, val);
+            h_reference[i] = inclusive;
+        }
+    }
+}
+
+
+/**
+ * Test DeviceScan for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT>
+void Test(
+    DeviceInputIteratorT    d_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value)
+{
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
+
+    // Allocate device output array
+    OutputT *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        g_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
+            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test DeviceScan on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestPointer(
+    int             num_items,
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    InputT*     h_in        = new InputT[num_items];
+    OutputT*    h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Initialize(gen_mode, h_in, num_items);
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Allocate problem device arrays
+    InputT *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceScan on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestIterator(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
+    fflush(stdout);
+
+    // Use a constant iterator as the input
+    InputT val = InputT();
+    ConstantInputIterator<InputT, int> h_in(val);
+
+    // Allocate host arrays
+    OutputT*  h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value);
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value);
+    TestIterator<BACKEND, InputT, OutputT>( num_items, scan_op, initial_value);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
+#ifdef CUB_CDP
+    Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
+#endif
+}
+
+
+/**
+ * Test different operators
+ */
+template <typename InputT, typename OutputT>
+void TestOp(
+    int             num_items,
+    OutputT         identity,
+    OutputT         initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<InputT, OutputT>(num_items, cub::Sum(), identity);
+    Test<InputT, OutputT>(num_items, cub::Max(), identity);
+
+    // Exclusive (non-specialized, so we can test initial-value)
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value);
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value);
+
+    // Inclusive (no initial value)
+    Test<InputT, OutputT>(num_items, cub::Sum(), NullType());
+    Test<InputT, OutputT>(num_items, cub::Max(), NullType());
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename InputT,
+    typename OutputT>
+void TestSize(
+    int     num_items,
+    OutputT identity,
+    OutputT initial_value)
+{
+    if (num_items < 0)
+    {
+        TestOp<InputT>(0,        identity, initial_value);
+        TestOp<InputT>(1,        identity, initial_value);
+        TestOp<InputT>(100,      identity, initial_value);
+        TestOp<InputT>(10000,    identity, initial_value);
+        TestOp<InputT>(1000000,  identity, initial_value);
+
+        // Randomly select problem size between 1:10,000,000
+        unsigned int max_int = (unsigned int) -1;
+        for (int i = 0; i < 10; ++i)
+        {
+            unsigned int num_items;
+            RandomBits(num_items);
+            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+            num_items = CUB_MAX(1, num_items);
+            TestOp<InputT>(num_items,  identity, initial_value);
+        }
+    }
+    else
+    {
+        TestOp<InputT>(num_items, identity, initial_value);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, char, int>(            num_items    , RANDOM_BIT, Sum(), (int) (0));
+    TestPointer<CUB, short, int>(           num_items    , RANDOM_BIT, Sum(), (int) (0));
+
+    printf("----------------------------\n");
+
+    TestPointer<CUB, int, int>(             num_items    , RANDOM_BIT, Sum(), (int) (0));
+    TestPointer<CUB, long long, long long>( num_items    , RANDOM_BIT, Sum(), (long long) (0));
+
+    printf("----------------------------\n");
+
+    TestPointer<CUB, float, float>(         num_items    , RANDOM_BIT, Sum(), (float) (0));
+    TestPointer<CUB, double, double>(       num_items    , RANDOM_BIT, Sum(), (double) (0));
+
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, char, char>(        num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
+    TestPointer<THRUST, char, char>(     num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, short, short>(       num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
+    TestPointer<THRUST, short, short>(    num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
+    TestPointer<THRUST, int, int>(      num_items    , UNIFORM, Sum(), (int) (0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, long long, long long>(   num_items / 2, UNIFORM, Sum(), (long long) (0));
+    TestPointer<THRUST, long long, long long>(num_items / 2, UNIFORM, Sum(), (long long) (0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, TestBar, TestBar>(     num_items / 4, UNIFORM, Sum(), TestBar());
+    TestPointer<THRUST, TestBar, TestBar>(  num_items / 4, UNIFORM, Sum(), TestBar());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input+output data types
+        TestSize<unsigned char>(num_items,      (int) 0, (int) 99);
+
+        // Test same intput+output data types
+        TestSize<unsigned char>(num_items,      (unsigned char) 0,      (unsigned char) 99);
+        TestSize<char>(num_items,               (char) 0,               (char) 99);
+        TestSize<unsigned short>(num_items,     (unsigned short) 0,     (unsigned short)99);
+        TestSize<unsigned int>(num_items,       (unsigned int) 0,       (unsigned int) 99);
+        TestSize<unsigned long long>(num_items, (unsigned long long) 0, (unsigned long long) 99);
+
+        TestSize<uchar2>(num_items,     make_uchar2(0, 0),              make_uchar2(17, 21));
+        TestSize<char2>(num_items,      make_char2(0, 0),               make_char2(17, 21));
+        TestSize<ushort2>(num_items,    make_ushort2(0, 0),             make_ushort2(17, 21));
+        TestSize<uint2>(num_items,      make_uint2(0, 0),               make_uint2(17, 21));
+        TestSize<ulonglong2>(num_items, make_ulonglong2(0, 0),          make_ulonglong2(17, 21));
+        TestSize<uchar4>(num_items,     make_uchar4(0, 0, 0, 0),        make_uchar4(17, 21, 32, 85));
+        TestSize<char4>(num_items,      make_char4(0, 0, 0, 0),         make_char4(17, 21, 32, 85));
+
+        TestSize<ushort4>(num_items,    make_ushort4(0, 0, 0, 0),       make_ushort4(17, 21, 32, 85));
+        TestSize<uint4>(num_items,      make_uint4(0, 0, 0, 0),         make_uint4(17, 21, 32, 85));
+        TestSize<ulonglong4>(num_items, make_ulonglong4(0, 0, 0, 0),    make_ulonglong4(17, 21, 32, 85));
+
+        TestSize<TestFoo>(num_items,
+            TestFoo::MakeTestFoo(0, 0, 0, 0),
+            TestFoo::MakeTestFoo(1ll << 63, 1 << 31, short(1 << 15), char(1 << 7)));
+
+        TestSize<TestBar>(num_items,
+            TestBar(0, 0),
+            TestBar(1ll << 63, 1 << 31));
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_device_select_if.cu b/thrust/dependencies/cub/test/test_device_select_if.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4488276ca176932ac9d0192139a09dccfa42cb8
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_select_if.cu
@@ -0,0 +1,1052 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::If and DevicePartition::If utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/copy.h>
+#include <thrust/partition.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+// Selection functor type
+template <typename T>
+struct LessThan
+{
+    T compare;
+
+    __host__ __device__ __forceinline__
+    LessThan(T compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const T &a) const {
+        return (a < compare);
+    }
+};
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<false>             /*is_flagged*/,
+    Int2Type<false>             /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               /*d_flags*/,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<false>             /*is_flagged*/,
+    Int2Type<true>              /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               /*d_flags*/,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<true>              /*is_flagged*/,
+    Int2Type<false>             /*partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   /*select_op*/,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<true>              /*is_flagged*/,
+    Int2Type<true>              /*partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   /*select_op*/,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            /*dispatch_to*/,
+    Int2Type<false>             /*is_flagged*/,
+    Int2Type<false>             /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               /*d_flags*/,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT>         d_out_wrapper_end;
+        thrust::device_ptr<InputT>          d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>         d_out_wrapper(d_out);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            /*dispatch_to*/,
+    Int2Type<false>             /*is_flagged*/,
+    Int2Type<true>              /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               /*d_flags*/,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
+
+        thrust::device_ptr<InputT>       d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>       d_out_wrapper(d_out);
+
+        ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::partition_copy(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                d_out_wrapper,
+                d_out_unselected,
+                select_op);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            /*dispatch_to*/,
+    Int2Type<true>              /*is_flagged*/,
+    Int2Type<false>             /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   /*select_op*/,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The flag type
+    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT>     d_out_wrapper_end;
+        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
+        thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            /*dispatch_to*/,
+    Int2Type<true>              /*is_flagged*/,
+    Int2Type<true>              /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   /*select_op*/,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The flag type
+    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
+
+        thrust::device_ptr<InputT>  d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
+        ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::partition_copy(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                d_flags_wrapper,
+                d_out_wrapper,
+                d_out_unselected,
+                CastOp<bool>());
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
+__global__ void CnpDispatchKernel(
+    IsFlaggedTag                is_flagged,
+    IsPartitionTag              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    (void)is_flagged;
+    (void)is_partition;
+    (void)timing_timing_iterations;
+    (void)d_temp_storage_bytes;
+    (void)d_temp_storage;
+    (void)temp_storage_bytes;
+    (void)d_in;
+    (void)d_flags;
+    (void)d_out;
+    (void)d_num_selected_out;
+    (void)num_items;
+    (void)select_op;
+    (void)debug_synchronous;
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    IsFlaggedTag                is_flagged,
+    IsPartitionTag              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    T*  h_in,
+    int num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        // Initialize each item to a randomly selected value from [0..126]
+        unsigned int value;
+        RandomBits(value, 0, 0, 7);
+        if (value == 127)
+            value = 126;
+        InitValue(INTEGER_SEED, h_in[i], value);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve selection problem (and set corresponding flags)
+ */
+template <
+    typename        InputIteratorT,
+    typename        FlagIteratorT,
+    typename        SelectOpT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    SelectOpT       select_op,
+    T*              h_reference,
+    FlagIteratorT   h_flags,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if ((h_flags[i] = select_op(h_in[i])))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    bool                IS_FLAGGED,
+    bool                IS_PARTITION,
+    typename            DeviceInputIteratorT,
+    typename            FlagT,
+    typename            SelectOpT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT    d_in,
+    FlagT*                  h_flags,
+    SelectOpT               select_op,
+    T*                      h_reference,
+    int                     num_selected,
+    int                     num_items)
+{
+    // Allocate device flags, output, and num-selected
+    FlagT*      d_flags = NULL;
+    T*          d_out = NULL;
+    int*        d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*         d_temp_storage_bytes = NULL;
+    cudaError_t*    d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+    d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Copy flags and clear device output array
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = (IS_PARTITION) ?
+        CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
+        CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis          = elapsed_millis / g_timing_iterations;
+        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
+        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
+        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
+        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
+
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test on pointer type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_in        = new T[num_items];
+    FlagT*  h_flags     = new FlagT[num_items];
+    T*      h_reference = new T[num_items];
+
+    // Initialize input
+    Initialize(h_in, num_items);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestIterator(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_reference = new T[num_items];
+    FlagT*  h_flags = new FlagT[num_items];
+
+    // Use counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+}
+
+
+/**
+ * Test different selection ratios
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
+    {
+        TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
+    }
+}
+
+
+/**
+ * Test (select vs. partition) and (flagged vs. functor)
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestMethod(
+    int             num_items)
+{
+    // Functor
+    Test<BACKEND, false, false, T>(num_items);
+    Test<BACKEND, false, true, T>(num_items);
+
+    // Flagged
+    Test<BACKEND, true, false, T>(num_items);
+    Test<BACKEND, true, true, T>(num_items);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+    TestMethod<CUB, T>(num_items);
+#ifdef CUB_CDP
+    TestMethod<CDP, T>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+/**
+ * Test select/partition on pointer types
+ */
+template <typename T>
+void ComparePointer(
+    int             num_items,
+    float           select_ratio)
+{
+    printf("-- Select-if ----------------------------\n");
+    TestPointer<CUB, false, false, T>(num_items, select_ratio);
+    TestPointer<THRUST, false, false, T>(num_items, select_ratio);
+
+    printf("-- Partition-if ----------------------------\n");
+    TestPointer<CUB, false, true, T>(num_items, select_ratio);
+    TestPointer<THRUST, false, true, T>(num_items, select_ratio);
+
+    printf("-- Select-flagged ----------------------------\n");
+    TestPointer<CUB, true, false, T>(num_items, select_ratio);
+    TestPointer<THRUST, true, false, T>(num_items, select_ratio);
+
+    printf("-- Partition-flagged ----------------------------\n");
+    TestPointer<CUB, true, true, T>(num_items, select_ratio);
+    TestPointer<THRUST, true, true, T>(num_items, select_ratio);
+
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    float select_ratio      = 0.5;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("ratio", select_ratio);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--ratio=<selection ratio, default 0.5>] "
+            "[--repeat=<repetitions of entire test suite>] "
+            "[--v] "
+            "[--cdp] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Select-if ----------------------------\n");
+    TestPointer<CUB, false, false, int>(num_items, select_ratio);
+
+    printf("-- Partition-if ----------------------------\n");
+    TestPointer<CUB, false, true, int>(num_items, select_ratio);
+
+    printf("-- Select-flagged ----------------------------\n");
+    TestPointer<CUB, true, false, int>(num_items, select_ratio);
+
+    printf("-- Partition-flagged ----------------------------\n");
+    TestPointer<CUB, true, true, int>(num_items, select_ratio);
+
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Iterator ----------------------------\n");
+    TestIterator<CUB, false, false, int>(num_items, select_ratio);
+
+    ComparePointer<char>(       num_items * ((sm_version <= 130) ? 1 : 4),  select_ratio);
+    ComparePointer<short>(      num_items * ((sm_version <= 130) ? 1 : 2),  select_ratio);
+    ComparePointer<int>(        num_items,                                  select_ratio);
+    ComparePointer<long long>(  num_items / 2,                              select_ratio);
+    ComparePointer<TestFoo>(    num_items / 4,                              select_ratio);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        Test<unsigned char>(num_items);
+        Test<unsigned short>(num_items);
+        Test<unsigned int>(num_items);
+        Test<unsigned long long>(num_items);
+
+        Test<uchar2>(num_items);
+        Test<ushort2>(num_items);
+        Test<uint2>(num_items);
+        Test<ulonglong2>(num_items);
+
+        Test<uchar4>(num_items);
+        Test<ushort4>(num_items);
+        Test<uint4>(num_items);
+        Test<ulonglong4>(num_items);
+
+        Test<TestFoo>(num_items);
+        Test<TestBar>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_device_select_unique.cu b/thrust/dependencies/cub/test/test_device_select_unique.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1c13fba15de7b60a9048f70827c70c655621b48b
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_device_select_unique.cu
@@ -0,0 +1,661 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::Unique utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/unique.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/unique.h>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                /*stream*/,
+    bool                        /*debug_synchronous*/)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT> d_out_wrapper_end;
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    (void)timing_timing_iterations;
+    (void)d_temp_storage_bytes;
+    (void)d_cdp_error;
+    (void)d_temp_storage;
+    (void)temp_storage_bytes;
+    (void)d_in;
+    (void)d_out;
+    (void)d_num_selected_out;
+    (void)num_items;
+    (void)debug_synchronous;
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_reference,
+    int                 num_selected,
+    int                 num_items)
+{
+    // Allocate device output array and num selected
+    T       *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis        = elapsed_millis / g_timing_iterations;
+        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*  h_in        = new T[num_items];
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T),
+        entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceSelect on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestIterator(
+    int             num_items)
+{
+    // Use a counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Allocate host arrays
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
+    {
+        TestPointer<BACKEND, T>(num_items, 0, max_segment);
+        TestPointer<BACKEND, T>(num_items, 2, max_segment);
+        TestPointer<BACKEND, T>(num_items, 7, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+    Test<CUB, T>(num_items);
+#ifdef CUB_CDP
+    Test<CDP, T>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Iterator ----------------------------\n");
+    TestIterator<CUB, int>(        num_items);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, char>(        num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
+    TestPointer<THRUST, char>(     num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, short>(       num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
+    TestPointer<THRUST, short>(    num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
+    TestPointer<THRUST, int>(      num_items,                                 entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, long long>(   num_items / 2,                             entropy_reduction, maxseg);
+    TestPointer<THRUST, long long>(num_items / 2,                             entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, TestFoo>(     num_items / 4,                             entropy_reduction, maxseg);
+    TestPointer<THRUST, TestFoo>(  num_items / 4,                             entropy_reduction, maxseg);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        Test<unsigned char>(num_items);
+        Test<unsigned short>(num_items);
+        Test<unsigned int>(num_items);
+        Test<unsigned long long>(num_items);
+
+        Test<uchar2>(num_items);
+        Test<ushort2>(num_items);
+        Test<uint2>(num_items);
+        Test<ulonglong2>(num_items);
+
+        Test<uchar4>(num_items);
+        Test<ushort4>(num_items);
+        Test<uint4>(num_items);
+        Test<ulonglong4>(num_items);
+
+        Test<TestFoo>(num_items);
+        Test<TestBar>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_grid_barrier.cu b/thrust/dependencies/cub/test/test_grid_barrier.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e6e3b8125c8b6ab0f3eab55346cedabd1d7398f9
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_grid_barrier.cu
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for software global barrier throughput
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/grid/grid_barrier.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Kernel that iterates through the specified number of software global barriers
+ */
+__global__ void Kernel(
+    GridBarrier global_barrier,
+    int iterations)
+{
+    for (int i = 0; i < iterations; i++)
+    {
+        global_barrier.Sync();
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    cudaError_t retval = cudaSuccess;
+
+    // Defaults
+    int iterations = 10000;
+    int block_size = 128;
+    int grid_size = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Get args
+    args.GetCmdLineArgument("i", iterations);
+    args.GetCmdLineArgument("grid-size", grid_size);
+    args.GetCmdLineArgument("block-size", block_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--i=<iterations>]"
+            "[--grid-size<grid-size>]"
+            "[--block-size<block-size>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Get SM properties
+    int sm_count, max_block_threads, max_sm_occupancy;
+    CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
+
+    // Compute grid size and occupancy
+    int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
+
+    if (grid_size == -1)
+    {
+        grid_size = occupancy * sm_count;
+    }
+    else
+    {
+        occupancy = grid_size / sm_count;
+    }
+
+    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
+        grid_size, block_size, occupancy);
+    fflush(stdout);
+
+    // Init global barrier
+    GridBarrierLifetime global_barrier;
+    global_barrier.Setup(grid_size);
+
+    // Time kernel
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
+    gpu_timer.Stop();
+
+    retval = CubDebug(cudaThreadSynchronize());
+
+    // Output timing results
+    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
+    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
+        iterations,
+        gpu_timer.ElapsedMillis(),
+        avg_elapsed);
+
+    return retval;
+}
diff --git a/thrust/dependencies/cub/test/test_iterator.cu b/thrust/dependencies/cub/test/test_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..53fc5d1f722b0bd712adbd1de40edacaa1a775d3
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_iterator.cu
@@ -0,0 +1,805 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of iterator utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include <cub/util_type.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/copy.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+template <typename T>
+struct TransformOp
+{
+    // Increment transform
+    __host__ __device__ __forceinline__ T operator()(T input) const
+    {
+        T addend;
+        InitValue(INTEGER_SEED, addend, 1);
+        return input + addend;
+    }
+};
+
+struct SelectOp
+{
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(T input)
+    {
+        return true;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Test random access input iterator
+ */
+template <
+    typename InputIteratorT,
+    typename T>
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    T                 *d_out,
+    InputIteratorT    *d_itrs)
+{
+    d_out[0] = *d_in;               // Value at offset 0
+    d_out[1] = d_in[100];           // Value at offset 100
+    d_out[2] = *(d_in + 1000);      // Value at offset 1000
+    d_out[3] = *(d_in + 10000);     // Value at offset 10000
+
+    d_in++;
+    d_out[4] = d_in[0];             // Value at offset 1
+
+    d_in += 20;
+    d_out[5] = d_in[0];             // Value at offset 21
+    d_itrs[0] = d_in;               // Iterator at offset 21
+
+    d_in -= 10;
+    d_out[6] = d_in[0];             // Value at offset 11;
+
+    d_in -= 11;
+    d_out[7] = d_in[0];             // Value at offset 0
+    d_itrs[1] = d_in;               // Iterator at offset 0
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Run iterator test on device
+ */
+template <
+    typename        InputIteratorT,
+    typename        T,
+    int             TEST_VALUES>
+void Test(
+    InputIteratorT  d_in,
+    T               (&h_reference)[TEST_VALUES])
+{
+    // Allocate device arrays
+    T                 *d_out    = NULL;
+    InputIteratorT    *d_itrs   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
+
+    int compare;
+
+    // Run unguarded kernel
+    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 21
+    InputIteratorT h_itr = d_in + 21;
+    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 0
+    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
+}
+
+
+/**
+ * Test constant iterator
+ */
+template <typename T>
+void TestConstant(T base)
+{
+    printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    T h_reference[8] = {base, base, base, base, base, base, base, base};
+    ConstantInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    int copy_items  = 100;
+    T   *h_copy     = new T[copy_items];
+    T   *d_copy     = NULL;
+
+    for (int i = 0; i < copy_items; ++i)
+        h_copy[i] = d_itr[i];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+}
+
+
+/**
+ * Test counting iterator
+ */
+template <typename T>
+void TestCounting(T base)
+{
+    printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = base + 0;          // Value at offset 0
+    h_reference[1] = base + 100;        // Value at offset 100
+    h_reference[2] = base + 1000;       // Value at offset 1000
+    h_reference[3] = base + 10000;      // Value at offset 10000
+    h_reference[4] = base + 1;          // Value at offset 1
+    h_reference[5] = base + 21;         // Value at offset 21
+    h_reference[6] = base + 11;         // Value at offset 11
+    h_reference[7] = base + 0;          // Value at offset 0;
+
+    CountingInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    unsigned long long  max_items   = ((1ull << ((sizeof(T) * 8) - 1)) - 1);
+    size_t  copy_items              = (size_t) CUB_MIN(max_items - base, 100);     // potential issue with differencing overflows when T is a smaller type than can handle the offset
+    T                   *h_copy     = new T[copy_items];
+    T                   *d_copy     = NULL;
+
+    for (unsigned long long i = 0; i < copy_items; ++i)
+        h_copy[i] = d_itr[i];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+}
+
+
+/**
+ * Test modified iterator
+ */
+template <typename T, typename CastT>
+void TestModified()
+{
+    printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+
+    CacheModifiedInputIterator<LOAD_CG, T> d_in_itr((CastT*) d_data);
+    CacheModifiedOutputIterator<STORE_CG, T> d_out_itr((CastT*) d_copy);
+
+    thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test transform iterator
+ */
+template <typename T, typename CastT>
+void TestTransform()
+{
+    printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    TransformInputIterator<T, TransformOp<T>, CastT*> d_itr((CastT*) d_data, op);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *h_copy = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+        h_copy[i] = op(h_data[i]);
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test tex-obj texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexObj()
+{
+    printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES          = 11000;
+    const unsigned int DUMMY_OFFSET         = 500;
+    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind obj-based test iterator
+    TexObjInputIterator<T> d_obj_itr;
+    CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    Test(d_obj_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
+    thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    CubDebugExit(d_obj_itr.UnbindTexture());
+
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+
+#if CUDART_VERSION >= 5050
+
+/**
+ * Test tex-ref texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexRef()
+{
+    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES                   = 11000;
+    constexpr unsigned int DUMMY_OFFSET         = 500;
+    constexpr unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind ref-based test iterator
+    TexRefInputIterator<T, __LINE__> d_ref_itr;
+    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create and bind dummy iterator of same type to check with interferance
+    TexRefInputIterator<T, __LINE__> d_ref_itr2;
+    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+
+    Test(d_ref_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
+    thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    CubDebugExit(d_ref_itr.UnbindTexture());
+    CubDebugExit(d_ref_itr2.UnbindTexture());
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+
+/**
+ * Test texture transform iterator
+ */
+template <typename T, typename CastT>
+void TestTexTransform()
+{
+    printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    // Create and bind texture iterator
+    typedef TexRefInputIterator<T, __LINE__> TextureIterator;
+
+    TextureIterator d_tex_itr;
+    CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create transform iterator
+    TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);
+
+    Test(xform_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *h_copy = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+        h_copy[i] = op(h_data[i]);
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    CubDebugExit(d_tex_itr.UnbindTexture());
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+#endif  // CUDART_VERSION
+
+
+
+
+/**
+ * Run non-integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<false> /* is_integer */)
+{
+    TestModified<T, CastT>();
+    TestTransform<T, CastT>();
+
+#if CUB_CDP
+    // Test tex-obj iterators if CUDA dynamic parallelism enabled
+    TestTexObj<T, CastT>(type_string);
+#endif  // CUB_CDP
+
+#if CUDART_VERSION >= 5050
+    // Test tex-ref iterators for CUDA 5.5
+    TestTexRef<T, CastT>();
+    TestTexTransform<T, CastT>();
+#endif  // CUDART_VERSION
+}
+
+/**
+ * Run integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<true> /* is_integer */)
+{
+    TestConstant<T>(0);
+    TestConstant<T>(99);
+
+    TestCounting<T>(0);
+    TestCounting<T>(99);
+
+    // Run non-integer tests
+    Test<T, CastT>(Int2Type<false>());
+}
+
+/**
+ * Run tests
+ */
+template <typename T>
+void Test()
+{
+    enum {
+        IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
+    };
+
+    // Test non-const type
+    Test<T, T>(Int2Type<IS_INTEGER>());
+
+    // Test non-const type
+    Test<T, const T>(Int2Type<IS_INTEGER>());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // Evaluate different data types
+    Test<char>();
+    Test<short>();
+    Test<int>();
+    Test<long>();
+    Test<long long>();
+    Test<float>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double>();
+
+    Test<char2>();
+    Test<short2>();
+    Test<int2>();
+    Test<long2>();
+    Test<longlong2>();
+    Test<float2>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double2>();
+
+    Test<char3>();
+    Test<short3>();
+    Test<int3>();
+    Test<long3>();
+    Test<longlong3>();
+    Test<float3>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double3>();
+
+    Test<char4>();
+    Test<short4>();
+    Test<int4>();
+    Test<long4>();
+    Test<longlong4>();
+    Test<float4>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double4>();
+
+    Test<TestFoo>();
+    Test<TestBar>();
+
+    printf("\nTest complete\n"); fflush(stdout);
+
+    return 0;
+}
+
+
+
diff --git a/thrust/dependencies/cub/test/test_util.h b/thrust/dependencies/cub/test/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2fbd17cc3b9e9de3a37a0ff21e36aa2fdcdff14
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_util.h
@@ -0,0 +1,1648 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+    #include <windows.h>
+    #undef small            // Windows is terrible for polluting macro namespace
+#else
+    #include <sys/resource.h>
+#endif
+
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <float.h>
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include "mersenne.h"
+#include "half.h"
+
+#include "cub/util_debug.cuh"
+#include "cub/util_device.cuh"
+#include "cub/util_type.cuh"
+#include "cub/util_macro.cuh"
+#include "cub/iterator/discard_output_iterator.cuh"
+
+/******************************************************************************
+ * Type conversion macros
+ ******************************************************************************/
+
+/**
+ * Return a value of type `T` with the same bitwise representation of `in`.
+ * Types `T` and `U` must be the same size.
+ */
+template <typename T, typename U>
+T SafeBitCast(const U& in)
+{
+  static_assert(sizeof(T) == sizeof(U), "Types must be same size.");
+  T out;
+  memcpy(&out, &in, sizeof(T));
+  return out;
+}
+
+/******************************************************************************
+ * Assertion macros
+ ******************************************************************************/
+
+/**
+ * Assert equals
+ */
+#define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
+
+
+/******************************************************************************
+ * Command-line parsing functionality
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLineArgs
+{
+
+    std::vector<std::string>    keys;
+    std::vector<std::string>    values;
+    std::vector<std::string>    args;
+    cudaDeviceProp              deviceProp;
+    float                       device_giga_bandwidth;
+    size_t                      device_free_physmem;
+    size_t                      device_total_physmem;
+
+    /**
+     * Constructor
+     */
+    CommandLineArgs(int argc, char **argv) :
+        keys(10),
+        values(10)
+    {
+        using namespace std;
+
+        // Initialize mersenne generator
+        unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
+        mersenne::init_by_array(mersenne_init, 4);
+
+        for (int i = 1; i < argc; i++)
+        {
+            string arg = argv[i];
+
+            if ((arg[0] != '-') || (arg[1] != '-'))
+            {
+                args.push_back(arg);
+                continue;
+            }
+
+            string::size_type pos;
+            string key, val;
+            if ((pos = arg.find('=')) == string::npos) {
+                key = string(arg, 2, arg.length() - 2);
+                val = "";
+            } else {
+                key = string(arg, 2, pos - 2);
+                val = string(arg, pos + 1, arg.length() - 1);
+            }
+
+            keys.push_back(key);
+            values.push_back(val);
+        }
+    }
+
+
+    /**
+     * Checks whether a flag "--<flag>" is present in the commandline
+     */
+    bool CheckCmdLineFlag(const char* arg_name)
+    {
+        using namespace std;
+
+        for (int i = 0; i < int(keys.size()); ++i)
+        {
+            if (keys[i] == string(arg_name))
+                return true;
+        }
+        return false;
+    }
+
+
+    /**
+     * Returns number of naked (non-flag and non-key-value) commandline parameters
+     */
+    template <typename T>
+    int NumNakedArgs()
+    {
+        return args.size();
+    }
+
+
+    /**
+     * Returns the commandline parameter for a given index (not including flags)
+     */
+    template <typename T>
+    void GetCmdLineArgument(int index, T &val)
+    {
+        using namespace std;
+        if (index < args.size()) {
+            istringstream str_stream(args[index]);
+            str_stream >> val;
+        }
+    }
+
+    /**
+     * Returns the value specified for a given commandline parameter --<flag>=<value>
+     */
+    template <typename T>
+    void GetCmdLineArgument(const char *arg_name, T &val)
+    {
+        using namespace std;
+
+        for (int i = 0; i < int(keys.size()); ++i)
+        {
+            if (keys[i] == string(arg_name))
+            {
+                istringstream str_stream(values[i]);
+                str_stream >> val;
+            }
+        }
+    }
+
+
+    /**
+     * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+     */
+    template <typename T>
+    void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
+    {
+        using namespace std;
+
+        if (CheckCmdLineFlag(arg_name))
+        {
+            // Clear any default values
+            vals.clear();
+
+            // Recover from multi-value string
+            for (int i = 0; i < keys.size(); ++i)
+            {
+                if (keys[i] == string(arg_name))
+                {
+                    string val_string(values[i]);
+                    istringstream str_stream(val_string);
+                    string::size_type old_pos = 0;
+                    string::size_type new_pos = 0;
+
+                    // Iterate comma-separated values
+                    T val;
+                    while ((new_pos = val_string.find(',', old_pos)) != string::npos)
+                    {
+                        if (new_pos != old_pos)
+                        {
+                            str_stream.width(new_pos - old_pos);
+                            str_stream >> val;
+                            vals.push_back(val);
+                        }
+
+                        // skip over comma
+                        str_stream.ignore(1);
+                        old_pos = new_pos + 1;
+                    }
+
+                    // Read last value
+                    str_stream >> val;
+                    vals.push_back(val);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * The number of pairs parsed
+     */
+    int ParsedArgc()
+    {
+        return (int) keys.size();
+    }
+
+    /**
+     * Initialize device
+     */
+    cudaError_t DeviceInit(int dev = -1)
+    {
+        cudaError_t error = cudaSuccess;
+
+        do
+        {
+            int deviceCount;
+            error = CubDebug(cudaGetDeviceCount(&deviceCount));
+            if (error) break;
+
+            if (deviceCount == 0) {
+                fprintf(stderr, "No devices supporting CUDA.\n");
+                exit(1);
+            }
+            if (dev < 0)
+            {
+                GetCmdLineArgument("device", dev);
+            }
+            if ((dev > deviceCount - 1) || (dev < 0))
+            {
+                dev = 0;
+            }
+
+            error = CubDebug(cudaSetDevice(dev));
+            if (error) break;
+
+            CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+
+            int ptx_version = 0;
+            error = CubDebug(cub::PtxVersion(ptx_version));
+            if (error) break;
+
+            error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
+            if (error) break;
+
+            if (deviceProp.major < 1) {
+                fprintf(stderr, "Device does not support CUDA.\n");
+                exit(1);
+            }
+
+            device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+            if (!CheckCmdLineFlag("quiet"))
+            {
+                printf(
+                        "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
+                        "%lld free / %lld total MB physmem, "
+                        "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
+                    dev,
+                    deviceProp.name,
+                    ptx_version,
+                    deviceProp.major * 100 + deviceProp.minor * 10,
+                    deviceProp.multiProcessorCount,
+                    (unsigned long long) device_free_physmem / 1024 / 1024,
+                    (unsigned long long) device_total_physmem / 1024 / 1024,
+                    device_giga_bandwidth,
+                    deviceProp.memoryClockRate,
+                    (deviceProp.ECCEnabled) ? "on" : "off");
+                fflush(stdout);
+            }
+
+        } while (0);
+
+        return error;
+    }
+};
+
+/******************************************************************************
+ * Random bits generator
+ ******************************************************************************/
+
+int g_num_rand_samples = 0;
+
+
+template <typename T>
+bool IsNaN(T /* val */) { return false; }
+
+template<>
+__noinline__ bool IsNaN<float>(float val)
+{
+  return std::isnan(val);
+}
+
+template<>
+__noinline__ bool IsNaN<float1>(float1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float2>(float2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float3>(float3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float4>(float4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+template<>
+__noinline__ bool IsNaN<double>(double val)
+{
+  return std::isnan(val);
+}
+
+template<>
+__noinline__ bool IsNaN<double1>(double1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double2>(double2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double3>(double3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double4>(double4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+
+template<>
+__noinline__ bool IsNaN<half_t>(half_t val)
+{
+    const auto bits = SafeBitCast<unsigned short>(val);
+
+    // commented bit is always true, leaving for documentation:
+    return (((bits >= 0x7C01) && (bits <= 0x7FFF)) ||
+        ((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/));
+}
+
+
+
+/**
+ * Generates random keys.
+ *
+ * We always take the second-order byte from rand() because the higher-order
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ *
+ * We can decrease the entropy level of keys by adopting the technique
+ * of Thearling and Smith in which keys are computed from the bitwise AND of
+ * multiple random samples:
+ *
+ * entropy_reduction    | Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1                   | 0
+ * 0                    | 32
+ * 1                    | 25.95 (81%)
+ * 2                    | 17.41 (54%)
+ * 3                    | 10.78 (34%)
+ * 4                    | 6.42 (20%)
+ * ...                  | ...
+ *
+ */
+template <typename K>
+void RandomBits(
+    K &key,
+    int entropy_reduction = 0,
+    int begin_bit = 0,
+    int end_bit = sizeof(K) * 8)
+{
+    const int NUM_BYTES = sizeof(K);
+    const int WORD_BYTES = sizeof(unsigned int);
+    const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
+
+    unsigned int word_buff[NUM_WORDS];
+
+    if (entropy_reduction == -1)
+    {
+        memset((void *) &key, 0, sizeof(key));
+        return;
+    }
+
+    if (end_bit < 0)
+        end_bit = sizeof(K) * 8;
+
+    while (true) 
+    {
+        // Generate random word_buff
+        for (int j = 0; j < NUM_WORDS; j++)
+        {
+            int current_bit = j * WORD_BYTES * 8;
+
+            unsigned int word = 0xffffffff;
+            word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
+            word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
+
+            for (int i = 0; i <= entropy_reduction; i++)
+            {
+                // Grab some of the higher bits from rand (better entropy, supposedly)
+                word &= mersenne::genrand_int32();
+                g_num_rand_samples++;                
+            }
+
+            word_buff[j] = word;
+        }
+
+        memcpy(&key, word_buff, sizeof(K));
+
+        K copy = key;
+        if (!IsNaN(copy))
+            break;          // avoids NaNs when generating random floating point numbers
+    }
+}
+
+/// Randomly select number between [0:max)
+template <typename T>
+T RandomValue(T max)
+{
+    unsigned int bits;
+    unsigned int max_int = (unsigned int) -1;
+    do {
+        RandomBits(bits);
+    } while (bits == max_int);
+
+    return (T) ((double(bits) / double(max_int)) * double(max));
+}
+
+
+/******************************************************************************
+ * Console printing utilities
+ ******************************************************************************/
+
+/**
+ * Helper for casting character types to integers for cout printing
+ */
+template <typename T>
+T CoutCast(T val) { return val; }
+
+int CoutCast(char val) { return val; }
+
+int CoutCast(unsigned char val) { return val; }
+
+int CoutCast(signed char val) { return val; }
+
+
+
+/******************************************************************************
+ * Test value initialization utilities
+ ******************************************************************************/
+
+/**
+ * Test problem generation options
+ */
+enum GenMode
+{
+    UNIFORM,            // Assign to '2', regardless of integer seed
+    INTEGER_SEED,       // Assign to integer seed
+    RANDOM,             // Assign to random, regardless of integer seed
+    RANDOM_BIT,         // Assign to randomly chosen 0 or 1, regardless of integer seed
+};
+
+/**
+ * Initialize value
+ */
+template <typename T>
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
+{
+    switch (gen_mode)
+    {
+#if (CUB_PTX_ARCH == 0)
+    case RANDOM:
+        RandomBits(value);
+        break;
+    case RANDOM_BIT:
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = (c > 0) ? (T) 1 : (T) -1;
+        break;
+#endif
+     case UNIFORM:
+        value = 2;
+        break;
+    case INTEGER_SEED:
+    default:
+         value = (T) index;
+        break;
+    }
+}
+
+
+/**
+ * Initialize value (bool)
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
+{
+    switch (gen_mode)
+    {
+#if (CUB_PTX_ARCH == 0)
+    case RANDOM:
+    case RANDOM_BIT:
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = (c > 0);
+        break;
+#endif
+     case UNIFORM:
+        value = true;
+        break;
+    case INTEGER_SEED:
+    default:
+        value = (index > 0);
+        break;
+    }
+}
+
+
+/**
+ * cub::NullType test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode /* gen_mode */,
+						   cub::NullType &/* value */,
+						   int /* index */ = 0)
+{}
+
+
+/**
+ * cub::KeyValuePair<OffsetT, ValueT>test initialization
+ */
+template <typename KeyT, typename ValueT>
+__host__ __device__ __forceinline__ void InitValue(
+    GenMode                             gen_mode,
+    cub::KeyValuePair<KeyT, ValueT>&    value,
+    int                                 index = 0)
+{
+    InitValue(gen_mode, value.value, index);
+
+    // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
+    RandomBits(value.key, 3);
+    value.key = (value.key & 0x1);
+}
+
+
+
+/******************************************************************************
+ * Comparison and ostream operators
+ ******************************************************************************/
+
+/**
+ * KeyValuePair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
+{
+    os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
+    return os;
+}
+
+
+/******************************************************************************
+ * Comparison and ostream operators for CUDA vector types
+ ******************************************************************************/
+
+/**
+ * Vector1 overloads
+ */
+#define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '(' << CoutCast(val.x) << ')';                \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x);                                \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x);                                \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x > b.x);                                 \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x < b.x);                                 \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(a.x + b.x);                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace std */
+
+
+
+/**
+ * Vector2 overloads
+ */
+#define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        return a.y > b.y;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        return a.y < b.y;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                         \
+        T b)                                         \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+
+
+/**
+ * Vector3 overloads
+ */
+#define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        return a.z > b.z;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        return a.z < b.z;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+
+/**
+ * Vector4 overloads
+ */
+#define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ','                       \
+            << CoutCast(val.w) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z) ||                                 \
+            (a.w != b.w);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z) &&                                 \
+            (a.w == b.w);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+        InitValue(gen_mode, value.w, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
+        return a.w > b.w;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
+        return a.w < b.w;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z,                                      \
+            a.w + b.w);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+/**
+ * All vector overloads
+ */
+#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
+    CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
+    CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
+    CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
+    CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
+
+/**
+ * Define for types
+ */
+CUB_VEC_OVERLOAD(char, char)
+CUB_VEC_OVERLOAD(short, short)
+CUB_VEC_OVERLOAD(int, int)
+CUB_VEC_OVERLOAD(long, long)
+CUB_VEC_OVERLOAD(longlong, long long)
+CUB_VEC_OVERLOAD(uchar, unsigned char)
+CUB_VEC_OVERLOAD(ushort, unsigned short)
+CUB_VEC_OVERLOAD(uint, unsigned int)
+CUB_VEC_OVERLOAD(ulong, unsigned long)
+CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
+CUB_VEC_OVERLOAD(float, float)
+CUB_VEC_OVERLOAD(double, double)
+
+
+//---------------------------------------------------------------------
+// Complex data type TestFoo
+//---------------------------------------------------------------------
+
+/**
+ * TestFoo complex data type
+ */
+struct TestFoo
+{
+    long long   x;
+    int         y;
+    short       z;
+    char        w;
+
+    // Factory
+    static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
+    {
+        TestFoo retval = {x, y, z, w};
+        return retval;
+    }
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestFoo& operator =(int b)
+    {
+        x = b;
+        y = b;
+        z = b;
+        w = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
+    {
+        return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
+    {
+        return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
+    {
+        return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        if (y < b.y) return true; else if (b.y < y) return false;
+        if (z < b.z) return true; else if (b.z < z) return false;
+        return w < b.w;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        if (y > b.y) return true; else if (b.y > y) return false;
+        if (z > b.z) return true; else if (b.z > z) return false;
+        return w > b.w;
+    }
+
+};
+
+/**
+ * TestFoo ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestFoo& val)
+{
+    os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
+    return os;
+}
+
+/**
+ * TestFoo test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+    InitValue(gen_mode, value.z, index);
+    InitValue(gen_mode, value.w, index);
+}
+
+
+/// numeric_limits<TestFoo> specialization
+namespace cub {
+template<>
+struct NumericTraits<TestFoo>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    static TestFoo Max()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max(),
+            NumericTraits<short>::Max(),
+            NumericTraits<char>::Max());
+    }
+
+    static TestFoo Lowest()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest(),
+            NumericTraits<short>::Lowest(),
+            NumericTraits<char>::Lowest());
+    }
+};
+} // namespace cub
+
+
+//---------------------------------------------------------------------
+// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
+//---------------------------------------------------------------------
+
+/**
+ * TestBar complex data type
+ */
+struct TestBar
+{
+    long long       x;
+    int             y;
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
+    {}
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestBar& operator =(int b)
+    {
+        x = b;
+        y = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
+    {
+        return TestBar(x + b.x, y + b.y);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
+    {
+        return (x != b.x) || (y != b.y);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
+    {
+        return (x == b.x) && (y == b.y);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        return y < b.y;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        return y > b.y;
+    }
+
+};
+
+
+/**
+ * TestBar ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestBar& val)
+{
+    os << '(' << val.x << ',' << val.y << ')';
+    return os;
+}
+
+/**
+ * TestBar test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+}
+
+/// numeric_limits<TestBar> specialization
+namespace cub {
+template<>
+struct NumericTraits<TestBar>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    static TestBar Max()
+    {
+        return TestBar(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max());
+    }
+
+    static TestBar Lowest()
+    {
+        return TestBar(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest());
+    }
+};
+} // namespace cub
+
+
+/******************************************************************************
+ * Helper routines for list comparison and display
+ ******************************************************************************/
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename S, typename T, typename OffsetT>
+int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                << CoutCast(computed[i]) << " != "
+                << CoutCast(reference[i]);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            float difference = std::abs(computed[i]-reference[i]);
+            float fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.0001)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << "(computed) " << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
+{
+    return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            double difference = std::abs(computed[i]-reference[i]);
+            double fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.0001)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+int CompareDeviceResults(
+    cub::NullType */* h_reference */,
+    cub::NullType */* d_data */,
+    size_t /* num_items */,
+    bool /* verbose */ = true,
+    bool /* display_data */ = false)
+{
+    return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename OffsetT>
+int CompareDeviceResults(
+    S *h_reference,
+    cub::DiscardOutputIterator<OffsetT> d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename T>
+int CompareDeviceResults(
+    S *h_reference,
+    T *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data)
+    {
+        printf("Reference:\n");
+        for (int i = 0; i < int(num_items); i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (int i = 0; i < int(num_items); i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a device array
+ */
+template <typename T>
+int CompareDeviceDeviceResults(
+    T *d_reference,
+    T *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_reference = (T*) malloc(num_items * sizeof(T));
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data) {
+        printf("Reference:\n");
+        for (int i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (int i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_reference) free(h_reference);
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Print the contents of a host array
+ */
+void DisplayResults(
+    cub::NullType   */* h_data */,
+    size_t          /* num_items */)
+{}
+
+
+/**
+ * Print the contents of a host array
+ */
+template <typename InputIteratorT>
+void DisplayResults(
+    InputIteratorT h_data,
+    size_t num_items)
+{
+    // Display data
+    for (int i = 0; i < int(num_items); i++)
+    {
+        std::cout << CoutCast(h_data[i]) << ", ";
+    }
+    printf("\n");
+}
+
+
+/**
+ * Print the contents of a device array
+ */
+template <typename T>
+void DisplayDeviceResults(
+    T *d_data,
+    size_t num_items)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    DisplayResults(h_data, num_items);
+
+    // Cleanup
+    if (h_data) free(h_data);
+}
+
+
+/******************************************************************************
+ * Segment descriptor generation
+ ******************************************************************************/
+
+/**
+ * Initialize segments
+ */
+void InitializeSegments(
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    bool    verbose = false)
+{
+    if (num_segments <= 0)
+        return;
+
+    unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
+    int offset = 0;
+    for (int i = 0; i < num_segments; ++i)
+    {
+        h_segment_offsets[i] = offset;
+
+        unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
+        offset += segment_length;
+        offset = CUB_MIN(offset, num_items);
+    }
+    h_segment_offsets[num_segments] = num_items;
+
+    if (verbose)
+    {
+        printf("Segment offsets: ");
+        DisplayResults(h_segment_offsets, num_segments + 1);
+    }
+}
+
+
+/******************************************************************************
+ * Timing
+ ******************************************************************************/
+
+
+struct CpuTimer
+{
+#if defined(_WIN32) || defined(_WIN64)
+
+    LARGE_INTEGER ll_freq;
+    LARGE_INTEGER ll_start;
+    LARGE_INTEGER ll_stop;
+
+    CpuTimer()
+    {
+        QueryPerformanceFrequency(&ll_freq);
+    }
+
+    void Start()
+    {
+        QueryPerformanceCounter(&ll_start);
+    }
+
+    void Stop()
+    {
+        QueryPerformanceCounter(&ll_stop);
+    }
+
+    float ElapsedMillis()
+    {
+        double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
+        double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
+
+        return float((stop - start) * 1000);
+    }
+
+#else
+
+    rusage start;
+    rusage stop;
+
+    void Start()
+    {
+        getrusage(RUSAGE_SELF, &start);
+    }
+
+    void Stop()
+    {
+        getrusage(RUSAGE_SELF, &stop);
+    }
+
+    float ElapsedMillis()
+    {
+        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
+        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
+
+        return (sec * 1000) + (usec / 1000);
+    }
+
+#endif
+};
+
+struct GpuTimer
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+
+    GpuTimer()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+
+    ~GpuTimer()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+
+    void Start()
+    {
+        cudaEventRecord(start, 0);
+    }
+
+    void Stop()
+    {
+        cudaEventRecord(stop, 0);
+    }
+
+    float ElapsedMillis()
+    {
+        float elapsed;
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&elapsed, start, stop);
+        return elapsed;
+    }
+};
diff --git a/thrust/dependencies/cub/test/test_warp_reduce.cu b/thrust/dependencies/cub/test/test_warp_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..673219aa4191ffb45e2580cefbc2d318b6305190
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_warp_reduce.cu
@@ -0,0 +1,840 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of WarpReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/warp/warp_reduce.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<
+    typename    OpT,
+    int         LOGICAL_WARP_THREADS>
+struct WrapperFunctor
+{
+    OpT op;
+    int num_valid;
+
+    inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {}
+
+    template <typename T>
+    inline __host__ __device__ T operator()(const T &a, const T &b) const
+    {
+#if CUB_PTX_ARCH != 0
+        if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
+            cub::ThreadTrap();
+#endif
+
+        return op(a, b);
+    }
+
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Generic reduction
+ */
+template <
+    typename    T,
+    typename    ReductionOp,
+    typename    WarpReduce,
+    bool        PRIMITIVE = Traits<T>::PRIMITIVE>
+struct DeviceTest
+{
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).Reduce(data, reduction_op);
+    }
+
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        ReductionOp                         &reduction_op,
+        const int                           &valid_warp_threads)
+    {
+        return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T HeadSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T TailSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op);
+    }
+
+};
+
+
+/**
+ * Summation
+ */
+template <
+    typename    T,
+    typename    WarpReduce>
+struct DeviceTest<T, Sum, WarpReduce, true>
+{
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).Sum(data);
+    }
+
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        Sum                              &reduction_op,
+        const int                           &valid_warp_threads)
+    {
+        return WarpReduce(temp_storage).Sum(data, valid_warp_threads);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T HeadSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).HeadSegmentedSum(data, flag);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T TailSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).TailSegmentedSum(data, flag);
+    }
+
+};
+
+
+/**
+ * Full-tile warp reduction kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+__global__ void FullWarpReduceKernel(
+    T               *d_in,
+    T               *d_out,
+    ReductionOp     reduction_op,
+    clock_t         *d_elapsed)
+{
+    // Cooperative warp-reduce utility type (1 warp)
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T input = d_in[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
+        temp_storage[warp_id], input, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
+        output :
+        input;
+}
+
+/**
+ * Partially-full warp reduction kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+__global__ void PartialWarpReduceKernel(
+    T           *d_in,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed,
+    int         valid_warp_threads)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T input = d_in[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test partial-warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
+        temp_storage[warp_id], input, reduction_op, valid_warp_threads);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
+        output :
+        input;
+}
+
+
+/**
+ * Head-based segmented warp reduction test kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    FlagT,
+    typename    ReductionOp>
+__global__ void WarpHeadSegmentedReduceKernel(
+    T           *d_in,
+    FlagT        *d_head_flags,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T       input       = d_in[threadIdx.x];
+    FlagT   head_flag   = d_head_flags[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test segmented warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce(
+        temp_storage[warp_id], input, head_flag, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
+        output :
+        input;
+}
+
+
+/**
+ * Tail-based segmented warp reduction test kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    FlagT,
+    typename    ReductionOp>
+__global__ void WarpTailSegmentedReduceKernel(
+    T           *d_in,
+    FlagT       *d_tail_flags,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T       input       = d_in[threadIdx.x];
+    FlagT    tail_flag   = d_tail_flags[threadIdx.x];
+    FlagT    head_flag   = (threadIdx.x == 0) ?
+                            0 :
+                            d_tail_flags[threadIdx.x - 1];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test segmented warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce(
+        temp_storage[warp_id], input, tail_flag, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
+        output :
+        input;
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ReductionOp>
+void Initialize(
+    GenMode     gen_mode,
+    int         flag_entropy,
+    T           *h_in,
+    int         *h_flags,
+    int         warps,
+    int         warp_threads,
+    int         valid_warp_threads,
+    ReductionOp reduction_op,
+    T           *h_head_out,
+    T           *h_tail_out)
+{
+    for (int i = 0; i < warps * warp_threads; ++i)
+    {
+        // Sample a value for this item
+        InitValue(gen_mode, h_in[i], i);
+        h_head_out[i] = h_in[i];
+        h_tail_out[i] = h_in[i];
+
+        // Sample whether or not this item will be a segment head
+        char bits;
+        RandomBits(bits, flag_entropy);
+        h_flags[i] = bits & 0x1;
+    }
+
+    // Accumulate segments (lane 0 of each warp is implicitly a segment head)
+    for (int warp = 0; warp < warps; ++warp)
+    {
+        int warp_offset  = warp * warp_threads;
+        int item_offset = warp_offset + valid_warp_threads - 1;
+
+        // Last item in warp
+        T head_aggregate = h_in[item_offset];
+        T tail_aggregate = h_in[item_offset];
+
+        if (h_flags[item_offset])
+            h_head_out[item_offset] = head_aggregate;
+        item_offset--;
+
+        // Work backwards
+        while (item_offset >= warp_offset)
+        {
+            if (h_flags[item_offset + 1])
+            {
+                head_aggregate = h_in[item_offset];
+            }
+            else
+            {
+                head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
+            }
+
+            if (h_flags[item_offset])
+            {
+                h_head_out[item_offset] = head_aggregate;
+                h_tail_out[item_offset + 1] = tail_aggregate;
+                tail_aggregate = h_in[item_offset];
+            }
+            else
+            {
+                tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
+            }
+
+            item_offset--;
+        }
+
+        // Record last segment head_aggregate to head offset
+        h_head_out[warp_offset] = head_aggregate;
+        h_tail_out[warp_offset] = tail_aggregate;
+    }
+}
+
+
+/**
+ * Test warp reduction
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void TestReduce(
+    GenMode     gen_mode,
+    ReductionOp reduction_op,
+    int         valid_warp_threads = LOGICAL_WARP_THREADS)
+{
+    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
+
+    // Allocate host arrays
+    T   *h_in           = new T[BLOCK_THREADS];
+    int *h_flags        = new int[BLOCK_THREADS];
+    T   *h_out          = new T[BLOCK_THREADS];
+    T   *h_tail_out     = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out);
+
+    // Initialize/clear device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS));
+
+    if (g_verbose)
+    {
+        printf("Data:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads);
+    }
+
+    // Run kernel
+    printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n",
+        gen_mode,
+        WARPS,
+        LOGICAL_WARP_THREADS,
+        valid_warp_threads,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    if (valid_warp_threads == LOGICAL_WARP_THREADS)
+    {
+        // Run full-warp kernel
+        FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            reduction_op,
+            d_elapsed);
+    }
+    else
+    {
+        // Run partial-warp kernel
+        PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            reduction_op,
+            d_elapsed,
+            valid_warp_threads);
+    }
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_flags) delete[] h_flags;
+    if (h_out) delete[] h_out;
+    if (h_tail_out) delete[] h_tail_out;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test warp segmented reduction
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void TestSegmentedReduce(
+    GenMode     gen_mode,
+    int         flag_entropy,
+    ReductionOp reduction_op)
+{
+    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
+
+    // Allocate host arrays
+    int compare;
+    T   *h_in           = new T[BLOCK_THREADS];
+    int *h_flags        = new int[BLOCK_THREADS];
+    T   *h_head_out     = new T[BLOCK_THREADS];
+    T   *h_tail_out     = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out);
+
+    // Initialize/clear device arrays
+    T           *d_in = NULL;
+    int         *d_flags = NULL;
+    T           *d_head_out = NULL;
+    T           *d_tail_out = NULL;
+    clock_t     *d_elapsed = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS));
+
+    if (g_verbose)
+    {
+        printf("Data:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
+
+        printf("\nFlags:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
+    }
+
+    printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n",
+        gen_mode,
+        flag_entropy,
+        WARPS,
+        LOGICAL_WARP_THREADS,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run head-based kernel
+    WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+        d_in,
+        d_flags,
+        d_head_out,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tHead-based segmented reduction results: ");
+    compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Run tail-based kernel
+    WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+        d_in,
+        d_flags,
+        d_tail_out,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tTail-based segmented reduction results: ");
+    compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_flags) delete[] h_flags;
+    if (h_head_out) delete[] h_head_out;
+    if (h_tail_out) delete[] h_tail_out;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out));
+    if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Run battery of tests for different full and partial tile sizes
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void Test(
+    GenMode     gen_mode,
+    ReductionOp reduction_op)
+{
+    // Partial tiles
+    for (
+        int valid_warp_threads = 1;
+        valid_warp_threads < LOGICAL_WARP_THREADS;
+        valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5))
+    {
+        // Without wrapper (to test non-excepting PTX POD-op specializations)
+        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads);
+
+        // With wrapper to ensure no ops called on OOB lanes
+        WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads);
+        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads);
+    }
+
+    // Full tile
+    TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS);
+
+    // Segmented reduction with different head flags
+    for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy)
+    {
+        TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op);
+    }
+}
+
+
+/**
+ * Run battery of tests for different data types and reduce ops
+ */
+template <
+    int WARPS,
+    int LOGICAL_WARP_THREADS>
+void Test(GenMode gen_mode)
+{
+    // primitive
+    Test<WARPS, LOGICAL_WARP_THREADS, char>(                gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, short>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, int>(                 gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, long long>(           gen_mode, Sum());
+
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Sum());
+
+    if (gen_mode != RANDOM)
+    {
+        Test<WARPS, LOGICAL_WARP_THREADS, float>(           gen_mode, Sum());
+        Test<WARPS, LOGICAL_WARP_THREADS, double>(          gen_mode, Sum());
+    }
+
+    // primitive (alternative reduce op)
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Max());
+
+    // vec-1
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar1>(              gen_mode, Sum());
+
+    // vec-2
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar2>(              gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ushort2>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, uint2>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>(          gen_mode, Sum());
+
+    // vec-4
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar4>(              gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ushort4>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, uint4>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>(          gen_mode, Sum());
+
+    // complex
+    Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, TestBar>(             gen_mode, Sum());
+}
+
+
+/**
+ * Run battery of tests for different problem generation options
+ */
+template <
+    int WARPS,
+    int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM);
+    Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED);
+    Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM);
+}
+
+
+/**
+ * Run battery of tests for different number of active warps
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<1, LOGICAL_WARP_THREADS>();
+
+    // Only power-of-two subwarps can be tiled
+    if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
+        Test<2, LOGICAL_WARP_THREADS>();
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    TestReduce<1, 32, int>(UNIFORM, Sum());
+
+    TestReduce<1, 32, double>(UNIFORM, Sum());
+    TestReduce<2, 16, TestBar>(UNIFORM, Sum());
+    TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test logical warp sizes
+        Test<32>();
+        Test<16>();
+        Test<9>();
+        Test<7>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/thrust/dependencies/cub/test/test_warp_scan.cu b/thrust/dependencies/cub/test/test_warp_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a7307323068cb3960660a7f25069763c47f7ef7d
--- /dev/null
+++ b/thrust/dependencies/cub/test/test_warp_scan.cu
@@ -0,0 +1,661 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of WarpScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/warp/warp_scan.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+static const int        NUM_WARPS       = 2;
+
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * Primitive variant to test
+ */
+enum TestMode
+{
+    BASIC,
+    AGGREGATE,
+};
+
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/// Exclusive scan basic
+template <typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.ExclusiveScan(data, data, initial_value, scan_op);
+}
+
+/// Exclusive scan aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.ExclusiveScan(data, data, initial_value, scan_op, aggregate);
+}
+
+
+/// Exclusive sum basic
+template <
+    typename    WarpScanT,
+    typename    T>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.ExclusiveSum(data, data);
+}
+
+
+/// Exclusive sum aggregate
+template <
+    typename    WarpScanT,
+    typename    T>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.ExclusiveSum(data, data, aggregate);
+}
+
+
+/// Inclusive scan basic
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.InclusiveScan(data, data, scan_op);
+}
+
+/// Inclusive scan aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.InclusiveScan(data, data, scan_op, aggregate);
+}
+
+/// Inclusive sum basic
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    InitialValueT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.InclusiveSum(data, data);
+}
+
+/// Inclusive sum aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    InitialValueT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.InclusiveSum(data, data, aggregate);
+}
+
+
+/**
+ * WarpScan test kernel
+ */
+template <
+    int         LOGICAL_WARP_THREADS,
+    TestMode    TEST_MODE,
+    typename    T,
+    typename    ScanOpT,
+    typename    InitialValueT>
+__global__ void WarpScanKernel(
+    T               *d_in,
+    T               *d_out,
+    T               *d_aggregate,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    clock_t         *d_elapsed)
+{
+    // Cooperative warp-scan utility type (1 warp)
+    typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpScanT::TempStorage temp_storage[NUM_WARPS];
+
+    // Get warp index
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+
+    // Per-thread tile data
+    T data = d_in[threadIdx.x];
+
+    // Start cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    T aggregate;
+
+    // Test scan
+    WarpScanT warp_scan(temp_storage[warp_id]);
+    DeviceTest(
+        warp_scan,
+        data,
+        initial_value,
+        scan_op,
+        aggregate,
+        Int2Type<TEST_MODE>(),
+        Int2Type<Traits<T>::PRIMITIVE>());
+
+    // Stop cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Store data
+    d_out[threadIdx.x] = data;
+
+    if (TEST_MODE != BASIC)
+    {
+        // Store aggregate
+        d_aggregate[threadIdx.x] = aggregate;
+    }
+
+    // Store time
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive-scan problem (and solution)
+ */
+template <
+    typename        T,
+    typename        ScanOpT>
+void Initialize(
+    GenMode         gen_mode,
+    T               *h_in,
+    T               *h_reference,
+    int             logical_warp_items,
+    ScanOpT         scan_op,
+    T               initial_value,
+    T               warp_aggregates[NUM_WARPS])
+{
+    for (int w = 0; w < NUM_WARPS; ++w)
+    {
+        int base_idx = (w * logical_warp_items);
+        int i = base_idx;
+
+        InitValue(gen_mode, h_in[i], i);
+
+        T warp_aggregate   = h_in[i];
+        h_reference[i]      = initial_value;
+        T inclusive         = scan_op(initial_value, h_in[i]);
+
+        for (i = i + 1; i < base_idx + logical_warp_items; ++i)
+        {
+            InitValue(gen_mode, h_in[i], i);
+            h_reference[i] = inclusive;
+            inclusive = scan_op(inclusive, h_in[i]);
+            warp_aggregate = scan_op(warp_aggregate, h_in[i]);
+        }
+
+        warp_aggregates[w] = warp_aggregate;
+    }
+
+}
+
+
+/**
+ * Initialize inclusive-scan problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ScanOpT>
+void Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         logical_warp_items,
+    ScanOpT     scan_op,
+    NullType,
+    T           warp_aggregates[NUM_WARPS])
+{
+    for (int w = 0; w < NUM_WARPS; ++w)
+    {
+        int base_idx = (w * logical_warp_items);
+        int i = base_idx;
+
+        InitValue(gen_mode, h_in[i], i);
+
+        T warp_aggregate    = h_in[i];
+        T inclusive         = h_in[i];
+        h_reference[i]      = inclusive;
+
+        for (i = i + 1; i < base_idx + logical_warp_items; ++i)
+        {
+            InitValue(gen_mode, h_in[i], i);
+            inclusive = scan_op(inclusive, h_in[i]);
+            warp_aggregate = scan_op(warp_aggregate, h_in[i]);
+            h_reference[i] = inclusive;
+        }
+
+        warp_aggregates[w] = warp_aggregate;
+    }
+}
+
+
+/**
+ * Test warp scan
+ */
+template <
+    int             LOGICAL_WARP_THREADS,
+    TestMode        TEST_MODE,
+    typename        T,
+    typename        ScanOpT,
+    typename        InitialValueT>        // NullType implies inclusive-scan, otherwise inclusive scan
+void Test(
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    enum {
+        TOTAL_ITEMS = LOGICAL_WARP_THREADS * NUM_WARPS,
+    };
+
+    // Allocate host arrays
+    T *h_in = new T[TOTAL_ITEMS];
+    T *h_reference = new T[TOTAL_ITEMS];
+    T *h_aggregate = new T[TOTAL_ITEMS];
+
+    // Initialize problem
+    T aggregates[NUM_WARPS];
+
+    Initialize(
+        gen_mode,
+        h_in,
+        h_reference,
+        LOGICAL_WARP_THREADS,
+        scan_op,
+        initial_value,
+        aggregates);
+
+    if (g_verbose)
+    {
+        printf("Input: \n");
+        DisplayResults(h_in, TOTAL_ITEMS);
+        printf("\n");
+    }
+
+    for (int w = 0; w < NUM_WARPS; ++w)
+    {
+        for (int i = 0; i < LOGICAL_WARP_THREADS; ++i)
+        {
+            h_aggregate[(w * LOGICAL_WARP_THREADS) + i] = aggregates[w];
+        }
+    }
+
+    // Initialize/clear device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    T *d_aggregate = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TOTAL_ITEMS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TOTAL_ITEMS + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * TOTAL_ITEMS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TOTAL_ITEMS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TOTAL_ITEMS + 1)));
+    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * TOTAL_ITEMS));
+
+    // Run kernel
+    printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
+        TEST_MODE, typeid(TEST_MODE).name(),
+        gen_mode, typeid(gen_mode).name(),
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        LOGICAL_WARP_THREADS,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run aggregate/prefix kernel
+    WarpScanKernel<LOGICAL_WARP_THREADS, TEST_MODE><<<1, TOTAL_ITEMS>>>(
+        d_in,
+        d_out,
+        d_aggregate,
+        scan_op,
+        initial_value,
+        d_elapsed);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tScan results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TOTAL_ITEMS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Copy out and display aggregate
+    if (TEST_MODE == AGGREGATE)
+    {
+        printf("\tScan aggregate: ");
+        compare = CompareDeviceResults(h_aggregate, d_aggregate, TOTAL_ITEMS, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_aggregate) delete[] h_aggregate;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Run battery of tests for different primitive variants
+ */
+template <
+    int         LOGICAL_WARP_THREADS,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+    // Exclusive
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, T());
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, T());
+
+    // Exclusive (non-specialized, so we can use initial-value)
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+
+    // Inclusive
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, NullType());
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, NullType());
+}
+
+
+/**
+ * Run battery of tests for different data types and scan ops
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test(GenMode gen_mode)
+{
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // primitive
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (char) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (short) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (int) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long long) 99);
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (float) 99);
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (double) 99);
+    }
+
+    // primitive (alternative scan op)
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned char) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned short) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned int) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned long long) 99);
+
+    // vec-2
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uchar2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ushort2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uint2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulong2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulonglong2(17, 21));
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float2(17, 21));
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double2(17, 21));
+    }
+
+    // vec-4
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_char4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_short4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_int4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_long4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_longlong4(17, 21, 32, 85));
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float4(17, 21, 32, 85));
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double4(17, 21, 32, 85));
+    }
+
+    // complex
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestBar(17, 21));
+
+}
+
+
+/**
+ * Run battery of tests for different problem generation options
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<LOGICAL_WARP_THREADS>(UNIFORM);
+    Test<LOGICAL_WARP_THREADS>(INTEGER_SEED);
+    Test<LOGICAL_WARP_THREADS>(RANDOM);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    Test<32, AGGREGATE, int>(UNIFORM, Sum(), (int) 0);
+    Test<32, AGGREGATE, float>(UNIFORM, Sum(), (float) 0);
+    Test<32, AGGREGATE, long long>(UNIFORM, Sum(), (long long) 0);
+    Test<32, AGGREGATE, double>(UNIFORM, Sum(), (double) 0);
+
+    typedef KeyValuePair<int, float> T;
+    cub::Sum sum_op;
+    Test<32, AGGREGATE, T>(UNIFORM, ReduceBySegmentOp<cub::Sum>(sum_op), T());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test logical warp sizes
+        Test<32>();
+        Test<16>();
+        Test<9>();
+        Test<2>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/thrust/dependencies/cub/tune/.gitignore b/thrust/dependencies/cub/tune/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5e56e040ec0902e58df8573adaec65c5da6e9304
--- /dev/null
+++ b/thrust/dependencies/cub/tune/.gitignore
@@ -0,0 +1 @@
+/bin
diff --git a/thrust/dependencies/cub/tune/Makefile b/thrust/dependencies/cub/tune/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..926b340fe4af77d77663281c5874e11fe3a41be4
--- /dev/null
+++ b/thrust/dependencies/cub/tune/Makefile
@@ -0,0 +1,192 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+ 
+#-------------------------------------------------------------------------------
+# Build script for project
+#-------------------------------------------------------------------------------
+
+NVCC = "$(shell which nvcc)"
+NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+#-------------------------------------------------------------------------------
+# Libs
+#-------------------------------------------------------------------------------
+
+
+#-------------------------------------------------------------------------------
+# Includes
+#-------------------------------------------------------------------------------
+
+INC = -I. -I.. -I../test
+
+#-------------------------------------------------------------------------------
+# Libs
+#-------------------------------------------------------------------------------
+
+LIBS += -lcudart 
+
+#-------------------------------------------------------------------------------
+# Defines
+#-------------------------------------------------------------------------------
+
+DEFINES = 
+
+#-------------------------------------------------------------------------------
+# SM Arch
+#-------------------------------------------------------------------------------
+
+ifdef sm
+	SM_ARCH = $(sm)
+else 
+    SM_ARCH = 200
+endif
+
+# Only one arch per tuning binary
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_35
+    SM_ARCH = 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_30
+    SM_ARCH = 300
+endif
+ifeq (200, $(findstring 200, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_20
+    SM_ARCH = 200
+endif
+ifeq (130, $(findstring 130, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_13
+    SM_ARCH = 130
+endif
+ifeq (110, $(findstring 110, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_11 
+    SM_ARCH = 110
+endif
+ifeq (100, $(findstring 100, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_10 
+    SM_ARCH = 100
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler Flags
+#-------------------------------------------------------------------------------
+
+NVCCFLAGS = -Xptxas -v -Xcudafe -\#
+
+# Help the compiler/linker work with huge numbers of kernels on Windows
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+endif
+
+# 32/64-bit (32-bit device pointers by default) 
+ifeq ($(force32), 1)
+	CPU_ARCH = -m32
+	CPU_ARCH_SUFFIX = i386
+else
+	CPU_ARCH = -m64
+	CPU_ARCH_SUFFIX = x86_64
+endif
+
+# CUDA ABI enable/disable (enabled by default) 
+ifneq ($(abi), 0)
+	ABI_SUFFIX = abi
+else 
+	NVCCFLAGS += -Xptxas -abi=no
+	ABI_SUFFIX = noabi
+endif
+
+# NVVM/Open64 middle-end compiler (nvvm by default)
+ifeq ($(open64), 1)
+	NVCCFLAGS += -open64
+	PTX_SUFFIX = open64
+else 
+	PTX_SUFFIX = nvvm
+endif
+
+# Verbose toolchain output from nvcc
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+# Keep intermediate compilation artifacts
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+# Data type size to compile a schmoo binary for
+ifdef tunesize
+    TUNE_SIZE = $(tunesize)
+else 
+	TUNE_SIZE = 4
+endif
+
+
+SUFFIX = $(TUNE_SIZE)B_sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =	 ./Makefile \
+		../test/test_util.h \
+		$(call rwildcard,../cub/,*.cuh)
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+
+#-------------------------------------------------------------------------------
+# make tune_device_reduce
+#-------------------------------------------------------------------------------
+
+tune_device_reduce: bin/tune_device_reduce_$(SUFFIX)
+
+bin/tune_device_reduce_$(SUFFIX) : tune_device_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/tune_device_reduce_$(SUFFIX) tune_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 -DTUNE_ARCH=$(SM_ARCH) -DTUNE_SIZE=$(TUNE_SIZE)
+
diff --git a/thrust/dependencies/cub/tune/tune_device_reduce.cu b/thrust/dependencies/cub/tune/tune_device_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ec0cf57bbc7328857d425480fd271a094a974346
--- /dev/null
+++ b/thrust/dependencies/cub/tune/tune_device_reduce.cu
@@ -0,0 +1,763 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Evaluates different tuning configurations of DeviceReduce.
+ *
+ * The best way to use this program:
+ * (1) Find the best all-around single-block tune for a given arch.
+ *     For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
+ *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
+ * (2) Update the single tune in device_reduce.cuh
+ * (3) Find the best all-around multi-block tune for a given arch.
+ *     For example, 100 samples [single-block tile-size ..  50,331,648], 100 timing iterations per config per sample:
+ *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
+ * (4) Update the multi-block tune in device_reduce.cuh
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <vector>
+#include <algorithm>
+#include <stdio.h>
+#include <cub/cub.cuh>
+#include "../test/test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+#ifndef TUNE_ARCH
+#define TUNE_ARCH 100
+#endif
+
+int     g_max_items         = 48 * 1024 * 1024;
+int     g_samples           = 100;
+int     g_timing_iterations        = 2;
+bool    g_verbose           = false;
+bool    g_single            = false;
+bool    g_verify            = true;
+CachingDeviceAllocator  g_allocator;
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode         gen_mode,
+    T               *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+}
+
+/**
+ * Sequential reduction
+ */
+template <typename T, typename ReductionOp>
+T Reduce(
+    T               *h_in,
+    ReductionOp     reduction_op,
+    int             num_items)
+{
+    T retval = h_in[0];
+    for (int i = 1; i < num_items; ++i)
+        retval = reduction_op(retval, h_in[i]);
+
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Full tile test generation
+//---------------------------------------------------------------------
+
+
+
+/**
+ * Wrapper structure for generating and running different tuning configurations
+ */
+template <
+    typename T,
+    typename OffsetT,
+    typename ReductionOp>
+struct Schmoo
+{
+    //---------------------------------------------------------------------
+    // Types
+    //---------------------------------------------------------------------
+
+    /// Pairing of kernel function pointer and corresponding dispatch params
+    template <typename KernelPtr>
+    struct DispatchTuple
+    {
+        KernelPtr                           kernel_ptr;
+        DeviceReduce::KernelDispachParams   params;
+
+        float                               avg_throughput;
+        float                               best_avg_throughput;
+        OffsetT                              best_size;
+        float                               hmean_speedup;
+
+
+        DispatchTuple() :
+            kernel_ptr(0),
+            params(DeviceReduce::KernelDispachParams()),
+            avg_throughput(0.0),
+            best_avg_throughput(0.0),
+            hmean_speedup(0.0),
+            best_size(0)
+        {}
+    };
+
+    /**
+     * Comparison operator for DispatchTuple.avg_throughput
+     */
+    template <typename Tuple>
+    static bool MinSpeedup(const Tuple &a, const Tuple &b)
+    {
+        float delta = a.hmean_speedup - b.hmean_speedup;
+
+        return ((delta < 0.02) && (delta > -0.02)) ?
+            (a.best_avg_throughput < b.best_avg_throughput) :       // Negligible average performance differences: defer to best performance
+            (a.hmean_speedup < b.hmean_speedup);
+    }
+
+
+
+    /// Multi-block reduction kernel type and dispatch tuple type
+    typedef void (*MultiBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, GridEvenShare<OffsetT>, GridQueue<OffsetT>, ReductionOp);
+    typedef DispatchTuple<MultiBlockDeviceReduceKernelPtr> MultiDispatchTuple;
+
+    /// Single-block reduction kernel type and dispatch tuple type
+    typedef void (*SingleBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, ReductionOp);
+    typedef DispatchTuple<SingleBlockDeviceReduceKernelPtr> SingleDispatchTuple;
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    vector<MultiDispatchTuple> multi_kernels;       // List of generated multi-block kernels
+    vector<SingleDispatchTuple> single_kernels;     // List of generated single-block kernels
+
+
+    //---------------------------------------------------------------------
+    // Kernel enumeration methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Must have smem that fits in the SM
+     * Must have vector load length that divides items per thread
+     */
+    template <typename TilesReducePolicy, typename ReductionOp>
+    struct SmemSize
+    {
+        enum
+        {
+            BYTES = sizeof(typename BlockReduceTiles<TilesReducePolicy, T*, OffsetT, ReductionOp>::TempStorage),
+            IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
+                     (TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
+        };
+    };
+
+
+    /**
+     * Specialization that allows kernel generation with the specified TilesReducePolicy
+     */
+    template <
+        typename    TilesReducePolicy,
+        bool        IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
+    struct Ok
+    {
+        /// Enumerate multi-block kernel and add to the list
+        template <typename KernelsVector>
+        static void GenerateMulti(
+            KernelsVector &multi_kernels,
+            int subscription_factor)
+        {
+            MultiDispatchTuple tuple;
+            tuple.params.template Init<TilesReducePolicy>(subscription_factor);
+            tuple.kernel_ptr = ReducePrivatizedKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
+            multi_kernels.push_back(tuple);
+        }
+
+
+        /// Enumerate single-block kernel and add to the list
+        template <typename KernelsVector>
+        static void GenerateSingle(KernelsVector &single_kernels)
+        {
+            SingleDispatchTuple tuple;
+            tuple.params.template Init<TilesReducePolicy>();
+            tuple.kernel_ptr = ReduceSingleKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
+            single_kernels.push_back(tuple);
+        }
+    };
+
+    /**
+     * Specialization that rejects kernel generation with the specified TilesReducePolicy
+     */
+    template <typename TilesReducePolicy>
+    struct Ok<TilesReducePolicy, false>
+    {
+        template <typename KernelsVector>
+        static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
+
+        template <typename KernelsVector>
+        static void GenerateSingle(KernelsVector &single_kernels) {}
+    };
+
+
+    /// Enumerate block-scheduling variations
+    template <
+        int                     BLOCK_THREADS,
+        int                     ITEMS_PER_THREAD,
+        int                     VECTOR_LOAD_LENGTH,
+        BlockReduceAlgorithm    BLOCK_ALGORITHM,
+        CacheLoadModifier      LOAD_MODIFIER>
+    void Enumerate()
+    {
+        // Multi-block kernels
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 1);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 2);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 4);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 8);
+#if TUNE_ARCH >= 200
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
+#endif
+
+        // Single-block kernels
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateSingle(single_kernels);
+    }
+
+
+    /// Enumerate load modifier variations
+    template <
+        int                     BLOCK_THREADS,
+        int                     ITEMS_PER_THREAD,
+        int                     VECTOR_LOAD_LENGTH,
+        BlockReduceAlgorithm    BLOCK_ALGORITHM>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_DEFAULT>();
+#if TUNE_ARCH >= 350
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_LDG>();
+#endif
+    }
+
+
+    /// Enumerate block algorithms
+    template <
+        int BLOCK_THREADS,
+        int ITEMS_PER_THREAD,
+        int VECTOR_LOAD_LENGTH>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_RAKING>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    }
+
+
+    /// Enumerate vectorization variations
+    template <
+        int BLOCK_THREADS,
+        int ITEMS_PER_THREAD>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 4>();
+    }
+
+
+    /// Enumerate thread-granularity variations
+    template <int BLOCK_THREADS>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, 7>();
+        Enumerate<BLOCK_THREADS, 8>();
+        Enumerate<BLOCK_THREADS, 9>();
+
+        Enumerate<BLOCK_THREADS, 11>();
+        Enumerate<BLOCK_THREADS, 12>();
+        Enumerate<BLOCK_THREADS, 13>();
+
+        Enumerate<BLOCK_THREADS, 15>();
+        Enumerate<BLOCK_THREADS, 16>();
+        Enumerate<BLOCK_THREADS, 17>();
+
+        Enumerate<BLOCK_THREADS, 19>();
+        Enumerate<BLOCK_THREADS, 20>();
+        Enumerate<BLOCK_THREADS, 21>();
+
+        Enumerate<BLOCK_THREADS, 23>();
+        Enumerate<BLOCK_THREADS, 24>();
+        Enumerate<BLOCK_THREADS, 25>();
+    }
+
+
+    /// Enumerate block size variations
+    void Enumerate()
+    {
+        printf("\nEnumerating kernels\n"); fflush(stdout);
+
+        Enumerate<32>();
+        Enumerate<64>();
+        Enumerate<96>();
+        Enumerate<128>();
+        Enumerate<160>();
+        Enumerate<192>();
+        Enumerate<256>();
+        Enumerate<512>();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Test methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Test a configuration
+     */
+    void TestConfiguration(
+        MultiDispatchTuple      &multi_dispatch,
+        SingleDispatchTuple     &single_dispatch,
+        T*                      d_in,
+        T*                      d_out,
+        T*                      h_reference,
+        OffsetT                  num_items,
+        ReductionOp             reduction_op)
+    {
+        // Clear output
+        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
+
+        // Allocate temporary storage
+        void            *d_temp_storage = NULL;
+        size_t          temp_storage_bytes = 0;
+        CubDebugExit(DeviceReduce::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            multi_dispatch.kernel_ptr,
+            single_dispatch.kernel_ptr,
+            FillAndResetDrainKernel<OffsetT>,
+            multi_dispatch.params,
+            single_dispatch.params,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Warmup/correctness iteration
+        CubDebugExit(DeviceReduce::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            multi_dispatch.kernel_ptr,
+            single_dispatch.kernel_ptr,
+            FillAndResetDrainKernel<OffsetT>,
+            multi_dispatch.params,
+            single_dispatch.params,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op));
+
+        if (g_verify) CubDebugExit(cudaDeviceSynchronize());
+
+        // Copy out and display results
+        int compare = (g_verify) ?
+            CompareDeviceResults(h_reference, d_out, 1, true, false) :
+            0;
+
+        // Performance
+        GpuTimer gpu_timer;
+        float elapsed_millis = 0.0;
+        for (int i = 0; i < g_timing_iterations; i++)
+        {
+            gpu_timer.Start();
+
+            CubDebugExit(DeviceReduce::Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                multi_dispatch.kernel_ptr,
+                single_dispatch.kernel_ptr,
+                FillAndResetDrainKernel<OffsetT>,
+                multi_dispatch.params,
+                single_dispatch.params,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op));
+
+            gpu_timer.Stop();
+            elapsed_millis += gpu_timer.ElapsedMillis();
+        }
+
+        // Mooch
+        CubDebugExit(cudaDeviceSynchronize());
+
+        float avg_elapsed = elapsed_millis / g_timing_iterations;
+        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
+        float avg_bandwidth = avg_throughput * sizeof(T);
+
+        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
+        if (avg_throughput > multi_dispatch.best_avg_throughput)
+        {
+            multi_dispatch.best_avg_throughput = avg_throughput;
+            multi_dispatch.best_size = num_items;
+        }
+
+        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
+        if (avg_throughput > single_dispatch.best_avg_throughput)
+        {
+            single_dispatch.best_avg_throughput = avg_throughput;
+            single_dispatch.best_size = num_items;
+        }
+
+        if (g_verbose)
+        {
+            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
+            multi_dispatch.params.Print();
+            printf(" ), single_dispatch( ");
+            single_dispatch.params.Print();
+            printf(" )\n");
+            fflush(stdout);
+        }
+
+        AssertEquals(0, compare);
+
+        // Cleanup temporaries
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+
+    /**
+     * Evaluate multi-block configurations
+     */
+    void TestMulti(
+        T*                      h_in,
+        T*                      d_in,
+        T*                      d_out,
+        ReductionOp             reduction_op)
+    {
+        // Simple single kernel tuple for use with multi kernel sweep
+        typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
+        SingleDispatchTuple simple_single_tuple;
+        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
+        simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
+
+        double max_exponent      = log2(double(g_max_items));
+        double min_exponent      = log2(double(simple_single_tuple.params.tile_size));
+        unsigned int max_int     = (unsigned int) -1;
+
+        for (int sample = 0; sample < g_samples; ++sample)
+        {
+            printf("\nMulti-block sample %d, ", sample);
+
+            int num_items;
+            if (sample == 0)
+            {
+                // First sample: use max items
+                num_items = g_max_items;
+                printf("num_items: %d", num_items); fflush(stdout);
+            }
+            else
+            {
+                // Sample a problem size from [2^g_min_exponent, g_max_items].  First 2/3 of the samples are log-distributed, the other 1/3 are uniformly-distributed.
+                unsigned int bits;
+                RandomBits(bits);
+                double scale = double(bits) / max_int;
+
+                if (sample < g_samples / 2)
+                {
+                    // log bias
+                    double exponent = ((max_exponent - min_exponent) * scale) + min_exponent;
+                    num_items = pow(2.0, exponent);
+                    num_items = CUB_MIN(num_items, g_max_items);
+                    printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
+                }
+                else
+                {
+                    // uniform bias
+                    num_items = CUB_MAX(pow(2.0, min_exponent), scale * g_max_items);
+                    num_items = CUB_MIN(num_items, g_max_items);
+                    printf("num_items: %d (%.2f * %d)", num_items, scale, g_max_items); fflush(stdout);
+                }
+            }
+            if (g_verbose)
+                printf("\n");
+            else
+                printf(", ");
+
+            // Compute reference
+            T h_reference = Reduce(h_in, reduction_op, num_items);
+
+            // Run test on each multi-kernel configuration
+            float best_avg_throughput = 0.0;
+            for (int j = 0; j < multi_kernels.size(); ++j)
+            {
+                multi_kernels[j].avg_throughput = 0.0;
+
+                TestConfiguration(multi_kernels[j], simple_single_tuple, d_in, d_out, &h_reference, num_items, reduction_op);
+
+                best_avg_throughput = CUB_MAX(best_avg_throughput, multi_kernels[j].avg_throughput);
+            }
+
+            // Print best throughput for this problem size
+            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
+
+            // Accumulate speedup (inverse for harmonic mean)
+            for (int j = 0; j < multi_kernels.size(); ++j)
+                multi_kernels[j].hmean_speedup += best_avg_throughput / multi_kernels[j].avg_throughput;
+        }
+
+        // Find max overall throughput and compute hmean speedups
+        float overall_max_throughput = 0.0;
+        for (int j = 0; j < multi_kernels.size(); ++j)
+        {
+            overall_max_throughput = CUB_MAX(overall_max_throughput, multi_kernels[j].best_avg_throughput);
+            multi_kernels[j].hmean_speedup = float(g_samples) / multi_kernels[j].hmean_speedup;
+        }
+
+        // Sort by cumulative speedup
+        sort(multi_kernels.begin(), multi_kernels.end(), MinSpeedup<MultiDispatchTuple>);
+
+        // Print ranked multi configurations
+        printf("\nRanked multi_kernels:\n");
+        for (int j = 0; j < multi_kernels.size(); ++j)
+        {
+            printf("\t (%d) params( ", multi_kernels.size() - j);
+            multi_kernels[j].params.Print();
+            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
+                multi_kernels[j].hmean_speedup,
+                multi_kernels[j].best_avg_throughput,
+                (int) multi_kernels[j].best_size,
+                multi_kernels[j].best_avg_throughput * sizeof(T),
+                multi_kernels[j].best_avg_throughput / overall_max_throughput);
+        }
+
+        printf("\nMax multi-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
+    }
+
+
+    /**
+     * Evaluate single-block configurations
+     */
+    void TestSingle(
+        T*                      h_in,
+        T*                      d_in,
+        T*                      d_out,
+        ReductionOp             reduction_op)
+     {
+        // Construct a NULL-ptr multi-kernel tuple that forces a single-kernel pass
+        MultiDispatchTuple multi_tuple;
+
+        double max_exponent     = log2(double(g_max_items));
+        unsigned int max_int    = (unsigned int) -1;
+
+        for (int sample = 0; sample < g_samples; ++sample)
+        {
+            printf("\nSingle-block sample %d, ", sample);
+
+            int num_items;
+            if (sample == 0)
+            {
+                // First sample: use max items
+                num_items = g_max_items;
+                printf("num_items: %d", num_items); fflush(stdout);
+            }
+            else
+            {
+                // Sample a problem size from [2, g_max_items], log-distributed
+                unsigned int bits;
+                RandomBits(bits);
+                double scale = double(bits) / max_int;
+                double exponent = ((max_exponent - 1) * scale) + 1;
+                num_items = pow(2.0, exponent);
+                printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
+            }
+
+            if (g_verbose)
+                printf("\n");
+            else
+                printf(", ");
+
+            // Compute reference
+            T h_reference = Reduce(h_in, reduction_op, num_items);
+
+            // Run test on each single-kernel configuration (pick first multi-config to use, which shouldn't be
+            float best_avg_throughput = 0.0;
+            for (int j = 0; j < single_kernels.size(); ++j)
+            {
+                single_kernels[j].avg_throughput = 0.0;
+
+                TestConfiguration(multi_tuple, single_kernels[j], d_in, d_out, &h_reference, num_items, reduction_op);
+
+                best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
+            }
+
+            // Print best throughput for this problem size
+            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
+
+            // Accumulate speedup (inverse for harmonic mean)
+            for (int j = 0; j < single_kernels.size(); ++j)
+                single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
+        }
+
+        // Find max overall throughput and compute hmean speedups
+        float overall_max_throughput = 0.0;
+        for (int j = 0; j < single_kernels.size(); ++j)
+        {
+            overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
+            single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
+        }
+
+        // Sort by cumulative speedup
+        sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
+
+        // Print ranked single configurations
+        printf("\nRanked single_kernels:\n");
+        for (int j = 0; j < single_kernels.size(); ++j)
+        {
+            printf("\t (%d) params( ", single_kernels.size() - j);
+            single_kernels[j].params.Print();
+            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
+                single_kernels[j].hmean_speedup,
+                single_kernels[j].best_avg_throughput,
+                (int) single_kernels[j].best_size,
+                single_kernels[j].best_avg_throughput * sizeof(T),
+                single_kernels[j].best_avg_throughput / overall_max_throughput);
+        }
+
+        printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
+    }
+
+};
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    args.GetCmdLineArgument("n", g_max_items);
+    args.GetCmdLineArgument("s", g_samples);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_single = args.CheckCmdLineFlag("single");
+    g_verify = !args.CheckCmdLineFlag("noverify");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--n=<max items>]"
+            "[--s=<samples>]"
+            "[--i=<timing iterations>]"
+            "[--single]"
+            "[--v]"
+            "[--noverify]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#if (TUNE_SIZE == 1)
+    typedef unsigned char T;
+#elif (TUNE_SIZE == 2)
+    typedef unsigned short T;
+#elif (TUNE_SIZE == 4)
+    typedef unsigned int T;
+#elif (TUNE_SIZE == 8)
+    typedef unsigned long long T;
+#else
+    // Default
+    typedef unsigned int T;
+#endif
+
+    typedef unsigned int OffsetT;
+    Sum reduction_op;
+
+    // Enumerate kernels
+    Schmoo<T, OffsetT, Sum > schmoo;
+    schmoo.Enumerate();
+
+    // Allocate host arrays
+    T *h_in = new T[g_max_items];
+
+    // Initialize problem
+    Initialize(UNIFORM, h_in, g_max_items);
+
+    // Initialize device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
+
+    // Test kernels
+    if (g_single)
+        schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
+    else
+        schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+
+    return 0;
+}
+
+
+
diff --git a/thrust/doc/thrust.dox b/thrust/doc/thrust.dox
new file mode 100644
index 0000000000000000000000000000000000000000..95ec1a4806e81b2094444b0c4adcf79567e0a040
--- /dev/null
+++ b/thrust/doc/thrust.dox
@@ -0,0 +1,2458 @@
+# Doxyfile 1.8.13
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = thrust
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    = .
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = YES
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = YES
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = thrust \
+                         examples
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = examples
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */detail/*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = NO
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = NO
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = NO
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = NO
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = THRUST_NOEXCEPT=noexcept \
+                         "THRUST_DEFAULT={}" \
+                         "THRUST_NODISCARD=[[nodiscard]]" \
+                         "THRUST_MR_DEFAULT_ALIGNMENT=alignof(max_align_t)" \
+                         "THRUST_FINAL=final" \
+                         "THRUST_OVERRIDE=" \
+                         "cuda_cub=system::cuda"
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/thrust/doc/thrust_logo.png b/thrust/doc/thrust_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..123794b6a93ac7503662a5c7090a99b3c0385b99
Binary files /dev/null and b/thrust/doc/thrust_logo.png differ
diff --git a/thrust/doc/thrust_logo.svg b/thrust/doc/thrust_logo.svg
new file mode 100644
index 0000000000000000000000000000000000000000..4fd82acaf317273bd52d669b1aeb8cf4c456201a
--- /dev/null
+++ b/thrust/doc/thrust_logo.svg
@@ -0,0 +1,272 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="1052.3622"
+   height="744.09448"
+   id="svg2"
+   sodipodi:version="0.32"
+   inkscape:version="0.46"
+   version="1.0"
+   sodipodi:docname="thrust_logo.svg"
+   inkscape:output_extension="org.inkscape.output.svg.inkscape"
+   inkscape:export-filename="/home/nathan/Desktop/Old/logos/thrust3svg.jpg.png"
+   inkscape:export-xdpi="90"
+   inkscape:export-ydpi="90">
+  <defs
+     id="defs4">
+    <linearGradient
+       id="linearGradient5922">
+      <stop
+         style="stop-color:#b3b3b3;stop-opacity:1;"
+         offset="0"
+         id="stop5924" />
+      <stop
+         style="stop-color:#b3b3b3;stop-opacity:0;"
+         offset="1"
+         id="stop5926" />
+    </linearGradient>
+    <linearGradient
+       id="linearGradient5886">
+      <stop
+         id="stop5888"
+         offset="0"
+         style="stop-color:#666666;stop-opacity:1;" />
+      <stop
+         style="stop-color:#e3e3e3;stop-opacity:1;"
+         offset="0.47389936"
+         id="stop5890" />
+      <stop
+         id="stop5892"
+         offset="1"
+         style="stop-color:#666666;stop-opacity:1;" />
+    </linearGradient>
+    <linearGradient
+       id="linearGradient5840">
+      <stop
+         id="stop5842"
+         offset="0"
+         style="stop-color:#1a1a1a;stop-opacity:1;" />
+      <stop
+         style="stop-color:#cbcbcb;stop-opacity:1;"
+         offset="0.42692322"
+         id="stop5844" />
+      <stop
+         id="stop5846"
+         offset="1"
+         style="stop-color:#252525;stop-opacity:1;" />
+    </linearGradient>
+    <linearGradient
+       id="linearGradient5795">
+      <stop
+         style="stop-color:#666666;stop-opacity:1;"
+         offset="0"
+         id="stop5797" />
+      <stop
+         id="stop5805"
+         offset="0.36170211"
+         style="stop-color:#e3e3e3;stop-opacity:1;" />
+      <stop
+         style="stop-color:#666666;stop-opacity:1;"
+         offset="1"
+         id="stop5799" />
+    </linearGradient>
+    <linearGradient
+       id="linearGradient5773">
+      <stop
+         style="stop-color:#3b3b3b;stop-opacity:1;"
+         offset="0"
+         id="stop5775" />
+      <stop
+         id="stop5781"
+         offset="0.4955157"
+         style="stop-color:#ececec;stop-opacity:0.49803922;" />
+      <stop
+         style="stop-color:#000000;stop-opacity:0;"
+         offset="1"
+         id="stop5777" />
+    </linearGradient>
+    <linearGradient
+       id="linearGradient5743">
+      <stop
+         style="stop-color:#626161;stop-opacity:1;"
+         offset="0"
+         id="stop5745" />
+      <stop
+         id="stop5753"
+         offset="0.44680852"
+         style="stop-color:#161882;stop-opacity:0.49803922;" />
+      <stop
+         style="stop-color:#00bb00;stop-opacity:0;"
+         offset="1"
+         id="stop5747" />
+    </linearGradient>
+    <linearGradient
+       id="linearGradient3213">
+      <stop
+         style="stop-color:#000000;stop-opacity:1;"
+         offset="0"
+         id="stop3215" />
+      <stop
+         style="stop-color:#a7a7a7;stop-opacity:0;"
+         offset="1"
+         id="stop3217" />
+    </linearGradient>
+    <inkscape:perspective
+       sodipodi:type="inkscape:persp3d"
+       inkscape:vp_x="0 : 526.18109 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_z="744.09448 : 526.18109 : 1"
+       inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
+       id="perspective10" />
+    <linearGradient
+       inkscape:collect="always"
+       xlink:href="#linearGradient5795"
+       id="linearGradient5810"
+       gradientUnits="userSpaceOnUse"
+       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1246.1936,214.03097)"
+       x1="771.13623"
+       y1="-287.25806"
+       x2="1120.5692"
+       y2="201.83484" />
+    <linearGradient
+       inkscape:collect="always"
+       xlink:href="#linearGradient5795"
+       id="linearGradient5824"
+       gradientUnits="userSpaceOnUse"
+       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1227.724,586.99847)"
+       x1="771.13623"
+       y1="-287.25806"
+       x2="663.33466"
+       y2="-144.52788" />
+    <linearGradient
+       inkscape:collect="always"
+       xlink:href="#linearGradient5840"
+       id="linearGradient5838"
+       gradientUnits="userSpaceOnUse"
+       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1246.1936,214.03097)"
+       x1="771.13623"
+       y1="-287.25806"
+       x2="1137.2974"
+       y2="174.0116" />
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     gridtolerance="10000"
+     guidetolerance="10"
+     objecttolerance="10"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1"
+     inkscape:cx="513.86573"
+     inkscape:cy="372.04724"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1920"
+     inkscape:window-height="1125"
+     inkscape:window-x="0"
+     inkscape:window-y="25" />
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1">
+    <g
+       id="g3189"
+       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
+       inkscape:export-xdpi="47.029999"
+       inkscape:export-ydpi="47.029999">
+      <path
+         d="M 256.90625,209.34375 C 245.27561,209.38319 234.38709,213.94209 226.03125,221.0625 C 216.48171,229.20011 209.59283,242.94767 214.65625,256.65625 L 288.125,455.5625 C 291.48237,464.65215 295.87551,473.99003 303.21875,481.625 C 310.56199,489.25997 321.45303,494.71875 334.15625,494.71875 L 805.34375,494.71875 C 817.97624,494.71876 828.98878,489.54948 836.625,481.90625 C 844.26122,474.26302 848.88495,464.56763 851.65625,454.6875 L 889.5,319.75 C 893.24724,306.39046 886.23452,293.51892 877,286.21875 C 867.76548,278.91858 856.12028,274.84557 844.4375,273.5625 L 261.9375,209.59375 C 260.25138,209.40857 258.56777,209.33812 256.90625,209.34375 z"
+         inkscape:href="#rect2474"
+         id="path3265"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:2.10967277999999991;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1"
+         xlink:href="#rect2474"
+         inkscape:original="M 258.6875 221.03125 C 239.30554 218.90262 217.29031 236.04476 223.4375 252.6875 L 296.90625 451.59375 C 303.05344 468.2365 312.62987 483.21875 332.15625 483.21875 L 803.34375 483.21875 C 822.87016 483.21876 833.82448 468.59699 838.59375 451.59375 L 876.4375 316.65625 C 881.20677 299.65302 860.56946 287.12863 841.1875 285 L 258.6875 221.03125 z "
+         inkscape:radius="11.495221"
+         sodipodi:type="inkscape:offset" />
+      <path
+         sodipodi:nodetypes="czzzzzzzz"
+         id="rect2474"
+         d="M 841.1984,285.00037 L 258.69824,221.02711 C 239.31628,218.89848 217.30488,236.03474 223.45207,252.67748 L 296.91964,451.58125 C 303.06684,468.22399 312.63943,483.23161 332.16581,483.23161 L 803.35147,483.23161 C 822.87785,483.23161 833.82838,468.58449 838.59765,451.58125 L 876.44458,316.65074 C 881.21385,299.6475 860.58036,287.129 841.1984,285.00037 z"
+         style="fill:#66b366;fill-opacity:1;stroke:#000000;stroke-width:2.10967277999999991;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1" />
+    </g>
+    <g
+       id="g3251"
+       transform="matrix(0.913744,0,0,0.3451662,176.2736,220.85042)"
+       style="opacity:1"
+       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
+       inkscape:export-xdpi="47.029999"
+       inkscape:export-ydpi="47.029999">
+      <g
+         id="g3253"
+         transform="matrix(2.0484578,-1.263301,0.1197948,2.5356515,-182.46458,-362.9203)">
+        <path
+           sodipodi:type="inkscape:offset"
+           inkscape:radius="5.4485359"
+           inkscape:original="M 291.6875 279 C 206.19469 277.76693 90.813927 330.28055 44.5625 378.59375 C 119.00866 442.66663 390.60576 547.17687 393.5 375.5625 C 394.67595 305.83429 350.18258 279.84368 291.6875 279 z "
+           xlink:href="#path3255"
+           style="fill:#666666;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           id="path3263"
+           inkscape:href="#path3255"
+           d="M 291.78125,273.5625 C 247.88427,272.92937 197.14434,285.95647 151.3125,305.1875 C 105.48066,324.41853 64.633863,349.73338 40.625,374.8125 C 39.587603,375.89202 39.04008,377.35083 39.111013,378.84633 C 39.181946,380.34183 39.865085,381.74226 41,382.71875 C 79.595929,415.93675 166.14169,457.95278 244.96875,470.84375 C 284.38228,477.28923 321.94436,476.49105 350.625,462.34375 C 379.30564,448.19645 398.18956,420.0057 398.9375,375.65625 C 399.5452,339.62233 388.08647,313.71403 368.46875,297.28125 C 348.85103,280.84847 321.81559,273.99569 291.78125,273.5625 z" />
+        <path
+           style="fill:#ffee00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="M 393.50906,375.56396 C 396.40371,203.9253 122.46857,297.21173 44.57143,378.58133 C 119.01759,442.65421 390.61482,547.17833 393.50906,375.56396 z"
+           id="path3255"
+           sodipodi:nodetypes="ccz" />
+        <path
+           style="fill:#ffb500;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="M 385.4286,375.1448 C 388.01423,252.50309 143.32293,319.15945 73.741661,377.30082 C 140.24036,423.0831 382.84333,497.76917 385.4286,375.1448 z"
+           id="path3257"
+           sodipodi:nodetypes="ccz" />
+        <path
+           style="fill:#ff6c00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="M 382.08135,375.00509 C 383.96651,268.69569 205.56124,326.47536 154.8293,376.87398 C 203.31374,416.55939 380.19638,481.29945 382.08135,375.00509 z"
+           id="path3259"
+           sodipodi:nodetypes="ccz" />
+        <path
+           style="fill:#e42800;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="M 378.29864,374.84209 C 379.58638,287.58705 257.71919,335.01058 223.06461,376.37601 C 256.18393,408.9484 377.01103,462.08477 378.29864,374.84209 z"
+           id="path3261"
+           sodipodi:nodetypes="ccz" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:178.33847046px;font-style:italic;font-variant:normal;font-weight:bold;font-stretch:normal;opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1.99999785;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;font-family:UnDotum;-inkscape-font-specification:UnDotum Bold Italic"
+       x="352.8208"
+       y="466.72366"
+       id="text3247"
+       transform="matrix(1.0688669,0,-0.2132749,0.9355701,0,0)"
+       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
+       inkscape:export-xdpi="47.029999"
+       inkscape:export-ydpi="47.029999"><tspan
+         sodipodi:role="line"
+         id="tspan3249"
+         x="352.8208"
+         y="466.72366"
+         style="font-size:178.33847046px;font-style:italic;font-variant:normal;font-weight:bold;font-stretch:normal;fill:#ffffff;stroke:#000000;stroke-width:1.99999785;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;font-family:UnDotum;-inkscape-font-specification:UnDotum Bold Italic">Thrust</tspan></text>
+  </g>
+</svg>
diff --git a/thrust/examples/CMakeLists.txt b/thrust/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b86d8a18b95e962d5e1d3fdbfa21ca09dbedff1d
--- /dev/null
+++ b/thrust/examples/CMakeLists.txt
@@ -0,0 +1,152 @@
+# Setup FileCheck if requested and available:
+option(THRUST_ENABLE_EXAMPLE_FILECHECK
+  "Check example output with the LLVM FileCheck utility."
+  OFF
+)
+set(filecheck_data_path "${Thrust_SOURCE_DIR}/internal/test")
+
+if (THRUST_ENABLE_EXAMPLE_FILECHECK)
+  # TODO this should go into a find module
+  find_program(THRUST_FILECHECK_EXECUTABLE
+    DOC "Path to the LLVM FileCheck utility."
+    NAMES
+      FileCheck
+      FileCheck-3.9
+      FileCheck-4.0
+      FileCheck-5.0
+      FileCheck-6.0
+      FileCheck-7
+      FileCheck-8
+      FileCheck-9
+  )
+
+  if (NOT THRUST_FILECHECK_EXECUTABLE)
+    message(FATAL_ERROR
+      "Could not find the LLVM FileCheck utility. Set THRUST_FILECHECK_EXECUTABLE manually, "
+      "or disable THRUST_ENABLE_EXAMPLE_FILECHECK."
+    )
+  endif()
+
+  execute_process(
+    COMMAND "${THRUST_FILECHECK_EXECUTABLE}" "${filecheck_data_path}/thrust.sanity.filecheck"
+    INPUT_FILE "${Thrust_SOURCE_DIR}/cmake/sanity"
+    RESULT_VARIABLE exit_code
+  )
+
+  if (0 EQUAL exit_code)
+    message(STATUS "FileCheck enabled: ${THRUST_FILECHECK_EXECUTABLE}")
+  else()
+    message(FATAL_ERROR
+      "The current THRUST_FILECHECK_EXECUTABLE ('${THRUST_FILECHECK_EXECUTABLE}') "
+      "does not seem to be a valid FileCheck executable."
+    )
+  endif()
+endif()
+
+# Create meta targets that build all examples for a single configuration:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake --
+# these flag variables behave unintuitively:
+if (THRUST_ENABLE_EXAMPLES_WITH_RDC)
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+else()
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+endif()
+
+## thrust_add_example
+#
+# Add an example executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the example
+#   target. Useful for post-processing target information per-backend.
+# example_name: The name of the example minus "<config_prefix>.example." For
+#   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
+#   would be "cuda.copy".
+# example_src: The source file that implements the example.
+# thrust_target: The reference thrust target with configuration information.
+#
+function(thrust_add_example target_name_var example_name example_src thrust_target)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Wrap the .cu file in .cpp for non-CUDA backends
+  if ("CUDA" STREQUAL "${config_device}")
+    set(real_example_src "${example_src}")
+  else()
+    thrust_wrap_cu_in_cpp(real_example_src "${example_src}" ${thrust_target})
+  endif()
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target thrust.all.example.${example_name})
+
+  add_executable(${example_target} "${real_example_src}")
+  target_link_libraries(${example_target} ${thrust_target})
+  target_include_directories(${example_target} PRIVATE "${Thrust_SOURCE_DIR}/examples")
+  thrust_clone_target_properties(${example_target} ${thrust_target})
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  if ("CUDA" STREQUAL "${config_device}" AND
+      THRUST_ENABLE_EXAMPLES_WITH_RDC)
+    thrust_enable_rdc_for_cuda_target(${example_target})
+  endif()
+
+  # Get the name of FileCheck input by stripping out the config name.
+  # (e.g. "thrust.cpp.cuda.cpp14.example.xxx" -> "thrust.example.xxx.filecheck")
+  string(REPLACE "${config_prefix}" "thrust"
+    filecheck_reference_file
+    "${example_target}.filecheck"
+  )
+
+  add_test(NAME ${example_target}
+    COMMAND "${CMAKE_COMMAND}"
+    "-DEXAMPLE_EXECUTABLE=$<TARGET_FILE:${example_target}>"
+    "-DFILECHECK_ENABLED=${THRUST_ENABLE_EXAMPLE_FILECHECK}"
+    "-DFILECHECK_EXECUTABLE=${THRUST_FILECHECK_EXECUTABLE}"
+    "-DREFERENCE_FILE=${filecheck_data_path}/${filecheck_reference_file}"
+    -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunExample.cmake"
+  )
+
+  # Run OMP/TBB tests in serial. Multiple OMP processes will massively
+  # oversubscribe the machine with GCC's OMP, and we want to test these with
+  # the full CPU available to each unit test.
+  set(config_systems ${config_host} ${config_device})
+  if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
+    set_tests_properties(${example_target} PROPERTIES RUN_SERIAL ON)
+  endif()
+endfunction()
+
+file(GLOB example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target})
+  endforeach()
+endforeach()
+
+add_subdirectory(cmake)
+add_subdirectory(cuda)
diff --git a/thrust/examples/README b/thrust/examples/README
new file mode 100644
index 0000000000000000000000000000000000000000..4188534fe33c9ec833798bc1dae87312076acb72
--- /dev/null
+++ b/thrust/examples/README
@@ -0,0 +1,11 @@
+Once Thrust has been installed, these example programs can be compiled
+directly with nvcc.  For example, the following command will compile the
+norm example.
+  $ nvcc norm.cu -o norm
+
+These examples are also available online:
+  https://github.com/thrust/thrust/tree/master/examples
+
+For additional information refer to the Quick Start Guide:
+  https://github.com/thrust/thrust/wiki/Quick-Start-Guide
+
diff --git a/thrust/examples/arbitrary_transformation.cu b/thrust/examples/arbitrary_transformation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be22c2e5a1423fa88888e89bd47c073398f7d27c
--- /dev/null
+++ b/thrust/examples/arbitrary_transformation.cu
@@ -0,0 +1,105 @@
+#include <thrust/for_each.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <iostream>
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#include <thrust/zip_function.h>
+#endif // >= C++11
+
+// This example shows how to implement an arbitrary transformation of
+// the form output[i] = F(first[i], second[i], third[i], ... ).
+// In this example, we use a function with 3 inputs and 1 output.
+//
+// Iterators for all four vectors (3 inputs + 1 output) are "zipped"
+// into a single sequence of tuples with the zip_iterator.
+//  
+// The arbitrary_functor receives a tuple that contains four elements,
+// which are references to values in each of the four sequences. When we
+// access the tuple 't' with the get() function,
+//      get<0>(t) returns a reference to A[i],
+//      get<1>(t) returns a reference to B[i],
+//      get<2>(t) returns a reference to C[i],
+//      get<3>(t) returns a reference to D[i].
+//
+// In this example, we can implement the transformation,
+//      D[i] = A[i] + B[i] * C[i];
+// by invoking arbitrary_functor() on each of the tuples using for_each.
+//
+// If we are using a functor that is not designed for zip iterators by taking a
+// tuple instead of individual arguments we can adapt this function using the
+// zip_function adaptor (C++11 only).
+//
+// Note that we could extend this example to implement functions with an
+// arbitrary number of input arguments by zipping more sequence together.
+// With the same approach we can have multiple *output* sequences, if we 
+// wanted to implement something like
+//      D[i] = A[i] + B[i] * C[i];
+//      E[i] = A[i] + B[i] + C[i];
+//
+// The possibilities are endless! :)
+
+struct arbitrary_functor1
+{
+    template <typename Tuple>
+    __host__ __device__
+    void operator()(Tuple t)
+    {
+        // D[i] = A[i] + B[i] * C[i];
+        thrust::get<3>(t) = thrust::get<0>(t) + thrust::get<1>(t) * thrust::get<2>(t);
+    }
+};
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+struct arbitrary_functor2
+{
+    __host__ __device__
+    void operator()(const float& a, const float& b, const float& c, float& d)
+    {
+        // D[i] = A[i] + B[i] * C[i];
+        d = a + b * c;
+    }
+};
+#endif // >= C++11
+
+int main(void)
+{
+    // allocate storage
+    thrust::device_vector<float> A(5);
+    thrust::device_vector<float> B(5);
+    thrust::device_vector<float> C(5);
+    thrust::device_vector<float> D1(5);
+
+    // initialize input vectors
+    A[0] = 3;  B[0] = 6;  C[0] = 2; 
+    A[1] = 4;  B[1] = 7;  C[1] = 5; 
+    A[2] = 0;  B[2] = 2;  C[2] = 7; 
+    A[3] = 8;  B[3] = 1;  C[3] = 4; 
+    A[4] = 2;  B[4] = 8;  C[4] = 3; 
+
+    // apply the transformation
+    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D1.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D1.end())),
+                     arbitrary_functor1());
+
+    // print the output
+    std::cout << "Tuple functor" << std::endl;
+    for(int i = 0; i < 5; i++)
+        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i] << std::endl;
+
+    // apply the transformation using zip_function
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+    thrust::device_vector<float> D2(5);
+    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D2.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D2.end())),
+                     thrust::make_zip_function(arbitrary_functor2()));
+
+    // print the output
+    std::cout << "N-ary functor" << std::endl;
+    for(int i = 0; i < 5; i++)
+        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i] << std::endl;
+#endif // >= C++11
+}
+
diff --git a/thrust/examples/basic_vector.cu b/thrust/examples/basic_vector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..499153770d75f86fe4ad78aa0d5dc4d36d1f1aac
--- /dev/null
+++ b/thrust/examples/basic_vector.cu
@@ -0,0 +1,42 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <iostream>
+
+int main(void)
+{
+    // H has storage for 4 integers
+    thrust::host_vector<int> H(4);
+
+    // initialize individual elements
+    H[0] = 14;
+    H[1] = 20;
+    H[2] = 38;
+    H[3] = 46;
+    
+    // H.size() returns the size of vector H
+    std::cout << "H has size " << H.size() << std::endl;
+
+    // print contents of H
+    for(size_t i = 0; i < H.size(); i++)
+        std::cout << "H[" << i << "] = " << H[i] << std::endl;
+
+    // resize H
+    H.resize(2);
+    
+    std::cout << "H now has size " << H.size() << std::endl;
+
+    // Copy host_vector H to device_vector D
+    thrust::device_vector<int> D = H;
+    
+    // elements of D can be modified
+    D[0] = 99;
+    D[1] = 88;
+    
+    // print contents of D
+    for(size_t i = 0; i < D.size(); i++)
+        std::cout << "D[" << i << "] = " << D[i] << std::endl;
+
+    // H and D are automatically deleted when the function returns
+    return 0;
+}
diff --git a/thrust/examples/bounding_box.cu b/thrust/examples/bounding_box.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cca71a45e416c8c7e0037a0096bee019d18cbb50
--- /dev/null
+++ b/thrust/examples/bounding_box.cu
@@ -0,0 +1,100 @@
+#include <thrust/transform_reduce.h>
+#include <thrust/device_vector.h>
+#include <thrust/pair.h>
+#include <thrust/random.h>
+#include <thrust/extrema.h>
+
+// This example shows how to compute a bounding box
+// for a set of points in two dimensions.
+
+struct point2d
+{
+  float x, y;
+  
+  __host__ __device__
+  point2d() : x(0), y(0) {}
+  
+  __host__ __device__
+  point2d(float _x, float _y) : x(_x), y(_y) {}
+};
+
+// bounding box type
+struct bbox
+{
+  // construct an empty box
+  __host__ __device__
+  bbox() {}
+
+  // construct a box from a single point
+  __host__ __device__
+  bbox(const point2d &point)
+    : lower_left(point), upper_right(point)
+  {}
+
+  // construct a box from a single point
+  __host__ __device__
+  bbox& operator=(const point2d &point)
+  {
+    lower_left = point;
+    upper_right = point;
+    return *this;
+  }
+
+  // construct a box from a pair of points
+  __host__ __device__
+  bbox(const point2d &ll, const point2d &ur)
+    : lower_left(ll), upper_right(ur)
+  {}
+
+  point2d lower_left, upper_right;
+};
+
+// reduce a pair of bounding boxes (a,b) to a bounding box containing a and b
+struct bbox_reduction : public thrust::binary_function<bbox,bbox,bbox>
+{
+  __host__ __device__
+  bbox operator()(bbox a, bbox b)
+  {
+    // lower left corner
+    point2d ll(thrust::min(a.lower_left.x, b.lower_left.x), thrust::min(a.lower_left.y, b.lower_left.y));
+    
+    // upper right corner
+    point2d ur(thrust::max(a.upper_right.x, b.upper_right.x), thrust::max(a.upper_right.y, b.upper_right.y));
+    
+    return bbox(ll, ur);
+  }
+};
+
+int main(void)
+{
+  const size_t N = 40;
+  thrust::default_random_engine rng;
+  thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
+  
+  // allocate storage for points
+  thrust::device_vector<point2d> points(N);
+  
+  // generate some random points in the unit square
+  for(size_t i = 0; i < N; i++)
+  {
+      float x = u01(rng);
+      float y = u01(rng);
+      points[i] = point2d(x,y);
+  }
+  
+  // initial bounding box contains first point
+  bbox init = bbox(points[0], points[0]);
+  
+  // binary reduction operation
+  bbox_reduction binary_op;
+  
+  // compute the bounding box for the point set
+  bbox result = thrust::reduce(points.begin(), points.end(), init, binary_op);
+  
+  // print output
+  std::cout << "bounding box " << std::fixed;
+  std::cout << "(" << result.lower_left.x  << "," << result.lower_left.y  << ") ";
+  std::cout << "(" << result.upper_right.x << "," << result.upper_right.y << ")" << std::endl;
+  
+  return 0;
+}
diff --git a/thrust/examples/bucket_sort2d.cu b/thrust/examples/bucket_sort2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9e3bb27209bf01f55a6b1bc333395e116b4d7f0a
--- /dev/null
+++ b/thrust/examples/bucket_sort2d.cu
@@ -0,0 +1,113 @@
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/generate.h>
+#include <thrust/sort.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+
+#include <iostream>
+#include <iomanip>
+
+// define a 2d float vector
+typedef thrust::tuple<float,float> vec2;
+
+// return a random vec2 in [0,1)^2
+vec2 make_random_vec2(void)
+{
+  static thrust::default_random_engine rng;
+  static thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
+  float x = u01(rng);
+  float y = u01(rng);
+  return vec2(x,y);
+}
+
+// hash a point in the unit square to the index of
+// the grid bucket that contains it
+struct point_to_bucket_index : public thrust::unary_function<vec2,unsigned int>
+{
+  unsigned int width;  // buckets in the x dimension (grid spacing = 1/width)
+  unsigned int height; // buckets in the y dimension (grid spacing = 1/height)
+
+  __host__ __device__
+  point_to_bucket_index(unsigned int width, unsigned int height)
+    : width(width), height(height) {}
+
+  __host__ __device__
+  unsigned int operator()(const vec2& v) const
+  {
+    // find the raster indices of p's bucket
+    unsigned int x = static_cast<unsigned int>(thrust::get<0>(v) * width);
+    unsigned int y = static_cast<unsigned int>(thrust::get<1>(v) * height);
+
+    // return the bucket's linear index
+    return y * width + x;
+  }
+
+};
+
+int main(void)
+{
+  const size_t N = 1000000;
+
+  // allocate some random points in the unit square on the host
+  thrust::host_vector<vec2> h_points(N);
+  thrust::generate(h_points.begin(), h_points.end(), make_random_vec2);
+
+  // transfer to device
+  thrust::device_vector<vec2> points = h_points;
+
+  // allocate storage for a 2D grid
+  // of dimensions w x h
+  unsigned int w = 200, h = 100;
+
+  // the grid data structure keeps a range per grid bucket:
+  // each bucket_begin[i] indexes the first element of bucket i's list of points
+  // each bucket_end[i] indexes one past the last element of bucket i's list of points
+  thrust::device_vector<unsigned int> bucket_begin(w*h);
+  thrust::device_vector<unsigned int> bucket_end(w*h);
+
+  // allocate storage for each point's bucket index
+  thrust::device_vector<unsigned int> bucket_indices(N);
+
+  // transform the points to their bucket indices
+  thrust::transform(points.begin(),
+                    points.end(),
+                    bucket_indices.begin(),
+                    point_to_bucket_index(w,h));
+
+  // sort the points by their bucket index
+  thrust::sort_by_key(bucket_indices.begin(),
+                      bucket_indices.end(),
+                      points.begin());
+
+  // find the beginning of each bucket's list of points
+  thrust::counting_iterator<unsigned int> search_begin(0);
+  thrust::lower_bound(bucket_indices.begin(),
+                      bucket_indices.end(),
+                      search_begin,
+                      search_begin + w*h,
+                      bucket_begin.begin());
+
+  // find the end of each bucket's list of points
+  thrust::upper_bound(bucket_indices.begin(),
+                      bucket_indices.end(),
+                      search_begin,
+                      search_begin + w*h,
+                      bucket_end.begin());
+
+  // write out bucket (150, 50)'s list of points
+  unsigned int bucket_idx = 50 * w + 150;
+  std::cout << "bucket (150, 50)'s list of points:" << std::endl;
+  std::cout << std::fixed << std::setprecision(6);
+  for(unsigned int point_idx = bucket_begin[bucket_idx];
+      point_idx != bucket_end[bucket_idx];
+      ++point_idx)
+  {
+    vec2 p = points[point_idx];
+    std::cout << "(" << thrust::get<0>(p) << "," << thrust::get<1>(p) << ")" << std::endl;
+  }
+
+  return 0;
+}
+
diff --git a/thrust/examples/cmake/CMakeLists.txt b/thrust/examples/cmake/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a193994f460f1a976267443b7e1f79525e0dcabb
--- /dev/null
+++ b/thrust/examples/cmake/CMakeLists.txt
@@ -0,0 +1,16 @@
+thrust_update_system_found_flags()
+
+if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND)
+  # Do a basic check of the cmake/ThrustAddSubdir.cmake mechanism:
+  add_test(
+    NAME thrust.example.cmake.add_subdir
+    COMMAND "${CMAKE_COMMAND}"
+      --log-level=VERBOSE
+      -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir"
+      -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir"
+      -D "THRUST_DIR=${Thrust_SOURCE_DIR}"
+      -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+      -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+  )
+endif()
diff --git a/thrust/examples/cmake/add_subdir/CMakeLists.txt b/thrust/examples/cmake/add_subdir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b66143fddc8e944f788f4c249c78c5d0f37c68d2
--- /dev/null
+++ b/thrust/examples/cmake/add_subdir/CMakeLists.txt
@@ -0,0 +1,91 @@
+# This example demonstrates / tests adding thrust via a CMake add_subdirectory
+# call from a parent project.
+#
+# The variables THRUST_REQUIRED_SYSTEMS and THRUST_OPTIONAL_SYSTEMS must be
+# set prior to add_subdirectory(thrust), and afterwards the thrust_create_target
+# function may be used to create targets with the desired systems. See
+# thrust/thrust/cmake/README.md for more details on thrust_create_target.
+
+cmake_minimum_required(VERSION 3.15)
+
+# Silence warnings about empty CUDA_ARCHITECTURES properties on example targets:
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+project(ThrustAddSubDirExample CXX)
+
+# Add required Thrust systems to THRUST_REQUIRED_SYSTEMS.
+# Options are: CPP, CUDA, TBB or OMP.
+# Multiple systems may be specified.
+# An error is emitted if the system is not found.
+set(THRUST_REQUIRED_SYSTEMS CPP)
+
+# Add optional Thrust systems to THRUST_OPTIONAL_SYSTEMS.
+# Options are: CPP, CUDA, TBB or OMP.
+# Multiple systems may be specified.
+# No error is emitted if not found.
+set(THRUST_OPTIONAL_SYSTEMS CUDA)
+
+# Use your project's checkout of Thrust here, for most cases
+# `add_subdirectory(thrust)` will be sufficient.
+add_subdirectory("${THRUST_DIR}" thrust)
+
+# Create a thrust target that only uses the serial CPP backend.
+# See thrust/thrust/cmake/README.md for details and additional options:
+thrust_create_target(ThrustCPP HOST CPP DEVICE CPP)
+
+# Create an executable that uses the CPP-only thrust target:
+add_executable(ExecWithCPP dummy.cpp)
+target_link_libraries(ExecWithCPP ThrustCPP)
+
+# To test for optional systems, first call thrust_update_system_found_flags to
+# set the THRUST_${system}_FOUND flags in current scope.
+# Required due to CMake scoping rules.
+thrust_update_system_found_flags()
+
+# Create and use a Thrust target configured to use CUDA acceleration if CUDA
+# is available:
+if (THRUST_CUDA_FOUND)
+  enable_language(CUDA)
+  thrust_create_target(ThrustCUDA HOST CPP DEVICE CUDA)
+  add_executable(ExecWithCUDA dummy.cu)
+  target_link_libraries(ExecWithCUDA ThrustCUDA)
+endif()
+
+#
+# Validation
+#
+
+function(assert_boolean var_name expect)
+  if (expect)
+    if (NOT ${var_name})
+      message(FATAL_ERROR "'${var_name}' is false, expected true.")
+    endif()
+  else()
+    if (${var_name})
+      message(FATAL_ERROR "'${var_name}' is true, expected false.")
+    endif()
+  endif()
+endfunction()
+
+function(assert_target target_name)
+  if (NOT TARGET "${target_name}")
+    message(FATAL_ERROR "Target '${target_name}' not defined.")
+  endif()
+endfunction()
+
+assert_boolean(THRUST_CPP_FOUND TRUE)
+assert_boolean(THRUST_CUDA_FOUND TRUE)
+assert_boolean(THRUST_OMP_FOUND FALSE)
+assert_boolean(THRUST_TBB_FOUND FALSE)
+
+assert_target(ThrustCPP)
+assert_target(ThrustCUDA)
+assert_target(ExecWithCPP)
+assert_target(ExecWithCUDA)
+
+thrust_debug_target(ThrustCPP "")
+thrust_debug_target(ThrustCUDA "")
+thrust_debug_target(ExecWithCPP "")
+thrust_debug_target(ExecWithCUDA "")
diff --git a/thrust/examples/cmake/add_subdir/dummy.cpp b/thrust/examples/cmake/add_subdir/dummy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad7b9435fff3274ee11551d0191c70cf581fcb86
--- /dev/null
+++ b/thrust/examples/cmake/add_subdir/dummy.cpp
@@ -0,0 +1,32 @@
+#include <thrust/detail/config.h>
+
+#include <iostream>
+
+int main()
+{
+  std::cout << "Hello from Thrust version " << THRUST_VERSION << ":\n"
+
+            << "Host system: "
+#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
+            << "CPP\n"
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP
+            << "OMP\n"
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB
+            << "TBB\n"
+#else
+            << "Unknown\n"
+#endif
+
+            << "Device system: "
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
+            << "CPP\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+            << "CUDA\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
+            << "OMP\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+            << "TBB\n";
+#else
+            << "Unknown\n";
+#endif
+}
diff --git a/thrust/examples/cmake/add_subdir/dummy.cu b/thrust/examples/cmake/add_subdir/dummy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b5645fc3d3b1b83b3498428c48d253862dc4340a
--- /dev/null
+++ b/thrust/examples/cmake/add_subdir/dummy.cu
@@ -0,0 +1 @@
+#include "dummy.cpp"
diff --git a/thrust/examples/constant_iterator.cu b/thrust/examples/constant_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7e579f93d6602b9af0aa9055975e160a83becabe
--- /dev/null
+++ b/thrust/examples/constant_iterator.cu
@@ -0,0 +1,29 @@
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/copy.h> 
+#include <iterator>
+#include <iostream>
+
+int main(void)
+{
+    thrust::device_vector<int> data(4);
+    data[0] = 3;
+    data[1] = 7;
+    data[2] = 2;
+    data[3] = 5;
+
+    // add 10 to all values in data
+    thrust::transform(data.begin(), data.end(),
+                      thrust::constant_iterator<int>(10),
+                      data.begin(),
+                      thrust::plus<int>());
+
+    // data is now [13, 17, 12, 15]
+
+    // print result
+    thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, "\n"));
+
+    return 0;
+}
diff --git a/thrust/examples/counting_iterator.cu b/thrust/examples/counting_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e090e9e5e839c3273c4d1d74b862c4ad76f819bd
--- /dev/null
+++ b/thrust/examples/counting_iterator.cu
@@ -0,0 +1,44 @@
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <iterator>
+#include <iostream>
+
+int main(void)
+{
+    // this example computes indices for all the nonzero values in a sequence
+
+    // sequence of zero and nonzero values
+    thrust::device_vector<int> stencil(8);
+    stencil[0] = 0;
+    stencil[1] = 1;
+    stencil[2] = 1;
+    stencil[3] = 0;
+    stencil[4] = 0;
+    stencil[5] = 1;
+    stencil[6] = 0;
+    stencil[7] = 1;
+
+    // storage for the nonzero indices
+    thrust::device_vector<int> indices(8);
+    
+    // counting iterators define a sequence [0, 8)
+    thrust::counting_iterator<int> first(0);
+    thrust::counting_iterator<int> last = first + 8;
+
+    // compute indices of nonzero elements 
+    typedef thrust::device_vector<int>::iterator IndexIterator;
+
+    IndexIterator indices_end = thrust::copy_if(first, last,
+                                                stencil.begin(),
+                                                indices.begin(),
+                                                thrust::identity<int>());
+    // indices now contains [1,2,5,7]
+
+    // print result
+    std::cout << "found " << (indices_end - indices.begin()) << " nonzero values at indices:\n";
+    thrust::copy(indices.begin(), indices_end, std::ostream_iterator<int>(std::cout, "\n"));
+
+    return 0;
+}
diff --git a/thrust/examples/cpp_integration/README b/thrust/examples/cpp_integration/README
new file mode 100644
index 0000000000000000000000000000000000000000..c9dc09e4df64976dfc621fee9dfd8f65a6be35b0
--- /dev/null
+++ b/thrust/examples/cpp_integration/README
@@ -0,0 +1,21 @@
+This example shows how to link a Thrust program contained in 
+a .cu file with a C++ program contained in a .cpp file.  Note
+that device_vector only appears in the .cu file while host_vector
+appears in both.  This relects the fact that algorithms on device
+vectors are only available when the contents of the program are
+located in a .cu file and compiled with the nvcc compiler.
+
+On a Linux system where Thrust is installed in the default location
+we can use the following procedure to compile the two parts of the
+program and link them together.
+
+  $ nvcc -O2 -c device.cu
+  $ g++  -O2 -c host.cpp   -I/usr/local/cuda/include/
+  $ nvcc -o tester device.o host.o
+
+Alternatively, we can use g++ to perform final linking step.
+
+  $ nvcc -O2 -c device.cu
+  $ g++  -O2 -c host.cpp   -I/usr/local/cuda/include/
+  $ g++ -o tester device.o host.o -L/usr/local/cuda/lib64 -lcudart
+
diff --git a/thrust/examples/cpp_integration/device.cu b/thrust/examples/cpp_integration/device.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb42542b2e1b9578b1476b979ca7d6df3689aec9
--- /dev/null
+++ b/thrust/examples/cpp_integration/device.cu
@@ -0,0 +1,18 @@
+#include <thrust/sort.h>
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+
+#include "device.h"
+
+void sort_on_device(thrust::host_vector<int>& h_vec)
+{
+    // transfer data to the device
+    thrust::device_vector<int> d_vec = h_vec;
+
+    // sort data on the device
+    thrust::sort(d_vec.begin(), d_vec.end());
+    
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+}
+
diff --git a/thrust/examples/cpp_integration/device.h b/thrust/examples/cpp_integration/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..e398edf3361d35225e1ba829baab9c61ada55367
--- /dev/null
+++ b/thrust/examples/cpp_integration/device.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <thrust/host_vector.h>
+
+// function prototype
+void sort_on_device(thrust::host_vector<int>& V);
+
diff --git a/thrust/examples/cpp_integration/host.cpp b/thrust/examples/cpp_integration/host.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..009f3fa87dd6e318c97a4749a392a57f01814bd6
--- /dev/null
+++ b/thrust/examples/cpp_integration/host.cpp
@@ -0,0 +1,27 @@
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/generate.h>
+#include <thrust/sort.h>
+#include <cstdlib>
+#include <iostream>
+#include <iterator>
+
+// defines the function prototype
+#include "device.h"
+
+int main(void)
+{
+    // generate 20 random numbers on the host
+    thrust::host_vector<int> h_vec(20);
+    thrust::default_random_engine rng;
+    thrust::generate(h_vec.begin(), h_vec.end(), rng);
+
+    // interface to CUDA code
+    sort_on_device(h_vec);
+
+    // print sorted array
+    thrust::copy(h_vec.begin(), h_vec.end(), std::ostream_iterator<int>(std::cout, "\n"));
+
+    return 0;
+}
+
diff --git a/thrust/examples/cuda/CMakeLists.txt b/thrust/examples/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bd72c58c010125466607327c820d9d22d1978ee2
--- /dev/null
+++ b/thrust/examples/cuda/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    string(PREPEND example_name "cuda.")
+    thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/thrust/examples/cuda/async_reduce.cu b/thrust/examples/cuda/async_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..845fe882dec5bda88bbd267ff3b92dcfbf23b360
--- /dev/null
+++ b/thrust/examples/cuda/async_reduce.cu
@@ -0,0 +1,78 @@
+#include <thrust/detail/config.h>
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <cassert>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <future>
+#endif
+
+// This example demonstrates two ways to achieve algorithm invocations that are asynchronous with
+// the calling thread.
+//
+// The first method wraps a call to thrust::reduce inside a __global__ function. Since __global__ function
+// launches are asynchronous with the launching thread, this achieves asynchrony. The result of the reduction
+// is stored to a pointer to CUDA global memory. The calling thread waits for the result of the reduction to 
+// be ready by synchronizing with the CUDA stream on which the __global__ function is launched.
+//
+// The second method uses the C++11 library function, std::async, to create concurrency. The lambda function
+// given to std::async returns the result of thrust::reduce to a std::future. The calling thread can use the
+// std::future to wait for the result of the reduction. This method requires a compiler which supports
+// C++11-capable language and library constructs.
+
+template<typename Iterator, typename T, typename BinaryOperation, typename Pointer>
+__global__ void reduce_kernel(Iterator first, Iterator last, T init, BinaryOperation binary_op, Pointer result)
+{
+  *result = thrust::reduce(thrust::cuda::par, first, last, init, binary_op);
+}
+
+int main()
+{
+  size_t n = 1 << 20;
+  thrust::device_vector<unsigned int> data(n, 1);
+  thrust::device_vector<unsigned int> result(1, 0);
+
+  // method 1: call thrust::reduce from an asynchronous CUDA kernel launch
+
+  // create a CUDA stream 
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  // launch a CUDA kernel with only 1 thread on our stream
+  reduce_kernel<<<1,1,0,s>>>(data.begin(), data.end(), 0, thrust::plus<int>(), result.data());
+
+  // wait for the stream to finish
+  cudaStreamSynchronize(s);
+
+  // our result should be ready
+  assert(result[0] == n);
+
+  cudaStreamDestroy(s);
+
+  // reset the result
+  result[0] = 0;
+
+#if THRUST_CPP_DIALECT >= 2011
+  // method 2: use std::async to create asynchrony
+
+  // copy all the algorithm parameters
+  auto begin        = data.begin();
+  auto end          = data.end();
+  unsigned int init = 0;
+  auto binary_op    = thrust::plus<unsigned int>();
+
+  // std::async captures the algorithm parameters by value
+  // use std::launch::async to ensure the creation of a new thread
+  std::future<unsigned int> future_result = std::async(std::launch::async, [=]
+  {
+    return thrust::reduce(begin, end, init, binary_op);
+  });
+
+  // wait on the result and check that it is correct
+  assert(future_result.get() == n);
+#endif
+
+  return 0;
+}
+
diff --git a/thrust/examples/cuda/custom_temporary_allocation.cu b/thrust/examples/cuda/custom_temporary_allocation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7bba0fa9e997f50a5296ebab4d2dd016def2d4fc
--- /dev/null
+++ b/thrust/examples/cuda/custom_temporary_allocation.cu
@@ -0,0 +1,184 @@
+#include <thrust/system/cuda/vector.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/generate.h>
+#include <thrust/sort.h>
+#include <thrust/pair.h>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <map>
+#include <cassert>
+
+// This example demonstrates how to control how Thrust allocates temporary
+// storage during algorithms such as thrust::sort. The idea will be to create a
+// simple cache of allocations to search when temporary storage is requested.
+// If a hit is found in the cache, we quickly return the cached allocation
+// instead of resorting to the more expensive thrust::cuda::malloc.
+
+// Note: Thrust now has its own caching allocator layer; if you just need a
+// caching allocator, you ought to use that. This example is still useful
+// as a demonstration of how to use a Thrust custom allocator.
+
+// Note: this implementation cached_allocator is not thread-safe. If multiple
+// (host) threads use the same cached_allocator then they should gain exclusive
+// access to the allocator before accessing its methods.
+
+struct not_my_pointer
+{
+  not_my_pointer(void* p)
+    : message()
+  {
+    std::stringstream s;
+    s << "Pointer `" << p << "` was not allocated by this allocator.";
+    message = s.str();
+  }
+
+  virtual ~not_my_pointer() {}
+
+  virtual const char* what() const
+  {
+    return message.c_str();
+  }
+
+private:
+  std::string message;
+};
+
+// A simple allocator for caching cudaMalloc allocations.
+struct cached_allocator
+{
+  typedef char value_type;
+
+  cached_allocator() {}
+
+  ~cached_allocator()
+  {
+    free_all();
+  }
+
+  char *allocate(std::ptrdiff_t num_bytes)
+  {
+    std::cout << "cached_allocator::allocate(): num_bytes == "
+              << num_bytes
+              << std::endl;
+
+    char *result = 0;
+
+    // Search the cache for a free block.
+    free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
+
+    if (free_block != free_blocks.end())
+    {
+      std::cout << "cached_allocator::allocate(): found a free block"
+                << std::endl;
+
+      result = free_block->second;
+
+      // Erase from the `free_blocks` map.
+      free_blocks.erase(free_block);
+    }
+    else
+    {
+      // No allocation of the right size exists, so create a new one with
+      // `thrust::cuda::malloc`.
+      try
+      {
+        std::cout << "cached_allocator::allocate(): allocating new block"
+                  << std::endl;
+
+        // Allocate memory and convert the resulting `thrust::cuda::pointer` to
+        // a raw pointer.
+        result = thrust::cuda::malloc<char>(num_bytes).get();
+      }
+      catch (std::runtime_error&)
+      {
+        throw;
+      }
+    }
+
+    // Insert the allocated pointer into the `allocated_blocks` map.
+    allocated_blocks.insert(std::make_pair(result, num_bytes));
+
+    return result;
+  }
+
+  void deallocate(char *ptr, size_t)
+  {
+    std::cout << "cached_allocator::deallocate(): ptr == "
+              << reinterpret_cast<void*>(ptr) << std::endl;
+
+    // Erase the allocated block from the allocated blocks map.
+    allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
+
+    if (iter == allocated_blocks.end())
+      throw not_my_pointer(reinterpret_cast<void*>(ptr));
+
+    std::ptrdiff_t num_bytes = iter->second;
+    allocated_blocks.erase(iter);
+
+    // Insert the block into the free blocks map.
+    free_blocks.insert(std::make_pair(num_bytes, ptr));
+  }
+
+private:
+  typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
+  typedef std::map<char*, std::ptrdiff_t>      allocated_blocks_type;
+
+  free_blocks_type      free_blocks;
+  allocated_blocks_type allocated_blocks;
+
+  void free_all()
+  {
+    std::cout << "cached_allocator::free_all()" << std::endl;
+
+    // Deallocate all outstanding blocks in both lists.
+    for ( free_blocks_type::iterator i = free_blocks.begin()
+        ; i != free_blocks.end()
+        ; ++i)
+    {
+      // Transform the pointer to cuda::pointer before calling cuda::free.
+      thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
+    }
+
+    for( allocated_blocks_type::iterator i = allocated_blocks.begin()
+       ; i != allocated_blocks.end()
+       ; ++i)
+    {
+      // Transform the pointer to cuda::pointer before calling cuda::free.
+      thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
+    }
+  }
+};
+
+int main()
+{
+  std::size_t num_elements = 32768;
+
+  thrust::host_vector<int> h_input(num_elements);
+
+  // Generate random input.
+  thrust::generate(h_input.begin(), h_input.end(), rand);
+
+  thrust::cuda::vector<int> d_input = h_input;
+  thrust::cuda::vector<int> d_result(num_elements);
+
+  std::size_t num_trials = 5;
+
+  cached_allocator alloc;
+
+  for (std::size_t i = 0; i < num_trials; ++i)
+  {
+    d_result = d_input;
+
+    // Pass alloc through cuda::par as the first parameter to sort
+    // to cause allocations to be handled by alloc during sort.
+    thrust::sort(thrust::cuda::par(alloc), d_result.begin(), d_result.end());
+
+    // Ensure the result is sorted.
+    assert(thrust::is_sorted(d_result.begin(), d_result.end()));
+  }
+
+  return 0;
+}
+
diff --git a/thrust/examples/cuda/global_device_vector.cu b/thrust/examples/cuda/global_device_vector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a995667962e31400bd2aff2215876cf51dd18519
--- /dev/null
+++ b/thrust/examples/cuda/global_device_vector.cu
@@ -0,0 +1,46 @@
+#include <thrust/detail/config.h>
+#include <thrust/device_vector.h>
+
+// If you create a global `thrust::device_vector` with the default allocator,
+// you'll get an error during program termination when the memory of the vector
+// is freed, as the CUDA runtime cannot be used during program termination.
+//
+// To get around this, you can create your own allocator which ignores
+// deallocation failures that occur because the CUDA runtime is shut down.
+
+extern "C" cudaError_t cudaFreeIgnoreShutdown(void* ptr) {
+  cudaError_t const err = cudaFree(ptr);
+  if (cudaSuccess == err || cudaErrorCudartUnloading == err)
+    return cudaSuccess;
+  return err; 
+}
+
+typedef thrust::system::cuda::detail::cuda_memory_resource<
+  cudaMalloc, 
+  cudaFreeIgnoreShutdown,
+  thrust::cuda::pointer<void>
+> device_ignore_shutdown_memory_resource;
+
+#if THRUST_CPP_DIALECT >= 2011
+  template <typename T>
+  using device_ignore_shutdown_allocator = 
+    thrust::mr::stateless_resource_allocator<
+      T,
+      thrust::device_ptr_memory_resource<device_ignore_shutdown_memory_resource>
+    >;
+    
+  thrust::device_vector<double, device_ignore_shutdown_allocator<double>> d;
+#else
+  thrust::device_vector<
+    double, 
+    thrust::mr::stateless_resource_allocator<
+      double,
+      thrust::device_ptr_memory_resource<device_ignore_shutdown_memory_resource>
+    > 
+  > d;
+#endif
+
+int main() {
+  d.resize(25);
+}
+
diff --git a/thrust/examples/cuda/range_view.cu b/thrust/examples/cuda/range_view.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e863a6199a80dd419623b34f473c5ef9a312ed63
--- /dev/null
+++ b/thrust/examples/cuda/range_view.cu
@@ -0,0 +1,237 @@
+#include <thrust/device_vector.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+
+// This example demonstrates the use of a view: a non-owning wrapper for an
+// iterator range which presents a container-like interface to the user.
+//
+// For example, a view of a device_vector's data can be helpful when we wish to
+// access that data from a device function. Even though device_vectors are not
+// accessible from device functions, the range_view class allows us to access
+// and manipulate its data as if we were manipulating a real container.
+
+template<class Iterator>
+class range_view
+{
+public:
+  typedef Iterator iterator;
+  typedef typename thrust::iterator_traits<iterator>::value_type value_type;
+  typedef typename thrust::iterator_traits<iterator>::pointer pointer;
+  typedef typename thrust::iterator_traits<iterator>::difference_type difference_type;
+  typedef typename thrust::iterator_traits<iterator>::reference reference;
+
+private:
+  const iterator first;
+  const iterator last;
+
+
+public:
+  __host__ __device__
+  range_view(Iterator first, Iterator last)
+      : first(first), last(last) {}
+  __host__ __device__
+  ~range_view() {}
+
+  __host__ __device__
+  difference_type size() const { return thrust::distance(first, last); }
+
+
+  __host__ __device__
+  reference operator[](difference_type n)
+  {
+    return *(first + n);
+  }
+  __host__ __device__
+  const reference operator[](difference_type n) const
+  {
+    return *(first + n);
+  }
+
+  __host__ __device__
+  iterator begin() 
+  {
+    return first;
+  }
+  __host__ __device__
+  const iterator cbegin() const
+  {
+    return first;
+  }
+  __host__ __device__
+  iterator end() 
+  {
+    return last;
+  }
+  __host__ __device__
+  const iterator cend() const
+  {
+    return last;
+  }
+
+
+  __host__ __device__
+  thrust::reverse_iterator<iterator> rbegin()
+  {
+    return thrust::reverse_iterator<iterator>(end());
+  }
+  __host__ __device__
+  const thrust::reverse_iterator<const iterator> crbegin() const 
+  {
+    return thrust::reverse_iterator<const iterator>(cend());
+  }
+  __host__ __device__
+  thrust::reverse_iterator<iterator> rend()
+  {
+    return thrust::reverse_iterator<iterator>(begin());
+  }
+  __host__ __device__
+  const thrust::reverse_iterator<const iterator> crend() const 
+  {
+    return thrust::reverse_iterator<const iterator>(cbegin());
+  }
+  __host__ __device__
+  reference front() 
+  {
+    return *begin();
+  }
+  __host__ __device__
+  const reference front()  const
+  {
+    return *cbegin();
+  }
+
+  __host__ __device__
+  reference back() 
+  {
+    return *end();
+  }
+  __host__ __device__
+  const reference back()  const
+  {
+    return *cend();
+  }
+
+  __host__ __device__
+  bool empty() const 
+  {
+    return size() == 0;
+  }
+
+};
+
+// This helper function creates a range_view from iterator and the number of
+// elements
+template <class Iterator, class Size>
+range_view<Iterator>
+__host__ __device__
+make_range_view(Iterator first, Size n)
+{
+  return range_view<Iterator>(first, first+n);
+}
+
+// This helper function creates a range_view from a pair of iterators
+template <class Iterator>
+range_view<Iterator>
+__host__ __device__
+make_range_view(Iterator first, Iterator last)
+{
+  return range_view<Iterator>(first, last);
+}
+
+// This helper function creates a range_view from a Vector
+template <class Vector>
+range_view<typename Vector::iterator>
+__host__
+make_range_view(Vector& v)
+{
+  return range_view<typename Vector::iterator>(v.begin(), v.end());
+}
+
+
+// This saxpy functor stores view of X, Y, Z array, and accesses them in
+// vector-like way
+template<class View1, class View2, class View3>
+struct saxpy_functor : public thrust::unary_function<int,void>
+{
+  const float a;
+  View1 x;
+  View2 y;
+  View3 z;
+
+  __host__ __device__
+  saxpy_functor(float _a, View1 _x, View2 _y, View3 _z)
+      : a(_a), x(_x), y(_y), z(_z)
+  {
+  }
+
+  __host__ __device__ 
+  void operator()(int i) 
+  {
+    z[i] = a * x[i] + y[i];
+  }
+};
+
+// saxpy function, which can either be called form host or device
+// The views are passed by value
+template<class View1, class View2, class View3>
+__host__ __device__
+void saxpy(float A, View1 X, View2 Y, View3 Z)
+{
+  // Z = A * X + Y
+  const int size = X.size();
+  thrust::for_each(thrust::device,
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(size),
+      saxpy_functor<View1,View2,View3>(A,X,Y,Z));
+}
+
+struct f1 : public thrust::unary_function<float,float>
+{
+  __host__ __device__
+  float operator()(float x) const
+  {
+    return x*3;
+  }
+};
+
+int main()
+{
+  using std::cout;
+  using std::endl;
+
+  // initialize host arrays
+  float x[4] = {1.0, 1.0, 1.0, 1.0};
+  float y[4] = {1.0, 2.0, 3.0, 4.0};
+  float z[4] = {0.0};
+
+  thrust::device_vector<float> X(x, x + 4);
+  thrust::device_vector<float> Y(y, y + 4);
+  thrust::device_vector<float> Z(z, z + 4);
+
+  saxpy(
+      2.0, 
+
+      // make a range view of a pair of transform_iterators
+      make_range_view(thrust::make_transform_iterator(X.cbegin(), f1()),
+                      thrust::make_transform_iterator(X.cend(), f1())),
+
+      // range view of normal_iterators
+      make_range_view(Y.begin(), thrust::distance(Y.begin(), Y.end())),
+
+      // range view of naked pointers
+      make_range_view(Z.data().get(), 4));
+
+  // print values from original device_vector<float> Z 
+  // to ensure that range view was mapped to this vector
+  for (int i = 0, n = Z.size(); i < n; ++i)
+  {
+    cout << "z[" << i << "]= " << Z[i] << endl;
+  }
+
+
+  return 0;
+}
+
diff --git a/thrust/examples/cuda/unwrap_pointer.cu b/thrust/examples/cuda/unwrap_pointer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a3039826237cd64ae919255b7ff2d26c5a2fc098
--- /dev/null
+++ b/thrust/examples/cuda/unwrap_pointer.cu
@@ -0,0 +1,30 @@
+#include <thrust/device_ptr.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <thrust/device_vector.h>
+#include <cuda.h>
+
+int main(void)
+{
+    size_t N = 10;
+
+    // create a device_ptr 
+    thrust::device_ptr<int> dev_ptr = thrust::device_malloc<int>(N);
+     
+    // extract raw pointer from device_ptr
+    int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);
+
+    // use raw_ptr in CUDA API functions
+    cudaMemset(raw_ptr, 0, N * sizeof(int));
+
+    // free memory
+    thrust::device_free(dev_ptr);
+    
+    // we can use the same approach for device_vector
+    thrust::device_vector<int> d_vec(N);
+
+    // note: d_vec.data() returns a device_ptr
+    raw_ptr = thrust::raw_pointer_cast(d_vec.data());
+
+    return 0;
+}
diff --git a/thrust/examples/cuda/wrap_pointer.cu b/thrust/examples/cuda/wrap_pointer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd665fcb55b83f25d7762830ea211f00a900b558
--- /dev/null
+++ b/thrust/examples/cuda/wrap_pointer.cu
@@ -0,0 +1,26 @@
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#include <cuda.h>
+
+int main(void)
+{
+    size_t N = 10;
+
+    // obtain raw pointer to device memory
+    int * raw_ptr;
+    cudaMalloc((void **) &raw_ptr, N * sizeof(int));
+
+    // wrap raw pointer with a device_ptr 
+    thrust::device_ptr<int> dev_ptr = thrust::device_pointer_cast(raw_ptr);
+
+    // use device_ptr in Thrust algorithms
+    thrust::fill(dev_ptr, dev_ptr + N, (int) 0);
+
+    // access device memory transparently through device_ptr
+    dev_ptr[0] = 1;
+
+    // free memory
+    cudaFree(raw_ptr);
+
+    return 0;
+}
diff --git a/thrust/examples/device_ptr.cu b/thrust/examples/device_ptr.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0074a025024a9bee8e706a12e15517a168d424db
--- /dev/null
+++ b/thrust/examples/device_ptr.cu
@@ -0,0 +1,47 @@
+#include <thrust/device_ptr.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+#include <thrust/sequence.h>
+#include <thrust/reduce.h>
+
+#include <cassert>
+#include <iostream>
+
+int main(void)
+{
+  // allocate memory buffer to store 10 integers on the device
+  thrust::device_ptr<int> d_ptr = thrust::device_malloc<int>(10);
+
+  // device_ptr supports pointer arithmetic 
+  thrust::device_ptr<int> first = d_ptr;
+  thrust::device_ptr<int> last  = d_ptr + 10;
+  std::cout << "device array contains " << (last - first) << " values\n";
+  
+  // algorithms work as expected
+  thrust::sequence(first, last);
+  std::cout << "sum of values is " << thrust::reduce(first, last) << "\n";
+  
+  // device memory can be read and written transparently
+  d_ptr[0] = 10;
+  d_ptr[1] = 11;
+  d_ptr[2] = d_ptr[0] + d_ptr[1];
+
+  // device_ptr can be converted to a "raw" pointer for use in other APIs and kernels, etc.
+  int * raw_ptr = thrust::raw_pointer_cast(d_ptr);
+
+  // note: raw_ptr cannot necessarily be accessed by the host!
+
+  // conversely, raw pointers can be wrapped
+  thrust::device_ptr<int> wrapped_ptr = thrust::device_pointer_cast(raw_ptr);
+
+  // back to where we started
+  assert(wrapped_ptr == d_ptr);
+  (void)wrapped_ptr; // for when NDEBUG is defined
+
+  // deallocate device memory
+  thrust::device_free(d_ptr);
+
+  return 0;
+}
+
diff --git a/thrust/examples/discrete_voronoi.cu b/thrust/examples/discrete_voronoi.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bfbf2242d9685e8b385ccf81c1ec276755b4b470
--- /dev/null
+++ b/thrust/examples/discrete_voronoi.cu
@@ -0,0 +1,246 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/tuple.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "include/timer.h"
+
+// Compute an approximate Voronoi Diagram with a Jump Flooding Algorithm (JFA)
+//
+// References
+//   http://en.wikipedia.org/wiki/Voronoi_diagram
+//   http://www.comp.nus.edu.sg/~tants/jfa.html
+//   http://www.utdallas.edu/~guodongrong/Papers/Dissertation.pdf
+//
+// Thanks to David Coeurjolly for contributing this example
+
+
+
+// minFunctor
+// Tuple  = <seeds,seeds + k,seeds + m*k, seeds - k, 
+//           seeds - m*k, seeds+ k+m*k,seeds + k-m*k,
+//           seeds- k+m*k,seeds - k+m*k, i>
+struct minFunctor
+{
+  int m, n, k;
+  
+  __host__ __device__
+  minFunctor(int m, int n, int k)
+    : m(m), n(n), k(k) {}
+
+
+  //To decide I have to change my current Voronoi site
+  __host__ __device__
+      int minVoro(int x_i, int y_i, int p, int q)
+      {    
+          if (q == m*n)
+              return p;
+
+          // coordinates of points p and q
+          int y_q =  q / m;
+          int x_q =  q - y_q * m;
+          int y_p =  p / m;
+          int x_p =  p - y_p * m;
+        
+          // squared distances
+          int d_iq = (x_i-x_q) * (x_i-x_q) + (y_i-y_q) * (y_i-y_q);
+          int d_ip = (x_i-x_p) * (x_i-x_p) + (y_i-y_p) * (y_i-y_p);
+
+          if (d_iq < d_ip)
+              return q;  // q is closer
+          else
+              return p;
+      }
+
+  //For each point p+{-k,0,k}, we keep the Site with minimum distance
+  template <typename Tuple>
+  __host__ __device__
+  int operator()(const Tuple &t)
+  {
+      //Current point and site
+      int i = thrust::get<9>(t);
+      int v = thrust::get<0>(t);
+
+      //Current point coordinates
+      int y = i / m;    
+      int x = i - y * m;
+
+      if (x >= k)
+      {
+          v = minVoro(x, y, v, thrust::get<3>(t));
+
+          if (y >= k)
+              v = minVoro(x, y, v, thrust::get<8>(t));
+
+          if (y + k < n)
+              v = minVoro(x, y, v, thrust::get<7>(t));
+      }
+
+      if (x + k < m)
+      { 
+          v = minVoro(x, y, v, thrust::get<1>(t));
+
+          if (y >= k)
+              v = minVoro(x, y, v, thrust::get<6>(t));
+          if (y + k < n)
+              v = minVoro(x, y, v, thrust::get<5>(t));
+      }
+
+      if (y >= k)
+          v = minVoro(x, y, v, thrust::get<4>(t));
+      if (y + k < n)
+          v = minVoro(x, y, v, thrust::get<2>(t));
+
+      //global return
+      return v;
+  }
+};
+
+
+
+// print an M-by-N array
+template <typename T>
+void print(int m, int n, const thrust::device_vector<T>& d_data)
+{
+    thrust::host_vector<T> h_data = d_data;
+
+    for(int i = 0; i < m; i++)
+    {
+        for(int j = 0; j < n; j++)
+            std::cout << std::setw(4) << h_data[i * n + j] << " ";
+        std::cout << "\n";
+    }
+}
+
+
+void generate_random_sites(thrust::host_vector<int> &t, int Nb, int m, int n)
+{
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(0, m * n - 1);
+
+  for(int k = 0; k < Nb; k++)
+  {
+      int index = dist(rng);
+      t[index] = index + 1;
+  }
+}
+
+//Export the tab to PGM image format
+void vector_to_pgm(thrust::host_vector<int> &t, int m, int n, const char *out)
+{
+    assert(static_cast<int>(t.size()) == m * n &&
+           "Vector size does not match image dims.");
+
+    std::fstream f(out, std::fstream::out);
+    f << "P2\n";
+    f << m << " " << n << "\n";
+    f << "253\n";
+
+    //Hash function to map values to [0,255]
+    auto to_grey_level = [](int in_value) -> int
+    {
+        return (71 * in_value) % 253;
+    };
+
+    for (int value : t)
+    {
+      f << to_grey_level(value) << " ";
+    }
+    f << "\n";
+    f.close();
+}
+
+/************Main Jfa loop********************/
+// Perform a jump with step k
+void jfa(thrust::device_vector<int>& in,thrust::device_vector<int>& out, unsigned int k, int m, int n)
+{
+   thrust::transform(
+        thrust::make_zip_iterator(
+            thrust::make_tuple(in.begin(), 
+                               in.begin() + k, 
+                               in.begin() + m*k, 
+                               in.begin() - k, 
+                               in.begin() - m*k, 
+                               in.begin() + k+m*k,
+                               in.begin() + k-m*k,
+                               in.begin() - k+m*k,
+                               in.begin() - k-m*k,
+                               thrust::counting_iterator<int>(0))),
+        thrust::make_zip_iterator(
+            thrust::make_tuple(in.begin(), 
+				    		   in.begin() + k, 
+                               in.begin() + m*k, 
+                               in.begin() - k, 
+                               in.begin() - m*k, 
+                               in.begin() + k+m*k,
+                               in.begin() + k-m*k,
+                               in.begin() - k+m*k,
+                               in.begin() - k-m*k,
+                               thrust::counting_iterator<int>(0)))+ n*m,
+        out.begin(),
+        minFunctor(m,n,k));
+}
+/********************************************/
+
+void display_time(timer& t)
+{
+  std::cout << "  ( "<< 1e3 * t.elapsed() << "ms )" << std::endl;
+}
+
+int main(void)
+{
+  int m = 2048; // number of rows
+  int n = 2048; // number of columns  
+  int s = 1000; // number of sites
+  
+  timer t;
+ 
+  //Host vector to encode a 2D image
+  std::cout << "[Inititialize " << m << "x" << n << " Image]" << std::endl;
+  t.restart();
+  thrust::host_vector<int> seeds_host(m*n, m*n);
+  generate_random_sites(seeds_host,s,m,n);
+  display_time(t);
+  
+  std::cout<<"[Copy to Device]" << std::endl;
+  t.restart();
+  thrust::device_vector<int> seeds = seeds_host;
+  thrust::device_vector<int> temp(seeds);
+  display_time(t);
+
+  //JFA+1  : before entering the log(n) loop, we perform a jump with k=1
+  std::cout<<"[JFA stepping]" << std::endl;
+  t.restart();
+  jfa(seeds,temp,1,m,n);
+  seeds.swap(temp);
+ 
+  //JFA : main loop with k=n/2, n/4, ..., 1
+  for(int k = thrust::max(m,n) / 2; k > 0; k /= 2)
+  {
+    jfa(seeds,temp,k,m,n);
+    seeds.swap(temp);
+  }
+
+  display_time(t);
+  std::cout <<"  ( " <<  seeds.size() / (1e6 * t.elapsed()) << " MPixel/s ) " << std::endl;
+  
+  std::cout << "[Device to Host Copy]" << std::endl;
+  t.restart();
+  seeds_host = seeds;
+  display_time(t);
+  
+  std::cout << "[PGM Export]" << std::endl;
+  t.restart();
+  vector_to_pgm(seeds_host, m, n, "discrete_voronoi.pgm");
+  display_time(t);
+
+  return 0;
+}
+
diff --git a/thrust/examples/dot_products_with_zip.cu b/thrust/examples/dot_products_with_zip.cu
new file mode 100644
index 0000000000000000000000000000000000000000..81ff7ac1281ccbe4f86ebfd4a824f074e8c7ecdf
--- /dev/null
+++ b/thrust/examples/dot_products_with_zip.cu
@@ -0,0 +1,133 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+
+
+// This example shows how thrust::zip_iterator can be used to create a
+// 'virtual' array of structures.  In this case the structure is a 3d
+// vector type (Float3) whose (x,y,z) components will be stored in
+// three separate float arrays.  The zip_iterator "zips" these arrays
+// into a single virtual Float3 array.
+
+
+
+// We'll use a 3-tuple to store our 3d vector type
+typedef thrust::tuple<float,float,float> Float3;
+
+
+// This functor implements the dot product between 3d vectors
+struct DotProduct : public thrust::binary_function<Float3,Float3,float>
+{
+    __host__ __device__
+        float operator()(const Float3& a, const Float3& b) const
+        {
+            return thrust::get<0>(a) * thrust::get<0>(b) +    // x components
+                   thrust::get<1>(a) * thrust::get<1>(b) +    // y components
+                   thrust::get<2>(a) * thrust::get<2>(b);     // z components
+        }
+};
+
+
+
+// Return a host vector with random values in the range [0,1)
+thrust::host_vector<float> random_vector(const size_t N,
+                                         unsigned int seed = thrust::default_random_engine::default_seed)
+{
+    thrust::default_random_engine rng(seed);
+    thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
+    thrust::host_vector<float> temp(N);
+    for(size_t i = 0; i < N; i++) {
+        temp[i] = u01(rng);
+    }
+    return temp;
+}
+
+
+int main(void)
+{
+    // number of vectors
+    const size_t N = 1000;
+
+    // We'll store the components of the 3d vectors in separate arrays. One set of
+    // arrays will store the 'A' vectors and another set will store the 'B' vectors.
+
+    // This 'structure of arrays' (SoA) approach is usually more efficient than the
+    // 'array of structures' (AoS) approach.  The primary reason is that structures,
+    // like Float3, don't always obey the memory coalescing rules, so they are not
+    // efficiently transferred to and from memory.  Another reason to prefer SoA to
+    // AoS is that we don't aways want to process all members of the structure.  For
+    // example, if we only need to look at first element of the structure then it
+    // is wasteful to load the entire structure from memory.  With the SoA approach,
+    // we can chose which elements of the structure we wish to read.
+
+    thrust::device_vector<float> A0 = random_vector(N);  // x components of the 'A' vectors
+    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors
+    thrust::device_vector<float> A2 = random_vector(N);  // z components of the 'A' vectors
+
+    thrust::device_vector<float> B0 = random_vector(N);  // x components of the 'B' vectors
+    thrust::device_vector<float> B1 = random_vector(N);  // y components of the 'B' vectors
+    thrust::device_vector<float> B2 = random_vector(N);  // z components of the 'B' vectors
+
+    // Storage for result of each dot product
+    thrust::device_vector<float> result(N);
+
+
+    // We'll now illustrate two ways to use zip_iterator to compute the dot
+    // products.  The first method is verbose but shows how the parts fit together.
+    // The second method hides these details and is more concise.
+
+
+    // METHOD #1
+    // Defining a zip_iterator type can be a little cumbersome ...
+    typedef thrust::device_vector<float>::iterator                     FloatIterator;
+    typedef thrust::tuple<FloatIterator, FloatIterator, FloatIterator> FloatIteratorTuple;
+    typedef thrust::zip_iterator<FloatIteratorTuple>                   Float3Iterator;
+
+    // Now we'll create some zip_iterators for A and B
+    Float3Iterator A_first = thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin()));
+    Float3Iterator A_last  = thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end()));
+    Float3Iterator B_first = thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin()));
+
+    // Finally, we pass the zip_iterators into transform() as if they
+    // were 'normal' iterators for a device_vector<Float3>.
+    thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());
+
+
+    // METHOD #2
+    // Alternatively, we can avoid creating variables for X_first, X_last,
+    // and Y_first and invoke transform() directly.
+    thrust::transform( thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())),
+                       thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end())),
+                       thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())),
+                       result.begin(),
+                       DotProduct() );
+
+
+
+    // Finally, we'll print a few results
+
+    // Example output
+    // (0.840188,0.45724,0.0860517) * (0.0587587,0.456151,0.322409) = 0.285683
+    // (0.394383,0.640368,0.180886) * (0.0138811,0.24875,0.0221609) = 0.168775
+    // (0.783099,0.717092,0.426423) * (0.622212,0.0699601,0.234811) = 0.63755
+    // (0.79844,0.460067,0.0470658) * (0.0391351,0.742097,0.354747) = 0.389358
+    std::cout << std::fixed;
+    for(size_t i = 0; i < 4; i++)
+    {
+        Float3 a = A_first[i];
+        Float3 b = B_first[i];
+        float dot = result[i];
+
+        std::cout << "(" << thrust::get<0>(a) << "," << thrust::get<1>(a) << "," << thrust::get<2>(a) << ")";
+        std::cout << " * ";
+        std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << "," << thrust::get<2>(b) << ")";
+        std::cout << " = ";
+        std::cout << dot << std::endl;
+    }
+
+    return 0;
+}
+
diff --git a/thrust/examples/expand.cu b/thrust/examples/expand.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4547bcd130041f2f90ec7ad1c03882afe602f71c
--- /dev/null
+++ b/thrust/examples/expand.cu
@@ -0,0 +1,99 @@
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/gather.h>
+#include <thrust/scan.h>
+#include <thrust/fill.h>
+#include <thrust/copy.h>
+
+#include <iterator>
+#include <iostream>
+
+// This example demonstrates how to expand an input sequence by 
+// replicating each element a variable number of times. For example,
+//
+//   expand([2,2,2],[A,B,C]) -> [A,A,B,B,C,C]
+//   expand([3,0,1],[A,B,C]) -> [A,A,A,C]
+//   expand([1,3,2],[A,B,C]) -> [A,B,B,B,C,C]
+//
+// The element counts are assumed to be non-negative integers
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator expand(InputIterator1 first1,
+                      InputIterator1 last1,
+                      InputIterator2 first2,
+                      OutputIterator output)
+{
+  typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
+  
+  difference_type input_size  = thrust::distance(first1, last1);
+  difference_type output_size = thrust::reduce(first1, last1);
+
+  // scan the counts to obtain output offsets for each input element
+  thrust::device_vector<difference_type> output_offsets(input_size, 0);
+  thrust::exclusive_scan(first1, last1, output_offsets.begin()); 
+
+  // scatter the nonzero counts into their corresponding output positions
+  thrust::device_vector<difference_type> output_indices(output_size, 0);
+  thrust::scatter_if
+    (thrust::counting_iterator<difference_type>(0),
+     thrust::counting_iterator<difference_type>(input_size),
+     output_offsets.begin(),
+     first1,
+     output_indices.begin());
+
+  // compute max-scan over the output indices, filling in the holes
+  thrust::inclusive_scan
+    (output_indices.begin(),
+     output_indices.end(),
+     output_indices.begin(),
+     thrust::maximum<difference_type>());
+
+  // gather input values according to index array (output = first2[output_indices])
+  OutputIterator output_end = output; thrust::advance(output_end, output_size);
+  thrust::gather(output_indices.begin(),
+                 output_indices.end(),
+                 first2,
+                 output);
+
+  // return output + output_size
+  thrust::advance(output, output_size);
+  return output;
+}
+
+template <typename Vector>
+void print(const std::string& s, const Vector& v)
+{
+  typedef typename Vector::value_type T;
+
+  std::cout << s;
+  thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
+  std::cout << std::endl;
+}
+
+int main(void)
+{
+  int counts[] = {3,5,2,0,1,3,4,2,4};
+  int values[] = {1,2,3,4,5,6,7,8,9};
+
+  size_t input_size  = sizeof(counts) / sizeof(int);
+  size_t output_size = thrust::reduce(counts, counts + input_size);
+
+  // copy inputs to device
+  thrust::device_vector<int> d_counts(counts, counts + input_size);
+  thrust::device_vector<int> d_values(values, values + input_size);
+  thrust::device_vector<int> d_output(output_size);
+
+  // expand values according to counts
+  expand(d_counts.begin(), d_counts.end(),
+         d_values.begin(),
+         d_output.begin());
+
+  std::cout << "Expanding values according to counts" << std::endl;
+  print(" counts ", d_counts);
+  print(" values ", d_values);
+  print(" output ", d_output);
+
+  return 0;
+}
diff --git a/thrust/examples/fill_copy_sequence.cu b/thrust/examples/fill_copy_sequence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..797f9686a6572f35b997cb8034a8e5811b974c66
--- /dev/null
+++ b/thrust/examples/fill_copy_sequence.cu
@@ -0,0 +1,31 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/sequence.h>
+
+#include <iostream>
+
+int main(void)
+{
+    // initialize all ten integers of a device_vector to 1
+    thrust::device_vector<int> D(10, 1);
+
+    // set the first seven elements of a vector to 9
+    thrust::fill(D.begin(), D.begin() + 7, 9);
+
+    // initialize a host_vector with the first five elements of D
+    thrust::host_vector<int> H(D.begin(), D.begin() + 5);
+
+    // set the elements of H to 0, 1, 2, 3, ...
+    thrust::sequence(H.begin(), H.end());
+
+    // copy all of H back to the beginning of D
+    thrust::copy(H.begin(), H.end(), D.begin());
+
+    // print D
+    for(size_t i = 0; i < D.size(); i++)
+        std::cout << "D[" << i << "] = " << D[i] << std::endl;
+
+    return 0;
+}
diff --git a/thrust/examples/histogram.cu b/thrust/examples/histogram.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fbcfd0aabbf431cf24cb88623d4b5bd08efe6745
--- /dev/null
+++ b/thrust/examples/histogram.cu
@@ -0,0 +1,190 @@
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+#include <thrust/copy.h>
+#include <thrust/random.h>
+#include <thrust/inner_product.h>
+#include <thrust/binary_search.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <iostream>
+#include <iomanip>
+#include <iterator>
+
+// This example illustrates several methods for computing a
+// histogram [1] with Thrust.  We consider standard "dense"
+// histograms, where some bins may have zero entries, as well
+// as "sparse" histograms, where only the nonzero bins are
+// stored.  For example, histograms for the data set
+//    [2 1 0 0 2 2 1 1 1 1 4]
+// which contains 2 zeros, 5 ones, and 3 twos and 1 four, is
+//    [2 5 3 0 1]
+// using the dense method and 
+//    [(0,2), (1,5), (2,3), (4,1)]
+// using the sparse method. Since there are no threes, the 
+// sparse histogram representation does not contain a bin
+// for that value.
+//
+// Note that we choose to store the sparse histogram in two
+// separate arrays, one array of keys and one array of bin counts,
+//    [0 1 2 4] - keys
+//    [2 5 3 1] - bin counts
+// This "structure of arrays" format is generally faster and
+// more convenient to process than the alternative "array
+// of structures" layout.
+//
+// The best histogramming methods depends on the application.
+// If the number of bins is relatively small compared to the 
+// input size, then the binary search-based dense histogram
+// method is probably best.  If the number of bins is comparable
+// to the input size, then the reduce_by_key-based sparse method 
+// ought to be faster.  When in doubt, try both and see which
+// is fastest.
+//
+// [1] http://en.wikipedia.org/wiki/Histogram
+
+
+// simple routine to print contents of a vector
+template <typename Vector>
+void print_vector(const std::string& name, const Vector& v)
+{
+  typedef typename Vector::value_type T;
+  std::cout << "  " << std::setw(20) << name << "  ";
+  thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
+  std::cout << std::endl;
+}
+
+// dense histogram using binary search
+template <typename Vector1, 
+          typename Vector2>
+void dense_histogram(const Vector1& input,
+                           Vector2& histogram)
+{
+  typedef typename Vector1::value_type ValueType; // input value type
+  typedef typename Vector2::value_type IndexType; // histogram index type
+
+  // copy input data (could be skipped if input is allowed to be modified)
+  thrust::device_vector<ValueType> data(input);
+    
+  // print the initial data
+  print_vector("initial data", data);
+
+  // sort data to bring equal elements together
+  thrust::sort(data.begin(), data.end());
+  
+  // print the sorted data
+  print_vector("sorted data", data);
+
+  // number of histogram bins is equal to the maximum value plus one
+  IndexType num_bins = data.back() + 1;
+
+  // resize histogram storage
+  histogram.resize(num_bins);
+  
+  // find the end of each bin of values
+  thrust::counting_iterator<IndexType> search_begin(0);
+  thrust::upper_bound(data.begin(), data.end(),
+                      search_begin, search_begin + num_bins,
+                      histogram.begin());
+  
+  // print the cumulative histogram
+  print_vector("cumulative histogram", histogram);
+
+  // compute the histogram by taking differences of the cumulative histogram
+  thrust::adjacent_difference(histogram.begin(), histogram.end(),
+                              histogram.begin());
+
+  // print the histogram
+  print_vector("histogram", histogram);
+}
+
+// sparse histogram using reduce_by_key
+template <typename Vector1,
+          typename Vector2,
+          typename Vector3>
+void sparse_histogram(const Vector1& input,
+                            Vector2& histogram_values,
+                            Vector3& histogram_counts)
+{
+  typedef typename Vector1::value_type ValueType; // input value type
+  typedef typename Vector3::value_type IndexType; // histogram index type
+
+  // copy input data (could be skipped if input is allowed to be modified)
+  thrust::device_vector<ValueType> data(input);
+    
+  // print the initial data
+  print_vector("initial data", data);
+
+  // sort data to bring equal elements together
+  thrust::sort(data.begin(), data.end());
+  
+  // print the sorted data
+  print_vector("sorted data", data);
+
+  // number of histogram bins is equal to number of unique values (assumes data.size() > 0)
+  IndexType num_bins = thrust::inner_product(data.begin(), data.end() - 1,
+                                             data.begin() + 1,
+                                             IndexType(1),
+                                             thrust::plus<IndexType>(),
+                                             thrust::not_equal_to<ValueType>());
+
+  // resize histogram storage
+  histogram_values.resize(num_bins);
+  histogram_counts.resize(num_bins);
+  
+  // compact find the end of each bin of values
+  thrust::reduce_by_key(data.begin(), data.end(),
+                        thrust::constant_iterator<IndexType>(1),
+                        histogram_values.begin(),
+                        histogram_counts.begin());
+  
+  // print the sparse histogram
+  print_vector("histogram values", histogram_values);
+  print_vector("histogram counts", histogram_counts);
+}
+
+int main(void)
+{
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(0, 9);
+
+  const int N = 40;
+  const int S = 4;
+
+  // generate random data on the host
+  thrust::host_vector<int> input(N);
+  for(int i = 0; i < N; i++)
+  {
+    int sum = 0;
+    for (int j = 0; j < S; j++)
+      sum += dist(rng);
+    input[i] = sum / S;
+  }
+
+  // demonstrate dense histogram method
+  {
+    std::cout << "Dense Histogram" << std::endl;
+    thrust::device_vector<int> histogram;
+    dense_histogram(input, histogram);
+  }
+  
+  // demonstrate sparse histogram method
+  {
+    std::cout << "Sparse Histogram" << std::endl;
+    thrust::device_vector<int> histogram_values;
+    thrust::device_vector<int> histogram_counts;
+    sparse_histogram(input, histogram_values, histogram_counts);
+  }
+
+  // Note: 
+  // A dense histogram can be converted to a sparse histogram
+  // using stream compaction (i.e. thrust::copy_if).
+  // A sparse histogram can be expanded into a dense histogram
+  // by initializing the dense histogram to zero (with thrust::fill)
+  // and then scattering the histogram counts (with thrust::scatter).
+
+  return 0;
+}
+
diff --git a/thrust/examples/include/timer.h b/thrust/examples/include/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c405195a17408a26b89285cada760d6a07f5d320
--- /dev/null
+++ b/thrust/examples/include/timer.h
@@ -0,0 +1,112 @@
+/*
+ *  Copyright 2008-2009 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// A simple timer class
+
+#ifdef __CUDACC__
+
+// use CUDA's high-resolution timers when possible
+#include <cuda_runtime_api.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#include <string>
+
+void cuda_safe_call(cudaError_t error, const std::string& message = "")
+{
+  if(error)
+    throw thrust::system_error(error, thrust::cuda_category(), message);
+}
+
+struct timer
+{
+  cudaEvent_t start;
+  cudaEvent_t end;
+
+  timer(void)
+  {
+    cuda_safe_call(cudaEventCreate(&start));
+    cuda_safe_call(cudaEventCreate(&end));
+    restart();
+  }
+
+  ~timer(void)
+  {
+    cuda_safe_call(cudaEventDestroy(start));
+    cuda_safe_call(cudaEventDestroy(end));
+  }
+
+  void restart(void)
+  {
+    cuda_safe_call(cudaEventRecord(start, 0));
+  }
+
+  double elapsed(void)
+  {
+    cuda_safe_call(cudaEventRecord(end, 0));
+    cuda_safe_call(cudaEventSynchronize(end));
+
+    float ms_elapsed;
+    cuda_safe_call(cudaEventElapsedTime(&ms_elapsed, start, end));
+    return ms_elapsed / 1e3;
+  }
+
+  double epsilon(void)
+  {
+    return 0.5e-6;
+  }
+};
+
+#else
+
+// fallback to clock()
+#include <ctime>
+
+struct timer
+{
+  clock_t start;
+  clock_t end;
+
+  timer(void)
+  {
+    restart();
+  }
+
+  ~timer(void)
+  {
+  }
+
+  void restart(void)
+  {
+    start = clock();
+  }
+
+  double elapsed(void)
+  {
+    end = clock();
+
+    return static_cast<double>(end - start) / static_cast<double>(CLOCKS_PER_SEC);
+  }
+
+  double epsilon(void)
+  {
+    return 1.0 / static_cast<double>(CLOCKS_PER_SEC);
+  }
+};
+
+#endif
+
diff --git a/thrust/examples/lambda.cu b/thrust/examples/lambda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..65b75f6275011e881c1797bbbaca6139caa34b4b
--- /dev/null
+++ b/thrust/examples/lambda.cu
@@ -0,0 +1,81 @@
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <iostream>
+
+// This example demonstrates the use of placeholders to implement
+// the SAXPY operation (i.e. Y[i] = a * X[i] + Y[i]).
+//
+// Placeholders enable developers to write concise inline expressions
+// instead of full functors for many simple operations.  For example,
+// the placeholder expression "_1 + _2" means to add the first argument,
+// represented by _1, to the second argument, represented by _2.
+// The names _1, _2, _3, _4 ... _10 represent the first ten arguments
+// to the function.
+// 
+// In this example, the placeholder expression "a * _1 + _2" is used
+// to implement the SAXPY operation.  Note that the placeholder 
+// implementation is considerably shorter and written inline. 
+
+
+// allows us to use "_1" instead of "thrust::placeholders::_1"
+using namespace thrust::placeholders;
+
+
+// implementing SAXPY with a functor is cumbersome and verbose
+struct saxpy_functor
+  : public thrust::binary_function<float, float, float>
+{
+  float a;
+  
+  saxpy_functor(float a) : a(a) {}
+
+  __host__ __device__
+  float operator()(float x, float y)
+  {
+    return a * x + y;
+  }
+};
+
+
+
+int main(void)
+{
+  // input data
+  float a = 2.0f;
+  float x[4] = {1, 2, 3, 4};
+  float y[4] = {1, 1, 1, 1};
+   
+  // SAXPY implemented with a functor (function object)
+  {
+    thrust::device_vector<float> X(x, x + 4);
+    thrust::device_vector<float> Y(y, y + 4);
+
+    thrust::transform(X.begin(), X.end(),  // input range #1
+                      Y.begin(),           // input range #2
+                      Y.begin(),           // output range
+                      saxpy_functor(a));   // functor
+    
+    std::cout << "SAXPY (functor method)" << std::endl;
+    for (size_t i = 0; i < 4; i++)
+      std::cout << a << " * " << x[i] << " + " << y[i] << " = " << Y[i] << std::endl;
+  }
+
+  // SAXPY implemented with a placeholders
+  {
+    thrust::device_vector<float> X(x, x + 4);
+    thrust::device_vector<float> Y(y, y + 4);
+
+    thrust::transform(X.begin(), X.end(),  // input range #1
+                      Y.begin(),           // input range #2
+                      Y.begin(),           // output range
+                      a * _1 + _2);        // placeholder expression
+
+    std::cout << "SAXPY (placeholder method)" << std::endl;
+    for (size_t i = 0; i < 4; i++)
+      std::cout << a << " * " << x[i] << " + " << y[i] << " = " << Y[i] << std::endl;
+  }
+
+  return 0;
+}
+
diff --git a/thrust/examples/lexicographical_sort.cu b/thrust/examples/lexicographical_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2f7d88bdf2c80eb646fa5387cea1c480637aba78
--- /dev/null
+++ b/thrust/examples/lexicographical_sort.cu
@@ -0,0 +1,92 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/gather.h>
+#include <thrust/random.h>
+#include <iostream>
+
+// This example shows how to perform a lexicographical sort on multiple keys.
+//
+// http://en.wikipedia.org/wiki/Lexicographical_order
+
+template <typename KeyVector, typename PermutationVector>
+void update_permutation(KeyVector& keys, PermutationVector& permutation)
+{
+    // temporary storage for keys
+    KeyVector temp(keys.size());
+
+    // permute the keys with the current reordering
+    thrust::gather(permutation.begin(), permutation.end(), keys.begin(), temp.begin());
+
+    // stable_sort the permuted keys and update the permutation
+    thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
+}
+
+
+template <typename KeyVector, typename PermutationVector>
+void apply_permutation(KeyVector& keys, PermutationVector& permutation)
+{
+    // copy keys to temporary vector
+    KeyVector temp(keys.begin(), keys.end());
+
+    // permute the keys
+    thrust::gather(permutation.begin(), permutation.end(), temp.begin(), keys.begin());
+}
+
+
+thrust::host_vector<int> random_vector(size_t N)
+{
+    thrust::host_vector<int> vec(N);
+    static thrust::default_random_engine rng;
+    static thrust::uniform_int_distribution<int> dist(0, 9);
+
+    for (size_t i = 0; i < N; i++)
+        vec[i] = dist(rng);
+
+    return vec;
+}
+
+
+int main(void)
+{
+    size_t N = 20;
+
+    // generate three arrays of random values
+    thrust::device_vector<int> upper  = random_vector(N);
+    thrust::device_vector<int> middle = random_vector(N);
+    thrust::device_vector<int> lower  = random_vector(N);
+    
+    std::cout << "Unsorted Keys" << std::endl;
+    for(size_t i = 0; i < N; i++)
+    {
+        std::cout << "(" << upper[i] << "," << middle[i] << "," << lower[i] << ")" << std::endl;
+    }
+
+    // initialize permutation to [0, 1, 2, ... ,N-1]
+    thrust::device_vector<int> permutation(N);
+    thrust::sequence(permutation.begin(), permutation.end());
+
+    // sort from least significant key to most significant keys
+    update_permutation(lower,  permutation);
+    update_permutation(middle, permutation);
+    update_permutation(upper,  permutation);
+
+    // Note: keys have not been modified
+    // Note: permutation now maps unsorted keys to sorted order
+  
+    // permute the key arrays by the final permuation
+    apply_permutation(lower,  permutation);
+    apply_permutation(middle, permutation);
+    apply_permutation(upper,  permutation);
+
+    std::cout << "Sorted Keys" << std::endl;
+    for(size_t i = 0; i < N; i++)
+    {
+        std::cout << "(" << upper[i] << "," << middle[i] << "," << lower[i] << ")" << std::endl;
+    }
+
+    return 0;
+}
+
diff --git a/thrust/examples/max_abs_diff.cu b/thrust/examples/max_abs_diff.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c9ae4d337f563a07d0880802588d12e9a77dc893
--- /dev/null
+++ b/thrust/examples/max_abs_diff.cu
@@ -0,0 +1,44 @@
+#include <thrust/inner_product.h>
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+
+#include <iostream>
+#include <cmath>
+
+// this example computes the maximum absolute difference 
+// between the elements of two vectors
+
+template <typename T>
+struct abs_diff : public thrust::binary_function<T,T,T>
+{
+    __host__ __device__
+    T operator()(const T& a, const T& b)
+    {
+        return fabsf(b - a);
+    }
+};
+
+
+int main(void)
+{
+    thrust::device_vector<float> d_a(4);
+    thrust::device_vector<float> d_b(4);
+
+    d_a[0] = 1.0;  d_b[0] = 2.0; 
+    d_a[1] = 2.0;  d_b[1] = 4.0;
+    d_a[2] = 3.0;  d_b[2] = 3.0;
+    d_a[3] = 4.0;  d_b[3] = 0.0;
+
+    // initial value of the reduction
+    float init = 0;
+
+    // binary operations
+    thrust::maximum<float> binary_op1;
+    abs_diff<float>        binary_op2;
+
+    float max_abs_diff = thrust::inner_product(d_a.begin(), d_a.end(), d_b.begin(), init, binary_op1, binary_op2); 
+
+    std::cout << "maximum absolute difference: " << max_abs_diff << std::endl;
+    return 0;
+}
+
diff --git a/thrust/examples/minimal_custom_backend.cu b/thrust/examples/minimal_custom_backend.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb0235a487d1a2554fb54db166921584cd212a26
--- /dev/null
+++ b/thrust/examples/minimal_custom_backend.cu
@@ -0,0 +1,60 @@
+#include <thrust/device_vector.h>
+#include <thrust/for_each.h>
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+// This example demonstrates how to build a minimal custom
+// Thrust backend by intercepting for_each's dispatch.
+
+// We begin by defining a "system", which distinguishes our novel
+// backend from other Thrust backends.
+// We'll derive my_system from thrust::device_execution_policy to inherit
+// the functionality of the default device backend.
+// Note that we pass the name of our system as a template parameter
+// to thrust::device_execution_policy.
+struct my_system : thrust::device_execution_policy<my_system> {};
+
+// Next, we'll create a novel version of for_each which only
+// applies to algorithm invocations executed with my_system.
+// Our version of for_each will print a message and then call
+// the regular device version of for_each.
+
+// The first parameter to our version for_each is my_system. This allows
+// Thrust to locate it when dispatching thrust::for_each.
+// The following parameters are as normal.
+template<typename Iterator, typename Function>
+  Iterator for_each(my_system, 
+                    Iterator first, Iterator last,
+                    Function f)
+{
+  // output a message
+  std::cout << "Hello, world from for_each(my_system)!" << std::endl;
+
+  // to call the normal device version of for_each, pass thrust::device as the first parameter.
+  return thrust::for_each(thrust::device, first, last, f);
+}
+
+int main()
+{
+  thrust::device_vector<int> vec(1);
+
+  // create an instance of our system
+  my_system sys;
+
+  // To invoke our version of for_each, pass sys as the first parameter
+  thrust::for_each(sys, vec.begin(), vec.end(), thrust::identity<int>());
+
+  // Other algorithms that Thrust implements with thrust::for_each will also
+  // cause our version of for_each to be invoked when we pass an instance of my_system as the first parameter.
+  // Even though we did not define a special version of transform, Thrust dispatches the version it knows
+  // for thrust::device_execution_policy, which my_system inherits.
+  thrust::transform(sys, vec.begin(), vec.end(), vec.begin(), thrust::identity<int>());
+
+  // Invocations without my_system are handled normally.
+  thrust::for_each(vec.begin(), vec.end(), thrust::identity<int>());
+
+  return 0;
+}
+
diff --git a/thrust/examples/minmax.cu b/thrust/examples/minmax.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3b4a53881356875c63c95751342dbecc945208eb
--- /dev/null
+++ b/thrust/examples/minmax.cu
@@ -0,0 +1,91 @@
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/functional.h>
+#include <thrust/extrema.h>
+#include <thrust/random.h>
+
+
+// compute minimum and maximum values in a single reduction
+
+// minmax_pair stores the minimum and maximum 
+// values that have been encountered so far
+template <typename T>
+struct minmax_pair
+{
+  T min_val;
+  T max_val;
+};
+
+// minmax_unary_op is a functor that takes in a value x and
+// returns a minmax_pair whose minimum and maximum values
+// are initialized to x.
+template <typename T>
+struct minmax_unary_op
+  : public thrust::unary_function< T, minmax_pair<T> >
+{
+  __host__ __device__
+  minmax_pair<T> operator()(const T& x) const
+  {
+    minmax_pair<T> result;
+    result.min_val = x;
+    result.max_val = x;
+    return result;
+  }
+};
+
+// minmax_binary_op is a functor that accepts two minmax_pair 
+// structs and returns a new minmax_pair whose minimum and 
+// maximum values are the min() and max() respectively of 
+// the minimums and maximums of the input pairs
+template <typename T>
+struct minmax_binary_op
+  : public thrust::binary_function< minmax_pair<T>, minmax_pair<T>, minmax_pair<T> >
+{
+  __host__ __device__
+  minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const
+  {
+    minmax_pair<T> result;
+    result.min_val = thrust::min(x.min_val, y.min_val);
+    result.max_val = thrust::max(x.max_val, y.max_val);
+    return result;
+  }
+};
+
+
+int main(void)
+{
+  // input size
+  size_t N = 10;
+
+  // initialize random number generator
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(10, 99);
+
+  // initialize data on host
+  thrust::device_vector<int> data(N);
+  for (size_t i = 0; i < data.size(); i++)
+      data[i] = dist(rng);
+
+  // setup arguments
+  minmax_unary_op<int>  unary_op;
+  minmax_binary_op<int> binary_op;
+
+  // initialize reduction with the first value
+  minmax_pair<int> init = unary_op(data[0]);
+
+  // compute minimum and maximum values
+  minmax_pair<int> result = thrust::transform_reduce(data.begin(), data.end(), unary_op, init, binary_op);
+
+  // print results
+  std::cout << "[ ";
+  for(size_t i = 0; i < N; i++)
+    std::cout << data[i] << " ";
+  std::cout << "]" << std::endl;
+ 
+  std::cout << "minimum = " << result.min_val << std::endl;
+  std::cout << "maximum = " << result.max_val << std::endl;
+
+  return 0;
+}
+
diff --git a/thrust/examples/mode.cu b/thrust/examples/mode.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2069adec142d0dba90f4d13df771da623c4ba56f
--- /dev/null
+++ b/thrust/examples/mode.cu
@@ -0,0 +1,84 @@
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/random.h>
+
+#include <iostream>
+#include <iterator>
+
+// This example compute the mode [1] of a set of numbers.  If there
+// are multiple modes, one with the smallest value it returned.
+//
+// [1] http://en.wikipedia.org/wiki/Mode_(statistics)
+
+int main(void)
+{
+    const size_t N = 30;
+    const size_t M = 10;
+    thrust::default_random_engine rng;
+    thrust::uniform_int_distribution<int> dist(0, M - 1);
+
+    // generate random data on the host
+    thrust::host_vector<int> h_data(N);
+    for(size_t i = 0; i < N; i++)
+        h_data[i] = dist(rng);
+
+    // transfer data to device
+    thrust::device_vector<int> d_data(h_data);
+    
+    // print the initial data
+    std::cout << "initial data" << std::endl;
+    thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+
+    // sort data to bring equal elements together
+    thrust::sort(d_data.begin(), d_data.end());
+    
+    // print the sorted data
+    std::cout << "sorted data" << std::endl;
+    thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+
+    // count number of unique keys
+    size_t num_unique = thrust::inner_product(d_data.begin(), d_data.end() - 1,
+                                              d_data.begin() + 1,
+                                              0,
+                                              thrust::plus<int>(),
+                                              thrust::not_equal_to<int>()) + 1;
+
+    // count multiplicity of each key
+    thrust::device_vector<int> d_output_keys(num_unique);
+    thrust::device_vector<int> d_output_counts(num_unique);
+    thrust::reduce_by_key(d_data.begin(), d_data.end(),
+                          thrust::constant_iterator<int>(1),
+                          d_output_keys.begin(),
+                          d_output_counts.begin());
+    
+    // print the counts
+    std::cout << "values" << std::endl;
+    thrust::copy(d_output_keys.begin(), d_output_keys.end(), std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+
+    // print the counts
+    std::cout << "counts" << std::endl;
+    thrust::copy(d_output_counts.begin(), d_output_counts.end(), std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+
+    // find the index of the maximum count
+    thrust::device_vector<int>::iterator mode_iter;
+    mode_iter = thrust::max_element(d_output_counts.begin(), d_output_counts.end());
+
+    int mode = d_output_keys[mode_iter - d_output_counts.begin()];
+    int occurances = *mode_iter;
+    
+    std::cout << "Modal value " << mode << " occurs " << occurances << " times " << std::endl;
+    
+    return 0;
+}
+
diff --git a/thrust/examples/monte_carlo.cu b/thrust/examples/monte_carlo.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a11c4de89d48ca1c380fb3e9b32234323d2cec2
--- /dev/null
+++ b/thrust/examples/monte_carlo.cu
@@ -0,0 +1,80 @@
+#include <thrust/random.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+
+// we could vary M & N to find the perf sweet spot
+
+__host__ __device__
+unsigned int hash(unsigned int a)
+{
+    a = (a+0x7ed55d16) + (a<<12);
+    a = (a^0xc761c23c) ^ (a>>19);
+    a = (a+0x165667b1) + (a<<5);
+    a = (a+0xd3a2646c) ^ (a<<9);
+    a = (a+0xfd7046c5) + (a<<3);
+    a = (a^0xb55a4f09) ^ (a>>16);
+    return a;
+}
+
+struct estimate_pi : public thrust::unary_function<unsigned int,float>
+{
+  __host__ __device__
+  float operator()(unsigned int thread_id)
+  {
+    float sum = 0;
+    unsigned int N = 10000; // samples per thread
+
+    unsigned int seed = hash(thread_id);
+
+    // seed a random number generator
+    thrust::default_random_engine rng(seed);
+
+    // create a mapping from random numbers to [0,1)
+    thrust::uniform_real_distribution<float> u01(0,1);
+
+    // take N samples in a quarter circle
+    for(unsigned int i = 0; i < N; ++i)
+    {
+      // draw a sample from the unit square
+      float x = u01(rng);
+      float y = u01(rng);
+
+      // measure distance from the origin
+      float dist = sqrtf(x*x + y*y);
+
+      // add 1.0f if (u0,u1) is inside the quarter circle
+      if(dist <= 1.0f)
+        sum += 1.0f;
+    }
+
+    // multiply by 4 to get the area of the whole circle
+    sum *= 4.0f;
+
+    // divide by N
+    return sum / N;
+  }
+};
+
+int main(void)
+{
+  // use 30K independent seeds
+  int M = 30000;
+
+  float estimate = thrust::transform_reduce(thrust::counting_iterator<int>(0),
+                                            thrust::counting_iterator<int>(M),
+                                            estimate_pi(),
+                                            0.0f,
+                                            thrust::plus<float>());
+  estimate /= M;
+
+  std::cout << std::setprecision(3);
+  std::cout << "pi is approximately " << estimate << std::endl;
+
+  return 0;
+}
+
diff --git a/thrust/examples/monte_carlo_disjoint_sequences.cu b/thrust/examples/monte_carlo_disjoint_sequences.cu
new file mode 100644
index 0000000000000000000000000000000000000000..77b0d00869294b69139ba1ba1caa1b0cc37c1ef6
--- /dev/null
+++ b/thrust/examples/monte_carlo_disjoint_sequences.cu
@@ -0,0 +1,85 @@
+#include <thrust/random.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+
+#include <iostream>
+#include <cmath>
+
+// The technique demonstrated in the example monte_carlo.cu 
+// assigns an independently seeded random number generator to each
+// of 30K threads, and uses a hashing scheme based on thread index to
+// seed each RNG. This technique, while simple, may be succeptible
+// to correlation among the streams of numbers generated by each RNG
+// because there is no guarantee that the streams are not disjoint.
+// This example demonstrates a slightly more sophisticated technique
+// which ensures that the subsequences generated in each thread are
+// disjoint. To achieve this, we use a single common stream
+// of random numbers, but partition it among threads to ensure no overlap
+// of substreams. The substreams are generated procedurally using
+// default_random_engine's discard(n) member function, which skips
+// past n states of the RNG. This function is accelerated and executes
+// in O(lg n) time.
+
+struct estimate_pi : public thrust::unary_function<unsigned int,float>
+{
+  __host__ __device__
+  float operator()(unsigned int thread_id)
+  {
+    float sum = 0;
+    unsigned int N = 5000; // samples per stream
+
+    // note that M * N <= default_random_engine::max,
+    // which is also the period of this particular RNG
+    // this ensures the substreams are disjoint
+
+    // create a random number generator
+    // note that each thread uses an RNG with the same seed
+    thrust::default_random_engine rng;
+
+    // jump past the numbers used by the subsequences before me
+    rng.discard(N * thread_id);
+
+    // create a mapping from random numbers to [0,1)
+    thrust::uniform_real_distribution<float> u01(0,1);
+
+    // take N samples in a quarter circle
+    for(unsigned int i = 0; i < N; ++i)
+    {
+      // draw a sample from the unit square
+      float x = u01(rng);
+      float y = u01(rng);
+
+      // measure distance from the origin
+      float dist = sqrtf(x*x + y*y);
+
+      // add 1.0f if (u0,u1) is inside the quarter circle
+      if(dist <= 1.0f)
+        sum += 1.0f;
+    }
+
+    // multiply by 4 to get the area of the whole circle
+    sum *= 4.0f;
+
+    // divide by N
+    return sum / N;
+  }
+};
+
+int main(void)
+{
+  // use 30K subsequences of random numbers
+  int M = 30000;
+
+  float estimate = thrust::transform_reduce(thrust::counting_iterator<int>(0),
+                                            thrust::counting_iterator<int>(M),
+                                            estimate_pi(),
+                                            0.0f,
+                                            thrust::plus<float>());
+  estimate /= M;
+
+  std::cout << "pi is around " << estimate << std::endl;
+
+  return 0;
+}
+
diff --git a/thrust/examples/mr_basic.cu b/thrust/examples/mr_basic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..733799425aa422b4345980385a3e97a00a32996a
--- /dev/null
+++ b/thrust/examples/mr_basic.cu
@@ -0,0 +1,82 @@
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/pool.h>
+#include <thrust/mr/disjoint_pool.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_ptr.h>
+
+#include <cassert>
+
+template<typename Vec>
+void do_stuff_with_vector(typename Vec::allocator_type alloc)
+{
+    Vec v1(alloc);
+    v1.push_back(1);
+    assert(v1.back() == 1);
+
+    Vec v2(alloc);
+    v2 = v1;
+
+    v1.swap(v2);
+
+    v1.clear();
+    v1.resize(2);
+    assert(v1.size() == 2);
+}
+
+int main()
+{
+    thrust::mr::new_delete_resource memres;
+
+    {
+        // no virtual calls will be issued
+        typedef thrust::mr::allocator<int, thrust::mr::new_delete_resource> Alloc;
+        Alloc alloc(&memres);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+
+    {
+        // virtual calls will be issued - wrapping in a polymorphic wrapper
+        thrust::mr::polymorphic_adaptor_resource<void *> adaptor(&memres);
+        typedef thrust::mr::polymorphic_allocator<int, void *> Alloc;
+        Alloc alloc(&adaptor);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+
+    {
+        // use the global device_ptr-flavored device memory resource
+        typedef thrust::device_ptr_memory_resource<thrust::device_memory_resource> Resource;
+        thrust::mr::polymorphic_adaptor_resource<thrust::device_ptr<void> > adaptor(
+            thrust::mr::get_global_resource<Resource>()
+        );
+        typedef thrust::mr::polymorphic_allocator<int, thrust::device_ptr<void> > Alloc;
+        Alloc alloc(&adaptor);
+
+        do_stuff_with_vector<thrust::device_vector<int, Alloc> >(alloc);
+    }
+
+    typedef thrust::mr::unsynchronized_pool_resource<
+        thrust::mr::new_delete_resource
+    > Pool;
+    Pool pool(&memres);
+    {
+        typedef thrust::mr::allocator<int, Pool> Alloc;
+        Alloc alloc(&pool);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+
+    typedef thrust::mr::disjoint_unsynchronized_pool_resource<
+        thrust::mr::new_delete_resource,
+        thrust::mr::new_delete_resource
+    > DisjointPool;
+    DisjointPool disjoint_pool(&memres, &memres);
+    {
+        typedef thrust::mr::allocator<int, DisjointPool> Alloc;
+        Alloc alloc(&disjoint_pool);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+}
diff --git a/thrust/examples/norm.cu b/thrust/examples/norm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0892baaf9162b97db0848885b8f84916bd70758b
--- /dev/null
+++ b/thrust/examples/norm.cu
@@ -0,0 +1,50 @@
+#include <thrust/transform_reduce.h>
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <cmath>
+#include <iostream>
+
+//   This example computes the norm [1] of a vector.  The norm is 
+// computed by squaring all numbers in the vector, summing the 
+// squares, and taking the square root of the sum of squares.  In
+// Thrust this operation is efficiently implemented with the 
+// transform_reduce() algorith.  Specifically, we first transform
+// x -> x^2 and the compute a standard plus reduction.  Since there
+// is no built-in functor for squaring numbers, we define our own
+// square functor.
+//
+// [1] http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
+
+
+// square<T> computes the square of a number f(x) -> x*x
+template <typename T>
+struct square
+{
+    __host__ __device__
+        T operator()(const T& x) const { 
+            return x * x;
+        }
+};
+
+int main(void)
+{
+    // initialize host array
+    float x[4] = {1.0, 2.0, 3.0, 4.0};
+
+    // transfer to device
+    thrust::device_vector<float> d_x(x, x + 4);
+
+    // setup arguments
+    square<float>        unary_op;
+    thrust::plus<float> binary_op;
+    float init = 0;
+
+    // compute norm
+    float norm = std::sqrt( thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op) );
+
+    std::cout << "norm is " << norm << std::endl;
+
+    return 0;
+}
+
diff --git a/thrust/examples/padded_grid_reduction.cu b/thrust/examples/padded_grid_reduction.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2467debca8f4f473d7e7dc6a05515e4d1576b580
--- /dev/null
+++ b/thrust/examples/padded_grid_reduction.cu
@@ -0,0 +1,118 @@
+#include <thrust/transform_reduce.h>
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+#include <thrust/extrema.h>
+#include <cmath>
+#include <iomanip>
+#include <float.h>
+
+// This example computes the minimum and maximum values
+// over a padded grid.  The padded values are not considered
+// during the reduction operation.
+
+
+// transform a tuple (int,value) into a tuple (bool,value,value)
+// where the bool is true for valid grid values and false for 
+// values in the padded region of the grid
+template <typename IndexType, typename ValueType>
+struct transform_tuple : 
+    public thrust::unary_function< thrust::tuple<IndexType,ValueType>, 
+                                   thrust::tuple<bool,ValueType,ValueType> >
+{
+  typedef typename thrust::tuple<IndexType,ValueType>      InputTuple;
+  typedef typename thrust::tuple<bool,ValueType,ValueType> OutputTuple;
+
+  IndexType n, N;
+
+  transform_tuple(IndexType n, IndexType N) : n(n), N(N) {}
+
+  __host__ __device__
+    OutputTuple operator()(const InputTuple& t) const
+    { 
+      bool is_valid = (thrust::get<0>(t) % N) < n;
+      return OutputTuple(is_valid, thrust::get<1>(t), thrust::get<1>(t));
+    }
+};
+
+
+// reduce two tuples (bool,value,value) into a single tuple such that output
+// contains the smallest and largest *valid* values.
+template <typename IndexType, typename ValueType>
+struct reduce_tuple :
+    public thrust::binary_function< thrust::tuple<bool,ValueType,ValueType>,
+                                    thrust::tuple<bool,ValueType,ValueType>,
+                                    thrust::tuple<bool,ValueType,ValueType> >
+{
+  typedef typename thrust::tuple<bool,ValueType,ValueType> Tuple;
+
+  __host__ __device__
+    Tuple operator()(const Tuple& t0, const Tuple& t1) const
+    { 
+      if(thrust::get<0>(t0) && thrust::get<0>(t1)) // both valid
+        return Tuple(true, 
+            thrust::min(thrust::get<1>(t0), thrust::get<1>(t1)),
+            thrust::max(thrust::get<2>(t0), thrust::get<2>(t1)));
+      else if (thrust::get<0>(t0))
+        return t0;
+      else if (thrust::get<0>(t1))
+        return t1;
+      else
+        return t1; // if neither is valid then it doesn't matter what we return
+    }
+};
+
+int main(void)
+{
+  int M = 10;  // number of rows
+  int n = 11;  // number of columns excluding padding
+  int N = 16;  // number of columns including padding
+
+  thrust::default_random_engine rng(12345);
+  thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
+
+  thrust::device_vector<float> data(M * N, -1);
+
+  // initialize valid values in grid
+  for(int i = 0; i < M; i++)
+    for(int j = 0; j < n; j++)
+      data[i * N + j] = dist(rng);
+
+  // print full grid
+  std::cout << "padded grid" << std::endl;
+  std::cout << std::fixed << std::setprecision(4);
+  for(int i = 0; i < M; i++)
+  {
+    std::cout << " ";
+    for(int j = 0; j < N; j++)
+    {
+      std::cout << data[i * N + j] << " ";
+    }   
+    std::cout << "\n";
+  }
+  std::cout << "\n";
+
+  // compute min & max over valid region of the 2d grid
+  typedef thrust::tuple<bool, float, float> result_type;
+
+  result_type                 init(true, FLT_MAX, -FLT_MAX); // initial value
+  transform_tuple<int,float>  unary_op(n, N);                // transformation operator
+  reduce_tuple<int,float>     binary_op;                     // reduction operator
+
+  result_type result = 
+    thrust::transform_reduce(
+        thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), data.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), data.begin())) + data.size(),
+        unary_op,
+        init,
+        binary_op);
+
+  std::cout << "minimum value: " << thrust::get<1>(result) << std::endl;
+  std::cout << "maximum value: " << thrust::get<2>(result) << std::endl;
+
+  return 0;
+}
+
diff --git a/thrust/examples/permutation_iterator.cu b/thrust/examples/permutation_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..793c8aa12538379621b557a8261ab4407e477e04
--- /dev/null
+++ b/thrust/examples/permutation_iterator.cu
@@ -0,0 +1,36 @@
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/device_vector.h>
+#include <iostream>
+
+// this example fuses a gather operation with a reduction for
+// greater efficiency than separate gather() and reduce() calls
+
+int main(void)
+{
+    // gather locations
+    thrust::device_vector<int> map(4);
+    map[0] = 3;
+    map[1] = 1;
+    map[2] = 0;
+    map[3] = 5;
+
+    // array to gather from
+    thrust::device_vector<int> source(6);
+    source[0] = 10;
+    source[1] = 20;
+    source[2] = 30;
+    source[3] = 40;
+    source[4] = 50;
+    source[5] = 60;
+
+    // fuse gather with reduction: 
+    //   sum = source[map[0]] + source[map[1]] + ...
+    int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()),
+                             thrust::make_permutation_iterator(source.begin(), map.end()));
+
+    // print sum
+    std::cout << "sum is " << sum << std::endl;
+
+    return 0;
+}
diff --git a/thrust/examples/raw_reference_cast.cu b/thrust/examples/raw_reference_cast.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ec9a9783f2316348c942902f4390c2beb7ced4de
--- /dev/null
+++ b/thrust/examples/raw_reference_cast.cu
@@ -0,0 +1,112 @@
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/fill.h>
+#include <iostream>
+
+// This example illustrates how to use the raw_reference_cast to convert
+// system-specific reference wrappers into native references.
+//
+// Using iterators in the manner described here is generally discouraged.
+// Users should only resort to this technique if there is no viable
+// implemention of a given operation in terms of Thrust algorithms.
+// For example this particular example is better solved with thrust::copy,
+// which is safer and potentially faster.  Only use this approach after all
+// safer alternatives have been exhausted.
+//
+// When a Thrust iterator is referenced (e.g. *iter) the result is not
+// a native or "raw" reference like int& or float&.  Instead,
+// the result is a type such as thrust::system::cuda::reference<int>
+// or thrust::system::tbb::reference<float>, depending on the system
+// to which the data belongs.  These reference wrappers are necessary
+// to make expressions like *iter1 = *iter2; work correctly when
+// iter1 and iter2 refer to data in different memory spaces on
+// heterogenous systems.
+//
+// The raw_reference_cast function essentially strips away the system-specific
+// meta-data so it should only be used when the code is guaranteed to be
+// executed within an appropriate context.
+
+
+__host__ __device__
+void assign_reference_to_reference(int& x, int& y)
+{
+  y = x;
+}
+
+__host__ __device__
+void assign_value_to_reference(int x, int& y)
+{
+  y = x;
+}
+
+template <typename InputIterator,
+          typename OutputIterator>
+struct copy_iterators
+{
+  InputIterator  input;
+  OutputIterator output;
+
+  copy_iterators(InputIterator input, OutputIterator output)
+    : input(input), output(output)
+  {}
+
+  __host__ __device__
+  void operator()(int i)
+  {
+    InputIterator  in  = input  + i;
+    OutputIterator out = output + i;
+
+    // invalid - reference<int> is not convertible to int&
+    // assign_reference_to_reference(*in, *out);
+   
+    // valid - reference<int> explicitly converted to int&
+    assign_reference_to_reference(thrust::raw_reference_cast(*in), thrust::raw_reference_cast(*out));
+
+    // valid - since reference<int> is convertible to int
+    assign_value_to_reference(*in, thrust::raw_reference_cast(*out));
+  }
+};
+
+template <typename Vector>
+void print(const std::string& name, const Vector& v)
+{
+  typedef typename Vector::value_type T;
+
+  std::cout << name << ": ";
+  thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));  
+  std::cout << "\n";
+}
+
+int main(void)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator           Iterator;
+  typedef thrust::device_system_tag  System;
+
+  size_t N = 5;
+
+  // allocate device memory
+  Vector A(N);
+  Vector B(N);
+
+  // initialize A and B
+  thrust::sequence(A.begin(), A.end());
+  thrust::fill(B.begin(), B.end(), 0);
+
+  std::cout << "Before A->B Copy" << std::endl;
+  print("A", A);
+  print("B", B);
+
+  // note: we must specify the System to ensure correct execution
+  thrust::for_each(thrust::counting_iterator<int,System>(0),
+                   thrust::counting_iterator<int,System>(N),
+                   copy_iterators<Iterator,Iterator>(A.begin(), B.begin()));
+  
+  std::cout << "After A->B Copy" << std::endl;
+  print("A", A);
+  print("B", B);
+ 
+  return 0;
+}
+
diff --git a/thrust/examples/remove_points2d.cu b/thrust/examples/remove_points2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0bca500a66ee65e0b2fd8c9104b4d22d1fa31de0
--- /dev/null
+++ b/thrust/examples/remove_points2d.cu
@@ -0,0 +1,69 @@
+#include <thrust/host_vector.h>
+#include <thrust/remove.h>
+#include <thrust/random.h>
+
+// This example generates random points in the 
+// unit square [0,1)x[0,1) and then removes all 
+// points where x^2 + y^2 > 1
+//
+// The x and y coordinates are stored in separate arrays
+// and a zip_iterator is used to combine them together
+
+template <typename T>
+struct is_outside_circle
+{
+    template <typename Tuple>
+    inline __host__ __device__
+    bool operator()(const Tuple& tuple) const
+    {
+        // unpack the tuple into x and y coordinates
+        const T x = thrust::get<0>(tuple);
+        const T y = thrust::get<1>(tuple);
+
+        if (x*x + y*y > 1)
+            return true;
+        else
+            return false;
+    }
+};
+
+int main(void)
+{
+    const size_t N = 20;
+
+    // generate random points in the unit square on the host
+    thrust::default_random_engine rng;
+    thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
+    thrust::host_vector<float> x(N);
+    thrust::host_vector<float> y(N);
+    for(size_t i = 0; i < N; i++)
+    {
+        x[i] = u01(rng);
+        y[i] = u01(rng);
+    }
+
+    // print the initial points
+    std::cout << std::fixed;
+    std::cout << "Generated " << N << " points" << std::endl;
+    for(size_t i = 0; i < N; i++)
+        std::cout << "(" << x[i] << "," << y[i] << ")" << std::endl;
+    std::cout << std::endl;
+
+    // remove points where x^2 + y^2 > 1 and determine new array sizes
+    size_t new_size = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(x.begin(), y.begin())),
+                                        thrust::make_zip_iterator(thrust::make_tuple(x.end(), y.end())),
+                                        is_outside_circle<float>())
+                      - thrust::make_zip_iterator(thrust::make_tuple(x.begin(), y.begin()));
+
+    // resize the vectors (note: this does not free any memory)
+    x.resize(new_size);
+    y.resize(new_size);
+
+    // print the filtered points
+    std::cout << "After stream compaction, " << new_size << " points remain" << std::endl;
+    for(size_t i = 0; i < new_size; i++)
+        std::cout << "(" << x[i] << "," << y[i] << ")" << std::endl;
+
+    return 0;
+}
+
diff --git a/thrust/examples/repeated_range.cu b/thrust/examples/repeated_range.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a309b80a6d9ac014ed13d865c84075d5fb41ef20
--- /dev/null
+++ b/thrust/examples/repeated_range.cu
@@ -0,0 +1,90 @@
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/functional.h>
+#include <thrust/fill.h>
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <iostream>
+
+// this example illustrates how to make repeated access to a range of values
+// examples:
+//   repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3] 
+//   repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
+//   repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] 
+//   ...
+
+template <typename Iterator>
+class repeated_range
+{
+    public:
+
+    typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+    struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
+    {
+        difference_type repeats;
+
+        repeat_functor(difference_type repeats)
+            : repeats(repeats) {}
+
+        __host__ __device__
+        difference_type operator()(const difference_type& i) const
+        { 
+            return i / repeats;
+        }
+    };
+
+    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+    typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
+    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+    // type of the repeated_range iterator
+    typedef PermutationIterator iterator;
+
+    // construct repeated_range for the range [first,last)
+    repeated_range(Iterator first, Iterator last, difference_type repeats)
+        : first(first), last(last), repeats(repeats) {}
+   
+    iterator begin(void) const
+    {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
+    }
+
+    iterator end(void) const
+    {
+        return begin() + repeats * (last - first);
+    }
+    
+    protected:
+    Iterator first;
+    Iterator last;
+    difference_type repeats;
+};
+
+int main(void)
+{
+    thrust::device_vector<int> data(4);
+    data[0] = 10;
+    data[1] = 20;
+    data[2] = 30;
+    data[3] = 40;
+
+    // print the initial data
+    std::cout << "range        ";
+    thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+
+    typedef thrust::device_vector<int>::iterator Iterator;
+  
+    // create repeated_range with elements repeated twice
+    repeated_range<Iterator> twice(data.begin(), data.end(), 2);
+    std::cout << "repeated x2: ";
+    thrust::copy(twice.begin(), twice.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+    
+    // create repeated_range with elements repeated x3
+    repeated_range<Iterator> thrice(data.begin(), data.end(), 3);
+    std::cout << "repeated x3: ";
+    thrust::copy(thrice.begin(), thrice.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+
+    return 0;
+}
diff --git a/thrust/examples/run_length_decoding.cu b/thrust/examples/run_length_decoding.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd73c30f5b4914bbd33c934de71e2906a1bb6a33
--- /dev/null
+++ b/thrust/examples/run_length_decoding.cu
@@ -0,0 +1,60 @@
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <iostream>
+#include <iterator>
+
+// This example decodes a run-length code [1] for an array of characters.
+//
+// [1] http://en.wikipedia.org/wiki/Run-length_encoding
+
+
+int main(void)
+{
+    // allocate storage for compressed input and run lengths
+    thrust::device_vector<char> input(6);
+    thrust::device_vector<int>  lengths(6);
+    input[0] = 'a';  lengths[0] = 3;
+    input[1] = 'b';  lengths[1] = 5;
+    input[2] = 'c';  lengths[2] = 1;
+    input[3] = 'd';  lengths[3] = 2;
+    input[4] = 'e';  lengths[4] = 9;
+    input[5] = 'f';  lengths[5] = 2;
+    
+    // print the initial data
+    std::cout << "run-length encoded input:" << std::endl;
+    for(size_t i = 0; i < 6; i++)
+        std::cout << "(" << input[i] << "," << lengths[i] << ")";
+    std::cout << std::endl << std::endl;
+
+    // scan the lengths
+    thrust::inclusive_scan(lengths.begin(), lengths.end(), lengths.begin());
+    
+    // output size is sum of the run lengths
+    int N = lengths.back();
+
+    // compute input index for each output element
+    thrust::device_vector<int> indices(N);
+    thrust::lower_bound(lengths.begin(), lengths.end(),
+                        thrust::counting_iterator<int>(1),
+                        thrust::counting_iterator<int>(N + 1),
+                        indices.begin());
+
+    // gather input elements
+    thrust::device_vector<char> output(N);
+    thrust::gather(indices.begin(), indices.end(),
+                   input.begin(),
+                   output.begin());
+
+    // print the initial data
+    std::cout << "decoded output:" << std::endl;
+    thrust::copy(output.begin(), output.end(), std::ostream_iterator<char>(std::cout, ""));
+    std::cout << std::endl;
+
+    return 0;
+}
+
diff --git a/thrust/examples/run_length_encoding.cu b/thrust/examples/run_length_encoding.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4019b9722f416af17de3afa051a20bda76907259
--- /dev/null
+++ b/thrust/examples/run_length_encoding.cu
@@ -0,0 +1,49 @@
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+
+#include <iostream>
+#include <iterator>
+
+// This example computes a run-length code [1] for an array of characters.
+//
+// [1] http://en.wikipedia.org/wiki/Run-length_encoding
+
+
+int main(void)
+{
+    // input data on the host
+    const char data[] = "aaabbbbbcddeeeeeeeeeff";
+
+    const size_t N = (sizeof(data) / sizeof(char)) - 1;
+
+    // copy input data to the device
+    thrust::device_vector<char> input(data, data + N);
+
+    // allocate storage for output data and run lengths
+    thrust::device_vector<char> output(N);
+    thrust::device_vector<int>  lengths(N);
+    
+    // print the initial data
+    std::cout << "input data:" << std::endl;
+    thrust::copy(input.begin(), input.end(), std::ostream_iterator<char>(std::cout, ""));
+    std::cout << std::endl << std::endl;
+
+    // compute run lengths
+    size_t num_runs = thrust::reduce_by_key
+                                    (input.begin(), input.end(),          // input key sequence
+                                     thrust::constant_iterator<int>(1),   // input value sequence
+                                     output.begin(),                      // output key sequence
+                                     lengths.begin()                      // output value sequence
+                                     ).first - output.begin();            // compute the output size
+    
+    // print the output
+    std::cout << "run-length encoded output:" << std::endl;
+    for(size_t i = 0; i < num_runs; i++)
+        std::cout << "(" << output[i] << "," << lengths[i] << ")";
+    std::cout << std::endl;
+    
+    return 0;
+}
+
diff --git a/thrust/examples/saxpy.cu b/thrust/examples/saxpy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bbc6b6156e887e54e0888092c7fb3adac8a7b284
--- /dev/null
+++ b/thrust/examples/saxpy.cu
@@ -0,0 +1,76 @@
+#include <thrust/transform.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/functional.h>
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+// This example illustrates how to implement the SAXPY
+// operation (Y[i] = a * X[i] + Y[i]) using Thrust. 
+// The saxpy_slow function demonstrates the most
+// straightforward implementation using a temporary
+// array and two separate transformations, one with
+// multiplies and one with plus.  The saxpy_fast function
+// implements the operation with a single transformation
+// and represents "best practice".
+
+struct saxpy_functor : public thrust::binary_function<float,float,float>
+{
+    const float a;
+
+    saxpy_functor(float _a) : a(_a) {}
+
+    __host__ __device__
+        float operator()(const float& x, const float& y) const { 
+            return a * x + y;
+        }
+};
+
+void saxpy_fast(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y)
+{
+    // Y <- A * X + Y
+    thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
+}
+
+void saxpy_slow(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y)
+{
+    thrust::device_vector<float> temp(X.size());
+   
+    // temp <- A
+    thrust::fill(temp.begin(), temp.end(), A);
+    
+    // temp <- A * X
+    thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(), thrust::multiplies<float>());
+
+    // Y <- A * X + Y
+    thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(), thrust::plus<float>());
+}
+
+int main(void)
+{
+    // initialize host arrays
+    float x[4] = {1.0, 1.0, 1.0, 1.0};
+    float y[4] = {1.0, 2.0, 3.0, 4.0};
+
+    {
+        // transfer to device
+        thrust::device_vector<float> X(x, x + 4);
+        thrust::device_vector<float> Y(y, y + 4);
+
+        // slow method
+        saxpy_slow(2.0, X, Y);
+    }
+
+    {
+        // transfer to device
+        thrust::device_vector<float> X(x, x + 4);
+        thrust::device_vector<float> Y(y, y + 4);
+
+        // fast method
+        saxpy_fast(2.0, X, Y);
+    }
+    
+    return 0;
+}
+
diff --git a/thrust/examples/scan_by_key.cu b/thrust/examples/scan_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f353da55692aec27cc1c6c2a29bff4c984fcd9d6
--- /dev/null
+++ b/thrust/examples/scan_by_key.cu
@@ -0,0 +1,93 @@
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <thrust/scan.h>
+#include <iostream>
+
+// BinaryPredicate for the head flag segment representation
+// equivalent to thrust::not2(thrust::project2nd<int,int>()));
+template <typename HeadFlagType>
+struct head_flag_predicate 
+    : public thrust::binary_function<HeadFlagType,HeadFlagType,bool>
+{
+    __host__ __device__
+    bool operator()(HeadFlagType, HeadFlagType right) const
+    {
+        return !right;
+    }
+};
+
+template <typename Vector>
+void print(const Vector& v)
+{
+  for(size_t i = 0; i < v.size(); i++)
+    std::cout << v[i] << " ";
+  std::cout << "\n";
+}
+
+int main(void)
+{
+    int keys[]   = {0,0,0,1,1,2,2,2,2,3,4,4,5,5,5};  // segments represented with keys
+    int flags[]  = {1,0,0,1,0,1,0,0,0,1,1,0,1,0,0};  // segments represented with head flags
+    int values[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};  // values corresponding to each key
+
+    int N = sizeof(keys) / sizeof(int); // number of elements
+
+    // copy input data to device
+    thrust::device_vector<int> d_keys  (keys,   keys   + N);
+    thrust::device_vector<int> d_flags (flags,  flags  + N);
+    thrust::device_vector<int> d_values(values, values + N);
+    
+    // allocate storage for output
+    thrust::device_vector<int> d_output(N);
+
+    // inclusive scan using keys
+    thrust::inclusive_scan_by_key
+      (d_keys.begin(), d_keys.end(),
+       d_values.begin(),
+       d_output.begin());
+   
+    std::cout << "Inclusive Segmented Scan w/ Key Sequence\n";
+    std::cout << " keys          : ";  print(d_keys);
+    std::cout << " input values  : ";  print(d_values);
+    std::cout << " output values : ";  print(d_output);
+    
+    // inclusive scan using head flags
+    thrust::inclusive_scan_by_key
+      (d_flags.begin(), d_flags.end(),
+       d_values.begin(), 
+       d_output.begin(),
+       head_flag_predicate<int>());
+    
+    std::cout << "\nInclusive Segmented Scan w/ Head Flag Sequence\n";
+    std::cout << " head flags    : ";  print(d_flags);
+    std::cout << " input values  : ";  print(d_values);
+    std::cout << " output values : ";  print(d_output);
+    
+    // exclusive scan using keys
+    thrust::exclusive_scan_by_key
+      (d_keys.begin(), d_keys.end(),
+       d_values.begin(),
+       d_output.begin());
+   
+    std::cout << "\nExclusive Segmented Scan w/ Key Sequence\n";
+    std::cout << " keys          : ";  print(d_keys);
+    std::cout << " input values  : ";  print(d_values);
+    std::cout << " output values : ";  print(d_output);
+    
+    // exclusive scan using head flags
+    thrust::exclusive_scan_by_key
+      (d_flags.begin(), d_flags.end(),
+       d_values.begin(), 
+       d_output.begin(),
+       0,
+       head_flag_predicate<int>());
+    
+    std::cout << "\nExclusive Segmented Scan w/ Head Flag Sequence\n";
+    std::cout << " head flags    : ";  print(d_flags);
+    std::cout << " input values  : ";  print(d_values);
+    std::cout << " output values : ";  print(d_output);
+
+
+    return 0;
+}
+
diff --git a/thrust/examples/scan_matrix_by_rows.cu b/thrust/examples/scan_matrix_by_rows.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2cf1986e9868547cab513c1414b8d20590223653
--- /dev/null
+++ b/thrust/examples/scan_matrix_by_rows.cu
@@ -0,0 +1,73 @@
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <assert.h>
+
+// We have a matrix stored in a `thrust::device_vector`. We want to perform a
+// scan on each row of a matrix.
+
+__host__
+void scan_matrix_by_rows0(thrust::device_vector<int>& u, int n, int m) {
+  // Here, we launch a separate scan for each row in the matrix. This works,
+  // but each kernel only does a small amount of work. It would be better if we
+  // could launch one big kernel for the entire matrix.
+  for (int i = 0; i < n; ++i)
+    thrust::inclusive_scan(u.begin() + m * i, u.begin() + m * (i + 1),
+                           u.begin() + m * i);
+}
+
+// We can batch the operation using `thrust::inclusive_scan_by_key`, which
+// scans each group of consecutive equal keys. All we need to do is generate
+// the right key sequence. We want the keys for elements on the same row to
+// be identical.
+
+// So first, we define an unary function object which takes the index of an
+// element and returns the row that it belongs to.
+
+struct which_row : thrust::unary_function<int, int> {
+  int row_length;
+
+  __host__ __device__
+  which_row(int row_length_) : row_length(row_length_) {}
+
+  __host__ __device__
+  int operator()(int idx) const {
+    return idx / row_length;
+  }
+};
+
+__host__
+void scan_matrix_by_rows1(thrust::device_vector<int>& u, int n, int m) {
+  // This `thrust::counting_iterator` represents the index of the element.
+  thrust::counting_iterator<int> c_first(0);
+
+  // We construct a `thrust::transform_iterator` which applies the `which_row`
+  // function object to the index of each element.
+  thrust::transform_iterator<which_row, thrust::counting_iterator<int> >
+    t_first(c_first, which_row(m));
+
+  // Finally, we use our `thrust::transform_iterator` as the key sequence to
+  // `thrust::inclusive_scan_by_key`.
+  thrust::inclusive_scan_by_key(t_first, t_first + n * m, u.begin(), u.begin());
+}
+
+int main() {
+  int const n = 4;
+  int const m = 5;
+
+  thrust::device_vector<int> u0(n * m);
+  thrust::sequence(u0.begin(), u0.end());
+  scan_matrix_by_rows0(u0, n, m);
+
+  thrust::device_vector<int> u1(n * m);
+  thrust::sequence(u1.begin(), u1.end());
+  scan_matrix_by_rows1(u1, n, m);
+
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j)
+      assert(u0[j + m * i] == u1[j + m * i]);
+}
+
diff --git a/thrust/examples/set_operations.cu b/thrust/examples/set_operations.cu
new file mode 100644
index 0000000000000000000000000000000000000000..43e5bbd598443360204140106154cf3a0fdc3910
--- /dev/null
+++ b/thrust/examples/set_operations.cu
@@ -0,0 +1,156 @@
+#include <thrust/device_vector.h>
+#include <thrust/merge.h>
+#include <thrust/set_operations.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <iostream>
+
+// This example illustrates use of the set operation algorithms
+//  - merge
+//  - set_union
+//  - set_intersection
+//  - set_difference
+//  - set_symmetric_difference
+//
+// In this context a "set" is simply a sequence of sorted values,
+// allowing the standard set operations to be performed more efficiently
+// than on unsorted data.  Since the output of a set operation is a valid
+// set (i.e. a sorted sequence) it is possible to apply the set operations
+// in a nested fashion to compute arbitrary set expressions.
+//
+// Set operation usage notes:
+//   - The output set size is variable (except for thrust::merge),
+//     so the return value is important.
+//   - Generally one would conservatively allocate storage for the output
+//     and then resize or shrink an output container as necessary.
+//     Alternatively, one can compute the exact output size by
+//     outputting to a discard_iterator.  This approach is more computationally
+//     expensive (approximately 2x), but conserves memory capacity.
+//     Refer to the SetIntersectionSize function for implementation details.
+//   - Sets are allowed to have duplicate elements, which are carried
+//     through to the output in a algorithm-specific manner.  Refer
+//     to the full documentation for precise semantics.
+
+
+// helper routine
+template <typename String, typename Vector>
+void print(const String& s, const Vector& v)
+{
+  std::cout << s << " [";
+  for(size_t i = 0; i < v.size(); i++)
+    std::cout << " " << v[i];
+  std::cout << " ]\n";
+}
+
+template <typename Vector>
+void Merge(const Vector& A, const Vector& B)
+{
+  // merged output is always exactly A.size() + B.size()
+  Vector C(A.size() + B.size());
+
+  thrust::merge(A.begin(), A.end(), B.begin(), B.end(), C.begin());
+
+  print("Merge(A,B)", C);
+}
+
+template <typename Vector>
+void SetUnion(const Vector& A, const Vector& B)
+{
+  // union output is at most A.size() + B.size()
+  Vector C(A.size() + B.size());
+
+  // set_union returns an iterator C_end denoting the end of input
+  typename Vector::iterator C_end;
+  
+  C_end = thrust::set_union(A.begin(), A.end(), B.begin(), B.end(), C.begin());
+  
+  // shrink C to exactly fit output
+  C.erase(C_end, C.end());
+
+  print("Union(A,B)", C);
+}
+
+template <typename Vector>
+void SetIntersection(const Vector& A, const Vector& B)
+{
+  // intersection output is at most min(A.size(), B.size())
+  Vector C(thrust::min(A.size(), B.size()));
+
+  // set_union returns an iterator C_end denoting the end of input
+  typename Vector::iterator C_end;
+  
+  C_end = thrust::set_intersection(A.begin(), A.end(), B.begin(), B.end(), C.begin());
+  
+  // shrink C to exactly fit output
+  C.erase(C_end, C.end());
+
+  print("Intersection(A,B)", C);
+}
+
+template <typename Vector>
+void SetDifference(const Vector& A, const Vector& B)
+{
+  // difference output is at most A.size()
+  Vector C(A.size());
+
+  // set_union returns an iterator C_end denoting the end of input
+  typename Vector::iterator C_end;
+  
+  C_end = thrust::set_difference(A.begin(), A.end(), B.begin(), B.end(), C.begin());
+  
+  // shrink C to exactly fit output
+  C.erase(C_end, C.end());
+
+  print("Difference(A,B)", C);
+}
+
+template <typename Vector>
+void SetSymmetricDifference(const Vector& A, const Vector& B)
+{
+  // symmetric difference output is at most A.size() + B.size()
+  Vector C(A.size() + B.size());
+
+  // set_union returns an iterator C_end denoting the end of input
+  typename Vector::iterator C_end;
+  
+  C_end = thrust::set_symmetric_difference(A.begin(), A.end(), B.begin(), B.end(), C.begin());
+  
+  // shrink C to exactly fit output
+  C.erase(C_end, C.end());
+
+  print("SymmetricDifference(A,B)", C);
+}
+
+template <typename Vector>
+void SetIntersectionSize(const Vector& A, const Vector& B)
+{
+  // computes the exact size of the intersection without allocating output
+  thrust::discard_iterator<> C_begin, C_end;
+
+  C_end = thrust::set_intersection(A.begin(), A.end(), B.begin(), B.end(), C_begin);
+
+  std::cout << "SetIntersectionSize(A,B) " << (C_end - C_begin) << std::endl;
+}
+
+
+int main(void)
+{
+  int a[] = {0,2,4,5,6,8,9};
+  int b[] = {0,1,2,3,5,7,8};
+
+  thrust::device_vector<int> A(a, a + sizeof(a) / sizeof(int));
+  thrust::device_vector<int> B(b, b + sizeof(b) / sizeof(int));
+
+  print("Set A", A);
+  print("Set B", B);
+
+  Merge(A,B);
+  SetUnion(A,B);
+  SetIntersection(A,B);
+  SetDifference(A,B);
+  SetSymmetricDifference(A,B);
+
+  SetIntersectionSize(A,B);
+
+  return 0;
+}
+
diff --git a/thrust/examples/simple_moving_average.cu b/thrust/examples/simple_moving_average.cu
new file mode 100644
index 0000000000000000000000000000000000000000..523e8fb379fdcf4844a9e4d724909d160d5c0264
--- /dev/null
+++ b/thrust/examples/simple_moving_average.cu
@@ -0,0 +1,91 @@
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/random.h>
+#include <iostream>
+#include <iomanip>
+
+// Efficiently computes the simple moving average (SMA) [1] of a data series
+// using a parallel prefix-sum or "scan" operation.
+//
+// Note: additional numerical precision should be used in the cumulative summing
+// stage when computing the SMA of large data series.  The most straightforward 
+// remedy is to replace 'float' with 'double'.   Alternatively a Kahan or 
+// "compensated" summation algorithm could be applied [2].
+//
+// [1] http://en.wikipedia.org/wiki/Moving_average#Simple_moving_average
+// [2] http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+
+
+// compute the difference of two positions in the cumumulative sum and
+// divide by the SMA window size w.
+template <typename T>
+struct minus_and_divide : public thrust::binary_function<T,T,T>
+{
+    T w;
+
+    minus_and_divide(T w) : w(w) {}
+
+    __host__ __device__
+    T operator()(const T& a, const T& b) const
+    {
+        return (a - b) / w;
+    }
+};
+
+template <typename InputVector, typename OutputVector>
+void simple_moving_average(const InputVector& data, size_t w, OutputVector& output)
+{
+    typedef typename InputVector::value_type T;
+
+    if (data.size() < w)
+        return;
+    
+    // allocate storage for cumulative sum
+    thrust::device_vector<T> temp(data.size() + 1);
+
+    // compute cumulative sum
+    thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
+    temp[data.size()] = data.back() + temp[data.size() - 1];
+
+    // compute moving averages from cumulative sum
+    thrust::transform(temp.begin() + w, temp.end(), temp.begin(), output.begin(), minus_and_divide<T>(T(w)));
+}
+
+int main(void)
+{
+  // length of data series
+  size_t n = 30;
+
+  // window size of the moving average
+  size_t w = 4;
+
+  // generate random data series
+  thrust::device_vector<float> data(n);
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(0, 10);
+  for (size_t i = 0; i < n; i++)
+    data[i] = static_cast<float>(dist(rng));
+
+  // allocate storage for averages
+  thrust::device_vector<float> averages(data.size() - (w - 1));
+
+  // compute SMA using standard summation
+  simple_moving_average(data, w, averages);
+ 
+  // print data series
+  std::cout << "data series: [ ";
+  for (size_t i = 0; i < data.size(); i++)
+    std::cout << data[i] << " ";
+  std::cout << "]" << std::endl;
+
+  // print moving averages
+  std::cout << "simple moving averages (window = " << w << ")" << std::endl;
+  for (size_t i = 0; i < averages.size(); i++)
+    std::cout << "  [" << std::setw(2) << i << "," << std::setw(2) << (i + w) << ") = " << averages[i] << std::endl;
+
+  return 0;
+}
+
diff --git a/thrust/examples/sort.cu b/thrust/examples/sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..700fc5f3fb929cea0fab459a3ba0b87f24bf1272
--- /dev/null
+++ b/thrust/examples/sort.cu
@@ -0,0 +1,168 @@
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#include <thrust/random.h>
+#include <iostream>
+#include <iomanip>
+
+// Helper routines
+
+void initialize(thrust::device_vector<int>& v)
+{
+  thrust::default_random_engine rng(123456);
+  thrust::uniform_int_distribution<int> dist(10, 99);
+  for(size_t i = 0; i < v.size(); i++)
+    v[i] = dist(rng);
+}
+
+void initialize(thrust::device_vector<float>& v)
+{
+  thrust::default_random_engine rng(123456);
+  thrust::uniform_int_distribution<int> dist(2, 19);
+  for(size_t i = 0; i < v.size(); i++)
+    v[i] = dist(rng) / 2.0f;
+}
+
+void initialize(thrust::device_vector< thrust::pair<int,int> >& v)
+{
+  thrust::default_random_engine rng(123456);
+  thrust::uniform_int_distribution<int> dist(0,9);
+  for(size_t i = 0; i < v.size(); i++)
+  {
+    int a = dist(rng);
+    int b = dist(rng);
+    v[i] = thrust::make_pair(a,b);
+  }
+}
+
+void initialize(thrust::device_vector<int>& v1, thrust::device_vector<int>& v2)
+{
+  thrust::default_random_engine rng(123456);
+  thrust::uniform_int_distribution<int> dist(10, 99);
+  for(size_t i = 0; i < v1.size(); i++)
+  {
+    v1[i] = dist(rng);
+    v2[i] = i;
+  }
+}
+
+void print(const thrust::device_vector<int>& v)
+{
+  for(size_t i = 0; i < v.size(); i++)
+    std::cout << " " << v[i];
+  std::cout << "\n";
+}
+
+void print(const thrust::device_vector<float>& v)
+{
+  for(size_t i = 0; i < v.size(); i++)
+    std::cout << " " << std::fixed << std::setprecision(1) << v[i];
+  std::cout << "\n";
+}
+
+void print(const thrust::device_vector< thrust::pair<int,int> >& v)
+{
+  for(size_t i = 0; i < v.size(); i++)
+  {
+    thrust::pair<int,int> p = v[i];
+    std::cout << " (" << p.first << "," << p.second << ")";
+  }
+  std::cout << "\n";
+}
+
+void print(thrust::device_vector<int>& v1, thrust::device_vector<int> v2)
+{
+  for(size_t i = 0; i < v1.size(); i++)
+    std::cout << " (" << v1[i] << "," << std::setw(2) << v2[i] << ")";
+  std::cout << "\n";
+}
+
+
+// user-defined comparison operator that acts like less<int>,
+// except even numbers are considered to be smaller than odd numbers
+struct evens_before_odds
+{
+  __host__ __device__
+  bool operator()(int x, int y)
+  {
+    if (x % 2 == y % 2)
+      return x < y;
+    else if (x % 2)
+      return false;
+    else
+      return true;
+  }
+};
+
+
+int main(void)
+{
+  size_t N = 16;
+
+  std::cout << "sorting integers\n";
+  {
+    thrust::device_vector<int> keys(N);
+    initialize(keys);
+    print(keys);
+    thrust::sort(keys.begin(), keys.end());
+    print(keys);
+  }
+  
+  std::cout << "\nsorting integers (descending)\n";
+  {
+    thrust::device_vector<int> keys(N);
+    initialize(keys);
+    print(keys);
+    thrust::sort(keys.begin(), keys.end(), thrust::greater<int>());
+    print(keys);
+  }
+  
+  std::cout << "\nsorting integers (user-defined comparison)\n";
+  {
+    thrust::device_vector<int> keys(N);
+    initialize(keys);
+    print(keys);
+    thrust::sort(keys.begin(), keys.end(), evens_before_odds());
+    print(keys);
+  }
+
+  std::cout << "\nsorting floats\n";
+  {
+    thrust::device_vector<float> keys(N);
+    initialize(keys);
+    print(keys);
+    thrust::sort(keys.begin(), keys.end());
+    print(keys);
+  }
+  
+  std::cout << "\nsorting pairs\n";
+  {
+    thrust::device_vector< thrust::pair<int,int> > keys(N);
+    initialize(keys);
+    print(keys);
+    thrust::sort(keys.begin(), keys.end());
+    print(keys);
+  }
+  
+  std::cout << "\nkey-value sorting\n";
+  {
+    thrust::device_vector<int> keys(N);
+    thrust::device_vector<int> values(N);
+    initialize(keys, values);
+    print(keys, values);
+    thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
+    print(keys, values);
+  }
+  
+  std::cout << "\nkey-value sorting (descending)\n";
+  {
+    thrust::device_vector<int> keys(N);
+    thrust::device_vector<int> values(N);
+    initialize(keys, values);
+    print(keys, values);
+    thrust::sort_by_key(keys.begin(), keys.end(), values.begin(), thrust::greater<int>());
+    print(keys, values);
+  }
+
+  return 0;
+}
+
diff --git a/thrust/examples/sorting_aos_vs_soa.cu b/thrust/examples/sorting_aos_vs_soa.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1bf990982d44115f1b335371b8fe6480b285a49f
--- /dev/null
+++ b/thrust/examples/sorting_aos_vs_soa.cu
@@ -0,0 +1,91 @@
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#include <thrust/random.h>
+#include <assert.h>
+
+#include "include/timer.h"
+
+// This examples compares sorting performance using Array of Structures (AoS)
+// and Structure of Arrays (SoA) data layout.  Legacy applications will often
+// store data in C/C++ structs, such as MyStruct defined below.  Although 
+// Thrust can process array of structs, it is typically less efficient than
+// the equivalent structure of arrays layout.  In this particular example,
+// the optimized SoA approach is approximately *five times faster* than the
+// traditional AoS method.  Therefore, it is almost always worthwhile to
+// convert AoS data structures to SoA.
+
+struct MyStruct
+{
+  int key;
+  float value;
+
+  __host__ __device__
+    bool operator<(const MyStruct other) const
+    {
+      return key < other.key;
+    }
+};
+
+void initialize_keys(thrust::device_vector<int>& keys)
+{
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(0, 2147483647);
+
+  thrust::host_vector<int> h_keys(keys.size());
+
+  for(size_t i = 0; i < h_keys.size(); i++)
+    h_keys[i] = dist(rng);
+
+  keys = h_keys;
+}
+
+
+void initialize_keys(thrust::device_vector<MyStruct>& structures)
+{
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(0, 2147483647);
+
+  thrust::host_vector<MyStruct> h_structures(structures.size());
+
+  for(size_t i = 0; i < h_structures.size(); i++)
+    h_structures[i].key = dist(rng);
+
+  structures = h_structures;
+}
+
+int main(void)
+{
+  size_t N = 2 * 1024 * 1024;
+
+  // Sort Key-Value pairs using Array of Structures (AoS) storage 
+  {
+    thrust::device_vector<MyStruct> structures(N);
+
+    initialize_keys(structures);
+
+    timer t;
+
+    thrust::sort(structures.begin(), structures.end());
+    assert(thrust::is_sorted(structures.begin(), structures.end()));
+
+    std::cout << "AoS sort took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
+  }
+
+  // Sort Key-Value pairs using Structure of Arrays (SoA) storage 
+  {
+    thrust::device_vector<int>   keys(N);
+    thrust::device_vector<float> values(N);
+
+    initialize_keys(keys);
+
+    timer t;
+
+    thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
+    assert(thrust::is_sorted(keys.begin(), keys.end()));
+
+    std::cout << "SoA sort took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
+  }
+
+  return 0;
+}
+
diff --git a/thrust/examples/sparse_vector.cu b/thrust/examples/sparse_vector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c7528cff2d1c111eb41c2152aa0a9746a2913eed
--- /dev/null
+++ b/thrust/examples/sparse_vector.cu
@@ -0,0 +1,108 @@
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/merge.h>
+#include <thrust/reduce.h>
+#include <thrust/inner_product.h>
+#include <cassert>
+#include <iostream>
+
+template <typename IndexVector,
+          typename ValueVector>
+void print_sparse_vector(const IndexVector& A_index,
+                         const ValueVector& A_value)
+{
+    // sanity test
+    assert(A_index.size() == A_value.size());
+
+    for(size_t i = 0; i < A_index.size(); i++)
+        std::cout << "(" << A_index[i] << "," << A_value[i] << ") ";
+    std::cout << std::endl;
+}
+
+template <typename IndexVector1,
+          typename ValueVector1,
+          typename IndexVector2,
+          typename ValueVector2,
+          typename IndexVector3,
+          typename ValueVector3>
+void sum_sparse_vectors(const IndexVector1& A_index,
+                        const ValueVector1& A_value,
+                        const IndexVector2& B_index,
+                        const ValueVector2& B_value,
+                              IndexVector3& C_index,
+                              ValueVector3& C_value)
+{
+    typedef typename IndexVector3::value_type  IndexType;
+    typedef typename ValueVector3::value_type  ValueType;
+
+    // sanity test
+    assert(A_index.size() == A_value.size());
+    assert(B_index.size() == B_value.size());
+
+    size_t A_size = A_index.size();
+    size_t B_size = B_index.size();
+
+    // allocate storage for the combined contents of sparse vectors A and B
+    IndexVector3 temp_index(A_size + B_size);
+    ValueVector3 temp_value(A_size + B_size);
+
+    // merge A and B by index
+    thrust::merge_by_key(A_index.begin(), A_index.end(),
+                         B_index.begin(), B_index.end(),
+                         A_value.begin(),
+                         B_value.begin(),
+                         temp_index.begin(),
+                         temp_value.begin());
+    
+    // compute number of unique indices
+    size_t C_size = thrust::inner_product(temp_index.begin(), temp_index.end() - 1,
+                                          temp_index.begin() + 1,
+                                          size_t(0),
+                                          thrust::plus<size_t>(),
+                                          thrust::not_equal_to<IndexType>()) + 1;
+
+    // allocate space for output
+    C_index.resize(C_size);
+    C_value.resize(C_size);
+
+    // sum values with the same index
+    thrust::reduce_by_key(temp_index.begin(), temp_index.end(),
+                          temp_value.begin(),
+                          C_index.begin(),
+                          C_value.begin(),
+                          thrust::equal_to<IndexType>(),
+                          thrust::plus<ValueType>());
+}
+
+int main(void)
+{
+    // initialize sparse vector A with 4 elements
+    thrust::device_vector<int>   A_index(4);
+    thrust::device_vector<float> A_value(4);
+    A_index[0] = 2;  A_value[0] = 10;
+    A_index[1] = 3;  A_value[1] = 60;
+    A_index[2] = 5;  A_value[2] = 20;
+    A_index[3] = 8;  A_value[3] = 40;
+    
+    // initialize sparse vector B with 6 elements
+    thrust::device_vector<int>   B_index(6);
+    thrust::device_vector<float> B_value(6);
+    B_index[0] = 1;  B_value[0] = 50;
+    B_index[1] = 2;  B_value[1] = 30;
+    B_index[2] = 4;  B_value[2] = 80;
+    B_index[3] = 5;  B_value[3] = 30;
+    B_index[4] = 7;  B_value[4] = 90;
+    B_index[5] = 8;  B_value[5] = 10;
+
+    // compute sparse vector C = A + B
+    thrust::device_vector<int>   C_index;
+    thrust::device_vector<float> C_value;
+    
+    sum_sparse_vectors(A_index, A_value, B_index, B_value, C_index, C_value);
+
+    std::cout << "Computing C = A + B for sparse vectors A and B" << std::endl;
+    std::cout << "A "; print_sparse_vector(A_index, A_value);
+    std::cout << "B "; print_sparse_vector(B_index, B_value);
+    std::cout << "C "; print_sparse_vector(C_index, C_value);
+}
+
diff --git a/thrust/examples/stream_compaction.cu b/thrust/examples/stream_compaction.cu
new file mode 100644
index 0000000000000000000000000000000000000000..95316b06aabfbb4bd17f7bbb300cf1665be1cff9
--- /dev/null
+++ b/thrust/examples/stream_compaction.cu
@@ -0,0 +1,77 @@
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/remove.h>
+#include <iostream>
+#include <iterator>
+#include <string>
+
+// this functor returns true if the argument is odd, and false otherwise
+template <typename T>
+struct is_odd : public thrust::unary_function<T,bool>
+{
+    __host__ __device__
+    bool operator()(T x)
+    {
+        return x % 2;
+    }
+};
+
+
+template <typename Iterator>
+void print_range(const std::string& name, Iterator first, Iterator last)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    std::cout << name << ": ";
+    thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));  
+    std::cout << "\n";
+}
+
+int main(void)
+{
+    // input size
+    size_t N = 10;
+
+    // define some types
+    typedef thrust::device_vector<int> Vector;
+    typedef Vector::iterator           Iterator;
+
+    // allocate storage for array
+    Vector values(N);
+
+    // initialize array to [0, 1, 2, ... ]
+    thrust::sequence(values.begin(), values.end());
+    
+    print_range("values", values.begin(), values.end());
+
+    // allocate output storage, here we conservatively assume all values will be copied
+    Vector output(values.size());
+
+    // copy odd numbers to separate array
+    Iterator output_end = thrust::copy_if(values.begin(), values.end(), output.begin(), is_odd<int>());
+
+    print_range("output", output.begin(), output_end);
+
+    // another approach is to count the number of values that will 
+    // be copied, and allocate an array of the right size
+    size_t N_odd = thrust::count_if(values.begin(), values.end(), is_odd<int>());
+    
+    Vector small_output(N_odd);
+    
+    thrust::copy_if(values.begin(), values.end(), small_output.begin(), is_odd<int>());
+    
+    print_range("small_output", small_output.begin(), small_output.end());
+
+    // we can also compact sequences with the remove functions, which do the opposite of copy
+    Iterator values_end = thrust::remove_if(values.begin(), values.end(), is_odd<int>());
+
+    // since the values after values_end are garbage, we'll resize the vector
+    values.resize(values_end - values.begin());
+
+    print_range("values", values.begin(), values.end());
+
+    return 0;
+}
+
diff --git a/thrust/examples/strided_range.cu b/thrust/examples/strided_range.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3457bc1caca137a652a726be690de1e7506326cc
--- /dev/null
+++ b/thrust/examples/strided_range.cu
@@ -0,0 +1,97 @@
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/functional.h>
+#include <thrust/fill.h>
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <iostream>
+
+// this example illustrates how to make strided access to a range of values
+// examples:
+//   strided_range([0, 1, 2, 3, 4, 5, 6], 1) -> [0, 1, 2, 3, 4, 5, 6] 
+//   strided_range([0, 1, 2, 3, 4, 5, 6], 2) -> [0, 2, 4, 6]
+//   strided_range([0, 1, 2, 3, 4, 5, 6], 3) -> [0, 3, 6]
+//   ...
+
+template <typename Iterator>
+class strided_range
+{
+    public:
+
+    typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
+    {
+        difference_type stride;
+
+        stride_functor(difference_type stride)
+            : stride(stride) {}
+
+        __host__ __device__
+        difference_type operator()(const difference_type& i) const
+        { 
+            return stride * i;
+        }
+    };
+
+    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
+    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+    // type of the strided_range iterator
+    typedef PermutationIterator iterator;
+
+    // construct strided_range for the range [first,last)
+    strided_range(Iterator first, Iterator last, difference_type stride)
+        : first(first), last(last), stride(stride) {}
+   
+    iterator begin(void) const
+    {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
+    }
+
+    iterator end(void) const
+    {
+        return begin() + ((last - first) + (stride - 1)) / stride;
+    }
+    
+    protected:
+    Iterator first;
+    Iterator last;
+    difference_type stride;
+};
+
+int main(void)
+{
+    thrust::device_vector<int> data(8);
+    data[0] = 10;
+    data[1] = 20;
+    data[2] = 30;
+    data[3] = 40;
+    data[4] = 50;
+    data[5] = 60;
+    data[6] = 70;
+    data[7] = 80;
+
+    // print the initial data
+    std::cout << "data: ";
+    thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+
+    typedef thrust::device_vector<int>::iterator Iterator;
+    
+    // create strided_range with indices [0,2,4,6]
+    strided_range<Iterator> evens(data.begin(), data.end(), 2);
+    std::cout << "sum of even indices: " << thrust::reduce(evens.begin(), evens.end()) << std::endl;
+    
+    // create strided_range with indices [1,3,5,7]
+    strided_range<Iterator> odds(data.begin() + 1, data.end(), 2);
+    std::cout << "sum of odd indices:  " << thrust::reduce(odds.begin(), odds.end()) << std::endl;
+
+    // set odd elements to 0 with fill()
+    std::cout << "setting odd indices to zero: ";
+    thrust::fill(odds.begin(), odds.end(), 0);
+    thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+
+    return 0;
+}
diff --git a/thrust/examples/sum.cu b/thrust/examples/sum.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f841dd2a3255050476f318b3fa44a28bf586f51
--- /dev/null
+++ b/thrust/examples/sum.cu
@@ -0,0 +1,37 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/random.h>
+
+int my_rand(void)
+{
+  static thrust::default_random_engine rng;
+  static thrust::uniform_int_distribution<int> dist(0, 9999);
+  return dist(rng);
+}
+
+int main(void)
+{
+  // generate random data on the host
+  thrust::host_vector<int> h_vec(100);
+  thrust::generate(h_vec.begin(), h_vec.end(), my_rand);
+
+  // transfer to device and compute sum
+  thrust::device_vector<int> d_vec = h_vec;
+
+  // initial value of the reduction
+  int init = 0; 
+ 
+  // binary operation used to reduce values
+  thrust::plus<int> binary_op;
+
+  // compute sum on the device
+  int sum = thrust::reduce(d_vec.begin(), d_vec.end(), init, binary_op);
+
+  // print the sum
+  std::cout << "sum is " << sum << std::endl;
+
+  return 0;
+}
diff --git a/thrust/examples/sum_rows.cu b/thrust/examples/sum_rows.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4d8a2e11ff379b01c43d60c9d3227e6db2f9e58e
--- /dev/null
+++ b/thrust/examples/sum_rows.cu
@@ -0,0 +1,62 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/random.h>
+#include <iostream>
+
+// convert a linear index to a row index
+template <typename T>
+struct linear_index_to_row_index : public thrust::unary_function<T,T>
+{
+  T C; // number of columns
+  
+  __host__ __device__
+  linear_index_to_row_index(T C) : C(C) {}
+
+  __host__ __device__
+  T operator()(T i)
+  {
+    return i / C;
+  }
+};
+
+int main(void)
+{
+  int R = 5;     // number of rows
+  int C = 8;     // number of columns
+  thrust::default_random_engine rng;
+  thrust::uniform_int_distribution<int> dist(10, 99);
+
+  // initialize data
+  thrust::device_vector<int> array(R * C);
+  for (size_t i = 0; i < array.size(); i++)
+    array[i] = dist(rng);
+  
+  // allocate storage for row sums and indices
+  thrust::device_vector<int> row_sums(R);
+  thrust::device_vector<int> row_indices(R);
+  
+  // compute row sums by summing values with equal row indices
+  thrust::reduce_by_key
+    (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)),
+     thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C),
+     array.begin(),
+     row_indices.begin(),
+     row_sums.begin(),
+     thrust::equal_to<int>(),
+     thrust::plus<int>());
+
+  // print data 
+  for(int i = 0; i < R; i++)
+  {
+    std::cout << "[ ";
+    for(int j = 0; j < C; j++)
+      std::cout << array[i * C + j] << " ";
+    std::cout << "] = " << row_sums[i] << "\n";
+  }
+
+  return 0;
+}
+
diff --git a/thrust/examples/summary_statistics.cu b/thrust/examples/summary_statistics.cu
new file mode 100644
index 0000000000000000000000000000000000000000..38785e2b7777564528a7bccab78703b8fe56e764
--- /dev/null
+++ b/thrust/examples/summary_statistics.cu
@@ -0,0 +1,158 @@
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/functional.h>
+#include <thrust/extrema.h>
+#include <cmath>
+#include <limits>
+#include <iostream>
+
+// This example computes several statistical properties of a data
+// series in a single reduction.  The algorithm is described in detail here:
+// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+//
+// Thanks to Joseph Rhoads for contributing this example
+
+
+// structure used to accumulate the moments and other 
+// statistical properties encountered so far.
+template <typename T>
+struct summary_stats_data
+{
+    T n;
+    T min;
+    T max;
+    T mean;
+    T M2;
+    T M3;
+    T M4;
+    
+    // initialize to the identity element
+    void initialize()
+    {
+      n = mean = M2 = M3 = M4 = 0;
+      min = std::numeric_limits<T>::max();
+      max = std::numeric_limits<T>::min();
+    }
+
+    T variance()   { return M2 / (n - 1); }
+    T variance_n() { return M2 / n; }
+    T skewness()   { return std::sqrt(n) * M3 / std::pow(M2, (T) 1.5); }
+    T kurtosis()   { return n * M4 / (M2 * M2); }
+};
+
+// stats_unary_op is a functor that takes in a value x and
+// returns a variace_data whose mean value is initialized to x.
+template <typename T>
+struct summary_stats_unary_op
+{
+    __host__ __device__
+    summary_stats_data<T> operator()(const T& x) const
+    {
+         summary_stats_data<T> result;
+         result.n    = 1;
+         result.min  = x;
+         result.max  = x;
+         result.mean = x;
+         result.M2   = 0;
+         result.M3   = 0;
+         result.M4   = 0;
+
+         return result;
+    }
+};
+
+// summary_stats_binary_op is a functor that accepts two summary_stats_data 
+// structs and returns a new summary_stats_data which are an
+// approximation to the summary_stats for 
+// all values that have been agregated so far
+template <typename T>
+struct summary_stats_binary_op 
+    : public thrust::binary_function<const summary_stats_data<T>&, 
+                                     const summary_stats_data<T>&,
+                                           summary_stats_data<T> >
+{
+    __host__ __device__
+    summary_stats_data<T> operator()(const summary_stats_data<T>& x, const summary_stats_data <T>& y) const
+    {
+        summary_stats_data<T> result;
+        
+        // precompute some common subexpressions
+        T n  = x.n + y.n;
+        T n2 = n  * n;
+        T n3 = n2 * n;
+
+        T delta  = y.mean - x.mean;
+        T delta2 = delta  * delta;
+        T delta3 = delta2 * delta;
+        T delta4 = delta3 * delta;
+        
+        //Basic number of samples (n), min, and max
+        result.n   = n;
+        result.min = thrust::min(x.min, y.min);
+        result.max = thrust::max(x.max, y.max);
+
+        result.mean = x.mean + delta * y.n / n;
+
+        result.M2  = x.M2 + y.M2;
+        result.M2 += delta2 * x.n * y.n / n;
+
+        result.M3  = x.M3 + y.M3;
+        result.M3 += delta3 * x.n * y.n * (x.n - y.n) / n2; 
+        result.M3 += (T) 3.0 * delta * (x.n * y.M2 - y.n * x.M2) / n;
+    
+        result.M4  = x.M4 + y.M4;
+        result.M4 += delta4 * x.n * y.n * (x.n * x.n - x.n * y.n + y.n * y.n) / n3;
+        result.M4 += (T) 6.0 * delta2 * (x.n * x.n * y.M2 + y.n * y.n * x.M2) / n2;
+        result.M4 += (T) 4.0 * delta * (x.n * y.M3 - y.n * x.M3) / n;
+        
+        return result;
+    }
+};
+
+template <typename Iterator>
+void print_range(const std::string& name, Iterator first, Iterator last)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    std::cout << name << ": ";
+    thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));  
+    std::cout << "\n";
+}
+
+
+int main(void)
+{
+    typedef float T;
+
+    // initialize host array
+    T h_x[] = {4, 7, 13, 16};
+
+    // transfer to device
+    thrust::device_vector<T> d_x(h_x, h_x + sizeof(h_x) / sizeof(T));
+
+    // setup arguments
+    summary_stats_unary_op<T>  unary_op;
+    summary_stats_binary_op<T> binary_op;
+    summary_stats_data<T>      init;
+
+    init.initialize();
+
+    // compute summary statistics
+    summary_stats_data<T> result = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);
+
+    std::cout <<"******Summary Statistics Example*****"<<std::endl;
+    print_range("The data", d_x.begin(), d_x.end());
+
+    std::cout <<"Count              : "<< result.n << std::endl;
+    std::cout <<"Minimum            : "<< result.min <<std::endl;
+    std::cout <<"Maximum            : "<< result.max <<std::endl;
+    std::cout <<"Mean               : "<< result.mean << std::endl;
+    std::cout <<"Variance           : "<< result.variance() << std::endl;
+    std::cout <<"Standard Deviation : "<< std::sqrt(result.variance_n()) << std::endl;
+    std::cout <<"Skewness           : "<< result.skewness() << std::endl;
+    std::cout <<"Kurtosis           : "<< result.kurtosis() << std::endl;
+
+    return 0;
+}
+
diff --git a/thrust/examples/summed_area_table.cu b/thrust/examples/summed_area_table.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d962df25bde7299f8bb27e1594bd75f553e5f42a
--- /dev/null
+++ b/thrust/examples/summed_area_table.cu
@@ -0,0 +1,119 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/scan.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <iostream>
+#include <iomanip>
+
+// This example computes a summed area table using segmented scan
+// http://en.wikipedia.org/wiki/Summed_area_table
+
+
+
+// convert a linear index to a linear index in the transpose 
+struct transpose_index : public thrust::unary_function<size_t,size_t>
+{
+  size_t m, n;
+
+  __host__ __device__
+  transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
+
+  __host__ __device__
+  size_t operator()(size_t linear_index)
+  {
+      size_t i = linear_index / n;
+      size_t j = linear_index % n;
+
+      return m * j + i;
+  }
+};
+
+// convert a linear index to a row index
+struct row_index : public thrust::unary_function<size_t,size_t>
+{
+  size_t n;
+  
+  __host__ __device__
+  row_index(size_t _n) : n(_n) {}
+
+  __host__ __device__
+  size_t operator()(size_t i)
+  {
+      return i / n;
+  }
+};
+
+// transpose an M-by-N array
+template <typename T>
+void transpose(size_t m, size_t n, thrust::device_vector<T>& src, thrust::device_vector<T>& dst)
+{
+  thrust::counting_iterator<size_t> indices(0);
+  
+  thrust::gather
+    (thrust::make_transform_iterator(indices, transpose_index(n, m)),
+     thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
+     src.begin(),
+     dst.begin());
+}
+
+
+// scan the rows of an M-by-N array
+template <typename T>
+void scan_horizontally(size_t n, thrust::device_vector<T>& d_data)
+{
+  thrust::counting_iterator<size_t> indices(0);
+
+  thrust::inclusive_scan_by_key
+    (thrust::make_transform_iterator(indices, row_index(n)),
+     thrust::make_transform_iterator(indices, row_index(n)) + d_data.size(),
+     d_data.begin(),
+     d_data.begin());
+}
+
+// print an M-by-N array
+template <typename T>
+void print(size_t m, size_t n, thrust::device_vector<T>& d_data)
+{
+  thrust::host_vector<T> h_data = d_data;
+
+  for(size_t i = 0; i < m; i++)
+  {
+    for(size_t j = 0; j < n; j++)
+      std::cout << std::setw(8) << h_data[i * n + j] << " ";
+    std::cout << "\n";
+  }
+}
+
+int main(void)
+{
+  size_t m = 3; // number of rows
+  size_t n = 4; // number of columns
+
+  // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
+  thrust::device_vector<int> data(m * n, 1);
+
+  std::cout << "[step 0] initial array" << std::endl;
+  print(m, n, data);
+
+  std::cout << "[step 1] scan horizontally" << std::endl;
+  scan_horizontally(n, data);
+  print(m, n, data);
+
+  std::cout << "[step 2] transpose array" << std::endl;
+  thrust::device_vector<int> temp(m * n);
+  transpose(m, n, data, temp);
+  print(n, m, temp);
+
+  std::cout << "[step 3] scan transpose horizontally" << std::endl;
+  scan_horizontally(m, temp);
+  print(n, m, temp);
+
+  std::cout << "[step 4] transpose the transpose" << std::endl;
+  transpose(n, m, temp, data);
+  print(m, n, data);
+
+  return 0;
+}
diff --git a/thrust/examples/tiled_range.cu b/thrust/examples/tiled_range.cu
new file mode 100644
index 0000000000000000000000000000000000000000..51cc27d5fd752d0ccbf196320899ae391bcb36d7
--- /dev/null
+++ b/thrust/examples/tiled_range.cu
@@ -0,0 +1,91 @@
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/functional.h>
+#include <thrust/fill.h>
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <iostream>
+
+// this example illustrates how to tile a range multiple times
+// examples:
+//   tiled_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3] 
+//   tiled_range([0, 1, 2, 3], 2) -> [0, 1, 2, 3, 0, 1, 2, 3] 
+//   tiled_range([0, 1, 2, 3], 3) -> [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3] 
+//   ...
+
+template <typename Iterator>
+class tiled_range
+{
+    public:
+
+    typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+    struct tile_functor : public thrust::unary_function<difference_type,difference_type>
+    {
+        difference_type tile_size;
+
+        tile_functor(difference_type tile_size)
+            : tile_size(tile_size) {}
+
+        __host__ __device__
+        difference_type operator()(const difference_type& i) const
+        { 
+            return i % tile_size;
+        }
+    };
+
+    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+    typedef typename thrust::transform_iterator<tile_functor, CountingIterator>   TransformIterator;
+    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+    // type of the tiled_range iterator
+    typedef PermutationIterator iterator;
+
+    // construct repeated_range for the range [first,last)
+    tiled_range(Iterator first, Iterator last, difference_type tiles)
+        : first(first), last(last), tiles(tiles) {}
+   
+    iterator begin(void) const
+    {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(last - first)));
+    }
+
+    iterator end(void) const
+    {
+        return begin() + tiles * (last - first);
+    }
+    
+    protected:
+    Iterator first;
+    Iterator last;
+    difference_type tiles;
+};
+
+int main(void)
+{
+    thrust::device_vector<int> data(4);
+    data[0] = 10;
+    data[1] = 20;
+    data[2] = 30;
+    data[3] = 40;
+
+    // print the initial data
+    std::cout << "range        ";
+    thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+
+    typedef thrust::device_vector<int>::iterator Iterator;
+  
+    // create tiled_range with two tiles
+    tiled_range<Iterator> two(data.begin(), data.end(), 2);
+    std::cout << "two tiles:   ";
+    thrust::copy(two.begin(), two.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+    
+    // create tiled_range with three tiles
+    tiled_range<Iterator> three(data.begin(), data.end(), 3);
+    std::cout << "three tiles: ";
+    thrust::copy(three.begin(), three.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
+
+    return 0;
+}
+
diff --git a/thrust/examples/transform_input_output_iterator.cu b/thrust/examples/transform_input_output_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..843de72b4bb46ce221b2411e44e12590433b0886
--- /dev/null
+++ b/thrust/examples/transform_input_output_iterator.cu
@@ -0,0 +1,110 @@
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/transform_input_output_iterator.h>
+#include <thrust/sequence.h>
+#include <iostream>
+
+// Base 2 fixed point
+class ScaledInteger
+{
+  int value_;
+  int scale_;
+
+public:
+  __host__ __device__
+  ScaledInteger(int value, int scale): value_{value}, scale_{scale} {}
+
+  __host__ __device__
+  int value() const { return value_; }
+
+  __host__ __device__
+  ScaledInteger rescale(int scale) const
+  {
+    int shift = scale - scale_;
+    int result = shift < 0 ? value_ << (-shift) : value_ >> shift;
+    return ScaledInteger{result, scale};
+  }
+
+  __host__ __device__
+  friend ScaledInteger operator+(ScaledInteger a, ScaledInteger b)
+  {
+    // Rescale inputs to the lesser of the two scales
+    if (b.scale_ < a.scale_)
+      a = a.rescale(b.scale_);
+    else if (a.scale_ < b.scale_)
+      b = b.rescale(a.scale_);
+    return ScaledInteger{a.value_ + b.value_, a.scale_};
+  }
+};
+
+struct ValueToScaledInteger
+{
+  int scale;
+
+  __host__ __device__
+  ScaledInteger operator()(const int& value) const
+  {
+    return ScaledInteger{value, scale};
+  }
+};
+
+struct ScaledIntegerToValue
+{
+  int scale;
+
+  __host__ __device__
+  int operator()(const ScaledInteger& scaled) const
+  {
+    return scaled.rescale(scale).value();
+  }
+};
+
+int main(void)
+{
+  const size_t size = 4;
+  thrust::device_vector<int> A(size);
+  thrust::device_vector<int> B(size);
+  thrust::device_vector<int> C(size);
+
+  thrust::sequence(A.begin(), A.end(), 1);
+  thrust::sequence(B.begin(), B.end(), 5);
+
+  const int A_scale = 16; // Values in A are left shifted by 16
+  const int B_scale = 8;  // Values in B are left shifted by 8
+  const int C_scale = 4;  // Values in C are left shifted by 4
+
+  auto A_begin = thrust::make_transform_input_output_iterator(A.begin(),
+                    ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale});
+  auto A_end   = thrust::make_transform_input_output_iterator(A.end(),
+                    ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale});
+  auto B_begin = thrust::make_transform_input_output_iterator(B.begin(),
+                    ValueToScaledInteger{B_scale}, ScaledIntegerToValue{B_scale});
+  auto C_begin = thrust::make_transform_input_output_iterator(C.begin(),
+                    ValueToScaledInteger{C_scale}, ScaledIntegerToValue{C_scale});
+
+  // Sum A and B as ScaledIntegers, storing the scaled result in C
+  thrust::transform(A_begin, A_end, B_begin, C_begin, thrust::plus<ScaledInteger>{});
+
+  thrust::host_vector<int> A_h(A);
+  thrust::host_vector<int> B_h(B);
+  thrust::host_vector<int> C_h(C);
+
+  std::cout << std::hex;
+
+  std::cout << "Expected [ ";
+  for (size_t i = 0; i < size; i++) {
+    const int expected = ((A_h[i] << A_scale) + (B_h[i] << B_scale)) >> C_scale;
+    std::cout << expected <<  " ";
+  }
+  std::cout << "] \n";
+
+  std::cout << "Result   [ ";
+  for (size_t i = 0; i < size; i++) {
+    std::cout << C_h[i] <<  " ";
+  }
+  std::cout << "] \n";
+
+  return 0;
+}
+
diff --git a/thrust/examples/transform_iterator.cu b/thrust/examples/transform_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1da8f1e13a8fb7a0c54b13cbd620edc5fdd2294a
--- /dev/null
+++ b/thrust/examples/transform_iterator.cu
@@ -0,0 +1,132 @@
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <iostream>
+#include <iterator>
+#include <string>
+
+// this functor clamps a value to the range [lo, hi]
+template <typename T>
+struct clamp : public thrust::unary_function<T,T>
+{
+    T lo, hi;
+
+    __host__ __device__
+    clamp(T _lo, T _hi) : lo(_lo), hi(_hi) {}
+
+    __host__ __device__
+    T operator()(T x)
+    {
+        if (x < lo)
+            return lo;
+        else if (x < hi)
+            return x;
+        else
+            return hi;
+    }
+};
+
+template <typename T>
+struct simple_negate : public thrust::unary_function<T,T>
+{
+    __host__ __device__
+    T operator()(T x)
+    {
+        return -x;
+    }
+};
+
+template <typename Iterator>
+void print_range(const std::string& name, Iterator first, Iterator last)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    std::cout << name << ": ";
+    thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));  
+    std::cout << "\n";
+}
+
+
+int main(void)
+{
+    // clamp values to the range [1, 5]
+    int lo = 1;
+    int hi = 5;
+
+    // define some types
+    typedef thrust::device_vector<int> Vector;
+    typedef Vector::iterator           VectorIterator;
+
+    // initialize values
+    Vector values(8);
+
+    values[0] =  2;
+    values[1] =  5;
+    values[2] =  7;
+    values[3] =  1;
+    values[4] =  6;
+    values[5] =  0;
+    values[6] =  3;
+    values[7] =  8;
+    
+    print_range("values         ", values.begin(), values.end());
+
+    // define some more types
+    typedef thrust::transform_iterator<clamp<int>, VectorIterator> ClampedVectorIterator;
+
+    // create a transform_iterator that applies clamp() to the values array
+    ClampedVectorIterator cv_begin = thrust::make_transform_iterator(values.begin(), clamp<int>(lo, hi));
+    ClampedVectorIterator cv_end   = cv_begin + values.size();
+    
+    // now [clamped_begin, clamped_end) defines a sequence of clamped values
+    print_range("clamped values ", cv_begin, cv_end);
+
+
+
+    ////
+    // compute the sum of the clamped sequence with reduce()
+    std::cout << "sum of clamped values : " << thrust::reduce(cv_begin, cv_end) << "\n";
+
+
+
+    ////
+    // combine transform_iterator with other fancy iterators like counting_iterator
+    typedef thrust::counting_iterator<int>                           CountingIterator;
+    typedef thrust::transform_iterator<clamp<int>, CountingIterator> ClampedCountingIterator;
+
+    CountingIterator count_begin(0);
+    CountingIterator count_end(10);
+    
+    print_range("sequence         ", count_begin, count_end);
+
+    ClampedCountingIterator cs_begin = thrust::make_transform_iterator(count_begin, clamp<int>(lo, hi));
+    ClampedCountingIterator cs_end   = thrust::make_transform_iterator(count_end,   clamp<int>(lo, hi));
+
+    print_range("clamped sequence ", cs_begin, cs_end);
+
+
+
+    ////
+    // combine transform_iterator with another transform_iterator
+    typedef thrust::transform_iterator<thrust::negate<int>, ClampedCountingIterator> NegatedClampedCountingIterator;
+    
+    NegatedClampedCountingIterator ncs_begin = thrust::make_transform_iterator(cs_begin, thrust::negate<int>());
+    NegatedClampedCountingIterator ncs_end   = thrust::make_transform_iterator(cs_end,   thrust::negate<int>());
+
+    print_range("negated sequence ", ncs_begin, ncs_end);
+
+
+    ////
+    // when a functor does not define result_type, a third template argument must be provided
+    typedef thrust::transform_iterator<simple_negate<int>, VectorIterator, int> NegatedVectorIterator;
+
+    NegatedVectorIterator nv_begin(values.begin(), simple_negate<int>());
+    NegatedVectorIterator nv_end(values.end(), simple_negate<int>());
+    
+    print_range("negated values ", nv_begin, nv_end);
+
+    return 0;
+}
+
diff --git a/thrust/examples/transform_output_iterator.cu b/thrust/examples/transform_output_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1c5a05e068e02e7e7572a59087bae61acd5998da
--- /dev/null
+++ b/thrust/examples/transform_output_iterator.cu
@@ -0,0 +1,44 @@
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <iostream>
+
+struct Functor 
+{
+  template<class Tuple>
+  __host__ __device__
+  float operator()(const Tuple& tuple) const
+  {
+    const float x = thrust::get<0>(tuple);
+    const float y = thrust::get<1>(tuple);
+    return x*y*2.0f / 3.0f;
+  }
+};
+
+int main(void)
+{
+  float u[4] = { 4 , 3,  2,   1};
+  float v[4] = {-1,  1,  1,  -1};
+  int idx[3] = {3, 0, 1};
+  float w[3] = {0, 0, 0};
+
+  thrust::device_vector<float> U(u, u + 4);
+  thrust::device_vector<float> V(v, v + 4);
+  thrust::device_vector<int> IDX(idx, idx + 3);
+  thrust::device_vector<float> W(w, w + 3);
+
+  // gather multiple elements and apply a function before writing result in memory
+  thrust::gather(
+      IDX.begin(), IDX.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(U.begin(), V.begin())),
+      thrust::make_transform_output_iterator(W.begin(), Functor()));
+
+  std::cout << "result= [ ";
+  for (size_t i = 0; i < 3; i++)
+    std::cout << W[i] <<  " ";
+  std::cout << "] \n";
+
+  return 0;
+}
+
diff --git a/thrust/examples/uninitialized_vector.cu b/thrust/examples/uninitialized_vector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..90e8141fa47b457351c9ebd512d65e7ef0b5f357
--- /dev/null
+++ b/thrust/examples/uninitialized_vector.cu
@@ -0,0 +1,79 @@
+// Occasionally, it is advantageous to avoid initializing the individual
+// elements of a device_vector. For example, the default behavior of
+// zero-initializing numeric data may introduce undesirable overhead.
+// This example demonstrates how to avoid default construction of a
+// device_vector's data by using a custom allocator.
+
+#include <thrust/device_allocator.h>
+#include <thrust/device_vector.h>
+#include <thrust/logical.h>
+#include <thrust/functional.h>
+#include <cassert>
+
+// uninitialized_allocator is an allocator which
+// derives from device_allocator and which has a
+// no-op construct member function
+template<typename T>
+  struct uninitialized_allocator
+    : thrust::device_allocator<T>
+{
+  // the default generated constructors and destructors are implicitly
+  // marked __host__ __device__, but the current Thrust device_allocator
+  // can only be constructed and destroyed on the host; therefore, we
+  // define these as host only
+  __host__
+  uninitialized_allocator() {}
+  __host__
+  uninitialized_allocator(const uninitialized_allocator & other)
+    : thrust::device_allocator<T>(other) {}
+  __host__
+  ~uninitialized_allocator() {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  uninitialized_allocator & operator=(const uninitialized_allocator &) = default;
+#endif
+
+  // for correctness, you should also redefine rebind when you inherit
+  // from an allocator type; this way, if the allocator is rebound somewhere,
+  // it's going to be rebound to the correct type - and not to its base
+  // type for U
+  template<typename U>
+  struct rebind
+  {
+    typedef uninitialized_allocator<U> other;
+  };
+
+  // note that construct is annotated as
+  // a __host__ __device__ function
+  __host__ __device__
+  void construct(T *)
+  {
+    // no-op
+  }
+};
+
+// to make a device_vector which does not initialize its elements,
+// use uninitialized_allocator as the 2nd template parameter
+typedef thrust::device_vector<float, uninitialized_allocator<float> > uninitialized_vector;
+
+int main()
+{
+  uninitialized_vector vec(10);
+
+  // the initial value of vec's 10 elements is undefined
+
+  // resize without default value does not initialize elements
+  vec.resize(20);
+
+  // resize with default value does initialize elements
+  vec.resize(30, 13);
+
+  // the value of elements [0,20) is still undefined
+  // but the value of elements [20,30) is 13:
+
+  using namespace thrust::placeholders;
+  assert(thrust::all_of(vec.begin() + 20, vec.end(), _1 == 13));
+
+  return 0;
+}
+
diff --git a/thrust/examples/version.cu b/thrust/examples/version.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd0685b2d08a62a08bac9aa5bf36ef4ebaad8160
--- /dev/null
+++ b/thrust/examples/version.cu
@@ -0,0 +1,15 @@
+#include <thrust/version.h>
+#include <iostream>
+
+int main(void)
+{
+    int major = THRUST_MAJOR_VERSION;
+    int minor = THRUST_MINOR_VERSION;
+    int subminor = THRUST_SUBMINOR_VERSION;
+    int patch = THRUST_PATCH_NUMBER;
+
+    std::cout << "Thrust v" << major << "." << minor << "." << subminor << "-" << patch << std::endl;
+
+    return 0;
+}
+
diff --git a/thrust/examples/weld_vertices.cu b/thrust/examples/weld_vertices.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d133473f8e543c933a13689acd963fe8f81fcb25
--- /dev/null
+++ b/thrust/examples/weld_vertices.cu
@@ -0,0 +1,83 @@
+#include <thrust/device_vector.h>
+#include <thrust/remove.h>
+#include <thrust/unique.h>
+#include <thrust/binary_search.h>
+#include <thrust/sort.h>
+
+#include <iostream>
+
+/*
+ * This example "welds" triangle vertices together by taking as
+ * input "triangle soup" and eliminating redundant vertex positions
+ * and shared edges.  A connected mesh is the result.
+ * 
+ *
+ * Input: 9 vertices representing a mesh with 3 triangles
+ *  
+ *  Mesh              Vertices 
+ *    ------           (2)      (5)--(4)    (8)      
+ *    | \ 2| \          | \       \   |      | \
+ *    |  \ |  \   <->   |  \       \  |      |  \
+ *    | 0 \| 1 \        |   \       \ |      |   \
+ *    -----------      (0)--(1)      (3)    (6)--(7)
+ *
+ *   (vertex 1 equals vertex 3, vertex 2 equals vertex 5, ...)
+ *
+ * Output: mesh representation with 5 vertices and 9 indices
+ *
+ *  Vertices            Indices
+ *   (1)--(3)            [(0,2,1),
+ *    | \  | \            (2,3,1), 
+ *    |  \ |  \           (2,4,3)]
+ *    |   \|   \
+ *   (0)--(2)--(4)
+ */
+
+// define a 2d float vector
+typedef thrust::tuple<float,float> vec2;
+
+int main(void)
+{
+    // allocate memory for input mesh representation
+    thrust::device_vector<vec2> input(9);
+
+    input[0] = vec2(0,0);  // First Triangle
+    input[1] = vec2(1,0);
+    input[2] = vec2(0,1);
+    input[3] = vec2(1,0);  // Second Triangle
+    input[4] = vec2(1,1);
+    input[5] = vec2(0,1);
+    input[6] = vec2(1,0);  // Third Triangle
+    input[7] = vec2(2,0);
+    input[8] = vec2(1,1);
+
+    // allocate space for output mesh representation
+    thrust::device_vector<vec2>         vertices = input;
+    thrust::device_vector<unsigned int> indices(input.size());
+
+    // sort vertices to bring duplicates together
+    thrust::sort(vertices.begin(), vertices.end());
+
+    // find unique vertices and erase redundancies
+    vertices.erase(thrust::unique(vertices.begin(), vertices.end()), vertices.end());
+
+    // find index of each input vertex in the list of unique vertices
+    thrust::lower_bound(vertices.begin(), vertices.end(),
+                        input.begin(), input.end(),
+                        indices.begin());
+
+    // print output mesh representation
+    std::cout << "Output Representation" << std::endl;
+    for(size_t i = 0; i < vertices.size(); i++)
+    {
+        vec2 v = vertices[i];
+        std::cout << " vertices[" << i << "] = (" << thrust::get<0>(v) << "," << thrust::get<1>(v) << ")" << std::endl;
+    }
+    for(size_t i = 0; i < indices.size(); i++)
+    {
+        std::cout << " indices[" << i << "] = " << indices[i] << std::endl;
+    }
+
+    return 0;
+}
+
diff --git a/thrust/examples/word_count.cu b/thrust/examples/word_count.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4836c8600af9bad9c9ebd9ae3bde677031f78763
--- /dev/null
+++ b/thrust/examples/word_count.cu
@@ -0,0 +1,79 @@
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+
+#include <iostream>
+
+// This example computes the number of words in a text sample
+// with a single call to thrust::inner_product.  The algorithm
+// counts the number of characters which start a new word, i.e.
+// the number of characters where input[i] is an alphabetical
+// character and input[i-1] is not an alphabetical character.
+
+
+// determines whether the character is alphabetical
+__host__ __device__
+bool is_alpha(const char c)
+{
+    return (c >= 'A' && c <= 'z');
+}
+
+// determines whether the right character begins a new word
+struct is_word_start
+    : public thrust::binary_function<const char&, const char&, bool>
+{
+    __host__ __device__
+    bool operator()(const char& left, const char& right) const
+    {
+        return is_alpha(right) && !is_alpha(left);
+    }
+};
+
+
+int word_count(const thrust::device_vector<char>& input)
+{
+    // check for empty string
+    if (input.empty())
+        return 0;
+
+    // compute the number characters that start a new word
+    int wc = thrust::inner_product(input.begin(), input.end() - 1,  // sequence of left characters
+                                   input.begin() + 1,               // sequence of right characters
+                                   0,                               // initialize sum to 0
+                                   thrust::plus<int>(),             // sum values together
+                                   is_word_start());       // how to compare the left and right characters
+    
+    // if the first character is alphabetical, then it also begins a word
+    if (is_alpha(input.front()))
+        wc++;
+    
+    return wc;
+}
+
+
+int main(void)
+{
+    // Paragraph from 'The Raven' by Edgar Allan Poe
+    // http://en.wikipedia.org/wiki/The_Raven
+    const char raw_input[] = "  But the raven, sitting lonely on the placid bust, spoke only,\n"
+                             "  That one word, as if his soul in that one word he did outpour.\n"
+                             "  Nothing further then he uttered - not a feather then he fluttered -\n"
+                             "  Till I scarcely more than muttered `Other friends have flown before -\n"
+                             "  On the morrow he will leave me, as my hopes have flown before.'\n"
+                             "  Then the bird said, `Nevermore.'\n";
+
+    std::cout << "Text sample:" << std::endl;
+    std::cout << raw_input << std::endl;
+    
+    // transfer to device
+    thrust::device_vector<char> input(raw_input, raw_input + sizeof(raw_input));
+
+    // count words
+    int wc = word_count(input);
+    
+    std::cout << "Text sample contains " << wc << " words" << std::endl;
+        
+    return 0;
+}
+
diff --git a/thrust/generate_mk.py b/thrust/generate_mk.py
new file mode 100755
index 0000000000000000000000000000000000000000..84071338ccfdd99be55027c8046fe46c56e5a65b
--- /dev/null
+++ b/thrust/generate_mk.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# Generate set of projects mk files. 
+# Usage: python generate_mk.py PROJECTS_MK_DIR  THRUST_SOURCE_DIR
+#   The program scans through unit tests and examples in THRUST_SOURCE_DIR
+#   and generates project mk for each of the tests and examples in PROJECTS_MK_DIR
+#   A single example or unit test source file generates its own executable
+#   This program is called by a top level Makefile, but can also be used stand-alone for debugging
+#   This program also generates testing.mk, examples.mk and dependencies.mk
+from __future__ import print_function
+import sys
+import shutil as sh
+import os
+import glob
+import re
+
+test_template = """
+TEST_SRC   := %(TEST_SRC)s
+TEST_NAME  := %(TEST_NAME)s
+include $(ROOTDIR)/thrust/internal/build/generic_test.mk
+"""
+example_template = """
+EXAMPLE_SRC   := %(EXAMPLE_SRC)s
+EXAMPLE_NAME  := %(EXAMPLE_NAME)s
+include $(ROOTDIR)/thrust/internal/build/generic_example.mk
+"""
+
+def Glob(pattern, directory,exclude='\B'):
+    src = glob.glob(os.path.join(directory,pattern))
+    p = re.compile(exclude)
+    src = [s for s in src if not p.match(s)]
+    return src
+
+
+def generate_test_mk(mk_path, test_path, group, TEST_DIR):
+    print('Generating makefiles in "'+mk_path+'" for tests in "'+test_path+'"')
+    src_cu  = Glob("*.cu",  test_path, ".*testframework.cu$")
+    src_cxx = Glob("*.cpp", test_path)
+    src_cu.sort();
+    src_cxx.sort();
+    src_all = src_cu + src_cxx;
+    tests_all = []
+    dependencies_all = []
+    for s in src_all:
+        fn = os.path.splitext(os.path.basename(s));
+        t = "thrust."+group+"."+fn[0]
+        e = fn[1]
+        mkfile = test_template % {"TEST_SRC" : s,  "TEST_NAME" : t}
+        f = open(os.path.join(mk_path,t+".mk"), 'w')
+        f.write(mkfile)
+        f.close()
+        tests_all.append(os.path.join(mk_path,t))
+        dependencies_all.append(t+": testframework")
+    return [tests_all, dependencies_all]
+
+def generate_example_mk(mk_path, example_path, group, EXAMPLE_DIR):
+    print('Generating makefiles in "'+mk_path+'" for examples in "'+example_path+'"')
+    src_cu  = Glob("*.cu",  example_path)
+    src_cxx = Glob("*.cpp", example_path)
+    src_cu.sort();
+    src_cxx.sort();
+    src_all = src_cu + src_cxx;
+    examples_all = []
+    for s in src_all:
+        fn = os.path.splitext(os.path.basename(s));
+        t = "thrust."+group+"."+fn[0]
+        e = fn[1]
+        mkfile = example_template % {"EXAMPLE_SRC" : s, "EXAMPLE_NAME" : t}
+        f = open(os.path.join(mk_path,t+".mk"), 'w')
+        f.write(mkfile)
+        f.close()
+        examples_all.append(os.path.join(mk_path,t))
+    return examples_all
+
+
+## relpath : backported from os.relpath form python 2.6+
+def relpath(path, start):
+    """Return a relative version of a path"""
+
+    import posixpath
+    if not path:
+        raise ValueError("no path specified")
+    start_list = posixpath.abspath(start).split(posixpath.sep)
+    path_list = posixpath.abspath(path).split(posixpath.sep)
+    # Work out how much of the filepath is shared by start and path.
+    i = len(posixpath.commonprefix([start_list, path_list]))
+    rel_list = [posixpath.pardir] * (len(start_list)-i) + path_list[i:]
+    if not rel_list:
+        return posixpath.curdir
+    return posixpath.join(*rel_list)
+
+mk_path=sys.argv[1]
+REL_DIR="../../"
+if (len(sys.argv) > 2):
+    root_path=sys.argv[2];
+    mk_path = relpath(mk_path, root_path)
+    REL_DIR = relpath(root_path,mk_path)
+
+try:
+    sh.rmtree(mk_path)
+except:
+    pass
+os.makedirs(mk_path)
+
+tests_all, dependencies_all = generate_test_mk(mk_path, "testing/", "test", REL_DIR)
+tests_cu,  dependencies_cu  = generate_test_mk(mk_path, "testing/cuda/", "test.cuda", REL_DIR)
+tests_all.extend(tests_cu)
+dependencies_all.extend(dependencies_cu)
+
+testing_mk  = ""
+
+for t in tests_all:
+    testing_mk += "PROJECTS += "+t+"\n"
+testing_mk += "PROJECTS += internal/build/testframework\n"
+
+
+f = open(os.path.join(mk_path,"testing.mk"),'w')
+f.write(testing_mk)
+f.close()
+
+dependencies_mk = ""
+for d in dependencies_all:
+    dependencies_mk += d + "\n"
+
+f = open(os.path.join(mk_path,"dependencies.mk"),'w')
+f.write(dependencies_mk)
+f.close()
+
+
+examples_mk = ""
+examples_all  = generate_example_mk(mk_path, "examples/", "example", REL_DIR)
+examples_cuda = generate_example_mk(mk_path, "examples/cuda/", "example.cuda", REL_DIR)
+examples_all.extend(examples_cuda)
+for e in examples_all:
+    examples_mk += "PROJECTS += "+e+"\n"
+
+f = open(os.path.join(mk_path,"examples.mk"),'w')
+f.write(examples_mk)
+f.close()
+
+
+
+
+
+
+
+
diff --git a/thrust/internal/benchmark/README.txt b/thrust/internal/benchmark/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..73b0cc058e2460b4183274e5f48733b1f23e2493
--- /dev/null
+++ b/thrust/internal/benchmark/README.txt
@@ -0,0 +1,31 @@
+Directions for compiling and running the benchmark with Ubuntu Linux:
+
+Install Intel's Threading Building Blocks library (TBB):
+$ sudo apt-get install libtbb-dev
+
+Compile the benchmark:
+$ nvcc -O3 -arch=sm_20 bench.cu -ltbb -o bench
+
+Run the benchmark:
+$ ./bench
+
+Typical output (Tesla C2050):
+
+Benchmarking with input size 33554432
+Core Primitive Performance (elements per second)
+      Algorithm,          STL,          TBB,       Thrust
+         reduce,   3121746688,   3739585536,  26134038528
+      transform,   1869492736,   2347719424,  13804681216
+           scan,   1394143744,   1439394816,   5039195648
+           sort,     11070660,     34622352,    673543168
+Sorting Performance (keys per second)
+  Type,          STL,          TBB,       Thrust
+  char,     24050078,     62987040,   2798874368
+ short,     15644141,     41275164,   1428603008
+   int,     11062616,     33478628,    682295744
+  long,     11249874,     33972564,    219719184
+ float,      9850043,     29011806,    692407232
+double,      9700181,     27153626,    224345568
+
+The reported numbers are performance rates in "elements per second" (higher is better).
+
diff --git a/thrust/internal/benchmark/bench.cu b/thrust/internal/benchmark/bench.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e73a0d5bdf68d064fe74828e7a911fb0a6ddf5d6
--- /dev/null
+++ b/thrust/internal/benchmark/bench.cu
@@ -0,0 +1,1276 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/random.h>
+#include <thrust/shuffle.h>
+
+#include <random>
+#endif
+
+#include <algorithm>
+#include <numeric>
+
+#include <map>
+#include <string>
+#include <exception>
+
+#include <iostream>
+
+#include <cassert>
+#include <cstdlib>    // For `atoi`.
+#include <climits>    // For CHAR_BIT.
+#include <cmath>      // For `sqrt` and `abs`.
+
+#include <stdint.h>   // For `intN_t`.
+
+#include "random.h"
+#include "timer.h"
+
+#if defined(HAVE_TBB)
+  #include "tbb_algos.h"
+#endif
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+  #include <thrust/system_error.h>      // For `thrust::system_error`
+  #include <thrust/system/cuda/error.h> // For `thrust::cuda_category`
+#endif
+
+// We don't use THRUST_PP_STRINGIZE and THRUST_PP_CAT because they are new, and
+// we want this benchmark to be backwards-compatible to older versions of Thrust.
+#define PP_STRINGIZE_(expr) #expr
+#define PP_STRINGIZE(expr)  PP_STRINGIZE_(expr)
+
+#define PP_CAT(a, b) a ## b
+
+// We don't use THRUST_NOEXCEPT because it's new, and we want this benchmark to
+// be backwards-compatible to older versions of Thrust.
+#if THRUST_CPP_DIALECT >= 2011
+  #define NOEXCEPT noexcept
+#else
+  #define NOEXCEPT throw()
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct squared_difference
+{
+private:
+  T const average;
+
+public:
+  __host__ __device__
+  squared_difference(squared_difference const& rhs) : average(rhs.average) {}
+
+  __host__ __device__
+  squared_difference(T average_) : average(average_) {}
+
+  __host__ __device__
+  T operator()(T x) const
+  {
+    return (x - average) * (x - average);
+  }
+};
+
+template <typename T>
+struct value_and_count
+{
+  T           value;
+  uint64_t count;
+
+  __host__ __device__
+  value_and_count(value_and_count const& other)
+    : value(other.value), count(other.count) {}
+
+  __host__ __device__
+  value_and_count(T const& value_)
+    : value(value_), count(1) {}
+
+  __host__ __device__
+  value_and_count(T const& value_, uint64_t count_)
+    : value(value_), count(count_) {}
+
+  __host__ __device__
+  value_and_count& operator=(value_and_count const& other)
+  {
+    value = other.value;
+    count = other.count;
+    return *this;
+  }
+
+  __host__ __device__
+  value_and_count& operator=(T const& value_)
+  {
+    value = value_;
+    count = 1;
+    return *this;
+  }
+};
+
+template <typename T, typename ReduceOp>
+struct counting_op
+{
+private:
+  ReduceOp reduce;
+
+public:
+  __host__ __device__
+  counting_op() : reduce() {}
+
+  __host__ __device__
+  counting_op(counting_op const& other) : reduce(other.reduce) {}
+
+  __host__ __device__
+  counting_op(ReduceOp const& reduce_) : reduce(reduce_) {}
+
+  __host__ __device__
+  value_and_count<T> operator()(
+      value_and_count<T> const& x
+    , T const&                  y
+    ) const
+  {
+    return value_and_count<T>(reduce(x.value, y), x.count + 1);
+  }
+
+  __host__ __device__
+  value_and_count<T> operator()(
+      value_and_count<T> const& x
+    , value_and_count<T> const& y
+    ) const
+  {
+    return value_and_count<T>(reduce(x.value, y.value), x.count + y.count);
+  }
+};
+
+template <typename InputIt, typename T>
+T arithmetic_mean(InputIt first, InputIt last, T init)
+{
+  value_and_count<T> init_vc(init, 0);
+
+  counting_op<T, thrust::plus<T> > reduce_vc;
+
+  value_and_count<T> vc
+    = thrust::reduce(first, last, init_vc, reduce_vc);
+
+  return vc.value / vc.count;
+}
+
+template <typename InputIt>
+typename thrust::iterator_traits<InputIt>::value_type
+arithmetic_mean(InputIt first, InputIt last)
+{
+  typedef typename thrust::iterator_traits<InputIt>::value_type T;
+  return arithmetic_mean(first, last, T());
+}
+
+template <typename InputIt, typename T>
+T sample_standard_deviation(InputIt first, InputIt last, T average)
+{
+  value_and_count<T> init_vc(T(), 0);
+
+  counting_op<T, thrust::plus<T> > reduce_vc;
+
+  squared_difference<T> transform(average);
+
+  value_and_count<T> vc
+    = thrust::transform_reduce(first, last, transform, init_vc, reduce_vc);
+
+  return std::sqrt(vc.value / T(vc.count - 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Formulas for propagation of uncertainty from:
+//
+//   https://en.wikipedia.org/wiki/Propagation_of_uncertainty#Example_formulas
+//
+// Even though it's Wikipedia, I trust it as I helped write that table.
+//
+// XXX Replace with a proper reference.
+
+// Compute the propagated uncertainty from the multiplication of two uncertain
+// values, `A +/- A_unc` and `B +/- B_unc`. Given `f = AB` or `f = A/B`, where
+// `A != 0` and `B != 0`, the uncertainty in `f` is approximately:
+//
+//   f_unc = abs(f) * sqrt((A_unc / A) ^ 2 + (B_unc / B) ^ 2)
+//
+template <typename T>
+__host__ __device__
+T uncertainty_multiplicative(
+    T const& f
+  , T const& A, T const& A_unc
+  , T const& B, T const& B_unc
+    )
+{
+  return std::abs(f)
+       * std::sqrt((A_unc / A) * (A_unc / A) + (B_unc / B) * (B_unc / B));
+}
+
+// Compute the propagated uncertainty from addition of two uncertain values,
+// `A +/- A_unc` and `B +/- B_unc`. Given `f = cA + dB` (where `c` and `d` are
+// certain constants), the uncertainty in `f` is approximately:
+//
+//   f_unc = sqrt(c ^ 2 * A_unc ^ 2 + d ^ 2 * B_unc ^ 2)
+//
+template <typename T>
+__host__ __device__
+T uncertainty_additive(
+    T const& c, T const& A_unc
+  , T const& d, T const& B_unc
+    )
+{
+  return std::sqrt((c * c * A_unc * A_unc) + (d * d * B_unc * B_unc));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Return the significant digit of `x`. The result is the number of digits
+// after the decimal place to round to (negative numbers indicate rounding
+// before the decimal place)
+template <typename T>
+int find_significant_digit(T x)
+{
+  if (x == T(0)) return T(0);
+  return -int(std::floor(std::log10(std::abs(x))));
+}
+
+// Round `x` to `ndigits` after the decimal place (Python-style).
+template <typename T, typename N>
+T round_to_precision(T x, N ndigits)
+{
+  double m = (x < 0.0) ? -1.0 : 1.0;
+  double pwr = std::pow(T(10.0), ndigits);
+  return (std::floor(x * m * pwr + 0.5) / pwr) * m;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void print_experiment_header()
+{ // {{{
+  std::cout << "Thrust Version"
+    << ","  << "Algorithm"
+    << ","  << "Element Type"
+    << ","  << "Element Size"
+    << ","  << "Elements per Trial"
+    << ","  << "Total Input Size"
+    << ","  << "STL Trials"
+    << ","  << "STL Average Walltime"
+    << ","  << "STL Walltime Uncertainty"
+    << ","  << "STL Average Throughput"
+    << ","  << "STL Throughput Uncertainty"
+    << ","  << "Thrust Trials"
+    << ","  << "Thrust Average Walltime"
+    << ","  << "Thrust Walltime Uncertainty"
+    << ","  << "Thrust Average Throughput"
+    << ","  << "Thrust Throughput Uncertainty"
+    #if defined(HAVE_TBB)
+    << ","  << "TBB Trials"
+    << ","  << "TBB Average Walltime"
+    << ","  << "TBB Walltime Uncertainty"
+    << ","  << "TBB Average Throughput"
+    << ","  << "TBB Throughput Uncertainty"
+    #endif
+    << std::endl;
+
+  std::cout << ""                // Thrust Version.
+    << ","  << ""                // Algorithm.
+    << ","  << ""                // Element Type.
+    << ","  << "bits/element"    // Element Size.
+    << ","  << "elements"        // Elements per Trial.
+    << ","  << "MiBs"            // Total Input Size.
+    << ","  << "trials"          // STL Trials.
+    << ","  << "secs"            // STL Average Walltime.
+    << ","  << "secs"            // STL Walltime Uncertainty.
+    << ","  << "elements/sec"    // STL Average Throughput.
+    << ","  << "elements/sec"    // STL Throughput Uncertainty.
+    << ","  << "trials"          // Thrust Trials.
+    << ","  << "secs"            // Thrust Average Walltime.
+    << ","  << "secs"            // Thrust Walltime Uncertainty.
+    << ","  << "elements/sec"    // Thrust Average Throughput.
+    << ","  << "elements/sec"    // Thrust Throughput Uncertainty.
+    #if defined(HAVE_TBB)
+    << ","  << "trials"          // TBB Trials.
+    << ","  << "secs"            // TBB Average Walltime.
+    << ","  << "secs"            // TBB Walltime Uncertainty.
+    << ","  << "elements/sec"    // TBB Average Throughput.
+    << ","  << "elements/sec"    // TBB Throughput Uncertainty.
+    #endif
+    << std::endl;
+} // }}}
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct experiment_results
+{
+  double const average_time; // Arithmetic mean of trial times in seconds.
+  double const stdev_time;   // Sample standard deviation of trial times.
+
+  experiment_results(double average_time_, double stdev_time_)
+    : average_time(average_time_), stdev_time(stdev_time_) {}
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType // Has an embedded typedef `type,
+                                              // and a static method `name` that
+                                              // returns a char const*.
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+struct experiment_driver
+{
+  typedef typename ElementMetaType::type element_type;
+
+  static char const* const test_name;
+  static char const* const element_type_name; // Element type name as a string.
+
+  static uint64_t const elements;             // # of elements per trial.
+  static uint64_t const element_size;         // Size of each element in bits.
+  static double   const input_size;           // `elements` * `element_size` in MiB.
+  static uint64_t const baseline_trials;      // # of baseline trials per experiment.
+  static uint64_t const regular_trials;       // # of regular trials per experiment.
+
+  static void run_experiment()
+  { // {{{
+    experiment_results stl    = std_experiment();
+    experiment_results thrust = thrust_experiment();
+    #if defined(HAVE_TBB)
+    experiment_results tbb    = tbb_experiment();
+    #endif
+
+    double stl_average_walltime    = stl.average_time;
+    double thrust_average_walltime = thrust.average_time;
+    #if defined(HAVE_TBB)
+    double tbb_average_walltime    = tbb.average_time;
+    #endif
+
+    double stl_average_throughput    = elements / stl.average_time;
+    double thrust_average_throughput = elements / thrust.average_time;
+    #if defined(HAVE_TBB)
+    double tbb_average_throughput    = elements / tbb.average_time;
+    #endif
+
+    double stl_walltime_uncertainty    = stl.stdev_time;
+    double thrust_walltime_uncertainty = thrust.stdev_time;
+    #if defined(HAVE_TBB)
+    double tbb_walltime_uncertainty    = tbb.stdev_time;
+    #endif
+
+    double stl_throughput_uncertainty    = uncertainty_multiplicative(
+        stl_average_throughput
+      , double(elements), 0.0
+      , stl_average_walltime, stl_walltime_uncertainty
+    );
+    double thrust_throughput_uncertainty = uncertainty_multiplicative(
+        thrust_average_throughput
+      , double(elements), 0.0
+      , thrust_average_walltime, thrust_walltime_uncertainty
+    );
+
+    #if defined(HAVE_TBB)
+    double tbb_throughput_uncertainty    = uncertainty_multiplicative(
+        tbb_average_throughput
+      , double(elements), 0.0
+      , tbb_average_walltime, tbb_walltime_uncertainty
+    );
+    #endif
+
+    // Round the average walltime and walltime uncertainty to the
+    // significant figure of the walltime uncertainty.
+    int stl_walltime_precision = std::max(
+        find_significant_digit(stl.average_time)
+      , find_significant_digit(stl.stdev_time)
+    );
+    int thrust_walltime_precision = std::max(
+        find_significant_digit(thrust.average_time)
+      , find_significant_digit(thrust.stdev_time)
+    );
+    #if defined(HAVE_TBB)
+    int tbb_walltime_precision = std::max(
+        find_significant_digit(tbb.average_time)
+      , find_significant_digit(tbb.stdev_time)
+    );
+    #endif
+
+    stl_average_walltime = round_to_precision(
+        stl_average_walltime, stl_walltime_precision
+    );
+    thrust_average_walltime = round_to_precision(
+        thrust_average_walltime, thrust_walltime_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_average_walltime = round_to_precision(
+        tbb_average_walltime, tbb_walltime_precision
+    );
+    #endif
+
+    stl_walltime_uncertainty = round_to_precision(
+        stl_walltime_uncertainty, stl_walltime_precision
+    );
+    thrust_walltime_uncertainty = round_to_precision(
+        thrust_walltime_uncertainty, thrust_walltime_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_walltime_uncertainty = round_to_precision(
+        tbb_walltime_uncertainty, tbb_walltime_precision
+    );
+    #endif
+
+    // Round the average throughput and throughput uncertainty to the
+    // significant figure of the throughput uncertainty.
+    int stl_throughput_precision = std::max(
+        find_significant_digit(stl_average_throughput)
+      , find_significant_digit(stl_throughput_uncertainty)
+    );
+    int thrust_throughput_precision = std::max(
+        find_significant_digit(thrust_average_throughput)
+      , find_significant_digit(thrust_throughput_uncertainty)
+    );
+    #if defined(HAVE_TBB)
+    int tbb_throughput_precision = std::max(
+        find_significant_digit(tbb_average_throughput)
+      , find_significant_digit(tbb_throughput_uncertainty)
+    );
+    #endif
+
+    stl_average_throughput = round_to_precision(
+        stl_average_throughput, stl_throughput_precision
+    );
+    thrust_average_throughput = round_to_precision(
+        thrust_average_throughput, thrust_throughput_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_average_throughput = round_to_precision(
+        tbb_average_throughput, tbb_throughput_precision
+    );
+    #endif
+
+    stl_throughput_uncertainty = round_to_precision(
+        stl_throughput_uncertainty, stl_throughput_precision
+    );
+    thrust_throughput_uncertainty = round_to_precision(
+        thrust_throughput_uncertainty, thrust_throughput_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_throughput_uncertainty = round_to_precision(
+        tbb_throughput_uncertainty, tbb_throughput_precision
+    );
+    #endif
+
+    std::cout << THRUST_VERSION                // Thrust Version.
+      << ","  << test_name                     // Algorithm.
+      << ","  << element_type_name             // Element Type.
+      << ","  << element_size                  // Element Size.
+      << ","  << elements                      // Elements per Trial.
+      << ","  << input_size                    // Total Input Size.
+      << ","  << baseline_trials               // STL Trials.
+      << ","  << stl_average_walltime          // STL Average Walltime.
+      << ","  << stl_walltime_uncertainty      // STL Walltime Uncertainty.
+      << ","  << stl_average_throughput        // STL Average Throughput.
+      << ","  << stl_throughput_uncertainty    // STL Throughput Uncertainty.
+      << ","  << regular_trials                // Thrust Trials.
+      << ","  << thrust_average_walltime       // Thrust Average Walltime.
+      << ","  << thrust_walltime_uncertainty   // Thrust Walltime Uncertainty.
+      << ","  << thrust_average_throughput     // Thrust Average Throughput.
+      << ","  << thrust_throughput_uncertainty // Thrust Throughput Uncertainty.
+      #if defined(HAVE_TBB)
+      << ","  << regular_trials                // TBB Trials.
+      << ","  << tbb_average_walltime          // TBB Average Walltime.
+      << ","  << tbb_walltime_uncertainty      // TBB Walltime Uncertainty.
+      << ","  << tbb_average_throughput        // TBB Average Throughput.
+      << ","  << tbb_throughput_uncertainty    // TBB Throughput Uncertainty.
+      #endif
+      << std::endl;
+  } // }}}
+
+private:
+  static experiment_results std_experiment()
+  {
+    return experiment<typename Test<element_type>::std_trial>();
+  }
+
+  static experiment_results thrust_experiment()
+  {
+    return experiment<typename Test<element_type>::thrust_trial>();
+  }
+
+  #if defined(HAVE_TBB)
+  static experiment_results tbb_experiment()
+  {
+    return experiment<typename Test<element_type>::tbb_trial>();
+  }
+  #endif
+
+  template <typename Trial>
+  static experiment_results experiment()
+  { // {{{
+    Trial trial;
+
+    // Allocate storage and generate random input for the warmup trial.
+    trial.setup(elements);
+
+    // Warmup trial.
+    trial();
+
+    uint64_t const trials
+      = trial.is_baseline() ? baseline_trials : regular_trials;
+
+    std::vector<double> times;
+    times.reserve(trials);
+
+    for (uint64_t t = 0; t < trials; ++t)
+    {
+      // Generate random input for next trial.
+      trial.setup(elements);
+
+      steady_timer e;
+
+      // Benchmark.
+      e.start();
+      trial();
+      e.stop();
+
+      times.push_back(e.seconds_elapsed());
+    }
+
+    double average_time
+      = arithmetic_mean(times.begin(), times.end());
+
+    double stdev_time
+      = sample_standard_deviation(times.begin(), times.end(), average_time);
+
+    return experiment_results(average_time, stdev_time);
+  } // }}}
+};
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+char const* const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::test_name
+  = Test<typename ElementMetaType::type>::test_name();
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+char const* const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::element_type_name
+  = ElementMetaType::name();
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::element_size
+  = CHAR_BIT * sizeof(typename ElementMetaType::type);
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::elements
+  = Elements;
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+double const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::input_size
+  = double( Elements /* [elements] */
+          * sizeof(typename ElementMetaType::type) /* [bytes/element] */
+          )
+  / double(1024 * 1024 /* [bytes/MiB] */);
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::baseline_trials
+  = BaselineTrials;
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::regular_trials
+  = RegularTrials;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Never create variables, pointers or references of any of the `*_trial_base`
+// classes. They are purely mixin base classes and do not have vtables and
+// virtual destructors. Using them for polymorphism instead of composition will
+// probably cause slicing.
+
+struct baseline_trial {};
+struct regular_trial {};
+
+template <typename TrialKind = regular_trial>
+struct trial_base;
+
+template <>
+struct trial_base<baseline_trial>
+{
+  static bool is_baseline() { return true; }
+};
+
+template <>
+struct trial_base<regular_trial>
+{
+  static bool is_baseline() { return false; }
+};
+
+template <typename Container, typename TrialKind = regular_trial>
+struct inplace_trial_base : trial_base<TrialKind>
+{
+  Container input;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+
+    randomize(input);
+  }
+};
+
+template <typename Container, typename TrialKind = regular_trial>
+struct copy_trial_base : trial_base<TrialKind>
+{
+  Container input;
+  Container output;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+    output.resize(elements);
+
+    randomize(input);
+  }
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template <typename Container, typename TrialKind = regular_trial>
+struct shuffle_trial_base : trial_base<TrialKind>
+{
+  Container input;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+
+    randomize(input);
+  }
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct reduce_tester
+{
+  static char const* test_name() { return "reduce"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      if (std::accumulate(this->input.begin(), this->input.end(), T(0)) == 0)
+        // Prevent optimizer from removing body.
+        std::cout << "xyz";
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::reduce(this->input.begin(), this->input.end());
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_reduce(this->input);
+    }
+  };
+  #endif
+};
+
+template <typename T>
+struct sort_tester
+{
+  static char const* test_name() { return "sort"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      std::sort(this->input.begin(), this->input.end());
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::sort(this->input.begin(), this->input.end());
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_sort(this->input);
+    }
+  }
+  #endif
+};
+
+
+template <typename T>
+struct transform_inplace_tester
+{
+  static char const* test_name() { return "transform_inplace"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      std::transform(
+          this->input.begin(), this->input.end(), this->input.begin()
+        , thrust::negate<T>()
+      );
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::transform(
+          this->input.begin(), this->input.end(), this->input.begin()
+        , thrust::negate<T>()
+      );
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_transform(this->input);
+    }
+  };
+  #endif
+};
+
+template <typename T>
+struct inclusive_scan_inplace_tester
+{
+  static char const* test_name() { return "inclusive_scan_inplace"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      std::partial_sum(
+          this->input.begin(), this->input.end(), this->input.begin()
+      );
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::inclusive_scan(
+          this->input.begin(), this->input.end(), this->input.begin()
+      );
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_scan(this->input);
+    }
+  };
+  #endif
+};
+
+template <typename T>
+struct copy_tester
+{
+  static char const* test_name() { return "copy"; }
+
+  struct std_trial : copy_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      std::copy(this->input.begin(), this->input.end(), this->output.begin());
+    }
+  };
+
+  struct thrust_trial : copy_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::copy(this->input.begin(), this->input.end(), this->input.begin());
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : copy_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_copy(this->input, this->output);
+    }
+  };
+  #endif
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template <typename T>
+struct shuffle_tester
+{
+  static char const* test_name() { return "shuffle"; }
+
+  struct std_trial : shuffle_trial_base<std::vector<T>, baseline_trial>
+  {
+    std::default_random_engine g;
+    void operator()()
+    {
+      std::shuffle(this->input.begin(), this->input.end(), this->g);
+    }
+  };
+
+  struct thrust_trial : shuffle_trial_base<thrust::device_vector<T> >
+  {
+    thrust::default_random_engine g;
+    void operator()()
+    {
+      thrust::shuffle(this->input.begin(), this->input.end(), this->g);
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename ElementMetaType
+  , uint64_t Elements
+  , uint64_t BaselineTrials
+  , uint64_t RegularTrials
+>
+void run_core_primitives_experiments_for_type()
+{
+  experiment_driver<
+      reduce_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+    transform_inplace_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      inclusive_scan_inplace_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      sort_tester
+    , ElementMetaType
+//    , Elements / sizeof(typename ElementMetaType::type)
+    , (Elements >> 6) // Sorting is more sensitive to element count than
+                      // memory footprint.
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      copy_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+#if THRUST_CPP_DIALECT >= 2011
+  experiment_driver<
+      shuffle_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define DEFINE_ELEMENT_META_TYPE(T)                       \
+  struct PP_CAT(T, _meta)                                 \
+  {                                                       \
+    typedef T type;                                       \
+                                                          \
+    static char const* name() { return PP_STRINGIZE(T); } \
+  };                                                      \
+  /**/
+
+DEFINE_ELEMENT_META_TYPE(char);
+DEFINE_ELEMENT_META_TYPE(int);
+DEFINE_ELEMENT_META_TYPE(int8_t);
+DEFINE_ELEMENT_META_TYPE(int16_t);
+DEFINE_ELEMENT_META_TYPE(int32_t);
+DEFINE_ELEMENT_META_TYPE(int64_t);
+DEFINE_ELEMENT_META_TYPE(float);
+DEFINE_ELEMENT_META_TYPE(double);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    uint64_t Elements
+  , uint64_t BaselineTrials
+  , uint64_t RegularTrials
+>
+void run_core_primitives_experiments()
+{
+  run_core_primitives_experiments_for_type<
+    char_meta,    Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int_meta,     Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int8_t_meta,  Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int16_t_meta, Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int32_t_meta, Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int64_t_meta, Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    float_meta,   Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    double_meta,  Elements, BaselineTrials, RegularTrials
+  >();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// XXX Use `std::string_view` when possible.
+std::vector<std::string> split(std::string const& str, std::string const& delim)
+{
+  std::vector<std::string> tokens;
+  std::string::size_type prev = 0, pos = 0;
+  do
+  {
+    pos = str.find(delim, prev);
+    if (pos == std::string::npos) pos = str.length();
+    std::string token = str.substr(prev, pos - prev);
+    if (!token.empty()) tokens.push_back(token);
+    prev = pos + delim.length();
+  }
+  while (pos < str.length() && prev < str.length());
+  return tokens;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct command_line_option_error : std::exception
+{
+  virtual ~command_line_option_error() NOEXCEPT {}
+  virtual const char* what() const NOEXCEPT = 0;
+};
+
+struct only_one_option_allowed : command_line_option_error
+{
+  // Construct a new `only_one_option_allowed` exception. `key` is the
+  // option name and `[first, last)` is a sequence of
+  // `std::pair<std::string const, std::string>`s (the values).
+  template <typename InputIt>
+  only_one_option_allowed(std::string const& key, InputIt first, InputIt last)
+    : message()
+  {
+    message  = "Only one `--";
+    message += key;
+    message += "` option is allowed, but multiple were received: ";
+
+    for (; first != last; ++first)
+    {
+      message += "`";
+      message += (*first).second;
+      message += "` ";
+    }
+
+    // Remove the trailing space added by the last iteration of the above loop.
+    message.erase(message.size() - 1, 1);
+
+    message += ".";
+  }
+
+  virtual ~only_one_option_allowed() NOEXCEPT {}
+
+  virtual const char* what() const NOEXCEPT
+  {
+    return message.c_str();
+  }
+
+private:
+  std::string message;
+};
+
+struct required_option_missing : command_line_option_error
+{
+  // Construct a new `requirement_option_missing` exception. `key` is the
+  // option name.
+  required_option_missing(std::string const& key)
+    : message()
+  {
+    message  = "`--";
+    message += key;
+    message += "` option is required.";
+  }
+
+  virtual ~required_option_missing() NOEXCEPT {}
+
+  virtual const char* what() const NOEXCEPT
+  {
+    return message.c_str();
+  }
+
+private:
+  std::string message;
+};
+
+struct command_line_processor
+{
+  typedef std::vector<std::string> positional_options_type;
+
+  typedef std::multimap<std::string, std::string> keyword_options_type;
+
+  typedef std::pair<
+    keyword_options_type::const_iterator
+  , keyword_options_type::const_iterator
+  > keyword_option_values;
+
+  command_line_processor(int argc, char** argv)
+    : pos_args(), kw_args()
+  { // {{{
+    for (int i = 1; i < argc; ++i)
+    {
+      std::string arg(argv[i]);
+
+      // Look for --key or --key=value options.
+      if (arg.substr(0, 2) == "--")
+      {
+        std::string::size_type n = arg.find('=', 2);
+
+        keyword_options_type::value_type key_value;
+
+        if (n == std::string::npos) // --key
+          kw_args.insert(keyword_options_type::value_type(
+            arg.substr(2), ""
+          ));
+        else                        // --key=value
+          kw_args.insert(keyword_options_type::value_type(
+            arg.substr(2, n - 2), arg.substr(n + 1)
+          ));
+
+        kw_args.insert(key_value);
+      }
+      else // Assume it's positional.
+        pos_args.push_back(arg);
+    }
+  } // }}}
+
+  // Return the value for option `key`.
+  //
+  // Throws:
+  // * `only_one_option_allowed` if there is more than one value for `key`.
+  // * `required_option_missing` if there is no value for `key`.
+  std::string operator()(std::string const& key) const
+  {
+    keyword_option_values v = kw_args.equal_range(key);
+
+    keyword_options_type::difference_type d = std::distance(v.first, v.second);
+
+    if      (1 < d)  // Too many options.
+      throw only_one_option_allowed(key, v.first, v.second);
+    else if (0 == d) // No option.
+      throw required_option_missing(key);
+
+    return (*v.first).second;
+  }
+
+  // Return the value for option `key`, or `dflt` if `key` has no value.
+  //
+  // Throws: `only_one_option_allowed` if there is more than one value for `key`.
+  std::string operator()(std::string const& key, std::string const& dflt) const
+  {
+    keyword_option_values v = kw_args.equal_range(key);
+
+    keyword_options_type::difference_type d = std::distance(v.first, v.second);
+
+    if (1 < d)  // Too many options.
+      throw only_one_option_allowed(key, v.first, v.second);
+
+    if (0 == d) // No option.
+      return dflt;
+    else        // 1 option.
+      return (*v.first).second;
+  }
+
+  // Returns `true` if the option `key` was specified at least once.
+  bool has(std::string const& key) const
+  {
+    return kw_args.count(key) > 0;
+  }
+
+private:
+  positional_options_type pos_args;
+  keyword_options_type    kw_args;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv)
+{
+  command_line_processor clp(argc, argv);
+
+  #if defined(HAVE_TBB)
+  tbb::task_scheduler_init init;
+
+  test_tbb();
+  #endif
+
+  #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    // Set the CUDA device to use for the benchmark - `0` by default.
+
+    int device = std::atoi(clp("device", "0").c_str());
+    // `std::atoi` returns 0 if the conversion fails.
+
+    cudaSetDevice(device);
+  #endif
+
+  if (!clp.has("no-header"))
+    print_experiment_header();
+
+                                          /* Elements |       Trials       */
+                                          /*          | Baseline | Regular */
+//run_core_primitives_experiments< 1LLU << 21LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 22LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 23LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 24LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 25LLU      , 4        , 16      >();
+  run_core_primitives_experiments< 1LLU << 26LLU      , 4        , 16      >();
+  run_core_primitives_experiments< 1LLU << 27LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 28LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 29LLU      , 4        , 16      >();
+
+  return 0;
+}
+
+// TODO: Add different input sizes and half precision
diff --git a/thrust/internal/benchmark/bench.mk b/thrust/internal/benchmark/bench.mk
new file mode 100644
index 0000000000000000000000000000000000000000..2a5c002bcbc352f2be18fbe294d37ced15201b53
--- /dev/null
+++ b/thrust/internal/benchmark/bench.mk
@@ -0,0 +1,20 @@
+# XXX Use the common Thrust Makefiles instead of this.
+
+EXECUTABLE := bench
+BUILD_SRC  := $(ROOTDIR)/thrust/internal/benchmark/bench.cu
+
+ifeq ($(OS),Linux)
+  LIBRARIES += m
+endif
+
+# XXX Why is this needed?
+ifeq ($(OS),Linux)
+  ifeq ($(ABITYPE), androideabi)
+    override ALL_SASS_ARCHITECTURES := 32
+  endif
+endif
+
+ARCH_NEG_FILTER += 20 21
+
+include $(ROOTDIR)/thrust/internal/build/common_detect.mk
+include $(ROOTDIR)/thrust/internal/build/common_build.mk
diff --git a/thrust/internal/benchmark/combine_benchmark_results.py b/thrust/internal/benchmark/combine_benchmark_results.py
new file mode 100755
index 0000000000000000000000000000000000000000..f82b21f80a1eadbb16e0e8c27cbbc9d64d268fa7
--- /dev/null
+++ b/thrust/internal/benchmark/combine_benchmark_results.py
@@ -0,0 +1,817 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Copyright (c) 2012-7 Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+###############################################################################
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+# XXX Put code shared with `compare_benchmark_results.py` in a common place.
+
+# XXX Relative uncertainty.
+
+from sys import exit, stdout
+
+from os.path import splitext
+
+from itertools import imap # Lazy map.
+
+from math import sqrt, log10, floor
+
+from collections import deque
+
+from argparse import ArgumentParser as argument_parser
+
+from csv import DictReader as csv_dict_reader
+from csv import DictWriter as csv_dict_writer
+
+from re import compile as regex_compile
+
+###############################################################################
+
+def unpack_tuple(f):
+  """Return a unary function that calls `f` with its argument unpacked."""
+  return lambda args: f(*iter(args))
+
+def strip_dict(d):
+  """Strip leading and trailing whitespace from all keys and values in `d`."""
+  d.update({key: value.strip() for (key, value) in d.items()})
+
+def merge_dicts(d0, d1):
+  """Create a new `dict` that is the union of `dict`s `d0` and `d1`."""
+  d = d0.copy()
+  d.update(d1)
+  return d
+
+def strip_list(l):
+  """Strip leading and trailing whitespace from all values in `l`."""
+  for i, value in enumerate(l): l[i] = value.strip()
+
+###############################################################################
+
+def int_or_float(x):
+  """Convert `x` to either `int` or `float`, preferring `int`.
+
+  Raises:
+    ValueError : If `x` is not convertible to either `int` or `float`
+  """
+  try:
+    return int(x)
+  except ValueError:
+    return float(x)
+
+def try_int_or_float(x):
+  """Try to convert `x` to either `int` or `float`, preferring `int`. `x` is
+  returned unmodified if conversion fails.
+  """
+  try:
+    return int_or_float(x)
+  except ValueError:
+    return x
+
+###############################################################################
+
+def find_significant_digit(x):
+  """Return the significant digit of the number x. The result is the number of
+  digits after the decimal place to round to (negative numbers indicate rounding
+  before the decimal place)."""
+  if x == 0: return 0
+  return -int(floor(log10(abs(x))))
+
+def round_with_int_conversion(x, ndigits = None):
+  """Rounds `x` to `ndigits` after the the decimal place. If `ndigits` is less
+  than 1, convert the result to `int`. If `ndigits` is `None`, the significant
+  digit of `x` is used."""
+  if ndigits is None: ndigits = find_significant_digit(x)
+  x_rounded = round(x, ndigits)
+  return int(x_rounded) if ndigits < 1 else x_rounded
+
+###############################################################################
+
+class measured_variable(object):
+  """A meta-variable representing measured data. It is composed of three raw
+  variables plus units meta-data.
+
+  Attributes:
+    quantity (`str`) :
+      Name of the quantity variable of this object.
+    uncertainty (`str`) :
+      Name of the uncertainty variable of this object.
+    sample_size (`str`) :
+      Name of the sample size variable of this object.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+class measured_value(object):
+  """An object that represents a value determined by multiple measurements.
+
+  Attributes:
+    quantity (scalar) :
+      The quantity of the value, e.g. the arithmetic mean.
+    uncertainty (scalar) :
+      The measurement uncertainty, e.g. the sample standard deviation.
+    sample_size (`int`) :
+      The number of observations contributing to the value.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size = 1, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+###############################################################################
+
+def arithmetic_mean(X):
+  """Computes the arithmetic mean of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+
+  .. math::
+
+    u = \frac{\sum_{i = 0}^{n - 1} X_i}{n}
+  """
+  return sum(X) / len(X)
+
+def sample_variance(X, u = None):
+  """Computes the sample variance of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    v = \frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  return sum(imap(lambda X_i: (X_i - u) ** 2, X)) / (len(X) - 1)
+ 
+def sample_standard_deviation(X, u = None, v = None):
+  """Computes the sample standard deviation of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    s &= \sqrt{v}
+      &= \sqrt{\frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+    v (number)     : The sample variance of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  if v is None: v = sample_variance(X, u)
+  return sqrt(v)
+
+def combine_sample_size(As):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+
+  .. math::
+
+    n = \sum{i = 0}^{g - 1} n_i
+  """
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i), As))
+
+def combine_arithmetic_mean(As, n = None):
+  """Computes the combined arithmetic mean of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+
+  .. math::
+
+    u = \frac{\sum{i = 0}^{g - 1} n_i u_i}{n}
+  """
+  if n is None: n = combine_sample_size(As)
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i * u_i), As)) / n
+  
+def combine_sample_variance(As, n = None, u = None):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+
+  .. math::
+
+    v = \frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  return sum(imap(unpack_tuple(
+    lambda u_i, s_i, n_i, t_i: n_i * (u_i - u) ** 2 + (s_i ** 2) * (n_i - 1)
+  ), As)) / (n - 1)
+
+def combine_sample_standard_deviation(As, n = None, u = None, v = None):
+  """Computes the combined sample standard deviation of a group of
+  `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    s &= \sqrt{v}
+      &= \sqrt{\frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+    v (number)                           : The combined sample variance of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  if v is None: v = combine_sample_variance(As, n, u)
+  return sqrt(v)
+
+###############################################################################
+
+def process_program_arguments():
+  ap = argument_parser(
+    description = (
+      "Aggregates the results of multiple runs of benchmark results stored in "
+      "CSV format."
+    )
+  )
+
+  ap.add_argument(
+    "-d", "--dependent-variable",
+    help = ("Treat the specified three variables as a dependent variable. The "
+            "1st variable is the measured quantity, the 2nd is the uncertainty "
+            "of the measurement and the 3rd is the sample size. The defaults "
+            "are the dependent variables of Thrust's benchmark suite. May be "
+            "specified multiple times."),
+    action = "append", type = str, dest = "dependent_variables",
+    metavar = "QUANTITY,UNCERTAINTY,SAMPLES"
+  )
+
+  ap.add_argument(
+    "-p", "--preserve-whitespace",
+    help = ("Don't trim leading and trailing whitespace from each CSV cell."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "-o", "--output-file",
+    help = ("The file that results are written to. If `-`, results are "
+            "written to stdout."),
+    action = "store", type = str, default = "-",
+    metavar = "OUTPUT"
+  )
+
+  ap.add_argument(
+    "input_files",
+    help = ("Input CSV files. The first two rows should be a header. The 1st "
+            "header row specifies the name of each variable, and the 2nd "
+            "header row specifies the units for that variable."),
+    type = str, nargs = "+",
+    metavar = "INPUTS"
+  )
+
+  return ap.parse_args()
+
+###############################################################################
+
+def filter_comments(f, s = "#"):
+  """Return an iterator to the file `f` which filters out all lines beginning
+  with `s`."""
+  return filter(lambda line: not line.startswith(s), f)
+
+###############################################################################
+
+class io_manager(object):
+  """Manages I/O operations and represents the input data as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`. It can be used with `with`.
+
+  Attributes:
+    preserve_whitespace (`bool`) :
+      If `False`, leading and trailing whitespace is stripped from each CSV cell.
+    writer (`csv_dict_writer`) :
+      CSV writer object that the output is written to.
+    output_file (`file` or `stdout`) :
+      The output `file` object.
+    readers (`list` of `csv_dict_reader`s) :
+      List of input files as CSV reader objects.
+    input_files (list of `file`s) :
+      List of input `file` objects.
+    variable_names (`list` of `str`s) :
+      Names of the variables, in order. 
+    variable_units (`list` of `str`s) :
+      Units of the variables, in order. 
+  """
+
+  def __init__(self, input_files, output_file, preserve_whitespace = True):
+    """Read input files and open the output file and construct a new `io_manager`
+    object.
+
+    If `preserve_whitespace` is `False`, leading and trailing whitespace is
+    stripped from each CSV cell.
+
+    Raises
+      AssertionError :
+        If `len(input_files) <= 0` or `type(preserve_whitespace) != bool`.
+    """
+    assert len(input_files) > 0, "No input files provided."
+
+    assert type(preserve_whitespace) == bool
+
+    self.preserve_whitespace = preserve_whitespace
+
+    self.readers = deque()
+
+    self.variable_names = None
+    self.variable_units = None
+
+    self.input_files = deque()
+
+    for input_file in input_files:
+      input_file_object = open(input_file)
+      reader = csv_dict_reader(filter_comments(input_file_object))
+
+      if not self.preserve_whitespace:
+        strip_list(reader.fieldnames)
+
+      if self.variable_names is None:
+        self.variable_names = reader.fieldnames
+      else:
+        # Make sure all inputs have the same schema.
+        assert self.variable_names == reader.fieldnames,                      \
+          "Input file (`" + input_file + "`) variable schema `"             + \
+          str(reader.fieldnames) + "` does not match the variable schema `" + \
+          str(self.variable_names) + "`."
+
+      # Consume the next row, which should be the second line of the header.
+      variable_units = reader.next()
+
+      if not self.preserve_whitespace:
+        strip_dict(variable_units)
+
+      if self.variable_units is None:
+        self.variable_units = variable_units
+      else:
+        # Make sure all inputs have the same units schema.
+        assert self.variable_units == variable_units,                         \
+          "Input file (`" + input_file + "`) units schema `"                + \
+          str(variable_units) + "` does not match the units schema `"       + \
+          str(self.variable_units) + "`."
+
+      self.readers.append(reader)
+      self.input_files.append(input_file_object)
+ 
+    if   output_file == "-": # Output to stdout.
+      self.output_file = stdout
+    else:                    # Output to user-specified file.
+      self.output_file = open(output_file, "w")
+
+    self.writer = csv_dict_writer(
+      self.output_file, fieldnames = self.variable_names
+    )
+
+  def __enter__(self):
+    """Called upon entering a `with` statement."""
+    return self
+
+  def __exit__(self, *args):
+    """Called upon exiting a `with` statement."""
+    if   self.output_file is stdout:
+      self.output_file = None
+    elif self.output_file is not None:
+      self.output_file.__exit__(*args)
+
+    for input_file in self.input_files:
+      input_file.__exit__(*args)
+
+  #############################################################################
+  # Input Stream.
+
+  def __iter__(self):
+    """Return an iterator to the input sequence.
+
+    This is a requirement for the `Iterable` protocol.
+    """
+    return self
+
+  def next(self):
+    """Consume and return the next record (a `dict` representing a CSV row) in
+    the input.
+
+    This is a requirement for the `Iterator` protocol.
+
+    Raises:
+      StopIteration : If there is no more input.
+    """
+    if len(self.readers) == 0:
+      raise StopIteration()
+
+    try:
+      row = self.readers[0].next()
+      if not self.preserve_whitespace: strip_dict(row)
+      return row
+    except StopIteration:
+      # The current reader is empty, so pop it, pop it's input file, close the
+      # input file, and then call ourselves again. 
+      self.readers.popleft()
+      self.input_files.popleft().close()
+      return self.next()
+
+  #############################################################################
+  # Output.
+
+  def write_header(self):
+    """Write the header for the output CSV file."""
+    # Write the first line of the header.
+    self.writer.writeheader()
+
+    # Write the second line of the header.
+    self.writer.writerow(self.variable_units)
+
+  def write(self, d):
+    """Write a record (a `dict`) to the output CSV file."""
+    self.writer.writerow(d)
+
+###############################################################################
+
+class dependent_variable_parser(object):
+  """Parses a `--dependent-variable=AVG,STDEV,TRIALS` command line argument."""
+
+  #############################################################################
+  # Grammar
+
+  # Parse a variable_name.
+  variable_name_rule = r'[^,]+'
+
+  # Parse a variable classification.        
+  dependent_variable_rule = r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'
+
+  engine = regex_compile(dependent_variable_rule)
+
+  #############################################################################
+
+  def __call__(self, s):
+    """Parses the string `s` with the form "AVG,STDEV,TRIALS".
+
+    Returns:
+      A `measured_variable`. 
+
+    Raises:
+      AssertionError : If parsing fails.
+    """
+
+    match = self.engine.match(s)
+
+    assert match is not None,                                          \
+      "Dependent variable (-d) `" +s+ "` is invalid, the format is " + \
+      "`AVG,STDEV,TRIALS`."
+
+    return measured_variable(match.group(1), match.group(2), match.group(3))
+
+###############################################################################
+
+class record_aggregator(object):
+  """Consumes and combines records and represents the result as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`.
+
+  Attributes:
+    dependent_variables (`list` of `measured_variable`s) :
+      A list of dependent variables provided on the command line.
+    dataset (`dict`) :
+      A mapping of distinguishing (e.g. control + independent) values (`tuple`s
+      of variable-quantity pairs) to `list`s of dependent values (`dict`s from 
+      variables to lists of cells).
+    in_order_dataset_keys :
+      A list of unique dataset keys (e.g. distinguishing variables) in order of
+      appearance.
+  """
+
+  parse_dependent_variable = dependent_variable_parser()
+
+  def __init__(self, raw_dependent_variables):
+    """Parse dependent variables and construct a new `record_aggregator` object.
+
+    Raises:
+      AssertionError : If parsing of dependent variables fails.
+    """
+    self.dependent_variables = []
+
+    if raw_dependent_variables is not None:
+      for variable in raw_dependent_variables:
+        self.dependent_variables.append(self.parse_dependent_variable(variable))
+
+    self.dataset = {}
+
+    self.in_order_dataset_keys = deque()
+
+  #############################################################################
+  # Insertion.
+
+  def append(self, record):
+    """Add `record` to the dataset.
+
+    Raises:
+      ValueError : If any `str`-to-numeric conversions fail.
+    """
+    # The distinguishing variables are the control and independent variables.
+    # They form the key for each record in the dataset. Records with the same
+    # distinguishing variables are treated as observations of the same data
+    # point.
+    dependent_values = {}
+
+    # To allow the same sample size variable to be used for multiple dependent
+    # variables, we don't pop sample size variables until we're done processing
+    # all variables.
+    sample_size_variables = []
+
+    # Separate the dependent values from the distinguishing variables and
+    # perform `str`-to-numeric conversions.
+    for variable in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = variable.as_tuple()
+
+      dependent_values[quantity]    = [int_or_float(record.pop(quantity))]
+      dependent_values[uncertainty] = [int_or_float(record.pop(uncertainty))]
+      dependent_values[sample_size] = [int(record[sample_size])]
+
+      sample_size_variables.append(sample_size)
+
+    # Pop sample size variables.
+    for sample_size_variable in sample_size_variables:
+      # Allowed to fail, as we may have duplicates.
+      record.pop(sample_size_variable, None)
+
+    # `dict`s aren't hashable, so create a tuple of key-value pairs.
+    distinguishing_values = tuple(record.items())
+
+    if distinguishing_values in self.dataset:
+      # These distinguishing values already exist, so get the `dict` they're
+      # mapped to, look up each key in `dependent_values` in the `dict`, and
+      # add the corresponding quantity in `dependent_values` to the list in the
+      # the `dict`.
+      for variable, columns in dependent_values.iteritems():
+        self.dataset[distinguishing_values][variable] += columns
+    else:
+      # These distinguishing values aren't in the dataset, so add them and
+      # record them in `in_order_dataset_keys`.
+      self.dataset[distinguishing_values] = dependent_values
+      self.in_order_dataset_keys.append(distinguishing_values)
+
+  #############################################################################
+  # Postprocessing.
+
+  def combine_dependent_values(self, dependent_values):
+    """Takes a mapping of dependent variables to lists of cells and returns
+    a new mapping with the cells combined.
+
+    Raises:
+      AssertionError : If class invariants were violated.
+    """
+    combined_dependent_values = dependent_values.copy()
+
+    for variable in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = variable.as_tuple()
+
+      quantities    = dependent_values[quantity]
+      uncertainties = dependent_values[uncertainty]
+      sample_sizes  = dependent_values[sample_size]
+
+      if type(sample_size) is list:
+        # Sample size hasn't been combined yet.
+        assert len(quantities)    == len(uncertainties)                       \
+           and len(uncertainties) == len(sample_sizes),                       \
+          "Length of quantities list `(" + str(len(quantities)) + ")`, "    + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          "),` and length of sample sizes list `(" + str(len(sample_sizes)) + \
+          ")` are not the same."
+      else:
+        # Another dependent variable that uses our sample size has combined it
+        # already.
+        assert len(quantities) == len(uncertainties),                         \
+          "Length of quantities list `(" + str(len(quantities)) + ")` and " + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          ")` are not the same."
+
+      # Convert the three separate `list`s into one list of `measured_value`s.
+      measured_values = []
+
+      for i in range(len(quantities)):
+        mv = measured_value(
+          quantities[i], uncertainties[i], sample_sizes[i], units
+        )
+
+        measured_values.append(mv)
+
+      # Combine the `measured_value`s.
+      combined_sample_size = combine_sample_size(
+        measured_values
+      )
+
+      combined_arithmetic_mean = combine_arithmetic_mean(
+        measured_values, combined_sample_size
+      )
+
+      combined_sample_standard_deviation = combine_sample_standard_deviation(
+        measured_values, combined_sample_size, combined_arithmetic_mean
+      )
+
+      # Round the quantity and uncertainty to the significant digit of
+      # uncertainty and insert the combined values into the results.
+      sigdig = find_significant_digit(combined_sample_standard_deviation)
+
+#      combined_arithmetic_mean = round_with_int_conversion(
+#        combined_arithmetic_mean, sigdig
+#      )
+
+#      combined_sample_standard_deviation = round_with_int_conversion(
+#        combined_sample_standard_deviation, sigdig
+#      )
+
+      combined_dependent_values[quantity]    = combined_arithmetic_mean
+      combined_dependent_values[uncertainty] = combined_sample_standard_deviation
+      combined_dependent_values[sample_size] = combined_sample_size
+
+    return combined_dependent_values
+
+  ############################################################################# 
+  # Output Stream.
+
+  def __iter__(self):
+    """Return an iterator to the output sequence of separated distinguishing
+    variables and dependent variables (a tuple of two `dict`s).
+
+    This is a requirement for the `Iterable` protocol.
+    """
+    return self
+
+  def records(self):
+    """Return an iterator to the output sequence of CSV rows (`dict`s of
+    variables to values).
+    """
+    return imap(unpack_tuple(lambda dist, dep: merge_dicts(dist, dep)), self)
+
+  def next(self):
+    """Produce the components of the next output record - a tuple of two
+    `dict`s. The first `dict` is a mapping of distinguishing variables to
+    distinguishing values, the second `dict` is a mapping of dependent
+    variables to combined dependent values. Combining the two dicts forms a
+    CSV row suitable for output.
+
+    This is a requirement for the `Iterator` protocol.
+
+    Raises:
+      StopIteration  : If there is no more output.
+      AssertionError : If class invariants were violated.
+    """
+    assert len(self.dataset.keys()) == len(self.in_order_dataset_keys),      \
+      "Number of dataset keys (`" + str(len(self.dataset.keys()))          + \
+      "`) is not equal to the number of keys in the ordering list (`"      + \
+      str(len(self.in_order_dataset_keys)) + "`)."
+
+    if len(self.in_order_dataset_keys) == 0:
+      raise StopIteration()
+
+    # Get the next set of distinguishing values and convert them to a `dict`.
+    raw_distinguishing_values = self.in_order_dataset_keys.popleft()
+    distinguishing_values     = dict(raw_distinguishing_values)
+
+    dependent_values = self.dataset.pop(raw_distinguishing_values)
+
+    combined_dependent_values = self.combine_dependent_values(dependent_values)
+
+    return (distinguishing_values, combined_dependent_values)
+
+###############################################################################
+
+args = process_program_arguments()
+
+if args.dependent_variables is None:
+  args.dependent_variables = [
+    "STL Average Walltime,STL Walltime Uncertainty,STL Trials",
+    "STL Average Throughput,STL Throughput Uncertainty,STL Trials",
+    "Thrust Average Walltime,Thrust Walltime Uncertainty,Thrust Trials",
+    "Thrust Average Throughput,Thrust Throughput Uncertainty,Thrust Trials"
+  ]
+
+# Read input files and open the output file.
+with io_manager(args.input_files,
+                args.output_file,
+                args.preserve_whitespace) as iom:
+  # Parse dependent variable options.
+  ra = record_aggregator(args.dependent_variables)
+
+  # Add all input data to the `record_aggregator`.
+  for record in iom:
+    ra.append(record)
+
+  iom.write_header()
+
+  # Write combined results out.
+  for record in ra.records():
+    iom.write(record)
+
diff --git a/thrust/internal/benchmark/compare_benchmark_results.py b/thrust/internal/benchmark/compare_benchmark_results.py
new file mode 100755
index 0000000000000000000000000000000000000000..22e7be8cfc20e1de4cfa586258e433f2a93aeb27
--- /dev/null
+++ b/thrust/internal/benchmark/compare_benchmark_results.py
@@ -0,0 +1,1308 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Copyright (c) 2012-7 Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+###############################################################################
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+# XXX Put code shared with `combine_benchmark_results.py` in a common place.
+
+# XXX Relative uncertainty.
+
+# XXX Create uncertain value class which is quantity + uncertainty.
+
+from sys import exit, stdout
+
+from os.path import splitext
+
+from itertools import imap # Lazy map.
+
+from math import sqrt, log10, floor
+
+from collections import deque
+
+from argparse import ArgumentParser as argument_parser
+from argparse import Action as argument_action
+
+from csv import DictReader as csv_dict_reader
+from csv import DictWriter as csv_dict_writer
+
+from re import compile as regex_compile
+
+###############################################################################
+
+def unpack_tuple(f):
+  """Return a unary function that calls `f` with its argument unpacked."""
+  return lambda args: f(*iter(args))
+
+def strip_dict(d):
+  """Strip leading and trailing whitespace from all keys and values in `d`.
+
+  Returns:
+    The modified dict `d`.
+  """
+  d.update({key: value.strip() for (key, value) in d.items()})
+  return d
+
+def merge_dicts(d0, d1):
+  """Create a new `dict` that is the union of `dict`s `d0` and `d1`."""
+  d = d0.copy()
+  d.update(d1)
+  return d
+
+def change_key_in_dict(d, old_key, new_key):
+  """Change the key of the entry in `d` with key `old_key` to `new_key`. If
+  there is an existing entry 
+
+  Returns:
+    The modified dict `d`.
+
+  Raises:
+    KeyError : If `old_key` is not in `d`.
+  """
+  d[new_key] = d.pop(old_key)
+  return d
+
+def key_from_dict(d):
+  """Create a hashable key from a `dict` by converting the `dict` to a tuple."""
+  return tuple(sorted(d.items()))
+
+def strip_list(l):
+  """Strip leading and trailing whitespace from all values in `l`."""
+  for i, value in enumerate(l): l[i] = value.strip()
+  return l
+
+def remove_from_list(l, item):
+  """Remove the first occurence of `item` from list `l` and return a tuple of
+  the index that was removed and the element that was removed.
+
+  Raises:
+    ValueError : If `item` is not in `l`.
+  """
+  idx = l.index(item)
+  item = l.pop(idx)
+  return (idx, item)
+
+###############################################################################
+
+def int_or_float(x):
+  """Convert `x` to either `int` or `float`, preferring `int`.
+
+  Raises:
+    ValueError : If `x` is not convertible to either `int` or `float`
+  """
+  try:
+    return int(x)
+  except ValueError:
+    return float(x)
+
+def try_int_or_float(x):
+  """Try to convert `x` to either `int` or `float`, preferring `int`. `x` is
+  returned unmodified if conversion fails.
+  """
+  try:
+    return int_or_float(x)
+  except ValueError:
+    return x
+
+###############################################################################
+
+def ranges_overlap(x1, x2, y1, y2):
+  """Returns true if the ranges `[x1, x2]` and `[y1, y2]` overlap,
+  where `x1 <= x2` and `y1 <= y2`.
+
+  Raises:
+    AssertionError : If `x1 > x2` or `y1 > y2`.
+  """
+  assert x1 <= x2
+  assert y1 <= y2
+  return x1 <= y2 and y1 <= x2
+
+def ranges_overlap_uncertainty(x, x_unc, y, y_unc):
+  """Returns true if the ranges `[x - x_unc, x + x_unc]` and
+  `[y - y_unc, y + y_unc]` overlap, where `x_unc >= 0` and `y_unc >= 0`.
+
+  Raises:
+    AssertionError : If `x_unc < 0` or `y_unc < 0`.
+  """
+  assert x_unc >= 0
+  assert y_unc >= 0
+  return ranges_overlap(x - x_unc, x + x_unc, y - y_unc, y + y_unc)
+
+###############################################################################
+
+# Formulas for propagation of uncertainty from:
+#
+#   https://en.wikipedia.org/wiki/Propagation_of_uncertainty#Example_formulas
+#
+# Even though it's Wikipedia, I trust it as I helped write that table.
+#
+# XXX Replace with a proper reference.
+
+def uncertainty_multiplicative(f, A, A_abs_unc, B, B_abs_unc):
+  """Compute the propagated uncertainty from the multiplication of two
+  uncertain values, `A +/- A_abs_unc` and `B +/- B_abs_unc`. Given `f = AB` or
+  `f = A/B`, where `A != 0` and `B != 0`, the uncertainty in `f` is
+  approximately:
+
+  .. math::
+
+    \sigma_f = |f| \sqrt{\frac{\sigma_A}{A} ^ 2 + \frac{\sigma_B}{B} ^ 2}
+
+  Raises:
+    ZeroDivisionError : If `A == 0` or `B == 0`.
+  """
+  return abs(f) * sqrt((A_abs_unc / A) ** 2 + (B_abs_unc / B) ** 2);
+
+def uncertainty_additive(c, A_abs_unc, d, B_abs_unc):
+  """Compute the propagated uncertainty from addition of two uncertain values,
+  `A +/- A_abs_unc` and `B +/- B_abs_unc`. Given `f = cA + dB`, where `c` and
+  `d` are certain constants, the uncertainty in `f` is approximately:
+
+  .. math::
+
+    f_{\sigma} = \sqrt{c ^ 2 * A_{\sigma} ^ 2 + d ^ 2 * B_{\sigma} ^ 2}
+  """
+  return sqrt(((c ** 2) * (A_abs_unc ** 2)) + ((d ** 2) * (B_abs_unc ** 2)))
+
+###############################################################################
+
+# XXX Create change class.
+
+def absolute_change(old, new):
+  """Computes the absolute change from old to new:
+
+  .. math::
+
+    absolute_change = new - old
+  """
+  return new - old
+
+def absolute_change_uncertainty(old, old_unc, new, new_unc):
+  """Computes the uncertainty in the absolute change from old to new and returns
+  a tuple of the absolute change and the absolute change uncertainty.
+  """
+  absolute_change     = new - old
+  absolute_change_unc = uncertainty_additive(1.0, new_unc, -1.0, old_unc)
+
+  return (absolute_change, absolute_change_unc)
+
+def percent_change(old, new):
+  """Computes the percent change from old to new:
+
+  .. math::
+
+    percent_change = 100 \frac{new - old}{abs(old)}
+  """
+  return float(new - old) / abs(old)
+
+def percent_change_uncertainty(old, old_unc, new, new_unc):
+  """Computes the uncertainty in the percent change from old to new and returns
+  a tuple of the absolute change, the absolute change uncertainty, the percent
+  change and the percent change uncertainty.
+  """
+  # Let's break this down into a few sub-operations:
+  # 
+  #   absolute_change = new - old         <- Additive propagation.
+  #   relative_change = change / abs(old) <- Multiplicative propagation.
+  #   percent_change  = 100 * y           <- Multiplicative propagation.
+
+  if old == 0:
+    # We can't compute relative change because the old value is 0.
+    return (float("nan"), float("nan"), float("nan"), float("nan"))
+
+  (absolute_change, absolute_change_unc) = absolute_change_uncertainty(
+    old, old_unc, new, new_unc
+  )
+
+  if absolute_change == 0:
+    # We can't compute relative change uncertainty because the relative
+    # uncertainty of a value of 0 is undefined.
+    return (absolute_change, absolute_change_unc, float("nan"), float("nan"))
+
+  relative_change     = float(absolute_change) / abs(old)
+  relative_change_unc = uncertainty_multiplicative(
+    relative_change, absolute_change, absolute_change_unc, old, old_unc
+  )
+
+  percent_change = 100.0 * relative_change
+  percent_change_unc = uncertainty_multiplicative(
+    percent_change, 100.0, 0.0, relative_change, relative_change_unc
+  )
+
+  return (
+    absolute_change, absolute_change_unc, percent_change, percent_change_unc
+  )
+
+###############################################################################
+
+def find_significant_digit(x):
+  """Return the significant digit of the number x. The result is the number of
+  digits after the decimal place to round to (negative numbers indicate rounding
+  before the decimal place)."""
+  if x == 0: return 0
+  return -int(floor(log10(abs(x))))
+
+def round_with_int_conversion(x, ndigits = None):
+  """Rounds `x` to `ndigits` after the the decimal place. If `ndigits` is less
+  than 1, convert the result to `int`. If `ndigits` is `None`, the significant
+  digit of `x` is used."""
+  if ndigits is None: ndigits = find_significant_digit(x)
+  x_rounded = round(x, ndigits)
+  return int(x_rounded) if ndigits < 1 else x_rounded
+
+###############################################################################
+
+class measured_variable(object):
+  """A meta-variable representing measured data. It is composed of three raw
+  variables plus units meta-data.
+
+  Attributes:
+    quantity (`str`) :
+      Name of the quantity variable of this object.
+    uncertainty (`str`) :
+      Name of the uncertainty variable of this object.
+    sample_size (`str`) :
+      Name of the sample size variable of this object.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+class measured_value(object):
+  """An object that represents a value determined by multiple measurements.
+
+  Attributes:
+    quantity (scalar) :
+      The quantity of the value, e.g. the arithmetic mean.
+    uncertainty (scalar) :
+      The measurement uncertainty, e.g. the sample standard deviation.
+    sample_size (`int`) :
+      The number of observations contributing to the value.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size = 1, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+###############################################################################
+
+def arithmetic_mean(X):
+  """Computes the arithmetic mean of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+
+  .. math::
+
+    u = \frac{\sum_{i = 0}^{n - 1} X_i}{n}
+  """
+  return sum(X) / len(X)
+
+def sample_variance(X, u = None):
+  """Computes the sample variance of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    v = \frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  return sum(imap(lambda X_i: (X_i - u) ** 2, X)) / (len(X) - 1)
+ 
+def sample_standard_deviation(X, u = None, v = None):
+  """Computes the sample standard deviation of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    s &= \sqrt{v}
+      &= \sqrt{\frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+    v (number)     : The sample variance of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  if v is None: v = sample_variance(X, u)
+  return sqrt(v)
+
+def combine_sample_size(As):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+
+  .. math::
+
+    n = \sum{i = 0}^{g - 1} n_i
+  """
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i), As))
+
+def combine_arithmetic_mean(As, n = None):
+  """Computes the combined arithmetic mean of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+
+  .. math::
+
+    u = \frac{\sum{i = 0}^{g - 1} n_i u_i}{n}
+  """
+  if n is None: n = combine_sample_size(As)
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i * u_i), As)) / n
+  
+def combine_sample_variance(As, n = None, u = None):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+
+  .. math::
+
+    v = \frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  return sum(imap(unpack_tuple(
+    lambda u_i, s_i, n_i, t_i: n_i * (u_i - u) ** 2 + (s_i ** 2) * (n_i - 1)
+  ), As)) / (n - 1)
+
+def combine_sample_standard_deviation(As, n = None, u = None, v = None):
+  """Computes the combined sample standard deviation of a group of
+  `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+    v &= \frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}
+
+    s &= \sqrt{v}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+    v (number)                           : The combined sample variance of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  if v is None: v = combine_sample_variance(As, n, u)
+  return sqrt(v)
+
+###############################################################################
+
+def store_const_multiple(const, *destinations):
+  """Returns an `argument_action` class that sets multiple argument
+  destinations (`destinations`) to `const`."""
+  class store_const_multiple_action(argument_action):
+    def __init__(self, *args, **kwargs):
+      super(store_const_multiple_action, self).__init__(
+        metavar = None, nargs = 0, const = const, *args, **kwargs
+      )
+
+    def __call__(self, parser, namespace, values, option_string = None):
+      for destination in destinations:
+        setattr(namespace, destination, const)
+
+  return store_const_multiple_action
+
+def store_true_multiple(*destinations):
+  """Returns an `argument_action` class that sets multiple argument
+  destinations (`destinations`) to `True`."""
+  return store_const_multiple(True, *destinations)
+
+def store_false_multiple(*destinations):
+  """Returns an `argument_action` class that sets multiple argument
+  destinations (`destinations`) to `False`."""
+  return store_const_multiple(False, *destinations)
+
+###############################################################################
+
+def process_program_arguments():
+  ap = argument_parser(
+    description = (
+      "Compares two sets of combined performance results and identifies "
+      "statistically significant changes."
+    )
+  )
+
+  ap.add_argument(
+    "baseline_input_file",
+    help = ("CSV file containing the baseline performance results. The first "
+            "two rows should be a header. The 1st header row specifies the "
+            "name of each variable, and the 2nd header row specifies the units "
+            "for that variable. The baseline results may be a superset of the "
+            "observed performance results, but the reverse is not true. The "
+            "baseline results must contain data for every datapoint in the "
+            "observed performance results."),            
+    type = str
+  )
+
+  ap.add_argument(
+    "observed_input_file",
+    help = ("CSV file containing the observed performance results. The first "
+            "two rows should be a header. The 1st header row specifies the name "
+            "of header row specifies the units for that variable."),
+    type = str
+  )
+
+  ap.add_argument(
+    "-o", "--output-file",
+    help = ("The file that results are written to. If `-`, results are "
+            "written to stdout."),
+    action = "store", type = str, default = "-",
+    metavar = "OUTPUT"
+  )
+
+  ap.add_argument(
+    "-c", "--control-variable",
+    help = ("Treat the specified variable as a control variable. This means "
+            "it will be filtered out when forming dataset keys. For example, "
+            "this could be used to ignore a timestamp variable that is "
+            "different in the baseline and observed results. May be specified "
+            "multiple times."),
+    action = "append", type = str, dest = "control_variables", default = [],
+    metavar = "QUANTITY"
+  )
+
+  ap.add_argument(
+    "-d", "--dependent-variable",
+    help = ("Treat the specified three variables as a dependent variable. The "
+            "1st variable is the measured quantity, the 2nd is the uncertainty "
+            "of the measurement and the 3rd is the sample size. The defaults "
+            "are the dependent variables of Thrust's benchmark suite. May be "
+            "specified multiple times."),
+    action = "append", type = str, dest = "dependent_variables", default = [],
+    metavar = "QUANTITY,UNCERTAINTY,SAMPLES"
+  )
+
+  ap.add_argument(
+    "-t", "--change-threshold",
+    help = ("Treat relative changes less than this amount (a percentage) as "
+            "statistically insignificant. The default is 5%%."),
+    action = "store", type = float, default = 5,
+    metavar = "PERCENTAGE"
+  )
+
+  ap.add_argument(
+    "-p", "--preserve-whitespace",
+    help = ("Don't trim leading and trailing whitespace from each CSV cell."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "--output-all-variables",
+    help = ("Don't omit original absolute values in output."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "--output-all-datapoints",
+    help = ("Don't omit datapoints that are statistically indistinguishable "
+            "in output."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "-a", "--output-all",
+    help = ("Equivalent to `--output-all-variables --output-all-datapoints`."),
+    action = store_true_multiple("output_all_variables", "output_all_datapoints")
+  )
+
+  return ap.parse_args()
+
+###############################################################################
+
+def filter_comments(f, s = "#"):
+  """Return an iterator to the file `f` which filters out all lines beginning
+  with `s`."""
+  return filter(lambda line: not line.startswith(s), f)
+
+###############################################################################
+
+class io_manager(object):
+  """Manages I/O operations and represents the input data as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`. It can be used with `with`.
+
+  Attributes:
+    preserve_whitespace (`bool`) :
+      If `False`, leading and trailing whitespace is stripped from each CSV cell.
+    writer (`csv_dict_writer`) :
+      CSV writer object that the output is written to.
+    output_file (`file` or `stdout`) :
+      The output `file` object.
+    baseline_reader (`csv_dict_reader`) :
+      CSV reader object for the baseline results.
+    observed_reader (`csv_dict_reader`) :
+      CSV reader object for the observed results.
+    baseline_input_file (`file`) :
+      `file` object for the baseline results.
+    observed_input_file (`file`) :
+      `file` object for the observed results..
+    variable_names (`list` of `str`s) :
+      Names of the variables, in order. 
+    variable_units (`list` of `str`s) :
+      Units of the variables, in order. 
+  """
+
+  def __init__(self,
+               baseline_input_file, observed_input_file,
+               output_file,
+               preserve_whitespace = False):
+    """Read input files and open the output file and construct a new `io_manager`
+    object.
+
+    If `preserve_whitespace` is `False`, leading and trailing whitespace is
+    stripped from each CSV cell.
+
+    Raises
+      AssertionError :
+        If `type(preserve_whitespace) != bool`.
+    """
+    assert type(preserve_whitespace) == bool
+
+    self.preserve_whitespace = preserve_whitespace
+
+    # Open baseline results.
+    self.baseline_input_file = open(baseline_input_file)
+    self.baseline_reader = csv_dict_reader(
+      filter_comments(self.baseline_input_file)
+    )
+
+    if not self.preserve_whitespace:
+      strip_list(self.baseline_reader.fieldnames)
+
+    self.variable_names = list(self.baseline_reader.fieldnames) # Copy.
+    self.variable_units = self.baseline_reader.next()
+
+    if not self.preserve_whitespace:
+      strip_dict(self.variable_units)
+
+    # Open observed results.
+    self.observed_input_file = open(observed_input_file)
+    self.observed_reader = csv_dict_reader(
+      filter_comments(self.observed_input_file)
+    )
+
+    if not self.preserve_whitespace:
+      strip_list(self.observed_reader.fieldnames)
+
+    # Make sure all inputs have the same variables schema.
+    assert self.variable_names == self.observed_reader.fieldnames,             \
+      "Observed results input file (`" + observed_input_file + "`) "         + \
+      "variable schema `" + str(self.observed_reader.fieldnames) + "` does " + \
+      "not match the baseline results input file (`" + baseline_input_file   + \
+      "`) variable schema `" + str(self.variable_names) + "`."
+
+    # Consume the next row, which should be the second line of the header.
+    observed_variable_units = self.observed_reader.next()
+
+    if not self.preserve_whitespace:
+      strip_dict(observed_variable_units)
+
+    # Make sure all inputs have the same units schema.
+    assert self.variable_units == observed_variable_units,                    \
+      "Observed results input file (`" + observed_input_file + "`) "        + \
+      "units schema `" + str(observed_variable_units) + "` does not "       + \
+      "match the baseline results input file (`" + baseline_input_file      + \
+      "`) units schema `" + str(self.variable_units) + "`."
+
+    if   output_file == "-": # Output to stdout.
+      self.output_file = stdout
+    else:                    # Output to user-specified file.
+      self.output_file = open(output_file, "w")
+
+    self.writer = csv_dict_writer(
+      self.output_file, fieldnames = self.variable_names
+    )
+
+  def __enter__(self):
+    """Called upon entering a `with` statement."""
+    return self
+
+  def __exit__(self, *args):
+    """Called upon exiting a `with` statement."""
+    if   self.output_file is stdout:
+      self.output_file = None
+    elif self.output_file is not None:
+      self.output_file.__exit__(*args)
+
+    self.baseline_input_file.__exit__(*args)
+    self.observed_input_file.__exit__(*args)
+
+  def append_variable(self, name, units):
+    """Add a new variable to the output schema."""
+    self.variable_names.append(name)
+    self.variable_units.update({name : units})
+
+    # Update CSV writer field names.
+    self.writer.fieldnames = self.variable_names
+
+  def insert_variable(self, idx, name, units):
+    """Insert a new variable into the output schema at index `idx`."""
+    self.variable_names.insert(idx, name)
+    self.variable_units.update({name : units})
+
+    # Update CSV writer field names.
+    self.writer.fieldnames = self.variable_names
+
+  def remove_variable(self, name):
+    """Remove variable from the output schema and return a tuple of the variable
+    index and the variable units.
+
+    Raises:
+      ValueError : If `name` is not in the output schema.
+    """
+    # Remove the variable and get its index, which we'll need to remove the
+    # corresponding units entry.
+    (idx, item) = remove_from_list(self.variable_names, name)
+
+    # Remove the units entry.
+    units = self.variable_units.pop(item)
+
+    # Update CSV writer field names.
+    self.writer.fieldnames = self.variable_names
+
+    return (idx, units)
+
+  #############################################################################
+  # Input Stream.
+
+  def baseline(self):
+    """Return an iterator to the baseline results input sequence."""
+    return imap(lambda row: strip_dict(row), self.baseline_reader) 
+
+  def observed(self):
+    """Return an iterator to the observed results input sequence."""
+    return imap(lambda row: strip_dict(row), self.observed_reader) 
+
+  #############################################################################
+  # Output.
+
+  def write_header(self):
+    """Write the header for the output CSV file."""
+    # Write the first line of the header.
+    self.writer.writeheader()
+
+    # Write the second line of the header.
+    self.writer.writerow(self.variable_units)
+
+  def write(self, d):
+    """Write a record (a `dict`) to the output CSV file."""
+    self.writer.writerow(d)
+
+###############################################################################
+
+class dependent_variable_parser(object):
+  """Parses a `--dependent-variable=AVG,STDEV,TRIALS` command line argument."""
+
+  #############################################################################
+  # Grammar
+
+  # Parse a variable_name.
+  variable_name_rule = r'[^,]+'
+
+  # Parse a variable classification.        
+  dependent_variable_rule = r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'
+
+  engine = regex_compile(dependent_variable_rule)
+
+  #############################################################################
+
+  def __call__(self, s):
+    """Parses the string `s` with the form "AVG,STDEV,TRIALS".
+
+    Returns:
+      A `measured_variable`. 
+
+    Raises:
+      AssertionError : If parsing fails.
+    """
+
+    match = self.engine.match(s)
+
+    assert match is not None,                                          \
+      "Dependent variable (-d) `" +s+ "` is invalid, the format is " + \
+      "`AVG,STDEV,TRIALS`."
+
+    return measured_variable(match.group(1), match.group(2), match.group(3))
+
+###############################################################################
+
+class record_aggregator(object):
+  """Consumes and combines records and represents the result as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`.
+
+  Attributes:
+    dependent_variables (`list` of `measured_variable`s) :
+      A list of dependent variables provided on the command line.
+    control_variables (`list` of `str`s) :
+      A list of control variables provided on the command line.
+    dataset (`dict`) :
+      A mapping of distinguishing (e.g. control + independent) values (`tuple`s
+      of variable-quantity pairs) to `list`s of dependent values (`dict`s from 
+      variables to lists of cells).
+    in_order_dataset_keys :
+      A list of unique dataset keys (e.g. distinguishing variables) in order of
+      appearance.
+  """
+
+  def __init__(self, dependent_variables, control_variables):
+    """Construct a new `record_aggregator` object.
+
+    Raises:
+      AssertionError : If parsing of dependent variables fails.
+    """
+    self.dependent_variables = dependent_variables
+    self.control_variables = control_variables
+
+    self.dataset = {}
+
+    self.in_order_dataset_keys = deque()
+
+  #############################################################################
+  # Insertion.
+
+  def key_from_dict(self, d):
+    """Create a hashable key from a `dict` by filtering out control variables
+    and then converting the `dict` to a tuple.
+
+    Raises:
+      AssertionError : If any control variable was not found in `d`.
+    """
+    distinguishing_values = d.copy()
+
+    # Filter out control variables.
+    for var in self.control_variables:
+      distinguishing_values.pop(var, None)
+
+    return key_from_dict(distinguishing_values)
+
+  def append(self, record):
+    """Add `record` to the dataset.
+
+    Raises:
+      ValueError : If any `str`-to-numeric conversions fail.
+    """
+    # The distinguishing variables are the control and independent variables.
+    # They form the key for each record in the dataset. Records with the same
+    # distinguishing variables are treated as observations of the same
+    # datapoint.
+    dependent_values = {}
+
+    # To allow the same sample size variable to be used for multiple dependent
+    # variables, we don't pop sample size variables until we're done processing
+    # all variables.
+    sample_size_variables = []
+
+    # Separate the dependent values from the distinguishing variables and
+    # perform `str`-to-numeric conversions.
+    for var in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = var.as_tuple()
+
+      dependent_values[quantity]    = [int_or_float(record.pop(quantity))]
+      dependent_values[uncertainty] = [int_or_float(record.pop(uncertainty))]
+      dependent_values[sample_size] = [int(record[sample_size])]
+
+      sample_size_variables.append(sample_size)
+
+    # Pop sample size variables.
+    for var in sample_size_variables:
+      # Allowed to fail, as we may have duplicates.
+      record.pop(var, None)
+
+    distinguishing_values = self.key_from_dict(record)
+
+    if distinguishing_values in self.dataset:
+      # These distinguishing values already exist, so get the `dict` they're
+      # mapped to, look up each key in `dependent_values` in the `dict`, and
+      # add the corresponding quantity in `dependent_values` to the list in the
+      # the `dict`.
+      for var, columns in dependent_values.iteritems():
+        self.dataset[distinguishing_values][var] += columns
+    else:
+      # These distinguishing values aren't in the dataset, so add them and
+      # record them in `in_order_dataset_keys`.
+      self.dataset[distinguishing_values] = dependent_values
+      self.in_order_dataset_keys.append(distinguishing_values)
+
+  #############################################################################
+  # Postprocessing.
+
+  def combine_dependent_values(self, dependent_values):
+    """Takes a mapping of dependent variables to lists of cells and returns
+    a new mapping with the cells combined.
+
+    Raises:
+      AssertionError : If class invariants were violated.
+    """
+    combined_dependent_values = dependent_values.copy()
+
+    for var in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = var.as_tuple()
+
+      quantities    = dependent_values[quantity]
+      uncertainties = dependent_values[uncertainty]
+      sample_sizes  = dependent_values[sample_size]
+
+      if type(sample_size) is list:
+        # Sample size hasn't been combined yet.
+        assert len(quantities)    == len(uncertainties)                       \
+           and len(uncertainties) == len(sample_sizes),                       \
+          "Length of quantities list `(" + str(len(quantities)) + ")`, "    + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          "),` and length of sample sizes list `(" + str(len(sample_sizes)) + \
+          ")` are not the same."
+      else:
+        # Another dependent variable that uses our sample size has combined it
+        # already.
+        assert len(quantities) == len(uncertainties),                         \
+          "Length of quantities list `(" + str(len(quantities)) + ")` and " + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          ")` are not the same."
+
+      # Convert the three separate `list`s into one list of `measured_value`s.
+      measured_values = []
+
+      for i in range(len(quantities)):
+        mv = measured_value(
+          quantities[i], uncertainties[i], sample_sizes[i], units
+        )
+
+        measured_values.append(mv)
+
+      # Combine the `measured_value`s.
+      combined_sample_size = combine_sample_size(
+        measured_values
+      )
+
+      combined_arithmetic_mean = combine_arithmetic_mean(
+        measured_values, combined_sample_size
+      )
+
+      combined_sample_standard_deviation = combine_sample_standard_deviation(
+        measured_values, combined_sample_size, combined_arithmetic_mean
+      )
+
+      # Round the quantity and uncertainty to the significant digit of
+      # uncertainty and insert the combined values into the results.
+      sigdig = find_significant_digit(combined_sample_standard_deviation)
+
+#      combined_arithmetic_mean = round_with_int_conversion(
+#        combined_arithmetic_mean, sigdig
+#      )
+
+#      combined_sample_standard_deviation = round_with_int_conversion(
+#        combined_sample_standard_deviation, sigdig
+#      )
+
+      combined_dependent_values[quantity]    = combined_arithmetic_mean
+      combined_dependent_values[uncertainty] = combined_sample_standard_deviation
+      combined_dependent_values[sample_size] = combined_sample_size
+
+    return combined_dependent_values
+
+  ############################################################################# 
+  # Output Stream.
+
+  def __iter__(self):
+    """Return an iterator to the output sequence of separated distinguishing
+    variables and dependent variables (a tuple of two `dict`s).
+
+    This is a requirement for the `Iterable` protocol.
+    """
+    return self
+
+  def records(self):
+    """Return an iterator to the output sequence of CSV rows (`dict`s of
+    variables to values).
+    """
+    return imap(unpack_tuple(lambda dist, dep: merge_dicts(dist, dep)), self)
+
+  def next(self):
+    """Produce the components of the next output record - a tuple of two
+    `dict`s. The first `dict` is a mapping of distinguishing variables to
+    distinguishing values, the second `dict` is a mapping of dependent
+    variables to combined dependent values. Combining the two dicts forms a
+    CSV row suitable for output.
+
+    This is a requirement for the `Iterator` protocol.
+
+    Raises:
+      StopIteration  : If there is no more output.
+      AssertionError : If class invariants were violated.
+    """
+    assert len(self.dataset.keys()) == len(self.in_order_dataset_keys),      \
+      "Number of dataset keys (`" + str(len(self.dataset.keys()))          + \
+      "`) is not equal to the number of keys in the ordering list (`"      + \
+      str(len(self.in_order_dataset_keys)) + "`)."
+
+    if len(self.in_order_dataset_keys) == 0:
+      raise StopIteration()
+
+    # Get the next set of distinguishing values and convert them to a `dict`.
+    raw_distinguishing_values = self.in_order_dataset_keys.popleft()
+    distinguishing_values     = dict(raw_distinguishing_values)
+
+    dependent_values = self.dataset.pop(raw_distinguishing_values)
+
+    combined_dependent_values = self.combine_dependent_values(dependent_values)
+
+    return (distinguishing_values, combined_dependent_values)
+
+  def __getitem__(self, distinguishing_values):
+    """Produce the dependent component, a `dict` mapping dependent variables to
+    combined dependent values, associated with `distinguishing_values`.
+
+    Args:
+      distinguishing_values (`dict`) :
+        A `dict` mapping distinguishing variables to distinguishing values.
+
+    Raises:
+      KeyError : If `distinguishing_values` is not in the dataset.
+    """
+    raw_distinguishing_values = self.key_from_dict(distinguishing_values)
+
+    dependent_values = self.dataset[raw_distinguishing_values]
+
+    combined_dependent_values = self.combine_dependent_values(dependent_values)
+
+    return combined_dependent_values
+
+###############################################################################
+
+args = process_program_arguments()
+
+if len(args.dependent_variables) == 0:
+  args.dependent_variables = [
+    "STL Average Walltime,STL Walltime Uncertainty,STL Trials",
+    "STL Average Throughput,STL Throughput Uncertainty,STL Trials",
+    "Thrust Average Walltime,Thrust Walltime Uncertainty,Thrust Trials",
+    "Thrust Average Throughput,Thrust Throughput Uncertainty,Thrust Trials"
+  ]
+
+# Parse dependent variable options.
+dependent_variables = []
+
+parse_dependent_variable = dependent_variable_parser()
+
+#if args.dependent_variables is not None:
+for var in args.dependent_variables:
+  dependent_variables.append(parse_dependent_variable(var))
+
+# Read input files and open the output file.
+with io_manager(args.baseline_input_file, 
+                args.observed_input_file,
+                args.output_file,
+                args.preserve_whitespace) as iom:
+
+  # Create record aggregators.
+  baseline_ra = record_aggregator(dependent_variables, args.control_variables)
+  observed_ra = record_aggregator(dependent_variables, args.control_variables)
+
+  # Duplicate dependent variables: one for baseline results, one for observed
+  # results.
+  baseline_suffix = " - `{0}`".format(
+    args.baseline_input_file
+  )
+  observed_suffix = " - `{0}`".format(
+    args.observed_input_file
+  )
+
+  for var in dependent_variables:
+    # Remove the existing quantity variable:
+    #
+    #   [ ..., a, b, c, ... ]
+    #             ^- remove b at index i
+    #
+    (quantity_idx, quantity_units) = iom.remove_variable(var.quantity)
+
+    # If the `--output-all-variables` option was specified, add the new baseline
+    # and observed quantity variables. Note that we insert in the reverse of
+    # the order we desire (which is baseline then observed):
+    #
+    #   [ ..., a, b_1, c, ... ]
+    #              ^- insert b_1 at index i
+    #
+    #   [ ..., a, b_0, b_1, c, ... ]
+    #              ^- insert b_0 at index i
+    #
+    if args.output_all_variables:
+      iom.insert_variable(
+        quantity_idx, var.quantity + observed_suffix, quantity_units
+      )
+      iom.insert_variable(
+        quantity_idx, var.quantity + baseline_suffix, quantity_units
+      )
+
+    # Remove the existing uncertainty variable.
+    (uncertainty_idx, uncertainty_units) = iom.remove_variable(var.uncertainty)
+
+    # If the `--output-all-variables` option was specified, add the new baseline
+    # and observed uncertainty variables.
+    if args.output_all_variables:
+      iom.insert_variable(
+        uncertainty_idx, var.uncertainty + observed_suffix, uncertainty_units
+      )
+      iom.insert_variable(
+        uncertainty_idx, var.uncertainty + baseline_suffix, uncertainty_units
+      )
+
+    try:
+      # Remove the existing sample size variable.
+      (sample_size_idx, sample_size_units) = iom.remove_variable(var.sample_size)
+
+      # If the `--output-all-variables` option was specified, add the new
+      # baseline and observed sample size variables.
+      if args.output_all_variables:
+        iom.insert_variable(
+          sample_size_idx, var.sample_size + observed_suffix, sample_size_units
+        )
+        iom.insert_variable(
+          sample_size_idx, var.sample_size + baseline_suffix, sample_size_units
+        )
+    except ValueError:
+      # This is alright, because dependent variables may share the same sample
+      # size variable.
+      pass
+
+  for var in args.control_variables:
+    iom.remove_variable(var)
+
+  # Add change variables.
+  absolute_change_suffix = " - Change (`{0}` - `{1}`)".format(
+    args.observed_input_file, args.baseline_input_file
+  )
+
+  percent_change_suffix = " - % Change (`{0}` to `{1}`)".format(
+    args.observed_input_file, args.baseline_input_file
+  )
+
+  for var in dependent_variables:
+    iom.append_variable(var.quantity + absolute_change_suffix, var.units)
+    iom.append_variable(var.uncertainty + absolute_change_suffix, var.units)
+    iom.append_variable(var.quantity + percent_change_suffix, "")
+    iom.append_variable(var.uncertainty + percent_change_suffix, "")
+
+  # Add all baseline input data to the `record_aggregator`.
+  for record in iom.baseline():
+    baseline_ra.append(record)
+  
+  for record in iom.observed():
+    observed_ra.append(record)
+
+  iom.write_header()
+
+  # Compare and output results.
+  for distinguishing_values, observed_dependent_values in observed_ra:
+    try:
+      baseline_dependent_values = baseline_ra[distinguishing_values]
+    except KeyError: 
+      assert False,                                                           \
+        "Distinguishing value `"                                            + \
+        str(baseline_ra.key_from_dict(distinguishing_values))               + \
+        "` was not found in the baseline results."
+
+    statistically_significant_change = False
+
+    record = distinguishing_values.copy()
+
+    # Compute changes, add the values and changes to the record, and identify
+    # changes that are statistically significant.
+    for var in dependent_variables:
+      # Compute changes.
+      baseline_quantity    = baseline_dependent_values[var.quantity]
+      baseline_uncertainty = baseline_dependent_values[var.uncertainty]
+      baseline_sample_size = baseline_dependent_values[var.sample_size]
+
+      observed_quantity    = observed_dependent_values[var.quantity]
+      observed_uncertainty = observed_dependent_values[var.uncertainty]
+      observed_sample_size = observed_dependent_values[var.sample_size]
+
+      (abs_change, abs_change_unc, per_change, per_change_unc) = \
+        percent_change_uncertainty(
+          baseline_quantity, baseline_uncertainty,
+          observed_quantity, observed_uncertainty
+        )
+
+      # Round the change quantities and uncertainties to the significant digit
+      # of uncertainty.
+      try:
+        abs_change_sigdig = max(
+          find_significant_digit(abs_change),
+          find_significant_digit(abs_change_unc),
+        )
+
+#        abs_change     = round_with_int_conversion(
+#          abs_change,     abs_change_sigdig
+#        )
+#        abs_change_unc = round_with_int_conversion(
+#          abs_change_unc, abs_change_sigdig
+#        )
+      except:
+        # Any value errors should be due to NaNs returned by
+        # `percent_change_uncertainty` because quantities or change in
+        # quantities was 0. We can ignore these.
+        pass
+
+      try:
+        per_change_sigdig = max(
+          find_significant_digit(per_change),
+          find_significant_digit(per_change_unc)
+        )
+
+#        per_change     = round_with_int_conversion(
+#          per_change,     per_change_sigdig
+#        )
+#        per_change_unc = round_with_int_conversion(
+#          per_change_unc, per_change_sigdig
+#        )
+      except:
+        # Any value errors should be due to NaNs returned by
+        # `percent_change_uncertainty` because quantities or change in
+        # quantities was 0. We can ignore these.
+        pass
+
+      # Add the values (if the `--output-all-variables` option was specified)
+      # and the changes to the record. Note that the record's schema is
+      # different from the original schema. If multiple dependent variables
+      # share the same sample size variable, it's fine - they will overwrite
+      # each other, but with the same value.
+      if args.output_all_variables:
+        record[var.quantity + baseline_suffix]         = baseline_quantity
+        record[var.uncertainty + baseline_suffix]      = baseline_uncertainty
+        record[var.sample_size + baseline_suffix]      = baseline_sample_size
+        record[var.quantity + observed_suffix]         = observed_quantity
+        record[var.uncertainty + observed_suffix]      = observed_uncertainty
+        record[var.sample_size + observed_suffix]      = observed_sample_size
+
+      record[var.quantity + absolute_change_suffix]    = abs_change
+      record[var.uncertainty + absolute_change_suffix] = abs_change_unc
+      record[var.quantity + percent_change_suffix]     = per_change
+      record[var.uncertainty + percent_change_suffix]  = per_change_unc
+
+      # If the range of uncertainties overlap don't overlap and the percentage
+      # change is greater than the change threshold, then change is
+      # statistically significant.
+      overlap = ranges_overlap_uncertainty(
+          baseline_quantity, baseline_uncertainty,
+          observed_quantity, observed_uncertainty
+      )
+      if not overlap and per_change >= args.change_threshold:
+        statistically_significant_change = True
+
+    # Print the record if a statistically significant change was found or if the
+    # `--output-all-datapoints` option was specified.
+    if args.output_all_datapoints or statistically_significant_change:
+      iom.write(record)
+
diff --git a/thrust/internal/benchmark/random.h b/thrust/internal/benchmark/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..719588771307d5d0b80c61c3a2b8a614c61069c9
--- /dev/null
+++ b/thrust/internal/benchmark/random.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+struct hash32
+{
+  __host__ __device__
+  unsigned int operator()(unsigned int h) const
+  {
+    h = ~h + (h << 15);
+    h =  h ^ (h >> 12);
+    h =  h + (h <<  2);
+    h =  h ^ (h >>  4);
+    h =  h + (h <<  3) + (h << 11);
+    h =  h ^ (h >> 16);
+    return h;
+  }
+};
+
+struct hash64
+{
+  __host__ __device__
+  unsigned long long operator()(unsigned long long h) const
+  {
+    h = ~h + (h << 21);
+    h =  h ^ (h >> 24);
+    h = (h + (h <<  3)) + (h << 8);
+    h =  h ^ (h >> 14);
+    h = (h + (h <<  2)) + (h << 4);
+    h =  h ^ (h >> 28);
+    h =  h + (h << 31);
+    return h;
+  }
+};
+
+struct hashtofloat
+{
+  __host__ __device__
+  float operator()(unsigned int h) const
+  {
+    return static_cast<float>(hash32()(h)) / 4294967296.0f;
+  }
+};
+
+struct hashtodouble
+{
+  __host__ __device__
+  double operator()(unsigned long long h) const
+  {
+    return static_cast<double>(hash64()(h)) / 18446744073709551616.0;
+  }
+};
+
+
+
+template <typename Vector, typename T>
+void _randomize(Vector& v, T)
+{
+    thrust::transform(thrust::counting_iterator<unsigned int>(0), 
+                      thrust::counting_iterator<unsigned int>(0) + v.size(),
+                      v.begin(),
+                      hash32());
+}
+
+template <typename Vector>
+void _randomize(Vector& v, long long)
+{
+    thrust::transform(thrust::counting_iterator<unsigned long long>(0), 
+                      thrust::counting_iterator<unsigned long long>(0) + v.size(),
+                      v.begin(),
+                      hash64());
+}
+
+template <typename Vector>
+void _randomize(Vector& v, float)
+{
+    thrust::transform(thrust::counting_iterator<unsigned int>(0), 
+                      thrust::counting_iterator<unsigned int>(0) + v.size(),
+                      v.begin(),
+                      hashtofloat());
+}
+
+template <typename Vector>
+void _randomize(Vector& v, double)
+{
+    thrust::transform(thrust::counting_iterator<unsigned long long>(0), 
+                      thrust::counting_iterator<unsigned long long>(0) + v.size(),
+                      v.begin(),
+                      hashtodouble());
+}
+
+// fill Vector with random values
+template <typename Vector>
+void randomize(Vector& v)
+{
+    _randomize(v, typename Vector::value_type());
+}
+
+
diff --git a/thrust/internal/benchmark/tbb_algos.h b/thrust/internal/benchmark/tbb_algos.h
new file mode 100644
index 0000000000000000000000000000000000000000..a50a1cd2f9dcc028464d76487a700457faca640e
--- /dev/null
+++ b/thrust/internal/benchmark/tbb_algos.h
@@ -0,0 +1,195 @@
+#pragma once
+
+#include <tbb/parallel_reduce.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_scan.h>
+#include <tbb/parallel_sort.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/tick_count.h>
+#include <tbb/tbb_thread.h>
+
+#include <cstdef> // For std::size_t.
+
+#include <cassert>
+
+template <typename T>
+struct NegateBody
+{ 
+  void operator()(T& x) const
+  {
+    x = -x;
+  }
+};
+
+template <typename Vector>
+struct ForBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector& v;
+
+public: 
+  ForBody(Vector& x) : v(x) {}    
+
+  void operator()(tbb::blocked_range<std::size_t> const& r) const
+  { 
+    for (std::size_t i = r.begin(); i != r.end(); ++i)  
+      v[i] = -v[i];
+  }
+};
+
+template <typename Vector>
+struct ReduceBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector& v;
+
+public: 
+  T sum;  
+
+  ReduceBody(Vector& x) : v(x), sum(0) {}    
+
+  ReduceBody(ReduceBody& x, tbb::split) : v(x.v), sum(0) {}
+
+  void operator()(tbb::blocked_range<std::size_t> const& r)
+  { 
+    for (std::size_t i = r.begin(); i != r.end(); ++i)  
+      sum += v[i];
+  }
+  
+  void join(ReduceBody const& x) { sum += x.sum; } 
+};
+
+template <typename Vector>
+struct ScanBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector& v; 
+
+public: 
+  T sum; 
+
+  ScanBody(Vector& x) : sum(0), v(x) {} 
+
+  ScanBody(ScanBody& x, tbb::split) : v(x.v), sum(0) {} 
+
+  template <typename Tag> 
+  void operator()(tbb::blocked_range<std::size_t> const& r, Tag)
+  {
+    T temp = sum; 
+    for (std::size_t i = r.begin(); i < r.end(); ++i)
+    { 
+      temp = temp + x[i]; 
+      if (Tag::is_final_scan()) 
+        x[i] = temp; 
+    }        
+    sum = temp; 
+  }
+
+  void assign(ScanBody const& x) { sum = x.sum; } 
+
+  T get_sum() const { return sum; } 
+
+  void reverse_join(ScanBody const& x) { sum = x.sum + sum;} 
+};
+
+template <typename Vector>
+struct CopyBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector &v;
+  Vector &u;
+
+public: 
+  CopyBody(Vector& x, Vector& y) : v(x), u(y) {}    
+
+  void operator()(tbb::blocked_range<size_t> const& r) const
+  { 
+    for (std::size_t i = r.begin(); i != r.end(); ++i)  
+      v[i] = u[i];
+  }
+};
+
+template <typename Vector>
+typename Vector::value_type tbb_reduce(Vector& v)
+{
+  ReduceBody<Vector> body(v);
+  tbb::parallel_reduce(tbb::blocked_range<size_t>(0, v.size()), body);
+  return body.sum;
+}
+
+template <typename Vector>
+void tbb_sort(Vector& v)
+{
+  tbb::parallel_sort(v.begin(), v.end());
+}
+
+template <typename Vector>
+void tbb_transform(Vector& v)
+{
+  ForBody<Vector> body(v);
+  tbb::parallel_for(tbb::blocked_range<size_t>(0, v.size()), body);
+}
+
+template <typename Vector>
+void tbb_scan(Vector& v)
+{
+  ScanBody<Vector> body(v);
+  tbb::parallel_scan(tbb::blocked_range<size_t>(0, v.size()), body);
+}
+
+template <typename Vector>
+void tbb_copy(Vector& v, Vector& u)
+{
+  CopyBody<Vector> body(v, u);
+  tbb::parallel_for(tbb::blocked_range<size_t>(0, v.size()), body);
+}
+
+void test_tbb()
+{
+  std::size_t elements = 1 << 20;
+
+  std::vector<int> A(elements);
+  std::vector<int> B(elements);
+  std::vector<int> C(elements);
+  std::vector<int> D(elements);
+
+  randomize(A);
+  randomize(B);
+  assert(std::accumulate(A.begin(), A.end(), 0) == tbb_reduce(A));
+  
+  randomize(A);
+  randomize(B);
+  std::transform(A.begin(), A.end(), A.begin(), thrust::negate<int>());
+  tbb_transform(B);
+  assert(A == B);
+ 
+  randomize(A);
+  randomize(B);
+  std::partial_sum(A.begin(), A.end(), A.begin());
+  tbb_scan(B);
+  assert(A == B);
+
+  randomize(A);
+  randomize(B);
+  std::sort(A.begin(), A.end());
+  tbb_sort(B);
+  assert(A == B);
+
+  randomize(A);
+  randomize(B);
+  randomize(C);
+  randomize(D);
+  std::copy(A.begin(), A.end(), C.begin());
+  tbb_copy(B, D);
+  assert(A == B);
+  assert(C == D);
+}
+
diff --git a/thrust/internal/benchmark/timer.h b/thrust/internal/benchmark/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..077ffa44ce61e637e9e9b898bfe28186f6d36252
--- /dev/null
+++ b/thrust/internal/benchmark/timer.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <cassert>
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUDA_SAFE_CALL( call) do {                                         \
+    CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+    cudaError err = cudaDeviceSynchronize();                                 \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+class cuda_timer
+{
+    cudaEvent_t start_;
+    cudaEvent_t stop_;
+
+ public:
+    cuda_timer()
+    {
+        CUDA_SAFE_CALL(cudaEventCreate(&start_));
+        CUDA_SAFE_CALL(cudaEventCreate(&stop_));
+    }
+
+    ~cuda_timer()
+    {
+        CUDA_SAFE_CALL(cudaEventDestroy(start_));
+        CUDA_SAFE_CALL(cudaEventDestroy(stop_));
+    }
+
+    void start()
+    {
+        CUDA_SAFE_CALL(cudaEventRecord(start_, 0));
+    }
+
+    void stop()
+    {
+        CUDA_SAFE_CALL(cudaEventRecord(stop_, 0));
+        CUDA_SAFE_CALL(cudaEventSynchronize(stop_));
+    }
+
+    double milliseconds_elapsed()
+    {
+        float elapsed_time;
+        CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsed_time, start_, stop_));
+        return elapsed_time;
+    }
+
+    double seconds_elapsed()
+    {
+        return milliseconds_elapsed() / 1000.0;
+    }
+};
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+#include <windows.h>
+
+class steady_timer
+{
+    LARGE_INTEGER frequency_; // Cached to avoid system calls.
+    LARGE_INTEGER start_;
+    LARGE_INTEGER stop_;
+
+ public:
+    steady_timer() : start_(), stop_(), frequency_()
+    {
+        BOOL const r = QueryPerformanceFrequency(&frequency_);
+        assert(0 != r);
+    }
+
+    void start()
+    {
+        BOOL const r = QueryPerformanceCounter(&start_);
+        assert(0 != r);
+    }
+
+    void stop()
+    {
+        BOOL const r = QueryPerformanceCounter(&stop_);
+        assert(0 != r);
+    }
+
+    double seconds_elapsed()
+    {
+        return double(stop_.QuadPart - start_.QuadPart)
+             / double(frequency_.QuadPart);
+    }
+};
+#else
+#include <time.h>
+
+class steady_timer
+{
+    timespec start_;
+    timespec stop_;
+
+ public:
+    steady_timer() : start_(), stop_() {}
+
+    void start()
+    {
+        int const r = clock_gettime(CLOCK_MONOTONIC, &start_);
+        assert(0 == r);
+    }
+
+    void stop()
+    {
+        int const r = clock_gettime(CLOCK_MONOTONIC, &stop_);
+        assert(0 == r);
+    }
+
+    double seconds_elapsed()
+    {
+        return double(stop_.tv_sec  - start_.tv_sec)
+             + double(stop_.tv_nsec - start_.tv_nsec) * 1.0e-9;
+    }
+};
+#endif
+
+
diff --git a/thrust/internal/racecheck.sh b/thrust/internal/racecheck.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0654ee98cb0bacbe6c951ed14b93b1215d958255
--- /dev/null
+++ b/thrust/internal/racecheck.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+MEMCHECK=/work/nightly/memcheck/bin/x86_64_Linux_release/cuda-memcheck 
+
+#########################
+
+files=`ls thrust.test.*`;
+files=`ls thrust.example.*`;
+
+#########################
+
+nfiles=0
+for fn in $files; do
+  nfiles=$((nfiles + 1))
+done
+j=1
+for fn in $files; do
+  echo " ----------------------------------------------------------------------"
+  echo "  *** MEMCHECK *** [$j/$nfiles] $fn"
+  echo " ----------------------------------------------------------------------"
+  $MEMCHECK --tool memcheck ./$fn --verbose
+  echo " ----------------------------------------------------------------------"
+  echo "  *** RACECHECK *** [$j/$nfiles] $fn"
+  echo " ----------------------------------------------------------------------"
+  $MEMCHECK --tool racecheck ./$fn --verbose --sizes=small
+  j=$((j+1))
+done;
diff --git a/thrust/internal/rename_cub_namespace.sh b/thrust/internal/rename_cub_namespace.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7a539e5d64c4a0053c1e0487ea2cd6bc366b8f60
--- /dev/null
+++ b/thrust/internal/rename_cub_namespace.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to add a THRUST_
+# prefix to CUB's namespace macro.
+
+sed -i -e 's/CUB_NS_P/THRUST_CUB_NS_P/g' `find . -type f`
+
diff --git a/thrust/internal/reverse_rename_cub_namespace.sh b/thrust/internal/reverse_rename_cub_namespace.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bc4858449577af60ac3acbc2ea745f5444da96c5
--- /dev/null
+++ b/thrust/internal/reverse_rename_cub_namespace.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to undo the
+# renaming of CUB's namespace macro.
+
+sed -i -e 's|THRUST_CUB_NS_P|CUB_NS_P|g' `find . -type f`
+
diff --git a/thrust/internal/scripts/eris_perf.py b/thrust/internal/scripts/eris_perf.py
new file mode 100755
index 0000000000000000000000000000000000000000..5804711019263fb31cdb7207fd13b3f03b26a758
--- /dev/null
+++ b/thrust/internal/scripts/eris_perf.py
@@ -0,0 +1,189 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+from sys import exit 
+
+from os.path import join, dirname, basename, realpath
+
+from csv import DictReader as csv_dict_reader
+
+from subprocess import Popen
+
+from argparse import ArgumentParser as argument_parser
+
+###############################################################################
+
+def printable_cmd(c):
+  """Converts a `list` of `str`s representing a shell command to a printable 
+  `str`."""
+  return " ".join(map(lambda e: '"' + str(e) + '"', c))
+
+###############################################################################
+
+def print_file(p):
+  """Open the path `p` and print its contents to `stdout`."""
+  print "********************************************************************************"
+  with open(p) as f:
+    for line in f:
+      print line,
+  print "********************************************************************************"
+
+###############################################################################
+
+ap = argument_parser(
+  description = (
+    "CUDA Eris driver script: runs a benchmark suite multiple times, combines "
+    "the results, and outputs them in the CUDA Eris performance result format."
+  )
+)
+
+ap.add_argument(
+  "-b", "--benchmark", 
+  help = ("The location of the benchmark suite executable to run."),
+  type = str,
+  default = join(dirname(realpath(__file__)), "bench"), 
+  metavar = "R"
+)
+
+ap.add_argument(
+  "-p", "--postprocess", 
+  help = ("The location of the postprocessing script to run to combine the "
+          "results."),
+  type = str,
+  default = join(dirname(realpath(__file__)), "combine_benchmark_results.py"),
+  metavar = "R"
+)
+
+ap.add_argument(
+  "-r", "--runs", 
+  help = ("Run the benchmark suite `R` times.a),"),
+  type = int, default = 5, 
+  metavar = "R"
+)
+
+args = ap.parse_args()
+
+if args.runs <= 0:
+  print "ERROR: `--runs` must be greater than `0`."
+  ap.print_help()
+  exit(1)
+
+BENCHMARK_EXE             = args.benchmark
+BENCHMARK_NAME            = basename(BENCHMARK_EXE)
+POSTPROCESS_EXE           = args.postprocess
+OUTPUT_FILE_NAME          = lambda i: BENCHMARK_NAME + "_" + str(i) + ".csv"
+COMBINED_OUTPUT_FILE_NAME = BENCHMARK_NAME + "_combined.csv"
+
+###############################################################################
+
+print '&&&& RUNNING {0}'.format(BENCHMARK_NAME)
+
+print '#### RUNS {0}'.format(args.runs)
+
+###############################################################################
+
+print '#### CMD {0}'.format(BENCHMARK_EXE)
+
+for i in xrange(args.runs):
+  with open(OUTPUT_FILE_NAME(i), "w") as output_file:
+    print '#### RUN {0} OUTPUT -> {1}'.format(i, OUTPUT_FILE_NAME(i))
+
+    p = None
+
+    try:
+      p = Popen(BENCHMARK_EXE, stdout = output_file, stderr = output_file)
+      p.communicate()
+    except OSError as ex:
+      print_file(OUTPUT_FILE_NAME(i))
+      print '#### ERROR Caught OSError `{0}`.'.format(ex)
+      print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+      exit(-1)
+
+  print_file(OUTPUT_FILE_NAME(i))
+
+  if p.returncode != 0:
+    print '#### ERROR Process exited with code {0}.'.format(p.returncode)
+    print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+    exit(p.returncode)
+
+###############################################################################
+
+post_cmd = [POSTPROCESS_EXE]
+
+# Add dependent variable options.
+post_cmd += ["-dSTL Average Walltime,STL Walltime Uncertainty,STL Trials"]
+post_cmd += ["-dSTL Average Throughput,STL Throughput Uncertainty,STL Trials"]
+post_cmd += ["-dThrust Average Walltime,Thrust Walltime Uncertainty,Thrust Trials"]
+post_cmd += ["-dThrust Average Throughput,Thrust Throughput Uncertainty,Thrust Trials"]
+
+post_cmd += [OUTPUT_FILE_NAME(i) for i in range(args.runs)] 
+
+print '#### CMD {0}'.format(printable_cmd(post_cmd))
+
+with open(COMBINED_OUTPUT_FILE_NAME, "w") as output_file:
+  p = None
+
+  try:
+    p = Popen(post_cmd, stdout = output_file, stderr = output_file)
+    p.communicate()
+  except OSError as ex:
+    print_file(COMBINED_OUTPUT_FILE_NAME)
+    print '#### ERROR Caught OSError `{0}`.'.format(ex)
+    print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+    exit(-1)
+
+  print_file(COMBINED_OUTPUT_FILE_NAME)
+
+  if p.returncode != 0:
+    print '#### ERROR Process exited with code {0}.'.format(p.returncode)
+    print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+    exit(p.returncode)
+
+  with open(COMBINED_OUTPUT_FILE_NAME) as input_file:
+    reader = csv_dict_reader(input_file)
+
+    variable_units = reader.next() # Get units header row.
+
+    distinguishing_variables = reader.fieldnames
+
+    measured_variables = [
+      ("STL Average Throughput",    "+"),
+      ("Thrust Average Throughput", "+")
+    ]
+
+    for record in reader:
+      for variable, directionality in measured_variables:
+        # Don't monitor regressions for STL implementations, nvbug 28980890:
+        if "STL" in variable:
+          continue
+        print "&&&& PERF {0}_{1}_{2}bit_{3}mib_{4} {5} {6}{7}".format(
+          record["Algorithm"],
+          record["Element Type"],
+          record["Element Size"],
+          record["Total Input Size"],
+          variable.replace(" ", "_").lower(),
+          record[variable],
+          directionality,
+          variable_units[variable]
+        )
+
+###############################################################################
+                  
+print '&&&& PASSED {0}'.format(BENCHMARK_NAME)
+
diff --git a/thrust/internal/scripts/refresh_from_github2.sh b/thrust/internal/scripts/refresh_from_github2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fb4a2aff17c48003d3f2fd5bf818780500117e88
--- /dev/null
+++ b/thrust/internal/scripts/refresh_from_github2.sh
@@ -0,0 +1,96 @@
+branch="master"
+
+while getopts "hb:c:" opt; do
+    case $opt in
+        h)
+        echo "Usage: $0 [-h] [-b <github_branch_name>] -c <P4_changelist>"
+        exit 1
+        ;;
+
+        b)
+        branch=$OPTARG
+        ;;
+
+        c)
+        changelist=$OPTARG
+        ;;
+
+        /?)
+        echo "Invalid option: -$OPTARG" >&2;
+        exit 1
+        ;;
+
+        :)
+        echo "Option -$OPTARG requires an argument";
+        exit 1
+        ;;
+    esac
+done
+
+if [ "$changelist" == "" ]; then
+    echo "Missing required option -c to specify P4 changelist to put changed files into"
+    exit 1
+fi
+
+# Cause script to exit on any command that results in an error
+set -e
+
+echo "Downloading thrust code from the $branch branch into /tmp/thrust-${branch}"
+rm -rf /tmp/thrust-${branch}
+git clone -q git://github.com/thrust/thrust.git -b ${branch} /tmp/thrust-${branch}
+
+cd `dirname $0`/../..
+echo "Changed current directory to `pwd`"
+
+vulcan_files=`echo *.vlcc *.vlct` 
+logdir=`mktemp -d /tmp/tmp.XXXXXXXX`
+echo "Logging p4 command outputs to temporary directory $logdir"
+for i in *; do
+    if [[ "$i" != "internal" && "$i" != "Makefile" ]]; then
+        ii="$i";
+        if [ -d $i ]; then ii="$i/..."; fi
+        echo "Reverting, force syncing, and then removing $ii"
+        p4 revert $ii >> $logdir/$i.revert.log 2>&1
+        p4 sync -f $ii >> $logdir/$i.sync.log 2>&1
+        rm -rf $i
+    fi
+done
+
+echo "Copying downloaded thrust code to p4 client"
+cp -R /tmp/thrust-${branch}/* .
+find . -name ".gitignore" | xargs -n 1 rm
+
+echo "Checking if version has been bumped"
+new_version=`grep "#define THRUST_VERSION" thrust/version.h | sed -e "s/#define THRUST_VERSION //"`
+old_version=`p4 print thrust/version.h | grep "#define THRUST_VERSION" | sed -e "s/#define THRUST_VERSION //"`
+if [ "$new_version" != "$old_version" ]; then
+    p4 edit internal/test/version.gold
+    new_version_print="$(( $new_version / 100000 )).$(( ($new_version / 100) % 1000 )).$(( $new_version % 100 ))"
+    sed -e "s/v[0-9\.][0-9\.]*/v${new_version_print}/" internal/test/version.gold > internal/test/version.gold.tmp
+    mv internal/test/version.gold.tmp internal/test/version.gold
+    echo "Updated version.gold to version $new_version_print"
+else
+    echo "Version has not changed"
+fi
+
+echo "Reconciling changed code into changelist $changelist"
+p4 reconcile -c $changelist ... >> $logdir/reconcile.log 2>&1
+p4 revert -c $changelist Makefile $vulcan_files internal/... >> $logdir/internal_files_revert.log 2>&1
+
+echo "Looking for examples that were added"
+for e in `find examples -name "*.cu"`; do
+    if [ ! -e internal/build/`basename $e .cu`.mk ]; then
+	echo "ADDED: `basename $e .cu`";
+    fi
+done
+
+echo "Looking for examples that were deleted or moved"
+for e in `find internal/build -name "*.mk"`; do
+    ee=`basename $e .mk`
+    case "$ee" in
+	generic_example | unittester* | warningstester) continue;;
+    esac
+    if [  "`find examples -name $ee.cu`" == "" ]; then
+	echo "DELETED: $ee";
+    fi;
+done
diff --git a/thrust/internal/scripts/tounix b/thrust/internal/scripts/tounix
new file mode 100755
index 0000000000000000000000000000000000000000..c39a054a1d0ab5355620e7ab02d8f509563a5e25
--- /dev/null
+++ b/thrust/internal/scripts/tounix
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# converts all files in the current directory with extensions .h .inl or .cu to unix format
+
+#find . -type f \( -name "*.h" -o -name "*.inl" -o -name "*.cu" \) -a \( -not -wholename "*\.hg/*" \) -print
+find . -type f \( -name "*.h" -o -name "*.inl" -o -name "*.cu" \) -a \( -not -wholename "*\.hg/*" \) -exec fromdos -d {} \;
+
diff --git a/thrust/internal/scripts/wiki2tex.py b/thrust/internal/scripts/wiki2tex.py
new file mode 100644
index 0000000000000000000000000000000000000000..67f658b2d6fd9d08f7e6e0d4eb468b9905ab43d9
--- /dev/null
+++ b/thrust/internal/scripts/wiki2tex.py
@@ -0,0 +1,194 @@
+'''
+Convert Google Code .wiki files into .tex formatted files.
+
+Output is designed to be included within a larger TeX project, it is
+not standalone.
+
+'''
+
+import sys
+import re
+import codecs
+
+print(sys.argv)
+
+'''
+A "rule" is a begin tag, an end tag, and how to reformat the inner text
+(function)
+'''
+
+def encase(pre, post, strip=False):
+    """Return a function that prepends pre and postpends post"""
+    def f(txt):
+        if strip:
+            return pre + txt.strip() + post
+        else:
+            return pre + txt + post
+    return f
+
+def constant(text):
+    def f(txt):
+        return text
+    return f
+
+def encase_with_rules(pre, post, rules, strip=False):
+    def f(txt):
+        if strip:
+            return pre + apply_rules(txt, rules).strip() + post
+        else:
+            return pre + apply_rules(txt, rules) + post
+    return f
+
+def encase_escape_underscore(pre, post):
+    def f(txt):
+        txt = sub(r'_', r'\_', txt)
+        return pre + txt + post
+    return f
+
+def sub(pat, repl, txt):
+    """Substitute in repl for pat in txt, txt can be multiple lines"""
+    return re.compile(pat, re.MULTILINE).sub(repl, txt)
+
+def process_list(rules):
+    def f(txt):
+        txt = '  *' + txt # was removed to match begin tag of list
+        res = '\\begin{itemize}\n'
+        for ln in txt.split('\n'):
+            # Convert "  *" to "\item "
+            ln = sub(r'^  \*', r'\\item ', ln)
+            res += apply_rules(ln, rules) + '\n'
+        res += '\\end{itemize}\n'
+        return res
+    return f
+
+def process_link(rules):
+    def f(txt):
+        lst = txt.split(' ')
+        lnk = lst[0]
+        desc = apply_rules(' '.join(lst[1:]), rules)
+        if lnk[:7] == 'http://':
+            desc = apply_rules(' '.join(lst[1:]), rules)
+            return r'\href{' + lnk + r'}{' + desc + r'}'
+        if len(lst) > 1:
+            return r'\href{}{' + desc + r'}'
+        return r'\href{}{' + lnk + r'}'
+    return f
+
+# Some rules can be used inside some other rules (backticks in section names)
+
+link_rules = [
+    ['_', '', constant(r'\_')],
+]
+
+section_rules = [
+    ['`', '`', encase_escape_underscore(r'\texttt{', r'}')],
+]
+
+item_rules = [
+    ['`', '`', encase(r'\verb|', r'|')],
+    ['[', ']', process_link(link_rules)],
+]
+
+# Main rules for Latex formatting
+
+rules = [
+    ['{{{', '}}}', encase(r'\begin{lstlisting}[language=c++]', r'\end{lstlisting}')],
+    ['[', ']', process_link(link_rules)],
+    ['  *', '\n\n', process_list(item_rules)],
+    ['"', '"', encase("``", "''")],
+    ['`', '`', encase(r'\verb|', r'|')],
+    ['*', '*', encase(r'\emph{', r'}')],
+    ['_', '_', encase(r'\emph{', r'}')],
+    ['==', '==', encase_with_rules(r'\section{', r'}', section_rules, True)],
+    ['=', '=', encase_with_rules(r'\chapter{', r'}', section_rules, True)],
+    ['(e.g. f(x) -> y and f(x,y) -> ', 'z)', constant(r'(e.g. $f(x)\to y$ and $f(x,y)\to z$)')],
+]
+
+def match_rules(txt, rules):
+    """Find rule that first matches in txt"""
+    # Find first begin tag
+    first_begin_loc = 10e100
+    matching_rule = None
+    for rule in rules:
+        begin_tag, end_tag, func = rule
+        loc = txt.find(begin_tag)
+        if loc > -1 and loc < first_begin_loc:
+            first_begin_loc = loc
+            matching_rule = rule
+    return (matching_rule, first_begin_loc)
+
+def apply_rules(txt, rules):
+    """Apply set of rules to give txt, return transformed version of txt"""
+    matching_rule, first_begin_loc = match_rules(txt, rules)
+    if matching_rule is None:
+        return txt
+    begin_tag, end_tag, func = matching_rule
+    end_loc = txt.find(end_tag, first_begin_loc + 1)
+    if end_loc == -1:
+        sys.exit('Could not find end tag {0} after position {1}'.format(end_tag, first_begin_loc + 1))
+    inner_txt = txt[first_begin_loc + len(begin_tag) : end_loc]
+    # Copy characters up until begin tag
+    # Then have output of rule function on inner text
+    new_txt_start = txt[:first_begin_loc] + func(inner_txt)
+    # Follow with the remaining processed text
+    remaining_txt = txt[end_loc + len(end_tag):]
+    return new_txt_start + apply_rules(remaining_txt, rules)
+
+def split_sections(contents):
+    """Given one string of all file contents, return list of sections
+    
+    Return format is list of pairs, each pair has section title
+    and list of lines.  Result is ordered as the original input.
+
+    """
+    res = []
+    cur_section = ''
+    section = []
+    for ln in contents.split('\n'):
+        if len(ln) > 0 and ln[0] == '=':
+            # remove = formatting from line
+            section_title = sub(r'^\=+ (.*) \=+', r'\1', ln)
+            res.append((cur_section, section))
+            cur_section = section_title
+            section = [ln]
+        else:
+            section.append(ln)
+    res.append((cur_section, section))
+    return res
+
+def filter_sections(splitinput, removelst):
+    """Take split input and remove sections in removelst"""
+    res = []
+    for sectname, sectcontents in splitinput:
+        if sectname in removelst:
+            pass
+        else:
+            res.extend(sectcontents)
+    # convert to single string for output
+    return '\n'.join(res)
+
+
+def main():
+    infile = codecs.open(sys.argv[1], encoding='utf-8')
+    outfile = codecs.open(sys.argv[2], mode='w', encoding='utf-8')
+    
+    contents = infile.read()
+    
+    # Remove first three lines
+    contents = '\n'.join(contents.split('\n')[3:])
+    
+    # Split sections and filter out some of them
+    sections = split_sections(contents)
+    contents = filter_sections(sections, ['Introduction', 'Prerequisites', 'Simple Example'])
+    
+    # Convert to latex format
+    contents = apply_rules(contents, rules)
+    
+    infile.close()
+    outfile.write(contents)
+    outfile.close()
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/thrust/internal/test/dvstest.lst b/thrust/internal/test/dvstest.lst
new file mode 100755
index 0000000000000000000000000000000000000000..ffe580f088f3a3f9ad9285ce2fdfc6820500175b
--- /dev/null
+++ b/thrust/internal/test/dvstest.lst
@@ -0,0 +1,425 @@
+TestAdjacentDifference
+TestAdjacentDifferenceDiscardIterator
+TestAdjacentDifferenceDispatchExplicit
+TestAdjacentDifferenceDispatchImplicit
+TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes
+TestAdjacentDifferenceSimpleDevice
+TestAdjacentDifferenceSimpleHost
+TestAllOfDevice
+TestAllOfDispatchExplicit
+TestAllOfDispatchImplicit
+TestAllOfHost
+TestAnyOfDevice
+TestAnyOfDispatchExplicit
+TestAnyOfDispatchImplicit
+TestAnyOfHost
+TestComputeCapability
+TestCopyConstantIteratorToZipIteratorDevice
+TestCopyConstantIteratorToZipIteratorHost
+TestCopyCountingIteratorDevice
+TestCopyCountingIteratorHost
+TestCopyDispatchExplicit
+TestCopyDispatchImplicit
+TestCopyFromConstIterator
+TestCopyIf
+TestCopyIfDispatchExplicit
+TestCopyIfDispatchImplicit
+TestCopyIfSimpleDevice
+TestCopyIfSimpleHost
+TestCopyIfStencil
+TestCopyIfStencilDispatchExplicit
+TestCopyIfStencilDispatchImplicit
+TestCopyIfStencilSimpleDevice
+TestCopyIfStencilSimpleHost
+TestCopyListToDevice
+TestCopyListToHost
+TestCopyMatchingTypesDevice
+TestCopyMatchingTypesHost
+TestCopyMixedTypesDevice
+TestCopyMixedTypesHost
+TestCopyToDiscardIterator
+TestCopyToDiscardIteratorZipped
+TestCopyVectorBool
+TestCopyZipIteratorDevice
+TestCopyZipIteratorHost
+TestCount
+TestCountDispatchExplicit
+TestCountDispatchImplicit
+TestCountFromConstIteratorSimpleDevice
+TestCountFromConstIteratorSimpleHost
+TestCountIf
+TestCountIfSimpleDevice
+TestCountIfSimpleHost
+TestCountSimpleDevice
+TestCountSimpleHost
+TestFill
+TestFillDiscardIterator
+TestFillDispatchExplicit
+TestFillDispatchImplicit
+TestFillMixedTypesDevice
+TestFillMixedTypesHost
+TestFillN
+TestFillNDiscardIterator
+TestFillNDispatchExplicit
+TestFillNDispatchImplicit
+TestFillNMixedTypesDevice
+TestFillNMixedTypesHost
+TestFillNSimpleDevice
+TestFillNSimpleHost
+TestFillSimpleDevice
+TestFillSimpleHost
+TestFillTuple
+TestFillWithNonTrivialAssignment
+TestFillWithTrivialAssignment
+TestFillZipIteratorDevice
+TestFillZipIteratorHost
+TestForEach
+TestForEachDispatchExplicit
+TestForEachDispatchImplicit
+TestForEachN
+TestForEachNDispatchExplicit
+TestForEachNDispatchImplicit
+TestForEachNSimpleAnySystem
+TestForEachNSimpleDevice
+TestForEachNSimpleHost
+TestForEachNWithLargeTypes
+TestForEachSimpleAnySystem
+TestForEachSimpleDevice
+TestForEachSimpleHost
+TestForEachWithLargeTypes
+TestGather
+TestGatherCountingIteratorDevice
+TestGatherCountingIteratorHost
+TestGatherDispatchExplicit
+TestGatherDispatchImplicit
+TestGatherIf
+TestGatherIfDispatchExplicit
+TestGatherIfDispatchImplicit
+TestGatherIfSimpleDevice
+TestGatherIfSimpleHost
+TestGatherIfToDiscardIterator
+TestGatherSimpleDevice
+TestGatherSimpleHost
+TestGatherToDiscardIterator
+TestGenerate
+TestGenerateDispatchExplicit
+TestGenerateDispatchImplicit
+TestGenerateNDispatchExplicit
+TestGenerateNDispatchImplicit
+TestGenerateNSimpleDevice
+TestGenerateNSimpleHost
+TestGenerateNToDiscardIterator
+TestGenerateSimpleDevice
+TestGenerateSimpleHost
+TestGenerateToDiscardIterator
+TestGenerateTuple
+TestGenerateZipIteratorDevice
+TestGenerateZipIteratorHost
+TestInnerProduct
+TestInnerProductDispatchExplicit
+TestInnerProductDispatchImplicit
+TestInnerProductSimpleDevice
+TestInnerProductSimpleHost
+TestInnerProductWithOperatorDevice
+TestInnerProductWithOperatorHost
+TestIsCommutative
+TestIsPlainOldData
+TestIsTrivialIterator
+TestMaxActiveBlocks
+TestMaxBlocksizeWithHighestOccupancy
+TestMaxElement
+TestMaxElementDispatchExplicit
+TestMaxElementDispatchImplicit
+TestMaxElementSimpleDevice
+TestMaxElementSimpleHost
+TestMerge
+TestMergeDescending
+TestMergeDispatchExplicit
+TestMergeDispatchImplicit
+TestMergeKeyValue
+TestMergeKeyValueDescending
+TestMergeSimpleDevice
+TestMergeSimpleHost
+TestMergeToDiscardIterator
+TestMinElement
+TestMinElementDispatchExplicit
+TestMinElementDispatchImplicit
+TestMinElementSimpleDevice
+TestMinElementSimpleHost
+TestMinMaxElement
+TestMinMaxElementDispatchExplicit
+TestMinMaxElementDispatchImplicit
+TestMinMaxElementSimpleDevice
+TestMinMaxElementSimpleHost
+TestNoneOfDevice
+TestNoneOfDispatchExplicit
+TestNoneOfDispatchImplicit
+TestNoneOfHost
+TestPartition
+TestPartitionCopy
+TestPartitionCopyDispatchExplicit
+TestPartitionCopyDispatchImplicit
+TestPartitionCopySimpleDevice
+TestPartitionCopySimpleHost
+TestPartitionCopyStencil
+TestPartitionCopyStencilDispatchExplicit
+TestPartitionCopyStencilDispatchImplicit
+TestPartitionCopyStencilSimpleDevice
+TestPartitionCopyStencilSimpleHost
+TestPartitionCopyStencilToDiscardIterator
+TestPartitionCopyToDiscardIterator
+TestPartitionDispatchExplicit
+TestPartitionDispatchImplicit
+TestPartitionPointDevice
+TestPartitionPointDispatchExplicit
+TestPartitionPointDispatchImplicit
+TestPartitionPointHost
+TestPartitionPointSimpleDevice
+TestPartitionPointSimpleHost
+TestPartitionSimpleDevice
+TestPartitionSimpleHost
+TestPartitionStencil
+TestPartitionStencilDispatchExplicit
+TestPartitionStencilDispatchImplicit
+TestPartitionStencilSimpleDevice
+TestPartitionStencilSimpleHost
+TestPartitionStencilZipIteratorDevice
+TestPartitionStencilZipIteratorHost
+TestPartitionZipIteratorDevice
+TestPartitionZipIteratorHost
+TestRadixSort
+TestRadixSortByKey
+TestRadixSortKeySimple<thrust::device_vector>
+TestRadixSortKeyValueSimple<thrust::device_vector>
+TestReduce
+TestReduceByKey
+TestReduceByKeyDispatchExplicit
+TestReduceByKeyDispatchImplicit
+TestReduceByKeySimpleDevice
+TestReduceByKeySimpleHost
+TestReduceByKeyToDiscardIterator
+TestReduceCountingIterator
+TestReduceDispatchExplicit
+TestReduceDispatchImplicit
+TestReduceMixedTypesDevice
+TestReduceMixedTypesHost
+TestReduceSimpleDevice
+TestReduceSimpleHost
+TestReduceWithIndirectionDevice
+TestReduceWithIndirectionHost
+TestReduceWithOperator
+TestRemove
+TestRemoveCopy
+TestRemoveCopyDispatchExplicit
+TestRemoveCopyDispatchImplicit
+TestRemoveCopyIf
+TestRemoveCopyIfDispatchExplicit
+TestRemoveCopyIfDispatchImplicit
+TestRemoveCopyIfSimpleDevice
+TestRemoveCopyIfSimpleHost
+TestRemoveCopyIfStencil
+TestRemoveCopyIfStencilDispatchExplicit
+TestRemoveCopyIfStencilDispatchImplicit
+TestRemoveCopyIfStencilSimpleDevice
+TestRemoveCopyIfStencilSimpleHost
+TestRemoveCopyIfStencilToDiscardIterator
+TestRemoveCopyIfToDiscardIterator
+TestRemoveCopySimpleDevice
+TestRemoveCopySimpleHost
+TestRemoveCopyToDiscardIterator
+TestRemoveCopyToDiscardIteratorZipped
+TestRemoveDispatchExplicit
+TestRemoveDispatchImplicit
+TestRemoveIf
+TestRemoveIfDispatchExplicit
+TestRemoveIfDispatchImplicit
+TestRemoveIfSimpleDevice
+TestRemoveIfSimpleHost
+TestRemoveIfStencil
+TestRemoveIfStencilDispatchExplicit
+TestRemoveIfStencilDispatchImplicit
+TestRemoveIfStencilSimpleDevice
+TestRemoveIfStencilSimpleHost
+TestRemoveSimpleDevice
+TestRemoveSimpleHost
+TestReplace
+TestReplaceCopy
+TestReplaceCopyDispatchExplicit
+TestReplaceCopyDispatchImplicit
+TestReplaceCopyIf
+TestReplaceCopyIfDispatchExplicit
+TestReplaceCopyIfDispatchImplicit
+TestReplaceCopyIfSimpleDevice
+TestReplaceCopyIfSimpleHost
+TestReplaceCopyIfStencil
+TestReplaceCopyIfStencilDispatchExplicit
+TestReplaceCopyIfStencilDispatchImplicit
+TestReplaceCopyIfStencilSimpleDevice
+TestReplaceCopyIfStencilSimpleHost
+TestReplaceCopyIfStencilToDiscardIterator
+TestReplaceCopyIfToDiscardIterator
+TestReplaceCopySimpleDevice
+TestReplaceCopySimpleHost
+TestReplaceCopyToDiscardIterator
+TestReplaceDispatchExplicit
+TestReplaceDispatchImplicit
+TestReplaceIf
+TestReplaceIfDispatchExplicit
+TestReplaceIfDispatchImplicit
+TestReplaceIfSimpleDevice
+TestReplaceIfSimpleHost
+TestReplaceIfStencil
+TestReplaceIfStencilDispatchExplicit
+TestReplaceIfStencilDispatchImplicit
+TestReplaceIfStencilSimpleDevice
+TestReplaceIfStencilSimpleHost
+TestReplaceSimpleDevice
+TestReplaceSimpleHost
+TestReverse
+TestReverseCopy
+TestReverseCopyDispatchExplicit
+TestReverseCopyDispatchImplicit
+TestReverseCopySimpleDevice
+TestReverseCopySimpleHost
+TestReverseCopyToDiscardIterator
+TestReverseDispatchExplicit
+TestReverseDispatchImplicit
+TestReverseSimpleDevice
+TestReverseSimpleHost
+TestSetIntersection
+TestSetIntersectionDispatchExplicit
+TestSetIntersectionDispatchImplicit
+TestSetIntersectionEquivalentRanges
+TestSetIntersectionMultiset
+TestSetIntersectionSimpleDevice
+TestSetIntersectionSimpleHost
+TestSetIntersectionToDiscardIterator
+TestSetSymmetricDifference
+TestSetSymmetricDifferenceDispatchExplicit
+TestSetSymmetricDifferenceDispatchImplicit
+TestSetSymmetricDifferenceEquivalentRanges
+TestSetSymmetricDifferenceKeyValue
+TestSetSymmetricDifferenceMultiset
+TestSetSymmetricDifferenceSimpleDevice
+TestSetSymmetricDifferenceSimpleHost
+TestSetUnion
+TestSetUnionDispatchExplicit
+TestSetUnionDispatchImplicit
+TestSetUnionSimpleDevice
+TestSetUnionSimpleHost
+TestSetUnionToDiscardIterator
+TestSetUnionWithEquivalentElementsSimpleDevice
+TestSetUnionWithEquivalentElementsSimpleHost
+TestStablePartition
+TestStablePartitionCopy
+TestStablePartitionCopyDispatchExplicit
+TestStablePartitionCopyDispatchImplicit
+TestStablePartitionCopySimpleDevice
+TestStablePartitionCopySimpleHost
+TestStablePartitionCopyStencil
+TestStablePartitionCopyStencilDispatchExplicit
+TestStablePartitionCopyStencilDispatchImplicit
+TestStablePartitionCopyStencilSimpleDevice
+TestStablePartitionCopyStencilSimpleHost
+TestStablePartitionCopyStencilToDiscardIterator
+TestStablePartitionCopyToDiscardIterator
+TestStablePartitionDispatchExplicit
+TestStablePartitionDispatchImplicit
+TestStablePartitionSimpleDevice
+TestStablePartitionSimpleHost
+TestStablePartitionStencil
+TestStablePartitionStencilDispatchExplicit
+TestStablePartitionStencilDispatchImplicit
+TestStablePartitionStencilSimpleDevice
+TestStablePartitionStencilSimpleHost
+TestStablePartitionStencilZipIteratorDevice
+TestStablePartitionStencilZipIteratorHost
+TestStablePartitionZipIteratorDevice
+TestStablePartitionZipIteratorHost
+TestTransformBinary
+TestTransformBinaryCountingIterator
+TestTransformBinaryDispatchExplicit
+TestTransformBinaryDispatchImplicit
+TestTransformBinarySimpleDevice
+TestTransformBinarySimpleHost
+TestTransformBinaryToDiscardIterator
+TestTransformExclusiveScanDispatchExplicit
+TestTransformExclusiveScanDispatchImplicit
+TestTransformIfBinary
+TestTransformIfBinaryDispatchExplicit
+TestTransformIfBinaryDispatchImplicit
+TestTransformIfBinarySimpleDevice
+TestTransformIfBinarySimpleHost
+TestTransformIfBinaryToDiscardIterator
+TestTransformIfUnary
+TestTransformIfUnaryDispatchExplicit
+TestTransformIfUnaryDispatchImplicit
+TestTransformIfUnaryNoStencil
+TestTransformIfUnaryNoStencilDispatchExplicit
+TestTransformIfUnaryNoStencilDispatchImplicit
+TestTransformIfUnaryNoStencilSimpleDevice
+TestTransformIfUnaryNoStencilSimpleHost
+TestTransformIfUnarySimpleDevice
+TestTransformIfUnarySimpleHost
+TestTransformIfUnaryToDiscardIterator
+TestTransformInclusiveScanDispatchExplicit
+TestTransformInclusiveScanDispatchImplicit
+TestTransformScan
+TestTransformScanCountingIteratorDevice
+TestTransformScanCountingIteratorHost
+TestTransformScanSimpleDevice
+TestTransformScanSimpleHost
+TestTransformScanToDiscardIterator
+TestTransformUnary
+TestTransformUnaryCountingIterator
+TestTransformUnaryDispatchExplicit
+TestTransformUnaryDispatchImplicit
+TestTransformUnarySimpleDevice
+TestTransformUnarySimpleHost
+TestTransformUnaryToDiscardIterator
+TestTransformUnaryToDiscardIteratorZipped
+TestTransformWithIndirectionDevice
+TestTransformWithIndirectionHost
+TestUnique
+TestUniqueByKey
+TestUniqueByKeyCopyDispatchExplicit
+TestUniqueByKeyCopyDispatchImplicit
+TestUniqueByKeyDispatchExplicit
+TestUniqueByKeyDispatchImplicit
+TestUniqueByKeySimpleDevice
+TestUniqueByKeySimpleHost
+TestUniqueCopy
+TestUniqueCopyByKey
+TestUniqueCopyByKeySimpleDevice
+TestUniqueCopyByKeySimpleHost
+TestUniqueCopyByKeyToDiscardIterator
+TestUniqueCopyDispatchExplicit
+TestUniqueCopyDispatchImplicit
+TestUniqueCopySimpleDevice
+TestUniqueCopySimpleHost
+TestUniqueCopyToDiscardIterator
+TestUniqueDispatchExplicit
+TestUniqueDispatchImplicit
+TestUniqueSimpleDevice
+TestUniqueSimpleHost
+TestUnknownDeviceRobustness
+TestVectorBinarySearch
+TestVectorBinarySearchDiscardIterator
+TestVectorBinarySearchDispatchExplicit
+TestVectorBinarySearchDispatchImplicit
+TestVectorBinarySearchSimpleDevice
+TestVectorBinarySearchSimpleHost
+TestVectorCppZeroSizeDevice
+TestVectorCppZeroSizeHost
+TestVectorLowerBound
+TestVectorLowerBoundDiscardIterator
+TestVectorLowerBoundDispatchExplicit
+TestVectorLowerBoundDispatchImplicit
+TestVectorLowerBoundSimpleDevice
+TestVectorLowerBoundSimpleHost
+TestVectorUpperBound
+TestVectorUpperBoundDiscardIterator
+TestVectorUpperBoundDispatchExplicit
+TestVectorUpperBoundDispatchImplicit
+TestVectorUpperBoundSimpleDevice
+TestVectorUpperBoundSimpleHost
diff --git a/thrust/internal/test/thrust.example.arbitrary_transformation.filecheck b/thrust/internal/test/thrust.example.arbitrary_transformation.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..81b25ae230301c9fdd4dcd25125570de5489a5d7
--- /dev/null
+++ b/thrust/internal/test/thrust.example.arbitrary_transformation.filecheck
@@ -0,0 +1,5 @@
+     CHECK: 3 + 6 * 2 = 15
+CHECK-NEXT: 4 + 7 * 5 = 39
+CHECK-NEXT: 0 + 2 * 7 = 14
+CHECK-NEXT: 8 + 1 * 4 = 12
+CHECK-NEXT: 2 + 8 * 3 = 26
diff --git a/thrust/internal/test/thrust.example.basic_vector.filecheck b/thrust/internal/test/thrust.example.basic_vector.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..ab17b825162e0f05a1596a5253f80d5049ebb501
--- /dev/null
+++ b/thrust/internal/test/thrust.example.basic_vector.filecheck
@@ -0,0 +1,8 @@
+     CHECK: H has size 4
+CHECK-NEXT: H[0] = 14
+CHECK-NEXT: H[1] = 20
+CHECK-NEXT: H[2] = 38
+CHECK-NEXT: H[3] = 46
+CHECK-NEXT: H now has size 2
+CHECK-NEXT: D[0] = 99
+CHECK-NEXT: D[1] = 88
diff --git a/thrust/internal/test/thrust.example.bounding_box.filecheck b/thrust/internal/test/thrust.example.bounding_box.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..ddbe4a20127588cb790e35902a1e51bdde3100f4
--- /dev/null
+++ b/thrust/internal/test/thrust.example.bounding_box.filecheck
@@ -0,0 +1 @@
+     CHECK: bounding box (0.000022,0.037300) (0.967956,0.995085)
diff --git a/thrust/internal/test/thrust.example.bucket_sort2d.filecheck b/thrust/internal/test/thrust.example.bucket_sort2d.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..688e49cbaebf5a41b7b9b1f7be988f1eb40eec5d
--- /dev/null
+++ b/thrust/internal/test/thrust.example.bucket_sort2d.filecheck
@@ -0,0 +1,55 @@
+     CHECK: bucket (150, 50)'s list of points:
+CHECK-NEXT: (0.751041,0.505377)
+CHECK-NEXT: (0.750647,0.505272)
+CHECK-NEXT: (0.752243,0.509601)
+CHECK-NEXT: (0.750937,0.503519)
+CHECK-NEXT: (0.753879,0.506217)
+CHECK-NEXT: (0.754956,0.501953)
+CHECK-NEXT: (0.754439,0.502353)
+CHECK-NEXT: (0.754128,0.501410)
+CHECK-NEXT: (0.750917,0.502195)
+CHECK-NEXT: (0.754024,0.507150)
+CHECK-NEXT: (0.750565,0.502896)
+CHECK-NEXT: (0.753444,0.509374)
+CHECK-NEXT: (0.754874,0.506500)
+CHECK-NEXT: (0.754646,0.508721)
+CHECK-NEXT: (0.753527,0.504378)
+CHECK-NEXT: (0.754563,0.502366)
+CHECK-NEXT: (0.751227,0.502014)
+CHECK-NEXT: (0.753009,0.508329)
+CHECK-NEXT: (0.752284,0.500607)
+CHECK-NEXT: (0.753341,0.503853)
+CHECK-NEXT: (0.751787,0.501364)
+CHECK-NEXT: (0.750171,0.500588)
+CHECK-NEXT: (0.752243,0.501621)
+CHECK-NEXT: (0.752056,0.509570)
+CHECK-NEXT: (0.752263,0.507172)
+CHECK-NEXT: (0.754024,0.501935)
+CHECK-NEXT: (0.751538,0.500686)
+CHECK-NEXT: (0.754024,0.508004)
+CHECK-NEXT: (0.750358,0.506688)
+CHECK-NEXT: (0.751083,0.505733)
+CHECK-NEXT: (0.750150,0.505805)
+CHECK-NEXT: (0.750585,0.505232)
+CHECK-NEXT: (0.753838,0.508040)
+CHECK-NEXT: (0.750461,0.501308)
+CHECK-NEXT: (0.753527,0.501546)
+CHECK-NEXT: (0.751145,0.508224)
+CHECK-NEXT: (0.751953,0.506566)
+CHECK-NEXT: (0.750378,0.502955)
+CHECK-NEXT: (0.751704,0.507102)
+CHECK-NEXT: (0.754646,0.502674)
+CHECK-NEXT: (0.750772,0.501464)
+CHECK-NEXT: (0.752325,0.502761)
+CHECK-NEXT: (0.752408,0.502305)
+CHECK-NEXT: (0.751000,0.508639)
+CHECK-NEXT: (0.754252,0.506525)
+CHECK-NEXT: (0.753175,0.504877)
+CHECK-NEXT: (0.753071,0.502682)
+CHECK-NEXT: (0.750109,0.503627)
+CHECK-NEXT: (0.754936,0.506406)
+CHECK-NEXT: (0.754521,0.500953)
+CHECK-NEXT: (0.753941,0.509584)
+CHECK-NEXT: (0.754915,0.504699)
+CHECK-NEXT: (0.751476,0.509525)
+CHECK-NEXT: (0.752823,0.507129)
diff --git a/thrust/internal/test/thrust.example.constant_iterator.filecheck b/thrust/internal/test/thrust.example.constant_iterator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..53733577b6f2f42e351e7098222d435b91c22167
--- /dev/null
+++ b/thrust/internal/test/thrust.example.constant_iterator.filecheck
@@ -0,0 +1,4 @@
+     CHECK: 13
+CHECK-NEXT: 17
+CHECK-NEXT: 12
+CHECK-NEXT: 15
diff --git a/thrust/internal/test/thrust.example.counting_iterator.filecheck b/thrust/internal/test/thrust.example.counting_iterator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..b84601bbca991a8d4b176ac3dbad8de2c8ad8539
--- /dev/null
+++ b/thrust/internal/test/thrust.example.counting_iterator.filecheck
@@ -0,0 +1,5 @@
+     CHECK: found 4 nonzero values at indices:
+CHECK-NEXT: 1
+CHECK-NEXT: 2
+CHECK-NEXT: 5
+CHECK-NEXT: 7
diff --git a/thrust/internal/test/thrust.example.cuda.async_reduce.filecheck b/thrust/internal/test/thrust.example.cuda.async_reduce.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.cuda.custom_temporary_allocation.filecheck b/thrust/internal/test/thrust.example.cuda.custom_temporary_allocation.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..a1af14e69b21d4fac0e4e1ab3f5501b05463454f
--- /dev/null
+++ b/thrust/internal/test/thrust.example.cuda.custom_temporary_allocation.filecheck
@@ -0,0 +1,16 @@
+     CHECK: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): allocating new block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::free_all()
diff --git a/thrust/internal/test/thrust.example.cuda.fallback_allocator.filecheck b/thrust/internal/test/thrust.example.cuda.fallback_allocator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..535fc87fa391f34797661c5a5cc1c6c2341e1431
--- /dev/null
+++ b/thrust/internal/test/thrust.example.cuda.fallback_allocator.filecheck
@@ -0,0 +1,8 @@
+     CHECK: Testing fallback_allocator on device
+CHECK-SAME: with {{[0-9]+}} bytes of device memory
+     CHECK: attempting to sort {{[0-9]+}} values
+     CHECK:   allocated {{[0-9]+}} bytes of device memory
+     CHECK:   allocated {{[0-9]+}} bytes of device memory
+     CHECK: attempting to sort {{[0-9]+}} values
+     CHECK:   allocated {{[0-9]+}} bytes of device memory
+     CHECK:   allocated {{[0-9]+}} bytes of pinned host memory (fallback successful)
diff --git a/thrust/internal/test/thrust.example.cuda.global_device_vector.filecheck b/thrust/internal/test/thrust.example.cuda.global_device_vector.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.cuda.range_view.filecheck b/thrust/internal/test/thrust.example.cuda.range_view.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..83e3127d76a40c7e77496ad43de308bf01b0ad20
--- /dev/null
+++ b/thrust/internal/test/thrust.example.cuda.range_view.filecheck
@@ -0,0 +1,4 @@
+     CHECK: z[0]= 7
+CHECK-NEXT: z[1]= 8
+CHECK-NEXT: z[2]= 9
+CHECK-NEXT: z[3]= 10
diff --git a/thrust/internal/test/thrust.example.cuda.unwrap_pointer.filecheck b/thrust/internal/test/thrust.example.cuda.unwrap_pointer.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.cuda.wrap_pointer.filecheck b/thrust/internal/test/thrust.example.cuda.wrap_pointer.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.device_ptr.filecheck b/thrust/internal/test/thrust.example.device_ptr.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..b02b515881f75150430116f973cc7c446f93ed37
--- /dev/null
+++ b/thrust/internal/test/thrust.example.device_ptr.filecheck
@@ -0,0 +1,2 @@
+     CHECK: device array contains 10 values
+CHECK-NEXT: sum of values is 45
diff --git a/thrust/internal/test/thrust.example.discrete_voronoi.filecheck b/thrust/internal/test/thrust.example.discrete_voronoi.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..3dbf65cf5367440bee75758954fb2b54cc6d52c1
--- /dev/null
+++ b/thrust/internal/test/thrust.example.discrete_voronoi.filecheck
@@ -0,0 +1,11 @@
+     CHECK: [Inititialize {{[0-9]+}}x{{[0-9]+}} Image]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT: [Copy to Device]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT: [JFA stepping]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT:   ( {{[0-9.]+}} MPixel/s ) 
+CHECK-NEXT: [Device to Host Copy]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT: [PGM Export]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
diff --git a/thrust/internal/test/thrust.example.dot_products_with_zip.filecheck b/thrust/internal/test/thrust.example.dot_products_with_zip.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..a8a1b3e3e44838ba03e75509184df04cc6496306
--- /dev/null
+++ b/thrust/internal/test/thrust.example.dot_products_with_zip.filecheck
@@ -0,0 +1,4 @@
+     CHECK: (0.000022,0.000022,0.000022) * (0.000022,0.000022,0.000022) = 0.000000
+CHECK-NEXT: (0.085032,0.085032,0.085032) * (0.085032,0.085032,0.085032) = 0.021692
+CHECK-NEXT: (0.601353,0.601353,0.601353) * (0.601353,0.601353,0.601353) = 1.084875
+CHECK-NEXT: (0.891611,0.891611,0.891611) * (0.891611,0.891611,0.891611) = 2.384912
diff --git a/thrust/internal/test/thrust.example.expand.filecheck b/thrust/internal/test/thrust.example.expand.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..a43241087772aaf9835fe720a9e59ddbfdf764a4
--- /dev/null
+++ b/thrust/internal/test/thrust.example.expand.filecheck
@@ -0,0 +1,4 @@
+     CHECK: Expanding values according to counts
+CHECK-NEXT:  counts 3 5 2 0 1 3 4 2 4 
+CHECK-NEXT:  values 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT:  output 1 1 1 2 2 2 2 2 3 3 5 6 6 6 7 7 7 7 8 8 9 9 9 9 
diff --git a/thrust/internal/test/thrust.example.fill_copy_sequence.filecheck b/thrust/internal/test/thrust.example.fill_copy_sequence.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..78f3acda2565e990b584323684e369e608accd40
--- /dev/null
+++ b/thrust/internal/test/thrust.example.fill_copy_sequence.filecheck
@@ -0,0 +1,10 @@
+     CHECK: D[0] = 0
+CHECK-NEXT: D[1] = 1
+CHECK-NEXT: D[2] = 2
+CHECK-NEXT: D[3] = 3
+CHECK-NEXT: D[4] = 4
+CHECK-NEXT: D[5] = 9
+CHECK-NEXT: D[6] = 9
+CHECK-NEXT: D[7] = 1
+CHECK-NEXT: D[8] = 1
+CHECK-NEXT: D[9] = 1
diff --git a/thrust/internal/test/thrust.example.histogram.filecheck b/thrust/internal/test/thrust.example.histogram.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..bb5dbdba171c7637b1b749d1fed3acbb15de7b8e
--- /dev/null
+++ b/thrust/internal/test/thrust.example.histogram.filecheck
@@ -0,0 +1,10 @@
+     CHECK: Dense Histogram
+CHECK-NEXT:           initial data  3 4 3 5 8 5 6 6 4 4 5 3 2 5 6 3 1 3 2 3 6 5 3 3 3 2 4 2 3 3 2 5 5 5 8 2 5 6 6 3 
+CHECK-NEXT:            sorted data  1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 8 8 
+CHECK-NEXT:   cumulative histogram  0 1 7 19 23 32 38 38 40 
+CHECK-NEXT:              histogram  0 1 6 12 4 9 6 0 2 
+CHECK-NEXT: Sparse Histogram
+CHECK-NEXT:           initial data  3 4 3 5 8 5 6 6 4 4 5 3 2 5 6 3 1 3 2 3 6 5 3 3 3 2 4 2 3 3 2 5 5 5 8 2 5 6 6 3 
+CHECK-NEXT:            sorted data  1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 8 8 
+CHECK-NEXT:       histogram values  1 2 3 4 5 6 8 
+CHECK-NEXT:       histogram counts  1 6 12 4 9 6 2 
diff --git a/thrust/internal/test/thrust.example.lambda.filecheck b/thrust/internal/test/thrust.example.lambda.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..2937024bbeb89a7b360ab29505e80aff94fa9e22
--- /dev/null
+++ b/thrust/internal/test/thrust.example.lambda.filecheck
@@ -0,0 +1,10 @@
+     CHECK: SAXPY (functor method)
+CHECK-NEXT: 2 * 1 + 1 = 3
+CHECK-NEXT: 2 * 2 + 1 = 5
+CHECK-NEXT: 2 * 3 + 1 = 7
+CHECK-NEXT: 2 * 4 + 1 = 9
+CHECK-NEXT: SAXPY (placeholder method)
+CHECK-NEXT: 2 * 1 + 1 = 3
+CHECK-NEXT: 2 * 2 + 1 = 5
+CHECK-NEXT: 2 * 3 + 1 = 7
+CHECK-NEXT: 2 * 4 + 1 = 9
diff --git a/thrust/internal/test/thrust.example.lexicographical_sort.filecheck b/thrust/internal/test/thrust.example.lexicographical_sort.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..7d2dc490780b10f11f28105aaf02e77961890351
--- /dev/null
+++ b/thrust/internal/test/thrust.example.lexicographical_sort.filecheck
@@ -0,0 +1,42 @@
+     CHECK: Unsorted Keys
+CHECK-NEXT: (0,2,6)
+CHECK-NEXT: (0,4,4)
+CHECK-NEXT: (6,8,5)
+CHECK-NEXT: (8,6,8)
+CHECK-NEXT: (9,9,4)
+CHECK-NEXT: (1,9,7)
+CHECK-NEXT: (5,1,0)
+CHECK-NEXT: (3,8,1)
+CHECK-NEXT: (2,9,2)
+CHECK-NEXT: (7,2,7)
+CHECK-NEXT: (0,9,0)
+CHECK-NEXT: (5,4,1)
+CHECK-NEXT: (5,3,6)
+CHECK-NEXT: (8,5,5)
+CHECK-NEXT: (5,3,7)
+CHECK-NEXT: (5,7,3)
+CHECK-NEXT: (8,6,4)
+CHECK-NEXT: (9,5,4)
+CHECK-NEXT: (7,5,9)
+CHECK-NEXT: (9,0,9)
+CHECK-NEXT: Sorted Keys
+CHECK-NEXT: (0,2,6)
+CHECK-NEXT: (0,4,4)
+CHECK-NEXT: (0,9,0)
+CHECK-NEXT: (1,9,7)
+CHECK-NEXT: (2,9,2)
+CHECK-NEXT: (3,8,1)
+CHECK-NEXT: (5,1,0)
+CHECK-NEXT: (5,3,6)
+CHECK-NEXT: (5,3,7)
+CHECK-NEXT: (5,4,1)
+CHECK-NEXT: (5,7,3)
+CHECK-NEXT: (6,8,5)
+CHECK-NEXT: (7,2,7)
+CHECK-NEXT: (7,5,9)
+CHECK-NEXT: (8,5,5)
+CHECK-NEXT: (8,6,4)
+CHECK-NEXT: (8,6,8)
+CHECK-NEXT: (9,0,9)
+CHECK-NEXT: (9,5,4)
+CHECK-NEXT: (9,9,4)
diff --git a/thrust/internal/test/thrust.example.max_abs_diff.filecheck b/thrust/internal/test/thrust.example.max_abs_diff.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..a02df644fd34584469a8eb5bb932cb5e091b7d1f
--- /dev/null
+++ b/thrust/internal/test/thrust.example.max_abs_diff.filecheck
@@ -0,0 +1 @@
+     CHECK: maximum absolute difference: 4
diff --git a/thrust/internal/test/thrust.example.minimal_custom_backend.filecheck b/thrust/internal/test/thrust.example.minimal_custom_backend.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..76802325be8da8ca7d0569d920bc14b09d93b8b1
--- /dev/null
+++ b/thrust/internal/test/thrust.example.minimal_custom_backend.filecheck
@@ -0,0 +1 @@
+     CHECK: Hello, world from for_each(my_system)!
diff --git a/thrust/internal/test/thrust.example.minmax.filecheck b/thrust/internal/test/thrust.example.minmax.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..10e41724d32d686bdc03430511050230f21d4d16
--- /dev/null
+++ b/thrust/internal/test/thrust.example.minmax.filecheck
@@ -0,0 +1,3 @@
+     CHECK: [ 10 17 64 90 97 27 56 45 33 76 ]
+CHECK-NEXT: minimum = 10
+CHECK-NEXT: maximum = 97
diff --git a/thrust/internal/test/thrust.example.mode.filecheck b/thrust/internal/test/thrust.example.mode.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..c253cc48399454cbb455e149e150af38ddc0a959
--- /dev/null
+++ b/thrust/internal/test/thrust.example.mode.filecheck
@@ -0,0 +1,9 @@
+     CHECK: initial data
+CHECK-NEXT: 0 0 6 8 9 1 5 3 2 7 0 5 5 8 5 5 8 9 7 9 2 4 8 6 9 9 1 8 9 2 
+CHECK-NEXT: sorted data
+CHECK-NEXT: 0 0 0 1 1 2 2 2 3 4 5 5 5 5 5 6 6 7 7 8 8 8 8 8 9 9 9 9 9 9 
+CHECK-NEXT: values
+CHECK-NEXT: 0 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT: counts
+CHECK-NEXT: 3 2 3 1 1 5 2 2 5 6 
+CHECK-NEXT: Modal value 9 occurs 6 times 
diff --git a/thrust/internal/test/thrust.example.monte_carlo.filecheck b/thrust/internal/test/thrust.example.monte_carlo.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..137aec2746948e37fdd0b91228162a787c421a42
--- /dev/null
+++ b/thrust/internal/test/thrust.example.monte_carlo.filecheck
@@ -0,0 +1 @@
+     CHECK: pi is approximately 3.14
diff --git a/thrust/internal/test/thrust.example.monte_carlo_disjoint_sequences.filecheck b/thrust/internal/test/thrust.example.monte_carlo_disjoint_sequences.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..8d6bd022bc63f03656f7ca32e2dbb0bf30385e15
--- /dev/null
+++ b/thrust/internal/test/thrust.example.monte_carlo_disjoint_sequences.filecheck
@@ -0,0 +1 @@
+     CHECK: pi is around 3.1415
diff --git a/thrust/internal/test/thrust.example.mr_basic.filecheck b/thrust/internal/test/thrust.example.mr_basic.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.norm.filecheck b/thrust/internal/test/thrust.example.norm.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..8a8e4203e74642f6132dfe6d3b420da00cbbefce
--- /dev/null
+++ b/thrust/internal/test/thrust.example.norm.filecheck
@@ -0,0 +1 @@
+     CHECK: norm is 5.47723
diff --git a/thrust/internal/test/thrust.example.padded_grid_reduction.filecheck b/thrust/internal/test/thrust.example.padded_grid_reduction.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..ed77e84fd6bfc45802d74ad6a155a5172ab84ccb
--- /dev/null
+++ b/thrust/internal/test/thrust.example.padded_grid_reduction.filecheck
@@ -0,0 +1,13 @@
+     CHECK: padded grid
+CHECK-NEXT:  0.2775 0.7256 0.6979 0.9412 0.4131 0.7202 0.3765 0.4136 0.5766 0.6612 0.4672 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.0137 0.6256 0.1003 0.2374 0.0915 0.0455 0.3187 0.0839 0.8173 0.7281 0.5975 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.2990 0.2693 0.4408 0.1262 0.3812 0.8537 0.9962 0.7528 0.9272 0.7873 0.8984 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.3529 0.5803 0.8900 0.4505 0.0477 0.2683 0.8613 0.0877 0.2438 0.4363 0.6292 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.4561 0.7896 0.6662 0.4988 0.4404 0.6277 0.5752 0.6816 0.1240 0.5018 0.8027 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.9527 0.5223 0.9500 0.2376 0.0110 0.7803 0.6221 0.2488 0.7006 0.6347 0.9137 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.0027 0.4972 0.7421 0.4674 0.8961 0.2355 0.9507 0.9211 0.1650 0.4517 0.7143 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.8649 0.2082 0.8464 0.2547 0.4789 0.9534 0.0403 0.6872 0.8964 0.3910 0.2292 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.9017 0.1525 0.9041 0.1460 0.1646 0.3839 0.6994 0.0900 0.1671 0.2587 0.5893 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.9075 0.2186 0.4626 0.8713 0.7073 0.1520 0.9495 0.4137 0.6746 0.7064 0.5609 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+     CHECK: minimum value: 0.0027
+CHECK-NEXT: maximum value: 0.9962
diff --git a/thrust/internal/test/thrust.example.permutation_iterator.filecheck b/thrust/internal/test/thrust.example.permutation_iterator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..6507af04be147df0ea81f438c741a4fd50288bce
--- /dev/null
+++ b/thrust/internal/test/thrust.example.permutation_iterator.filecheck
@@ -0,0 +1 @@
+     CHECK: sum is 130
diff --git a/thrust/internal/test/thrust.example.raw_reference_cast.filecheck b/thrust/internal/test/thrust.example.raw_reference_cast.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..ed23222e9a19e461311696647e95f0a2676f0b35
--- /dev/null
+++ b/thrust/internal/test/thrust.example.raw_reference_cast.filecheck
@@ -0,0 +1,6 @@
+     CHECK: Before A->B Copy
+CHECK-NEXT: A: 0 1 2 3 4 
+CHECK-NEXT: B: 0 0 0 0 0 
+CHECK-NEXT: After A->B Copy
+CHECK-NEXT: A: 0 1 2 3 4 
+CHECK-NEXT: B: 0 1 2 3 4 
diff --git a/thrust/internal/test/thrust.example.remove_points2d.filecheck b/thrust/internal/test/thrust.example.remove_points2d.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..f69f1cd52145ee29552aad63334d4d46d3e8dac7
--- /dev/null
+++ b/thrust/internal/test/thrust.example.remove_points2d.filecheck
@@ -0,0 +1,36 @@
+     CHECK: Generated 20 points
+CHECK-NEXT: (0.000022,0.085032)
+CHECK-NEXT: (0.601353,0.891611)
+CHECK-NEXT: (0.967956,0.189690)
+CHECK-NEXT: (0.514976,0.398008)
+CHECK-NEXT: (0.262906,0.743512)
+CHECK-NEXT: (0.089548,0.560390)
+CHECK-NEXT: (0.582230,0.809567)
+CHECK-NEXT: (0.591919,0.511713)
+CHECK-NEXT: (0.876634,0.995085)
+CHECK-NEXT: (0.726212,0.966611)
+CHECK-NEXT: (0.297102,0.426051)
+CHECK-NEXT: (0.899498,0.652999)
+CHECK-NEXT: (0.901534,0.961533)
+CHECK-NEXT: (0.164713,0.857987)
+CHECK-NEXT: (0.906845,0.294026)
+CHECK-NEXT: (0.936244,0.414645)
+CHECK-NEXT: (0.308457,0.514893)
+CHECK-NEXT: (0.395430,0.789785)
+CHECK-NEXT: (0.689141,0.544273)
+CHECK-NEXT: (0.592407,0.093630)
+     CHECK: After stream compaction, 14 points remain
+CHECK-NEXT: (0.000022,0.085032)
+CHECK-NEXT: (0.967956,0.189690)
+CHECK-NEXT: (0.514976,0.398008)
+CHECK-NEXT: (0.262906,0.743512)
+CHECK-NEXT: (0.089548,0.560390)
+CHECK-NEXT: (0.582230,0.809567)
+CHECK-NEXT: (0.591919,0.511713)
+CHECK-NEXT: (0.297102,0.426051)
+CHECK-NEXT: (0.164713,0.857987)
+CHECK-NEXT: (0.906845,0.294026)
+CHECK-NEXT: (0.308457,0.514893)
+CHECK-NEXT: (0.395430,0.789785)
+CHECK-NEXT: (0.689141,0.544273)
+CHECK-NEXT: (0.592407,0.093630)
diff --git a/thrust/internal/test/thrust.example.repeated_range.filecheck b/thrust/internal/test/thrust.example.repeated_range.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e067aed99c6bb670e57b69d49c271526e7d0897f
--- /dev/null
+++ b/thrust/internal/test/thrust.example.repeated_range.filecheck
@@ -0,0 +1,3 @@
+     CHECK: range        10 20 30 40 
+CHECK-NEXT: repeated x2: 10 10 20 20 30 30 40 40 
+CHECK-NEXT: repeated x3: 10 10 10 20 20 20 30 30 30 40 40 40 
diff --git a/thrust/internal/test/thrust.example.run_length_decoding.filecheck b/thrust/internal/test/thrust.example.run_length_decoding.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..49faef7fc1bf7d35c0af626c9f0e6b217d7b12b4
--- /dev/null
+++ b/thrust/internal/test/thrust.example.run_length_decoding.filecheck
@@ -0,0 +1,4 @@
+     CHECK: run-length encoded input:
+CHECK-NEXT: (a,3)(b,5)(c,1)(d,2)(e,9)(f,2)
+     CHECK: decoded output:
+CHECK-NEXT: aaabbbbbcddeeeeeeeeeff
diff --git a/thrust/internal/test/thrust.example.run_length_encoding.filecheck b/thrust/internal/test/thrust.example.run_length_encoding.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..7d907ab79cdf415efb56698c71baa49f9f132da8
--- /dev/null
+++ b/thrust/internal/test/thrust.example.run_length_encoding.filecheck
@@ -0,0 +1,4 @@
+     CHECK: input data:
+CHECK-NEXT: aaabbbbbcddeeeeeeeeeff
+     CHECK: run-length encoded output:
+CHECK-NEXT: (a,3)(b,5)(c,1)(d,2)(e,9)(f,2)
diff --git a/thrust/internal/test/thrust.example.saxpy.filecheck b/thrust/internal/test/thrust.example.saxpy.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.scan_by_key.filecheck b/thrust/internal/test/thrust.example.scan_by_key.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..b183794b059167c80ca8a628d461b416677e9cbd
--- /dev/null
+++ b/thrust/internal/test/thrust.example.scan_by_key.filecheck
@@ -0,0 +1,16 @@
+     CHECK: Inclusive Segmented Scan w/ Key Sequence
+CHECK-NEXT:  keys          : 0 0 0 1 1 2 2 2 2 3 4 4 5 5 5 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 2 4 6 2 4 2 4 6 8 2 2 4 2 4 6 
+     CHECK: Inclusive Segmented Scan w/ Head Flag Sequence
+CHECK-NEXT:  head flags    : 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 2 4 6 2 4 2 4 6 8 2 2 4 2 4 6 
+     CHECK: Exclusive Segmented Scan w/ Key Sequence
+CHECK-NEXT:  keys          : 0 0 0 1 1 2 2 2 2 3 4 4 5 5 5 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 0 2 4 0 2 0 2 4 6 0 0 2 0 2 4 
+     CHECK: Exclusive Segmented Scan w/ Head Flag Sequence
+CHECK-NEXT:  head flags    : 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 0 2 4 0 2 0 2 4 6 0 0 2 0 2 4 
diff --git a/thrust/internal/test/thrust.example.scan_matrix_by_rows.filecheck b/thrust/internal/test/thrust.example.scan_matrix_by_rows.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.set_operations.filecheck b/thrust/internal/test/thrust.example.set_operations.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..6ccfe8bebda245db7727698820a66af839e91763
--- /dev/null
+++ b/thrust/internal/test/thrust.example.set_operations.filecheck
@@ -0,0 +1,8 @@
+     CHECK: Set A [ 0 2 4 5 6 8 9 ]
+CHECK-NEXT: Set B [ 0 1 2 3 5 7 8 ]
+CHECK-NEXT: Merge(A,B) [ 0 0 1 2 2 3 4 5 5 6 7 8 8 9 ]
+CHECK-NEXT: Union(A,B) [ 0 1 2 3 4 5 6 7 8 9 ]
+CHECK-NEXT: Intersection(A,B) [ 0 2 5 8 ]
+CHECK-NEXT: Difference(A,B) [ 4 6 9 ]
+CHECK-NEXT: SymmetricDifference(A,B) [ 1 3 4 6 7 9 ]
+CHECK-NEXT: SetIntersectionSize(A,B) 4
diff --git a/thrust/internal/test/thrust.example.simple_moving_average.filecheck b/thrust/internal/test/thrust.example.simple_moving_average.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..4fadc201cb1c8e0a4850c2c437e03eb3415e7a51
--- /dev/null
+++ b/thrust/internal/test/thrust.example.simple_moving_average.filecheck
@@ -0,0 +1,29 @@
+     CHECK: data series: [ 0 0 6 9 10 2 5 4 2 8 0 6 6 8 6 5 9 10 7 10 3 4 9 7 9 10 1 9 9 3 ]
+CHECK-NEXT: simple moving averages (window = 4)
+CHECK-NEXT:   [ 0, 4) = 3.75
+CHECK-NEXT:   [ 1, 5) = 6.25
+CHECK-NEXT:   [ 2, 6) = 6.75
+CHECK-NEXT:   [ 3, 7) = 6.5
+CHECK-NEXT:   [ 4, 8) = 5.25
+CHECK-NEXT:   [ 5, 9) = 3.25
+CHECK-NEXT:   [ 6,10) = 4.75
+CHECK-NEXT:   [ 7,11) = 3.5
+CHECK-NEXT:   [ 8,12) = 4
+CHECK-NEXT:   [ 9,13) = 5
+CHECK-NEXT:   [10,14) = 5
+CHECK-NEXT:   [11,15) = 6.5
+CHECK-NEXT:   [12,16) = 6.25
+CHECK-NEXT:   [13,17) = 7
+CHECK-NEXT:   [14,18) = 7.5
+CHECK-NEXT:   [15,19) = 7.75
+CHECK-NEXT:   [16,20) = 9
+CHECK-NEXT:   [17,21) = 7.5
+CHECK-NEXT:   [18,22) = 6
+CHECK-NEXT:   [19,23) = 6.5
+CHECK-NEXT:   [20,24) = 5.75
+CHECK-NEXT:   [21,25) = 7.25
+CHECK-NEXT:   [22,26) = 8.75
+CHECK-NEXT:   [23,27) = 6.75
+CHECK-NEXT:   [24,28) = 7.25
+CHECK-NEXT:   [25,29) = 7.25
+CHECK-NEXT:   [26,30) = 5.5
diff --git a/thrust/internal/test/thrust.example.sort.filecheck b/thrust/internal/test/thrust.example.sort.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..b6450f88d042de5a6ecc165a329c00e7c610d091
--- /dev/null
+++ b/thrust/internal/test/thrust.example.sort.filecheck
@@ -0,0 +1,21 @@
+     CHECK: sorting integers
+CHECK-NEXT:  79 78 62 78 94 40 86 57 40 16 28 54 77 87 93 98
+CHECK-NEXT:  16 28 40 40 54 57 62 77 78 78 79 86 87 93 94 98
+     CHECK: sorting integers (descending)
+CHECK-NEXT:  79 78 62 78 94 40 86 57 40 16 28 54 77 87 93 98
+CHECK-NEXT:  98 94 93 87 86 79 78 78 77 62 57 54 40 40 28 16
+     CHECK: sorting integers (user-defined comparison)
+CHECK-NEXT:  79 78 62 78 94 40 86 57 40 16 28 54 77 87 93 98
+CHECK-NEXT:  16 28 40 40 54 62 78 78 86 94 98 57 77 79 87 93
+     CHECK: sorting floats
+CHECK-NEXT:  7.5 7.5 6.0 7.5 9.0 4.0 8.5 5.5 4.0 1.5 2.5 5.0 7.5 8.5 9.0 9.5
+CHECK-NEXT:  1.5 2.5 4.0 4.0 5.0 5.5 6.0 7.5 7.5 7.5 7.5 8.5 8.5 9.0 9.0 9.5
+     CHECK: sorting pairs
+CHECK-NEXT:  (7,7) (5,7) (9,3) (8,5) (3,0) (2,4) (7,8) (9,9) (7,1) (1,9) (0,5) (3,6) (8,0) (7,6) (4,2) (8,3)
+CHECK-NEXT:  (0,5) (1,9) (2,4) (3,0) (3,6) (4,2) (5,7) (7,1) (7,6) (7,7) (7,8) (8,0) (8,3) (8,5) (9,3) (9,9)
+     CHECK: key-value sorting
+CHECK-NEXT:  (79, 0) (78, 1) (62, 2) (78, 3) (94, 4) (40, 5) (86, 6) (57, 7) (40, 8) (16, 9) (28,10) (54,11) (77,12) (87,13) (93,14) (98,15)
+CHECK-NEXT:  (16, 9) (28,10) (40, 5) (40, 8) (54,11) (57, 7) (62, 2) (77,12) (78, 1) (78, 3) (79, 0) (86, 6) (87,13) (93,14) (94, 4) (98,15)
+     CHECK: key-value sorting (descending)
+CHECK-NEXT:  (79, 0) (78, 1) (62, 2) (78, 3) (94, 4) (40, 5) (86, 6) (57, 7) (40, 8) (16, 9) (28,10) (54,11) (77,12) (87,13) (93,14) (98,15)
+CHECK-NEXT:  (98,15) (94, 4) (93,14) (87,13) (86, 6) (79, 0) (78, 1) (78, 3) (77,12) (62, 2) (57, 7) (54,11) (40, 5) (40, 8) (28,10) (16, 9)
diff --git a/thrust/internal/test/thrust.example.sorting_aos_vs_soa.filecheck b/thrust/internal/test/thrust.example.sorting_aos_vs_soa.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..f29323710a9a1dcd7870a3297ade7958671208bc
--- /dev/null
+++ b/thrust/internal/test/thrust.example.sorting_aos_vs_soa.filecheck
@@ -0,0 +1,2 @@
+     CHECK: AoS sort took {{[0-9.]+}} milliseconds
+CHECK-NEXT: SoA sort took {{[0-9.]+}} milliseconds
diff --git a/thrust/internal/test/thrust.example.sparse_vector.filecheck b/thrust/internal/test/thrust.example.sparse_vector.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..560378d3c835b36359f2be27d05e91b61f0f9a88
--- /dev/null
+++ b/thrust/internal/test/thrust.example.sparse_vector.filecheck
@@ -0,0 +1,4 @@
+     CHECK: Computing C = A + B for sparse vectors A and B
+CHECK-NEXT: A (2,10) (3,60) (5,20) (8,40) 
+CHECK-NEXT: B (1,50) (2,30) (4,80) (5,30) (7,90) (8,10) 
+CHECK-NEXT: C (1,50) (2,40) (3,60) (4,80) (5,50) (7,90) (8,50) 
diff --git a/thrust/internal/test/thrust.example.stream_compaction.filecheck b/thrust/internal/test/thrust.example.stream_compaction.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..eb62ac24c07ffea669146d2b1b778c31587fde17
--- /dev/null
+++ b/thrust/internal/test/thrust.example.stream_compaction.filecheck
@@ -0,0 +1,4 @@
+     CHECK: values: 0 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT: output: 1 3 5 7 9 
+CHECK-NEXT: small_output: 1 3 5 7 9 
+CHECK-NEXT: values: 0 2 4 6 8 
diff --git a/thrust/internal/test/thrust.example.strided_range.filecheck b/thrust/internal/test/thrust.example.strided_range.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..2067ffa17e7c0fe4bb9ef4b05c9700288d9c4dd7
--- /dev/null
+++ b/thrust/internal/test/thrust.example.strided_range.filecheck
@@ -0,0 +1,4 @@
+     CHECK: data: 10 20 30 40 50 60 70 80 
+CHECK-NEXT: sum of even indices: 160
+CHECK-NEXT: sum of odd indices:  200
+CHECK-NEXT: setting odd indices to zero: 10 0 30 0 50 0 70 0 
diff --git a/thrust/internal/test/thrust.example.sum.filecheck b/thrust/internal/test/thrust.example.sum.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..4c7771103335bad9f3a4fde138812388cde864c2
--- /dev/null
+++ b/thrust/internal/test/thrust.example.sum.filecheck
@@ -0,0 +1 @@
+     CHECK: sum is 509773
diff --git a/thrust/internal/test/thrust.example.sum_rows.filecheck b/thrust/internal/test/thrust.example.sum_rows.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..ae5f889d7993a2c82fcb02590df1cb6982b69f6f
--- /dev/null
+++ b/thrust/internal/test/thrust.example.sum_rows.filecheck
@@ -0,0 +1,5 @@
+     CHECK: [ 10 17 64 90 97 27 56 45 ] = 406
+CHECK-NEXT: [ 33 76 18 60 62 82 63 56 ] = 450
+CHECK-NEXT: [ 88 99 75 96 36 48 90 68 ] = 600
+CHECK-NEXT: [ 91 96 24 87 91 36 94 47 ] = 566
+CHECK-NEXT: [ 37 56 45 81 72 58 63 18 ] = 430
diff --git a/thrust/internal/test/thrust.example.summary_statistics.filecheck b/thrust/internal/test/thrust.example.summary_statistics.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..92c2470ea6f0beaf59103c519397f62c5601d5e7
--- /dev/null
+++ b/thrust/internal/test/thrust.example.summary_statistics.filecheck
@@ -0,0 +1,10 @@
+     CHECK: ******Summary Statistics Example*****
+CHECK-NEXT: The data: 4 7 13 16 
+CHECK-NEXT: Count              : 4
+CHECK-NEXT: Minimum            : 4
+CHECK-NEXT: Maximum            : 16
+CHECK-NEXT: Mean               : 10
+CHECK-NEXT: Variance           : 30
+CHECK-NEXT: Standard Deviation : 4.74342
+CHECK-NEXT: Skewness           : 0
+CHECK-NEXT: Kurtosis           : 1.36
diff --git a/thrust/internal/test/thrust.example.summed_area_table.filecheck b/thrust/internal/test/thrust.example.summed_area_table.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..98fabffca8b1245ad494d5f9724e05434136d644
--- /dev/null
+++ b/thrust/internal/test/thrust.example.summed_area_table.filecheck
@@ -0,0 +1,22 @@
+     CHECK: [step 0] initial array
+CHECK-NEXT:        1        1        1        1 
+CHECK-NEXT:        1        1        1        1 
+CHECK-NEXT:        1        1        1        1 
+CHECK-NEXT: [step 1] scan horizontally
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT: [step 2] transpose array
+CHECK-NEXT:        1        1        1 
+CHECK-NEXT:        2        2        2 
+CHECK-NEXT:        3        3        3 
+CHECK-NEXT:        4        4        4 
+CHECK-NEXT: [step 3] scan transpose horizontally
+CHECK-NEXT:        1        2        3 
+CHECK-NEXT:        2        4        6 
+CHECK-NEXT:        3        6        9 
+CHECK-NEXT:        4        8       12 
+CHECK-NEXT: [step 4] transpose the transpose
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT:        2        4        6        8 
+CHECK-NEXT:        3        6        9       12 
diff --git a/thrust/internal/test/thrust.example.tiled_range.filecheck b/thrust/internal/test/thrust.example.tiled_range.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..2ac310b51e855c248975539fac49c6cb9b8816f5
--- /dev/null
+++ b/thrust/internal/test/thrust.example.tiled_range.filecheck
@@ -0,0 +1,3 @@
+     CHECK: range        10 20 30 40 
+CHECK-NEXT: two tiles:   10 20 30 40 10 20 30 40 
+CHECK-NEXT: three tiles: 10 20 30 40 10 20 30 40 10 20 30 40 
diff --git a/thrust/internal/test/thrust.example.transform_input_output_iterator.filecheck b/thrust/internal/test/thrust.example.transform_input_output_iterator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..caeca2de5b5ad5f26365df84c06c137d3261991e
--- /dev/null
+++ b/thrust/internal/test/thrust.example.transform_input_output_iterator.filecheck
@@ -0,0 +1,2 @@
+     CHECK: Expected [ 1050 2060 3070 4080 ]
+CHECK-NEXT: Result   [ 1050 2060 3070 4080 ]
diff --git a/thrust/internal/test/thrust.example.transform_iterator.filecheck b/thrust/internal/test/thrust.example.transform_iterator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..8d3a4f85244de3624c0cb6a20a73f0d260be4bc6
--- /dev/null
+++ b/thrust/internal/test/thrust.example.transform_iterator.filecheck
@@ -0,0 +1,7 @@
+     CHECK: values         : 2 5 7 1 6 0 3 8 
+CHECK-NEXT: clamped values : 2 5 5 1 5 1 3 5 
+CHECK-NEXT: sum of clamped values : 27
+CHECK-NEXT: sequence         : 0 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT: clamped sequence : 1 1 2 3 4 5 5 5 5 5 
+CHECK-NEXT: negated sequence : -1 -1 -2 -3 -4 -5 -5 -5 -5 -5 
+CHECK-NEXT: negated values : -2 -5 -7 -1 -6 0 -3 -8 
diff --git a/thrust/internal/test/thrust.example.transform_output_iterator.filecheck b/thrust/internal/test/thrust.example.transform_output_iterator.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e1e4a92b52e8bacb61494349f3f73947ac19c3f3
--- /dev/null
+++ b/thrust/internal/test/thrust.example.transform_output_iterator.filecheck
@@ -0,0 +1 @@
+     CHECK: result= [ -0.666667 -2.66667 2 ] 
diff --git a/thrust/internal/test/thrust.example.uninitialized_vector.filecheck b/thrust/internal/test/thrust.example.uninitialized_vector.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/thrust/internal/test/thrust.example.version.filecheck b/thrust/internal/test/thrust.example.version.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..89b4d664a236bf9813bf190cf955d7e33c93902b
--- /dev/null
+++ b/thrust/internal/test/thrust.example.version.filecheck
@@ -0,0 +1 @@
+     CHECK: Thrust v{{[0-9]+[.][0-9]+[.][0-9]+-[0-9]+}}
diff --git a/thrust/internal/test/thrust.example.weld_vertices.filecheck b/thrust/internal/test/thrust.example.weld_vertices.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..a206e1f62a56c021ed991ca61b21ceb9ed807186
--- /dev/null
+++ b/thrust/internal/test/thrust.example.weld_vertices.filecheck
@@ -0,0 +1,15 @@
+     CHECK: Output Representation
+CHECK-NEXT:  vertices[0] = (0,0)
+CHECK-NEXT:  vertices[1] = (0,1)
+CHECK-NEXT:  vertices[2] = (1,0)
+CHECK-NEXT:  vertices[3] = (1,1)
+CHECK-NEXT:  vertices[4] = (2,0)
+CHECK-NEXT:  indices[0] = 0
+CHECK-NEXT:  indices[1] = 2
+CHECK-NEXT:  indices[2] = 1
+CHECK-NEXT:  indices[3] = 2
+CHECK-NEXT:  indices[4] = 3
+CHECK-NEXT:  indices[5] = 1
+CHECK-NEXT:  indices[6] = 2
+CHECK-NEXT:  indices[7] = 4
+CHECK-NEXT:  indices[8] = 3
diff --git a/thrust/internal/test/thrust.example.word_count.filecheck b/thrust/internal/test/thrust.example.word_count.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..e21beabd7fa22f30cc07542d7efee8b9b4c7c072
--- /dev/null
+++ b/thrust/internal/test/thrust.example.word_count.filecheck
@@ -0,0 +1,8 @@
+     CHECK: Text sample:
+CHECK-NEXT:   But the raven, sitting lonely on the placid bust, spoke only,
+CHECK-NEXT:   That one word, as if his soul in that one word he did outpour.
+CHECK-NEXT:   Nothing further then he uttered - not a feather then he fluttered -
+CHECK-NEXT:   Till I scarcely more than muttered `Other friends have flown before -
+CHECK-NEXT:   On the morrow he will leave me, as my hopes have flown before.'
+CHECK-NEXT:   Then the bird said, `Nevermore.'
+     CHECK: Text sample contains 65 words
diff --git a/thrust/internal/test/thrust.sanity.filecheck b/thrust/internal/test/thrust.sanity.filecheck
new file mode 100644
index 0000000000000000000000000000000000000000..1770bc9f30c66202d90ef6afa2feae9ddade64fd
--- /dev/null
+++ b/thrust/internal/test/thrust.sanity.filecheck
@@ -0,0 +1 @@
+     CHECK: SANITY
diff --git a/thrust/internal/test/thrust_nightly.pl b/thrust/internal/test/thrust_nightly.pl
new file mode 100755
index 0000000000000000000000000000000000000000..61e03bda4b7ca6a34fbf63bfc4383d6dbfe60445
--- /dev/null
+++ b/thrust/internal/test/thrust_nightly.pl
@@ -0,0 +1,600 @@
+#! /usr/bin/perl
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+use strict;
+use warnings;
+
+print(`perl --version`);
+
+use Getopt::Long;
+use Cwd;
+use Cwd "abs_path";
+use Config; # For signal names and numbers.
+use IPC::Open2;
+use File::Temp;
+use POSIX "strftime";
+
+my $have_time_hi_res = 0;
+
+if (eval { require Time::HiRes })
+{
+  printf("#### CONFIG timestamp `gettimeofday`\n");
+
+  import Time::HiRes "gettimeofday";
+
+  $have_time_hi_res = 1;
+} else {
+  printf("#### CONFIG timestamp `time`\n");
+}
+
+sub timestamp()
+{
+  if ($have_time_hi_res) {
+    return gettimeofday();
+  } else {
+    return time();
+  }
+}
+
+my %CmdLineOption;
+my $arch                = "";
+my $abi                 = "";
+my $os                  = "";
+my $build               = "release";
+my $bin_path;
+my $filecheck_path;
+my $filecheck_data_path = "internal/test";
+my $timeout_min         = 15;
+
+# https://stackoverflow.com/questions/29862178/name-of-signal-number-2
+my @sig_names;
+@sig_names[ split ' ', $Config{sig_num} ] = split ' ', $Config{sig_name};
+my %sig_nums;
+@sig_nums{ split ' ', $Config{sig_name} } = split ' ', $Config{sig_num};
+
+if (`uname` =~ m/CYGWIN/) {
+  $os = "win32";
+} elsif ($^O eq "MSWin32") {
+  $os = "win32";
+} else {
+  $os = `uname`;
+  chomp($os);
+}
+
+if ($os eq "win32") {
+  $ENV{'PROCESSOR_ARCHITECTURE'} ||= "";
+  $ENV{'PROCESSOR_ARCHITEW6432'} ||= "";
+
+  if ((lc($ENV{PROCESSOR_ARCHITECTURE}) ne "x86") ||
+      (lc($ENV{PROCESSOR_ARCHITECTURE}) eq "amd64") ||
+      (lc($ENV{PROCESSOR_ARCHITEW6432}) eq "amd64")) {
+    $arch = "x86_64";
+  } else {
+    $arch = "i686";
+  }
+} else {
+  $arch = `uname -m`;
+  chomp($arch);
+}
+
+sub usage()
+{
+  printf("Usage: thrust_nightly.pl <options>\n");
+  printf("Options:\n");
+  printf("  -help                         : Print help message\n");
+  printf("  -forcearch <arch>             : i686|x86_64|ARMv7|aarch64 (default: $arch)\n");
+  printf("  -forceabi <abi>               : Specify abi to be used for arm (gnueabi|gnueabihf)\n");
+  printf("  -forceos <os>                 : win32|Linux|Darwin (default: $os)\n");
+  printf("  -build <release|debug>        : (default: debug)\n");
+  printf("  -bin-path <path>              : Specify location of test binaries\n");
+  printf("  -filecheck-path <path>        : Specify location of filecheck binary\n");
+  printf("  -filecheck-data-path <path>   : Specify location of filecheck data (default: $filecheck_data_path)\n");
+  printf("  -timeout-min <min>            : timeout in minutes for each individual test\n");
+}
+
+GetOptions(\%CmdLineOption,
+           'help' => sub { usage() and exit 0 },
+           "forcearch=s" => \$arch,
+           "forceabi=s" => \$abi,
+           "forceos=s" => \$os,
+           "build=s" => \$build,
+           "bin-path=s" => \$bin_path,
+           "filecheck-path=s" => \$filecheck_path,
+           "filecheck-data-path=s" => \$filecheck_data_path,
+           "timeout-min=i" => \$timeout_min,
+          );
+
+my $pwd = getcwd();
+my $bin_path_root = abs_path ("${pwd}/..");
+
+if ($arch eq "ARMv7") {
+      if ($abi eq "") {
+          $abi = "_gnueabi";  #Use default abi for arm if not specified
+      }
+      else {
+          $abi = "_${abi}";
+      }
+}
+else {
+    $abi = "";                #Ignore abi for architectures other than arm
+}
+
+my $uname = "";
+$uname = $arch;
+chomp($uname);
+
+if (not $bin_path) {
+    $bin_path = "${bin_path_root}/bin/${uname}_${os}${abi}_${build}";
+}
+
+if (not $filecheck_path) {
+    $filecheck_path = "${bin_path}/nvvm/tools";
+}
+
+sub process_return_code {
+    my ($name, $ret, $msg) = @_;
+
+    if ($ret != 0) {
+        my $signal  = $ret & 127;
+        my $app_exit = $ret >> 8;
+        my $dumped_core = $ret & 0x80;
+        if (($app_exit != 0) && ($app_exit != 0)) {
+            if ($msg ne "") {
+                printf("#### ERROR $name exited with return value $app_exit. $msg\n");
+            } else {
+                printf("#### ERROR $name exited with return value $app_exit.\n");
+            }
+        }
+        if ($signal != 0) {
+            if ($msg ne "") {
+                printf("#### ERROR $name received signal SIG$sig_names[$signal] ($signal). $msg\n");
+            } else {
+                printf("#### ERROR $name received signal SIG$sig_names[$signal] ($signal).\n");
+            }
+            if ($sig_nums{'INT'} eq $signal) {
+                die("Terminating testing due to SIGINT.");
+            }
+        }
+        if ($dumped_core != 0) {
+            if ($msg ne "") {
+                printf("#### ERROR $name generated a core dump. $msg\n");
+            } else {
+                printf("#### ERROR $name generated a core dump.\n");
+            }
+        }
+    }
+}
+
+my $have_filecheck = 1;
+
+sub filecheck_sanity {
+    my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/thrust.sanity.filecheck";
+
+    my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
+
+    print $filecheck_stdin "SANITY";
+
+    my $filecheck_ret = 0;
+    if (close($filecheck_stdin) == 0)
+    {
+      $filecheck_ret = $?;
+    }
+
+    if ($filecheck_ret == 0) {
+      printf("#### SANE FileCheck\n");
+    } else {
+      # Use a temporary file to send the output to
+      # FileCheck so we can get the output this time,
+      # because Perl and bidirectional pipes suck.
+      my $tmp = File::Temp->new();
+      my $tmp_filename = $tmp->filename;
+      print $tmp "SANITY";
+
+      printf("********************************************************************************\n");
+      print `$filecheck_cmd -input-file $tmp_filename`;
+      printf("********************************************************************************\n");
+
+      process_return_code("FileCheck Sanity", $filecheck_ret, "");
+      printf("#### INSANE FileCheck\n");
+
+      $have_filecheck = 0;
+    }
+}
+
+# Wrapper for system that logs the commands so you can see what it did
+sub run_cmd {
+    my ($cmd) = @_;
+    my $ret = 0;
+    my @executable;
+    my @output;
+    my $syst_cmd;
+
+    my $start = timestamp();
+    eval {
+        local $SIG{ALRM} = sub { die("Command timed out (received SIGALRM).\n") };
+        alarm (60 * $timeout_min);
+        $syst_cmd = $cmd;
+
+        @executable = split(' ', $syst_cmd, 2);
+
+        open(my $child, "-|", "$syst_cmd") or die("Could not execute $syst_cmd.\n");
+
+        if ($child)
+        {
+          @output = <$child>;
+        }
+
+        if (close($child) == 0)
+        {
+          $ret = $?;
+        }
+ 
+        alarm 0;
+    };
+    my $elapsed = timestamp() - $start;
+
+    if ($@) {
+        printf("\n#### ERROR Command timeout reached, killing $executable[0].\n");
+        system("killall ".$executable[0]);
+        return ($sig_nums{'KILL'}, $elapsed, @output);
+    }
+
+    return ($ret, $elapsed, @output);
+}
+
+sub current_time
+{
+   return strftime("%x %X %Z", localtime());
+}
+
+my $failures = 0;
+my $known_failures = 0;
+my $errors = 0;
+my $passes = 0;
+
+sub run_examples {
+    # Get list of tests in binary folder.
+    my $dir = cwd();
+    chdir $bin_path;
+    my @examplelist;
+    if ($os eq "win32")
+    {
+        @examplelist = glob('thrust.example.*.exe');
+    } else {
+        @examplelist = glob('thrust.example.*');
+    }
+
+    chdir $dir;
+
+    my $test;
+    foreach $test (@examplelist)
+    {
+        my $test_exe = $test;
+
+        # Ignore FileCheck files. 
+        if ($test =~ /[.]filecheck$/)
+        {
+          next;
+        }
+
+        if ($os eq "win32")
+        {
+          $test =~ s/\.exe//g;
+        }
+
+        # Check the test actually exists.
+        if (!-e "${bin_path}/${test_exe}")
+        {
+          next;
+        }
+
+        my $cmd = "${bin_path}/${test_exe} --verbose 2>&1";
+
+        printf("&&&& RUNNING $test\n");
+        printf("#### CURRENT_TIME " . current_time() . "\n");
+
+        my ($ret, $elapsed, @output) = run_cmd($cmd);
+
+        printf("********************************************************************************\n");
+        print @output;
+        printf("********************************************************************************\n");
+
+        if ($ret != 0) {
+            process_return_code($test, $ret, "Example crash?");
+            printf("&&&& FAILED $test\n");
+            printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+            $errors = $errors + 1;
+        } else {
+            printf("&&&& PASSED $test\n");
+            printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+            $passes = $passes + 1;
+
+            if ($have_filecheck) {
+                # Check output with LLVM FileCheck.
+
+                printf("&&&& RUNNING FileCheck $test\n");
+
+                if (-f "${filecheck_data_path}/${test}.filecheck") {
+                    # If the filecheck file is empty, don't use filecheck, just
+                    # check if the output file is also empty.
+                    if (-z "${filecheck_data_path}/${test}.filecheck") {
+                        if (join("", @output) eq "") {
+                            printf("&&&& PASSED FileCheck $test\n");
+                            $passes = $passes + 1;
+                        } else {
+                            printf("#### ERROR Output received but not expected.\n");
+                            printf("&&&& FAILED FileCheck $test\n");
+                            $failures = $failures + 1;
+                        }
+                    } else {
+                        my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/$test.filecheck";
+
+                        my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
+
+                        print $filecheck_stdin @output;
+
+                        my $filecheck_ret = 0;
+                        if (close($filecheck_stdin) == 0)
+                        {
+                          $filecheck_ret = $?;
+                        }
+
+                        if ($filecheck_ret == 0) {
+                          printf("&&&& PASSED FileCheck $test\n");
+                          $passes = $passes + 1;
+                        } else {
+                          # Use a temporary file to send the output to
+                          # FileCheck so we can get the output this time,
+                          # because Perl and bidirectional pipes suck.
+                          my $tmp = File::Temp->new();
+                          my $tmp_filename = $tmp->filename;
+                          print $tmp @output;
+
+                          printf("********************************************************************************\n");
+                          print `$filecheck_cmd -input-file $tmp_filename`;
+                          printf("********************************************************************************\n");
+
+                          process_return_code("FileCheck $test", $filecheck_ret, "");
+                          printf("&&&& FAILED FileCheck $test\n");
+                          $failures = $failures + 1;
+                        }
+                    }
+                } else {
+                    printf("#### ERROR $test has no FileCheck comparison.\n");
+                    printf("&&&& FAILED FileCheck $test\n");
+                    $errors = $errors + 1;
+                }
+            }
+        }
+        printf("\n");
+    }
+}
+
+sub run_unit_tests {
+    # Get list of tests in binary folder.
+    my $dir = cwd();
+    chdir $bin_path;
+    my @unittestlist;
+    if ($os eq "win32")
+    {
+        @unittestlist = glob('thrust.test.*.exe');
+    } else {
+        @unittestlist = glob('thrust.test.*');
+    }
+    chdir $dir;
+
+    my $test;
+    foreach $test (@unittestlist)
+    {
+        my $test_exe = $test;
+
+        # Ignore FileCheck files. 
+        if ($test =~ /[.]filecheck$/)
+        {
+          next;
+        }
+
+        if ($os eq "win32")
+        {
+          $test =~ s/\.exe//g;
+        }
+
+        # Check the test actually exists.
+        if (!-e "${bin_path}/${test_exe}")
+        {
+          next;
+        }
+
+        # Check the test actually exists
+        next unless (-e "${bin_path}/${test_exe}");
+
+        my $cmd = "${bin_path}/${test_exe} --verbose 2>&1";
+
+        printf("&&&& RUNNING $test\n");
+        printf("#### CURRENT_TIME " . current_time() . "\n");
+
+        my ($ret, $elapsed, @output) = run_cmd($cmd);
+
+        printf("********************************************************************************\n");
+        print @output;
+        printf("********************************************************************************\n");
+        my $fail = 0;
+        my $known_fail = 0;
+        my $error = 0;
+        my $pass = 0;
+        my $found_totals = 0;
+        foreach my $line (@output)
+        {
+            if (($fail, $known_fail, $error, $pass) = $line =~ /Totals: ([0-9]+) failures, ([0-9]+) known failures, ([0-9]+) errors, and ([0-9]+) passes[.]/igs) {
+              $found_totals = 1;
+              $failures = $failures + $fail;
+              $known_failures = $known_failures + $known_fail;
+              $errors = $errors + $error;
+              $passes = $passes + $pass;
+              last;
+            } else {
+              $fail = 0;
+              $known_fail = 0;
+              $error = 0;
+              $pass = 0;
+            }
+        }
+        if ($ret == 0) {
+            if ($found_totals == 0) {
+                $errors = $errors + 1;
+                printf("#### ERROR $test returned 0 and no summary line was found. Invalid test?\n");
+                printf("&&&& FAILED $test\n");
+                printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+            }
+            else {
+                if ($fail != 0 or $error != 0) {
+                    $errors = $errors + 1;
+                    printf("#### ERROR $test returned 0 and had failures or errors. Test driver error?\n");
+                    printf("&&&& FAILED $test\n");
+                    printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+                } elsif ($known_fail == 0 and $pass == 0) {
+                    printf("#### DISABLED $test returned 0 and had no failures, known failures, errors or passes.\n");
+                    printf("&&&& PASSED $test\n");
+                    printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+                } else {
+                    printf("&&&& PASSED $test\n");
+                    printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+
+                    if ($have_filecheck) {
+                        # Check output with LLVM FileCheck if the test has a FileCheck input.
+
+                        if (-f "${filecheck_data_path}/${test}.filecheck") {
+                            printf("&&&& RUNNING FileCheck $test\n");
+
+                            # If the filecheck file is empty, don't use filecheck,
+                            # just check if the output file is also empty.
+                            if (! -z "${filecheck_data_path}/${test}.filecheck") {
+                                if (@output) {
+                                    printf("&&&& PASSED FileCheck $test\n");
+                                    $passes = $passes + 1;
+                                } else {
+                                    printf("#### Output received but not expected.\n");
+                                    printf("&&&& FAILED FileCheck $test\n");
+                                    $failures = $failures + 1;
+                                }
+                            } else {
+                                my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/$test.filecheck";
+
+                                my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
+
+                                print $filecheck_stdin @output;
+
+                                my $filecheck_ret = 0;
+                                if (close($filecheck_stdin) == 0)
+                                {
+                                  $filecheck_ret = $?;
+                                }
+
+                                if ($filecheck_ret == 0) {
+                                  printf("&&&& PASSED FileCheck $test\n");
+                                  $passes = $passes + 1;
+                                } else {
+                                  # Use a temporary file to send the output to
+                                  # FileCheck so we can get the output this time,
+                                  # because Perl and bidirectional pipes suck.
+                                  my $tmp = File::Temp->new();
+                                  my $tmp_filename = $tmp->filename;
+                                  print $tmp @output;
+
+                                  printf("********************************************************************************\n");
+                                  print `$filecheck_cmd -input-file $tmp_filename`;
+                                  printf("********************************************************************************\n");
+
+                                  process_return_code("FileCheck $test", $filecheck_ret, "");
+                                  printf("&&&& FAILED FileCheck $test\n");
+                                  $failures = $failures + 1;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            $errors = $errors + 1;
+            process_return_code($test, $ret, "Test crash?");
+            printf("&&&& FAILED $test\n");
+            printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+        }
+        printf("\n");
+    }
+}
+
+sub dvs_summary {
+    my $dvs_score = 0;
+    my $denominator = $failures + $known_failures + $errors + $passes;
+    if ($denominator == 0) {
+       $dvs_score = 0;
+    }
+    else {
+       $dvs_score = 100 * (($passes + $known_failures) / $denominator);
+    }
+
+    printf("\n");
+
+    printf("%*%*%*%* FA!LUR3S       $failures\n");
+    printf("%*%*%*%* KN0WN FA!LUR3S $known_failures\n");
+    printf("%*%*%*%* 3RR0RS         $errors\n");
+    printf("%*%*%*%* PASS3S         $passes\n");
+
+    printf("\n");
+
+    printf("CUDA DVS BASIC SANITY SCORE : %.1f\n", $dvs_score);
+
+    if ($failures + $errors > 0) {
+        exit(1);
+    }
+}
+
+###############################################################################
+
+printf("#### CONFIG arch `%s`\n", $arch);
+printf("#### CONFIG abi `%s`\n", $abi);
+printf("#### CONFIG os `%s`\n", $os);
+printf("#### CONFIG build `%s`\n", $build);
+printf("#### CONFIG bin_path `%s`\n", $bin_path);
+printf("#### CONFIG have_filecheck `$have_filecheck`\n");
+printf("#### CONFIG filecheck_path `%s`\n", $filecheck_path);
+printf("#### CONFIG filecheck_data_path `%s`\n", $filecheck_data_path);
+printf("#### CONFIG have_time_hi_res `$have_time_hi_res`\n");
+printf("#### CONFIG timeout_min `%s`\n", $timeout_min);
+printf("#### ENV PATH `%s`\n", defined $ENV{'PATH'} ? $ENV{'PATH'} : '');
+printf("#### ENV LD_LIBRARY_PATH `%s`\n", defined $ENV{'LD_LIBRARY_PATH'} ? $ENV{'LD_LIBRARY_PATH'} : '');
+
+printf("\n");
+
+filecheck_sanity();
+
+printf("\n");
+
+my $START_TIME = current_time();
+
+run_examples();
+run_unit_tests();
+
+my $STOP_TIME = current_time();
+
+printf("#### START_TIME $START_TIME\n");
+printf("#### STOP_TIME $STOP_TIME\n");
+
+dvs_summary();
+
diff --git a/thrust/internal/test/unittest.lst b/thrust/internal/test/unittest.lst
new file mode 100644
index 0000000000000000000000000000000000000000..8ea415184b58d30751e823e1b8322e588cea5978
--- /dev/null
+++ b/thrust/internal/test/unittest.lst
@@ -0,0 +1,1267 @@
+TestAdjacentDifference
+TestAdjacentDifferenceCudaStreams
+TestAdjacentDifferenceDeviceSeq
+TestAdjacentDifferenceDiscardIterator
+TestAdjacentDifferenceDispatchExplicit
+TestAdjacentDifferenceDispatchImplicit
+TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes
+TestAdjacentDifferenceSimpleDevice
+TestAdjacentDifferenceSimpleHost
+TestAdvanceDevice
+TestAdvanceHost
+TestAllOfCudaStreams
+TestAllOfDevice
+TestAllOfDeviceSeq
+TestAllOfDispatchExplicit
+TestAllOfDispatchImplicit
+TestAllOfHost
+TestAllocatorCustomCopyConstruct
+TestAllocatorCustomDefaultConstruct
+TestAllocatorCustomDestroy
+TestAllocatorMinimal
+TestAnyOfCudaStreams
+TestAnyOfDevice
+TestAnyOfDeviceSeq
+TestAnyOfDispatchExplicit
+TestAnyOfDispatchImplicit
+TestAnyOfHost
+TestAssertEqual
+TestAssertGEqual
+TestAssertLEqual
+TestBitAndFunctionalDevice
+TestBitAndFunctionalHost
+TestBitOrFunctionalDevice
+TestBitOrFunctionalHost
+TestBitXorFunctionalDevice
+TestBitXorFunctionalHost
+TestComplexArithmeticTransform
+TestComplexBasicArithmetic
+TestComplexBinaryArithmetic
+TestComplexConstructors
+TestComplexExponentialFunctions
+TestComplexExponentialTransform
+TestComplexGetters
+TestComplexMemberOperators
+TestComplexPlaneTransform
+TestComplexPowerFunctions
+TestComplexPowerTransform
+TestComplexStreamOperators
+TestComplexTrigonometricFunctions
+TestComplexTrigonometricTransform
+TestComplexUnaryArithmetic
+TestComputeCapability
+TestConstantIteratorComparison
+TestConstantIteratorConstructFromConvertibleSystem
+TestConstantIteratorCopyDevice
+TestConstantIteratorCopyHost
+TestConstantIteratorIncrement
+TestConstantIteratorReduce
+TestConstantIteratorTransformDevice
+TestConstantIteratorTransformHost
+TestCopyConstantIteratorToZipIteratorDevice
+TestCopyConstantIteratorToZipIteratorHost
+TestCopyCountingIteratorDevice
+TestCopyCountingIteratorHost
+TestCopyDispatchExplicit
+TestCopyDispatchImplicit
+TestCopyFromConstIterator
+TestCopyIf
+TestCopyIfDispatchExplicit
+TestCopyIfDispatchImplicit
+TestCopyIfSimpleDevice
+TestCopyIfSimpleHost
+TestCopyIfStencil
+TestCopyIfStencilDispatchExplicit
+TestCopyIfStencilDispatchImplicit
+TestCopyIfStencilSimpleDevice
+TestCopyIfStencilSimpleHost
+TestCopyListToDevice
+TestCopyListToHost
+TestCopyMatchingTypesDevice
+TestCopyMatchingTypesHost
+TestCopyMixedTypesDevice
+TestCopyMixedTypesHost
+TestCopyNConstantIteratorToZipIteratorDevice
+TestCopyNConstantIteratorToZipIteratorHost
+TestCopyNCountingIteratorDevice
+TestCopyNCountingIteratorHost
+TestCopyNDispatchExplicit
+TestCopyNDispatchImplicit
+TestCopyNFromConstIterator
+TestCopyNListToDevice
+TestCopyNListToHost
+TestCopyNMatchingTypesDevice
+TestCopyNMatchingTypesHost
+TestCopyNMixedTypesDevice
+TestCopyNMixedTypesHost
+TestCopyNToDiscardIterator
+TestCopyNVectorBool
+TestCopyNZipIteratorDevice
+TestCopyNZipIteratorHost
+TestCopyToDiscardIterator
+TestCopyToDiscardIteratorZipped
+TestCopyVectorBool
+TestCopyZipIteratorDevice
+TestCopyZipIteratorHost
+TestCount
+TestCountCudaStreams
+TestCountDeviceSeq
+TestCountDispatchExplicit
+TestCountDispatchImplicit
+TestCountFromConstIteratorSimpleDevice
+TestCountFromConstIteratorSimpleHost
+TestCountIf
+TestCountIfDeviceSeq
+TestCountIfSimpleDevice
+TestCountIfSimpleHost
+TestCountSimpleDevice
+TestCountSimpleHost
+TestCountingIteratorComparison
+TestCountingIteratorCopyConstructor
+TestCountingIteratorDifference
+TestCountingIteratorDistance
+TestCountingIteratorFloatComparison
+TestCountingIteratorIncrement
+TestCountingIteratorLowerBound
+TestCountingIteratorUnsignedType
+TestCudaMallocResultAligned
+TestCudaReduceIntervals
+TestCudaReduceIntervalsSimple
+TestDeviceDeleteDestructorInvocation
+TestDeviceDereferenceCountingIterator
+TestDeviceDereferenceDevicePtr
+TestDeviceDereferenceDeviceVectorIterator
+TestDeviceDereferenceTransformIterator
+TestDeviceDereferenceTransformedCountingIterator
+TestDevicePointerManipulation
+TestDeviceReferenceAssignmentFromDeviceReference
+TestDeviceReferenceConstructorFromDevicePointer
+TestDeviceReferenceConstructorFromDeviceReference
+TestDeviceReferenceManipulation
+TestDiscardIteratorComparison
+TestDiscardIteratorIncrement
+TestDistanceDevice
+TestDistanceHost
+TestDividesFunctionalDevice
+TestDividesFunctionalHost
+TestEqual
+TestEqualCudaStreams
+TestEqualDeviceSeq
+TestEqualDispatchExplicit
+TestEqualDispatchImplicit
+TestEqualSimpleDevice
+TestEqualSimpleHost
+TestEqualToFunctionalDevice
+TestEqualToFunctionalHost
+TestExclusiveScan32
+TestExclusiveScanByKeyCudaStreams
+TestExclusiveScanByKeyDispatchExplicit
+TestExclusiveScanByKeyDispatchImplicit
+TestExclusiveScanByKeySimpleDevice
+TestExclusiveScanByKeySimpleHost
+TestExclusiveScanDispatchExplicit
+TestExclusiveScanDispatchImplicit
+TestFill
+TestFillCudaStreams
+TestFillDeviceSeq
+TestFillDiscardIterator
+TestFillDispatchExplicit
+TestFillDispatchImplicit
+TestFillMixedTypesDevice
+TestFillMixedTypesHost
+TestFillN
+TestFillNDeviceSeq
+TestFillNDiscardIterator
+TestFillNDispatchExplicit
+TestFillNDispatchImplicit
+TestFillNMixedTypesDevice
+TestFillNMixedTypesHost
+TestFillNSimpleDevice
+TestFillNSimpleHost
+TestFillSimpleDevice
+TestFillSimpleHost
+TestFillTuple
+TestFillWithNonTrivialAssignment
+TestFillWithTrivialAssignment
+TestFillZipIteratorDevice
+TestFillZipIteratorHost
+TestFind
+TestFindCudaStreams
+TestFindDeviceSeq
+TestFindDispatchExplicit
+TestFindDispatchImplicit
+TestFindIf
+TestFindIfDeviceSeq
+TestFindIfDispatchExplicit
+TestFindIfDispatchImplicit
+TestFindIfNot
+TestFindIfNotDeviceSeq
+TestFindIfNotDispatchExplicit
+TestFindIfNotDispatchImplicit
+TestFindIfNotSimpleDevice
+TestFindIfNotSimpleHost
+TestFindIfSimpleDevice
+TestFindIfSimpleHost
+TestFindSimpleDevice
+TestFindSimpleHost
+TestForEach
+TestForEachCudaStreams
+TestForEachDeviceSeq
+TestForEachDispatchExplicit
+TestForEachDispatchImplicit
+TestForEachLargeRegisterFootprint
+TestForEachN
+TestForEachNDeviceSeq
+TestForEachNDispatchExplicit
+TestForEachNDispatchImplicit
+TestForEachNLargeRegisterFootprint
+TestForEachNSimpleAnySystem
+TestForEachNSimpleDevice
+TestForEachNSimpleHost
+TestForEachNWithLargeTypes
+TestForEachSimpleAnySystem
+TestForEachSimpleDevice
+TestForEachSimpleHost
+TestForEachWithLargeTypes
+TestFreeDispatchExplicit
+TestFunctionalPlaceholdersBinaryEqualToDevice
+TestFunctionalPlaceholdersBinaryEqualToHost
+TestFunctionalPlaceholdersBinaryGreaterDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualHost
+TestFunctionalPlaceholdersBinaryGreaterHost
+TestFunctionalPlaceholdersBinaryLessDevice
+TestFunctionalPlaceholdersBinaryLessEqualDevice
+TestFunctionalPlaceholdersBinaryLessEqualHost
+TestFunctionalPlaceholdersBinaryLessHost
+TestFunctionalPlaceholdersBinaryNotEqualToDevice
+TestFunctionalPlaceholdersBinaryNotEqualToHost
+TestFunctionalPlaceholdersBitAnd<thrust::device_vector>
+TestFunctionalPlaceholdersBitAnd<thrust::host_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitNegateDevice
+TestFunctionalPlaceholdersBitNegateHost
+TestFunctionalPlaceholdersBitOr<thrust::device_vector>
+TestFunctionalPlaceholdersBitOr<thrust::host_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitXor<thrust::device_vector>
+TestFunctionalPlaceholdersBitXor<thrust::host_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::host_vector>
+TestFunctionalPlaceholdersDivides<thrust::device_vector>
+TestFunctionalPlaceholdersDivides<thrust::host_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersLogicalAndDevice
+TestFunctionalPlaceholdersLogicalAndHost
+TestFunctionalPlaceholdersLogicalNotDevice
+TestFunctionalPlaceholdersLogicalNotHost
+TestFunctionalPlaceholdersLogicalOrDevice
+TestFunctionalPlaceholdersLogicalOrHost
+TestFunctionalPlaceholdersMinus<thrust::device_vector>
+TestFunctionalPlaceholdersMinus<thrust::host_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersModulus<thrust::device_vector>
+TestFunctionalPlaceholdersModulus<thrust::host_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::device_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::host_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersNegateDevice
+TestFunctionalPlaceholdersNegateHost
+TestFunctionalPlaceholdersPlus<thrust::device_vector>
+TestFunctionalPlaceholdersPlus<thrust::host_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersPrefixDecrementDevice
+TestFunctionalPlaceholdersPrefixDecrementHost
+TestFunctionalPlaceholdersPrefixIncrementDevice
+TestFunctionalPlaceholdersPrefixIncrementHost
+TestFunctionalPlaceholdersSuffixDecrementDevice
+TestFunctionalPlaceholdersSuffixDecrementHost
+TestFunctionalPlaceholdersSuffixIncrementDevice
+TestFunctionalPlaceholdersSuffixIncrementHost
+TestFunctionalPlaceholdersTransformIterator<thrust::device_vector>
+TestFunctionalPlaceholdersTransformIterator<thrust::host_vector>
+TestFunctionalPlaceholdersUnaryPlusDevice
+TestFunctionalPlaceholdersUnaryPlusHost
+TestFunctionalPlaceholdersValue<thrust::device_vector>
+TestFunctionalPlaceholdersValue<thrust::host_vector>
+TestGather
+TestGatherCountingIteratorDevice
+TestGatherCountingIteratorHost
+TestGatherCudaStreams
+TestGatherDeviceSeq
+TestGatherDispatchExplicit
+TestGatherDispatchImplicit
+TestGatherIf
+TestGatherIfCudaStreams
+TestGatherIfDeviceSeq
+TestGatherIfDispatchExplicit
+TestGatherIfDispatchImplicit
+TestGatherIfSimpleDevice
+TestGatherIfSimpleHost
+TestGatherIfToDiscardIterator
+TestGatherSimpleDevice
+TestGatherSimpleHost
+TestGatherToDiscardIterator
+TestGenerate
+TestGenerateCudaStreams
+TestGenerateDeviceSeq
+TestGenerateDispatchExplicit
+TestGenerateDispatchImplicit
+TestGenerateNCudaStreams
+TestGenerateNDeviceSeq
+TestGenerateNDispatchExplicit
+TestGenerateNDispatchImplicit
+TestGenerateNSimpleDevice
+TestGenerateNSimpleHost
+TestGenerateNToDiscardIterator
+TestGenerateSimpleDevice
+TestGenerateSimpleHost
+TestGenerateToDiscardIterator
+TestGenerateTuple
+TestGenerateZipIteratorDevice
+TestGenerateZipIteratorHost
+TestGetTemporaryBuffer
+TestGetTemporaryBufferDeviceSeq
+TestGetTemporaryBufferDispatchExplicit
+TestGetTemporaryBufferDispatchImplicit
+TestGreaterEqualFunctionalDevice
+TestGreaterEqualFunctionalHost
+TestGreaterFunctionalDevice
+TestGreaterFunctionalHost
+TestIdentityFunctionalDevice
+TestIdentityFunctionalHost
+TestInclusiveScan32
+TestInclusiveScanByKeyCudaStreams
+TestInclusiveScanByKeyDispatchExplicit
+TestInclusiveScanByKeyDispatchImplicit
+TestInclusiveScanByKeySimpleDevice
+TestInclusiveScanByKeySimpleHost
+TestInclusiveScanByKeyTransformIteratorDevice
+TestInclusiveScanByKeyTransformIteratorHost
+TestInclusiveScanDispatchExplicit
+TestInclusiveScanDispatchImplicit
+TestInclusiveScanWithIndirectionDevice
+TestInclusiveScanWithIndirectionHost
+TestInnerProduct
+TestInnerProductCudaStreams
+TestInnerProductDeviceSeq
+TestInnerProductDispatchExplicit
+TestInnerProductDispatchImplicit
+TestInnerProductSimpleDevice
+TestInnerProductSimpleHost
+TestInnerProductWithOperatorDevice
+TestInnerProductWithOperatorHost
+TestIsCommutative
+TestIsPartitionedCudaStreams
+TestIsPartitionedDevice
+TestIsPartitionedDeviceSeq
+TestIsPartitionedDispatchExplicit
+TestIsPartitionedDispatchImplicit
+TestIsPartitionedHost
+TestIsPartitionedSimpleDevice
+TestIsPartitionedSimpleHost
+TestIsPlainOldData
+TestIsSortedCudaStreams
+TestIsSortedDevice
+TestIsSortedDeviceSeq
+TestIsSortedDispatchExplicit
+TestIsSortedDispatchImplicit
+TestIsSortedHost
+TestIsSortedRepeatedElementsDevice
+TestIsSortedRepeatedElementsHost
+TestIsSortedSimpleDevice
+TestIsSortedSimpleHost
+TestIsSortedUntilCudaStreams
+TestIsSortedUntilDevice
+TestIsSortedUntilDeviceSeq
+TestIsSortedUntilExplicit
+TestIsSortedUntilHost
+TestIsSortedUntilImplicit
+TestIsSortedUntilRepeatedElementsDevice
+TestIsSortedUntilRepeatedElementsHost
+TestIsSortedUntilSimpleDevice
+TestIsSortedUntilSimpleHost
+TestIsTrivialIterator
+TestLessEqualFunctionalDevice
+TestLessEqualFunctionalHost
+TestLessFunctionalDevice
+TestLessFunctionalHost
+TestLog2
+TestLogicalAndFunctionalDevice
+TestLogicalAndFunctionalHost
+TestLogicalNotFunctionalDevice
+TestLogicalNotFunctionalHost
+TestLogicalOrFunctionalDevice
+TestLogicalOrFunctionalHost
+TestMakeConstantIterator
+TestMakeDevicePointer
+TestMakeDiscardIterator
+TestMakePermutationIteratorDevice
+TestMakePermutationIteratorHost
+TestMakeTransformIteratorDevice
+TestMakeTransformIteratorHost
+TestMakeTuple
+TestMalloc
+TestMallocDeviceSeq
+TestMallocDispatchExplicit
+TestMax
+TestMaxActiveBlocks
+TestMaxBlocksizeWithHighestOccupancy
+TestMaxElement
+TestMaxElementCudaStreams
+TestMaxElementDeviceSeq
+TestMaxElementDispatchExplicit
+TestMaxElementDispatchImplicit
+TestMaxElementSimpleDevice
+TestMaxElementSimpleHost
+TestMaximumFunctionalDevice
+TestMaximumFunctionalHost
+TestMerge
+TestMergeByKey
+TestMergeByKeyCudaStreams
+TestMergeByKeyDescending
+TestMergeByKeyDeviceSeq
+TestMergeByKeyDispatchExplicit
+TestMergeByKeyDispatchImplicit
+TestMergeByKeySimpleDevice
+TestMergeByKeySimpleHost
+TestMergeByKeyToDiscardIterator
+TestMergeCudaStreams
+TestMergeDescending
+TestMergeDeviceSeq
+TestMergeDispatchExplicit
+TestMergeDispatchImplicit
+TestMergeKeyValue
+TestMergeKeyValueDescending
+TestMergeSimpleDevice
+TestMergeSimpleHost
+TestMergeSortAscendingKeyValue
+TestMergeSortDescendingKey
+TestMergeSortDescendingKeyValue
+TestMergeSortKeySimple
+TestMergeSortKeyValue
+TestMergeSortKeyValueSimple
+TestMergeSortStableKeySimple
+TestMergeToDiscardIterator
+TestMin
+TestMinElement
+TestMinElementCudaStreams
+TestMinElementDeviceSeq
+TestMinElementDispatchExplicit
+TestMinElementDispatchImplicit
+TestMinElementSimpleDevice
+TestMinElementSimpleHost
+TestMinMaxElement
+TestMinMaxElementCudaStreams
+TestMinMaxElementDeviceSeq
+TestMinMaxElementDispatchExplicit
+TestMinMaxElementDispatchImplicit
+TestMinMaxElementSimpleDevice
+TestMinMaxElementSimpleHost
+TestMinimumFunctionalDevice
+TestMinimumFunctionalHost
+TestMinstdRand0Equal
+TestMinstdRand0Max
+TestMinstdRand0Min
+TestMinstdRand0SaveRestore
+TestMinstdRand0Unequal
+TestMinstdRand0Validation
+TestMinstdRandEqual
+TestMinstdRandMax
+TestMinstdRandMin
+TestMinstdRandSaveRestore
+TestMinstdRandUnequal
+TestMinstdRandValidation
+TestMinusFunctionalDevice
+TestMinusFunctionalHost
+TestMismatchCudaStreams
+TestMismatchDeviceSeq
+TestMismatchDispatchExplicit
+TestMismatchDispatchImplicit
+TestMismatchSimpleDevice
+TestMismatchSimpleHost
+TestModulusFunctionalDevice
+TestModulusFunctionalHost
+TestMultipliesFunctionalDevice
+TestMultipliesFunctionalHost
+TestNegateFunctionalDevice
+TestNegateFunctionalHost
+TestNoneOfCudaStreams
+TestNoneOfDevice
+TestNoneOfDeviceSeq
+TestNoneOfDispatchExplicit
+TestNoneOfDispatchImplicit
+TestNoneOfHost
+TestNormalDistributionMax
+TestNormalDistributionMin
+TestNormalDistributionSaveRestore
+TestNot1Device
+TestNot1Host
+TestNot2Device
+TestNot2Host
+TestNotEqualToFunctionalDevice
+TestNotEqualToFunctionalHost
+TestPairComparison
+TestPairGet
+TestPairManipulation
+TestPairReduce
+TestPairScan
+TestPairScanByKey
+TestPairStableSort
+TestPairStableSortByKey
+TestPairStableSortByKeyDeviceSeq
+TestPairStableSortDeviceSeq
+TestPairSwap
+TestPairTransform
+TestPairTupleElement
+TestPairTupleSize
+TestPartition
+TestPartitionCopy
+TestPartitionCopyDeviceSeq
+TestPartitionCopyDispatchExplicit
+TestPartitionCopyDispatchImplicit
+TestPartitionCopySimpleDevice
+TestPartitionCopySimpleHost
+TestPartitionCopyStencil
+TestPartitionCopyStencilDispatchExplicit
+TestPartitionCopyStencilDispatchImplicit
+TestPartitionCopyStencilSimpleDevice
+TestPartitionCopyStencilSimpleHost
+TestPartitionCopyStencilToDiscardIterator
+TestPartitionCopyToDiscardIterator
+TestPartitionCudaStreams
+TestPartitionDeviceSeq
+TestPartitionDispatchExplicit
+TestPartitionDispatchImplicit
+TestPartitionPointCudaStreams
+TestPartitionPointDevice
+TestPartitionPointDeviceSeq
+TestPartitionPointDispatchExplicit
+TestPartitionPointDispatchImplicit
+TestPartitionPointHost
+TestPartitionPointSimpleDevice
+TestPartitionPointSimpleHost
+TestPartitionSimpleDevice
+TestPartitionSimpleHost
+TestPartitionStencil
+TestPartitionStencilDeviceSeq
+TestPartitionStencilDispatchExplicit
+TestPartitionStencilDispatchImplicit
+TestPartitionStencilSimpleDevice
+TestPartitionStencilSimpleHost
+TestPartitionStencilZipIteratorDevice
+TestPartitionStencilZipIteratorHost
+TestPartitionZipIteratorDevice
+TestPartitionZipIteratorHost
+TestPermutationIteratorGatherDevice
+TestPermutationIteratorGatherHost
+TestPermutationIteratorHostDeviceGather
+TestPermutationIteratorHostDeviceScatter
+TestPermutationIteratorReduceDevice
+TestPermutationIteratorReduceHost
+TestPermutationIteratorScatterDevice
+TestPermutationIteratorScatterHost
+TestPermutationIteratorSimpleDevice
+TestPermutationIteratorSimpleHost
+TestPermutationIteratorWithCountingIteratorDevice
+TestPermutationIteratorWithCountingIteratorHost
+TestPinnedAllocatorSimple
+TestPlusFunctionalDevice
+TestPlusFunctionalHost
+TestProject1stFunctionalDevice
+TestProject1stFunctionalHost
+TestProject2ndFunctionalDevice
+TestProject2ndFunctionalHost
+TestRadixSort
+TestRadixSortByKey
+TestRadixSortByKeyLongLongValues
+TestRadixSortByKeyShortValues
+TestRadixSortKeySimple<thrust::device_vector>
+TestRadixSortKeyValueSimple<thrust::device_vector>
+TestRanlux24BaseEqual
+TestRanlux24BaseMax
+TestRanlux24BaseMin
+TestRanlux24BaseSaveRestore
+TestRanlux24BaseUnequal
+TestRanlux24BaseValidation
+TestRanlux24Equal
+TestRanlux24Max
+TestRanlux24Min
+TestRanlux24SaveRestore
+TestRanlux24Unequal
+TestRanlux24Validation
+TestRanlux48BaseEqual
+TestRanlux48BaseMax
+TestRanlux48BaseMin
+TestRanlux48BaseSaveRestore
+TestRanlux48BaseUnequal
+TestRanlux48BaseValidation
+TestRanlux48Equal
+TestRanlux48Max
+TestRanlux48Min
+TestRanlux48SaveRestore
+TestRanlux48Unequal
+TestRanlux48Validation
+TestRawPointerCastDevice
+TestRawPointerCastHost
+TestReduce
+TestReduceByKey
+TestReduceByKeyCudaStreams
+TestReduceByKeyDeviceSeq
+TestReduceByKeyDispatchExplicit
+TestReduceByKeyDispatchImplicit
+TestReduceByKeySimpleDevice
+TestReduceByKeySimpleHost
+TestReduceByKeyToDiscardIterator
+TestReduceCountingIterator
+TestReduceCudaStreams
+TestReduceDeviceSeq
+TestReduceDispatchExplicit
+TestReduceDispatchImplicit
+TestReduceMixedTypesDevice
+TestReduceMixedTypesHost
+TestReduceSimpleDevice
+TestReduceSimpleHost
+TestReduceWithIndirectionDevice
+TestReduceWithIndirectionHost
+TestReduceWithLargeTypes
+TestReduceWithOperator
+TestRemove
+TestRemoveCopy
+TestRemoveCopyCudaStreams
+TestRemoveCopyDeviceSeq
+TestRemoveCopyDispatchExplicit
+TestRemoveCopyDispatchImplicit
+TestRemoveCopyIf
+TestRemoveCopyIfCudaStreams
+TestRemoveCopyIfDeviceSeq
+TestRemoveCopyIfDispatchExplicit
+TestRemoveCopyIfDispatchImplicit
+TestRemoveCopyIfSimpleDevice
+TestRemoveCopyIfSimpleHost
+TestRemoveCopyIfStencil
+TestRemoveCopyIfStencilCudaStreams
+TestRemoveCopyIfStencilDeviceSeq
+TestRemoveCopyIfStencilDispatchExplicit
+TestRemoveCopyIfStencilDispatchImplicit
+TestRemoveCopyIfStencilSimpleDevice
+TestRemoveCopyIfStencilSimpleHost
+TestRemoveCopyIfStencilToDiscardIterator
+TestRemoveCopyIfToDiscardIterator
+TestRemoveCopySimpleDevice
+TestRemoveCopySimpleHost
+TestRemoveCopyToDiscardIterator
+TestRemoveCopyToDiscardIteratorZipped
+TestRemoveCudaStreams
+TestRemoveDeviceSeq
+TestRemoveDispatchExplicit
+TestRemoveDispatchImplicit
+TestRemoveIf
+TestRemoveIfCudaStreams
+TestRemoveIfDeviceSeq
+TestRemoveIfDispatchExplicit
+TestRemoveIfDispatchImplicit
+TestRemoveIfSimpleDevice
+TestRemoveIfSimpleHost
+TestRemoveIfStencil
+TestRemoveIfStencilCudaStreams
+TestRemoveIfStencilDeviceSeq
+TestRemoveIfStencilDispatchExplicit
+TestRemoveIfStencilDispatchImplicit
+TestRemoveIfStencilSimpleDevice
+TestRemoveIfStencilSimpleHost
+TestRemoveSimpleDevice
+TestRemoveSimpleHost
+TestReplace
+TestReplaceCopy
+TestReplaceCopyDeviceSeq
+TestReplaceCopyDispatchExplicit
+TestReplaceCopyDispatchImplicit
+TestReplaceCopyIf
+TestReplaceCopyIfDeviceSeq
+TestReplaceCopyIfDispatchExplicit
+TestReplaceCopyIfDispatchImplicit
+TestReplaceCopyIfSimpleDevice
+TestReplaceCopyIfSimpleHost
+TestReplaceCopyIfStencil
+TestReplaceCopyIfStencilDeviceSeq
+TestReplaceCopyIfStencilDispatchExplicit
+TestReplaceCopyIfStencilDispatchImplicit
+TestReplaceCopyIfStencilSimpleDevice
+TestReplaceCopyIfStencilSimpleHost
+TestReplaceCopyIfStencilToDiscardIterator
+TestReplaceCopyIfToDiscardIterator
+TestReplaceCopySimpleDevice
+TestReplaceCopySimpleHost
+TestReplaceCopyToDiscardIterator
+TestReplaceCudaStreams
+TestReplaceDeviceSeq
+TestReplaceDispatchExplicit
+TestReplaceDispatchImplicit
+TestReplaceIf
+TestReplaceIfDeviceSeq
+TestReplaceIfDispatchExplicit
+TestReplaceIfDispatchImplicit
+TestReplaceIfSimpleDevice
+TestReplaceIfSimpleHost
+TestReplaceIfStencil
+TestReplaceIfStencilDeviceSeq
+TestReplaceIfStencilDispatchExplicit
+TestReplaceIfStencilDispatchImplicit
+TestReplaceIfStencilSimpleDevice
+TestReplaceIfStencilSimpleHost
+TestReplaceSimpleDevice
+TestReplaceSimpleHost
+TestReverse
+TestReverseCopy
+TestReverseCopyDeviceSeq
+TestReverseCopyDispatchExplicit
+TestReverseCopyDispatchImplicit
+TestReverseCopySimpleDevice
+TestReverseCopySimpleHost
+TestReverseCopyToDiscardIterator
+TestReverseCudaStreams
+TestReverseDeviceSeq
+TestReverseDispatchExplicit
+TestReverseDispatchImplicit
+TestReverseIteratorCopyConstructor
+TestReverseIteratorCopyDevice
+TestReverseIteratorCopyHost
+TestReverseIteratorExclusiveScan
+TestReverseIteratorExclusiveScanSimple
+TestReverseIteratorIncrement
+TestReverseSimpleDevice
+TestReverseSimpleHost
+TestScalarBinarySearchDescendingSimpleDevice
+TestScalarBinarySearchDescendingSimpleHost
+TestScalarBinarySearchDispatchExplicit
+TestScalarBinarySearchDispatchImplicit
+TestScalarBinarySearchSimpleDevice
+TestScalarBinarySearchSimpleHost
+TestScalarEqualRangeDescendingSimpleDevice
+TestScalarEqualRangeDescendingSimpleHost
+TestScalarEqualRangeDispatchExplicit
+TestScalarEqualRangeDispatchImplicit
+TestScalarEqualRangeSimpleDevice
+TestScalarEqualRangeSimpleHost
+TestScalarLowerBoundDescendingSimpleDevice
+TestScalarLowerBoundDescendingSimpleHost
+TestScalarLowerBoundDispatchExplicit
+TestScalarLowerBoundDispatchImplicit
+TestScalarLowerBoundSimpleDevice
+TestScalarLowerBoundSimpleHost
+TestScalarUpperBoundDescendingSimpleDevice
+TestScalarUpperBoundDescendingSimpleHost
+TestScalarUpperBoundDispatchExplicit
+TestScalarUpperBoundDispatchImplicit
+TestScalarUpperBoundSimpleDevice
+TestScalarUpperBoundSimpleHost
+TestScan
+TestScanByKeyDeviceSeq
+TestScanByKeyHeadFlagsDevice
+TestScanByKeyHeadFlagsHost
+TestScanByKeyLargeInput
+TestScanByKeyMixedTypes
+TestScanByKeyReusedKeysDevice
+TestScanByKeyReusedKeysHost
+TestScanByKeyWithLargeTypes
+TestScanCudaStreams
+TestScanDeviceDevice
+TestScanDeviceSeq
+TestScanMixedTypes
+TestScanMixedTypesDevice
+TestScanMixedTypesHost
+TestScanSimpleDevice
+TestScanSimpleHost
+TestScanToDiscardIterator
+TestScanWithLargeTypes
+TestScanWithOperator
+TestScanWithOperatorToDiscardIterator
+TestScatter
+TestScatterCountingIteratorDevice
+TestScatterCountingIteratorHost
+TestScatterCudaStreams
+TestScatterDeviceSeq
+TestScatterDispatchExplicit
+TestScatterDispatchImplicit
+TestScatterIf
+TestScatterIfCountingIteratorDevice
+TestScatterIfCountingIteratorHost
+TestScatterIfCudaStreams
+TestScatterIfDeviceSeq
+TestScatterIfDispatchExplicit
+TestScatterIfDispatchImplicit
+TestScatterIfSimpleDevice
+TestScatterIfSimpleHost
+TestScatterIfToDiscardIterator
+TestScatterSimpleDevice
+TestScatterSimpleHost
+TestScatterToDiscardIterator
+TestSelectSystemCudaToCpp
+TestSelectSystemDifferentTypes
+TestSelectSystemSameTypes
+TestSequence
+TestSequenceCudaStreams
+TestSequenceDeviceSeq
+TestSequenceDispatchExplicit
+TestSequenceDispatchImplicit
+TestSequenceSimpleDevice
+TestSequenceSimpleHost
+TestSequenceToDiscardIterator
+TestSetDifference
+TestSetDifferenceByKey
+TestSetDifferenceByKeyCudaStreams
+TestSetDifferenceByKeyDescending
+TestSetDifferenceByKeyDescendingSimpleDevice
+TestSetDifferenceByKeyDescendingSimpleHost
+TestSetDifferenceByKeyDeviceSeq
+TestSetDifferenceByKeyDispatchExplicit
+TestSetDifferenceByKeyDispatchImplicit
+TestSetDifferenceByKeyEquivalentRanges
+TestSetDifferenceByKeyMultiset
+TestSetDifferenceByKeySimpleDevice
+TestSetDifferenceByKeySimpleHost
+TestSetDifferenceCudaStreams
+TestSetDifferenceDescending
+TestSetDifferenceDescendingSimpleDevice
+TestSetDifferenceDescendingSimpleHost
+TestSetDifferenceDeviceSeq
+TestSetDifferenceDispatchExplicit
+TestSetDifferenceDispatchImplicit
+TestSetDifferenceEquivalentRanges
+TestSetDifferenceKeyValue
+TestSetDifferenceMultiset
+TestSetDifferenceSimpleDevice
+TestSetDifferenceSimpleHost
+TestSetIntersection
+TestSetIntersectionByKey
+TestSetIntersectionByKeyCudaStreams
+TestSetIntersectionByKeyDescending
+TestSetIntersectionByKeyDescendingSimpleDevice
+TestSetIntersectionByKeyDescendingSimpleHost
+TestSetIntersectionByKeyDeviceSeq
+TestSetIntersectionByKeyDispatchExplicit
+TestSetIntersectionByKeyDispatchImplicit
+TestSetIntersectionByKeyEquivalentRanges
+TestSetIntersectionByKeyMultiset
+TestSetIntersectionByKeySimpleDevice
+TestSetIntersectionByKeySimpleHost
+TestSetIntersectionCudaStreams
+TestSetIntersectionDescending
+TestSetIntersectionDescendingSimpleDevice
+TestSetIntersectionDescendingSimpleHost
+TestSetIntersectionDeviceSeq
+TestSetIntersectionDispatchExplicit
+TestSetIntersectionDispatchImplicit
+TestSetIntersectionEquivalentRanges
+TestSetIntersectionKeyValue
+TestSetIntersectionMultiset
+TestSetIntersectionSimpleDevice
+TestSetIntersectionSimpleHost
+TestSetIntersectionToDiscardIterator
+TestSetSymmetricDifference
+TestSetSymmetricDifferenceByKey
+TestSetSymmetricDifferenceByKeyCudaStreams
+TestSetSymmetricDifferenceByKeyDescending
+TestSetSymmetricDifferenceByKeyDescendingSimpleDevice
+TestSetSymmetricDifferenceByKeyDescendingSimpleHost
+TestSetSymmetricDifferenceByKeyDeviceSeq
+TestSetSymmetricDifferenceByKeyDispatchExplicit
+TestSetSymmetricDifferenceByKeyDispatchImplicit
+TestSetSymmetricDifferenceByKeyEquivalentRanges
+TestSetSymmetricDifferenceByKeyMultiset
+TestSetSymmetricDifferenceByKeySimpleDevice
+TestSetSymmetricDifferenceByKeySimpleHost
+TestSetSymmetricDifferenceCudaStreams
+TestSetSymmetricDifferenceDescending
+TestSetSymmetricDifferenceDescendingSimpleDevice
+TestSetSymmetricDifferenceDescendingSimpleHost
+TestSetSymmetricDifferenceDeviceSeq
+TestSetSymmetricDifferenceDispatchExplicit
+TestSetSymmetricDifferenceDispatchImplicit
+TestSetSymmetricDifferenceEquivalentRanges
+TestSetSymmetricDifferenceKeyValue
+TestSetSymmetricDifferenceMultiset
+TestSetSymmetricDifferenceSimpleDevice
+TestSetSymmetricDifferenceSimpleHost
+TestSetUnion
+TestSetUnionByKey
+TestSetUnionByKeyCudaStreams
+TestSetUnionByKeyDescending
+TestSetUnionByKeyDescendingSimpleDevice
+TestSetUnionByKeyDescendingSimpleHost
+TestSetUnionByKeyDeviceSeq
+TestSetUnionByKeyDispatchExplicit
+TestSetUnionByKeyDispatchImplicit
+TestSetUnionByKeyEquivalentRanges
+TestSetUnionByKeyMultiset
+TestSetUnionByKeySimpleDevice
+TestSetUnionByKeySimpleHost
+TestSetUnionCudaStreams
+TestSetUnionDescending
+TestSetUnionDescendingSimpleDevice
+TestSetUnionDescendingSimpleHost
+TestSetUnionDeviceSeq
+TestSetUnionDispatchExplicit
+TestSetUnionDispatchImplicit
+TestSetUnionKeyValue
+TestSetUnionKeyValueDescending
+TestSetUnionSimpleDevice
+TestSetUnionSimpleHost
+TestSetUnionToDiscardIterator
+TestSetUnionWithEquivalentElementsSimpleDevice
+TestSetUnionWithEquivalentElementsSimpleHost
+TestSortAscendingKey
+TestSortAscendingKeyValue
+TestSortBool
+TestSortBoolDescending
+TestSortByKeyBool
+TestSortByKeyBoolDescending
+TestSortByKeyCudaStreams
+TestSortByKeyDeviceSeq
+TestSortByKeyDispatchExplicit
+TestSortByKeyDispatchImplicit
+TestSortByKeyPermutationIteratorDevice
+TestSortByKeyPermutationIteratorHost
+TestSortByKeySimpleDevice
+TestSortByKeySimpleHost
+TestSortByKeyVariableBits
+TestSortCudaStreams
+TestSortDescendingKey
+TestSortDescendingKeyValue
+TestSortDeviceSeq
+TestSortDispatchExplicit
+TestSortDispatchImplicit
+TestSortPermutationIteratorDevice
+TestSortPermutationIteratorHost
+TestSortSimpleDevice
+TestSortSimpleHost
+TestSortVariableBits
+TestStablePartition
+TestStablePartitionCopy
+TestStablePartitionCopyDeviceSeq
+TestStablePartitionCopyDispatchExplicit
+TestStablePartitionCopyDispatchImplicit
+TestStablePartitionCopySimpleDevice
+TestStablePartitionCopySimpleHost
+TestStablePartitionCopyStencil
+TestStablePartitionCopyStencilDispatchExplicit
+TestStablePartitionCopyStencilDispatchImplicit
+TestStablePartitionCopyStencilSimpleDevice
+TestStablePartitionCopyStencilSimpleHost
+TestStablePartitionCopyStencilToDiscardIterator
+TestStablePartitionCopyToDiscardIterator
+TestStablePartitionDeviceSeq
+TestStablePartitionDispatchExplicit
+TestStablePartitionDispatchImplicit
+TestStablePartitionSimpleDevice
+TestStablePartitionSimpleHost
+TestStablePartitionStencil
+TestStablePartitionStencilDeviceSeq
+TestStablePartitionStencilDispatchExplicit
+TestStablePartitionStencilDispatchImplicit
+TestStablePartitionStencilSimpleDevice
+TestStablePartitionStencilSimpleHost
+TestStablePartitionStencilZipIteratorDevice
+TestStablePartitionStencilZipIteratorHost
+TestStablePartitionZipIteratorDevice
+TestStablePartitionZipIteratorHost
+TestStableSort
+TestStableSortByKey
+TestStableSortByKeyDispatchExplicit
+TestStableSortByKeyDispatchImplicit
+TestStableSortByKeyPermutationIteratorDevice
+TestStableSortByKeyPermutationIteratorHost
+TestStableSortByKeySemantics
+TestStableSortByKeySimpleDevice
+TestStableSortByKeySimpleHost
+TestStableSortByKeyWithLargeKeys
+TestStableSortByKeyWithLargeKeysAndValues
+TestStableSortByKeyWithLargeValues
+TestStableSortDispatchExplicit
+TestStableSortDispatchImplicit
+TestStableSortPermutationIteratorDevice
+TestStableSortPermutationIteratorHost
+TestStableSortSemantics
+TestStableSortSimpleDevice
+TestStableSortSimpleHost
+TestStableSortWithIndirectionDevice
+TestStableSortWithIndirectionHost
+TestStableSortWithLargeKeys
+TestStandardIntegerTypes
+TestSwapRanges
+TestSwapRangesCudaStreams
+TestSwapRangesDeviceSeq
+TestSwapRangesDispatchExplicit
+TestSwapRangesDispatchImplicit
+TestSwapRangesSimpleDevice
+TestSwapRangesSimpleHost
+TestSwapRangesUserSwap
+TestTabulate
+TestTabulateCudaStreams
+TestTabulateDeviceSeq
+TestTabulateDispatchExplicit
+TestTabulateDispatchImplicit
+TestTabulateSimpleDevice
+TestTabulateSimpleHost
+TestTabulateToDiscardIterator
+TestTaus88Equal
+TestTaus88Max
+TestTaus88Min
+TestTaus88SaveRestore
+TestTaus88Unequal
+TestTaus88Validation
+TestTransformBinary
+TestTransformBinaryCountingIterator
+TestTransformBinaryCudaStreams
+TestTransformBinaryDeviceSeq
+TestTransformBinaryDispatchExplicit
+TestTransformBinaryDispatchImplicit
+TestTransformBinarySimpleDevice
+TestTransformBinarySimpleHost
+TestTransformBinaryToDiscardIterator
+TestTransformExclusiveScanDispatchExplicit
+TestTransformExclusiveScanDispatchImplicit
+TestTransformIfBinary
+TestTransformIfBinaryDeviceSeq
+TestTransformIfBinaryDispatchExplicit
+TestTransformIfBinaryDispatchImplicit
+TestTransformIfBinarySimpleDevice
+TestTransformIfBinarySimpleHost
+TestTransformIfBinaryToDiscardIterator
+TestTransformIfUnary
+TestTransformIfUnaryDeviceSeq
+TestTransformIfUnaryDispatchExplicit
+TestTransformIfUnaryDispatchImplicit
+TestTransformIfUnaryNoStencil
+TestTransformIfUnaryNoStencilDeviceSeq
+TestTransformIfUnaryNoStencilDispatchExplicit
+TestTransformIfUnaryNoStencilDispatchImplicit
+TestTransformIfUnaryNoStencilSimpleDevice
+TestTransformIfUnaryNoStencilSimpleHost
+TestTransformIfUnarySimpleDevice
+TestTransformIfUnarySimpleHost
+TestTransformIfUnaryToDiscardIterator
+TestTransformInclusiveScanDispatchExplicit
+TestTransformInclusiveScanDispatchImplicit
+TestTransformIteratorDevice
+TestTransformIteratorHost
+TestTransformIteratorReduce
+TestTransformReduce
+TestTransformReduceCountingIteratorDevice
+TestTransformReduceCountingIteratorHost
+TestTransformReduceCudaStreams
+TestTransformReduceDeviceSeq
+TestTransformReduceDispatchExplicit
+TestTransformReduceDispatchImplicit
+TestTransformReduceFromConst
+TestTransformReduceSimpleDevice
+TestTransformReduceSimpleHost
+TestTransformScan
+TestTransformScanCountingIteratorDevice
+TestTransformScanCountingIteratorHost
+TestTransformScanCudaStreams
+TestTransformScanDeviceSeq
+TestTransformScanSimpleDevice
+TestTransformScanSimpleHost
+TestTransformScanToDiscardIterator
+TestTransformUnary
+TestTransformUnaryCountingIterator
+TestTransformUnaryCudaStreams
+TestTransformUnaryDeviceSeq
+TestTransformUnaryDispatchExplicit
+TestTransformUnaryDispatchImplicit
+TestTransformUnarySimpleDevice
+TestTransformUnarySimpleHost
+TestTransformUnaryToDiscardIterator
+TestTransformUnaryToDiscardIteratorZipped
+TestTransformWithIndirectionDevice
+TestTransformWithIndirectionHost
+TestTrivialSequenceDevice
+TestTrivialSequenceHost
+TestTupleComparison
+TestTupleConstructor
+TestTupleGet
+TestTupleReduce
+TestTupleScan
+TestTupleStableSort
+TestTupleSwap
+TestTupleTie
+TestTupleTransform
+TestTypeName
+TestUniformDecomposition
+TestUniformIntDistributionMax
+TestUniformIntDistributionMin
+TestUniformIntDistributionSaveRestore
+TestUniformRealDistributionMax
+TestUniformRealDistributionMin
+TestUniformRealDistributionSaveRestore
+TestUninitializedCopyCudaStreams
+TestUninitializedCopyDeviceSeq
+TestUninitializedCopyDispatchExplicit
+TestUninitializedCopyDispatchImplicit
+TestUninitializedCopyNCudaStreams
+TestUninitializedCopyNDeviceSeq
+TestUninitializedCopyNDispatchExplicit
+TestUninitializedCopyNDispatchImplicit
+TestUninitializedCopyNNonPODDevice
+TestUninitializedCopyNNonPODHost
+TestUninitializedCopyNSimplePODDevice
+TestUninitializedCopyNSimplePODHost
+TestUninitializedCopyNonPODDevice
+TestUninitializedCopyNonPODHost
+TestUninitializedCopySimplePODDevice
+TestUninitializedCopySimplePODHost
+TestUninitializedFillCudaStreams
+TestUninitializedFillDeviceSeq
+TestUninitializedFillDispatchExplicit
+TestUninitializedFillDispatchImplicit
+TestUninitializedFillNCudaStreams
+TestUninitializedFillNDeviceSeq
+TestUninitializedFillNDispatchExplicit
+TestUninitializedFillNDispatchImplicit
+TestUninitializedFillNNonPOD
+TestUninitializedFillNPODDevice
+TestUninitializedFillNPODHost
+TestUninitializedFillNonPOD
+TestUninitializedFillPODDevice
+TestUninitializedFillPODHost
+TestUnique
+TestUniqueByKey
+TestUniqueByKeyCopyDispatchExplicit
+TestUniqueByKeyCopyDispatchImplicit
+TestUniqueByKeyCudaStreams
+TestUniqueByKeyDeviceSeq
+TestUniqueByKeyDispatchExplicit
+TestUniqueByKeyDispatchImplicit
+TestUniqueByKeySimpleDevice
+TestUniqueByKeySimpleHost
+TestUniqueCopy
+TestUniqueCopyByKey
+TestUniqueCopyByKeyCudaStreams
+TestUniqueCopyByKeyDeviceSeq
+TestUniqueCopyByKeySimpleDevice
+TestUniqueCopyByKeySimpleHost
+TestUniqueCopyByKeyToDiscardIterator
+TestUniqueCopyCudaStreams
+TestUniqueCopyDeviceSeq
+TestUniqueCopyDispatchExplicit
+TestUniqueCopyDispatchImplicit
+TestUniqueCopySimpleDevice
+TestUniqueCopySimpleHost
+TestUniqueCopyToDiscardIterator
+TestUniqueCudaStreams
+TestUniqueDeviceSeq
+TestUniqueDispatchExplicit
+TestUniqueDispatchImplicit
+TestUniqueSimpleDevice
+TestUniqueSimpleHost
+TestUnknownDeviceRobustness
+TestVectorAssignFromBiDirectionalIteratorDevice
+TestVectorAssignFromBiDirectionalIteratorHost
+TestVectorAssignFromDeviceVectorDevice
+TestVectorAssignFromDeviceVectorHost
+TestVectorAssignFromHostVectorDevice
+TestVectorAssignFromHostVectorHost
+TestVectorAssignFromSTLVectorDevice
+TestVectorAssignFromSTLVectorHost
+TestVectorBinarySearch
+TestVectorBinarySearchDescending
+TestVectorBinarySearchDescendingSimpleDevice
+TestVectorBinarySearchDescendingSimpleHost
+TestVectorBinarySearchDiscardIterator
+TestVectorBinarySearchDispatchExplicit
+TestVectorBinarySearchDispatchImplicit
+TestVectorBinarySearchSimpleDevice
+TestVectorBinarySearchSimpleHost
+TestVectorBool
+TestVectorContainingLargeType
+TestVectorCppZeroSizeDevice
+TestVectorCppZeroSizeHost
+TestVectorDataDevice
+TestVectorDataHost
+TestVectorElementAssignmentDevice
+TestVectorElementAssignmentHost
+TestVectorEquality
+TestVectorErasePositionDevice
+TestVectorErasePositionHost
+TestVectorEraseRangeDevice
+TestVectorEraseRangeHost
+TestVectorFillAssignDevice
+TestVectorFillAssignHost
+TestVectorFillInsert
+TestVectorFillInsertSimple<thrust::device_vector>
+TestVectorFillInsertSimple<thrust::host_vector>
+TestVectorFromBiDirectionalIteratorDevice
+TestVectorFromBiDirectionalIteratorHost
+TestVectorFromSTLVectorDevice
+TestVectorFromSTLVectorHost
+TestVectorFrontBackDevice
+TestVectorFrontBackHost
+TestVectorInequality
+TestVectorLowerBound
+TestVectorLowerBoundDescending
+TestVectorLowerBoundDescendingSimpleDevice
+TestVectorLowerBoundDescendingSimpleHost
+TestVectorLowerBoundDiscardIterator
+TestVectorLowerBoundDispatchExplicit
+TestVectorLowerBoundDispatchImplicit
+TestVectorLowerBoundSimpleDevice
+TestVectorLowerBoundSimpleHost
+TestVectorManipulationDevice
+TestVectorManipulationHost
+TestVectorRangeInsert
+TestVectorRangeInsertSimple<thrust::device_vector>
+TestVectorRangeInsertSimple<thrust::host_vector>
+TestVectorReservingDevice
+TestVectorReservingHost
+TestVectorResizingDevice
+TestVectorResizingHost
+TestVectorReversedDevice
+TestVectorReversedHost
+TestVectorShrinkToFitDevice
+TestVectorShrinkToFitHost
+TestVectorSwapDevice
+TestVectorSwapHost
+TestVectorToAndFromDeviceVectorDevice
+TestVectorToAndFromDeviceVectorHost
+TestVectorToAndFromHostVectorDevice
+TestVectorToAndFromHostVectorHost
+TestVectorUpperBound
+TestVectorUpperBoundDescending
+TestVectorUpperBoundDescendingSimpleDevice
+TestVectorUpperBoundDescendingSimpleHost
+TestVectorUpperBoundDiscardIterator
+TestVectorUpperBoundDispatchExplicit
+TestVectorUpperBoundDispatchImplicit
+TestVectorUpperBoundSimpleDevice
+TestVectorUpperBoundSimpleHost
+TestVectorWithInitialValueDevice
+TestVectorWithInitialValueHost
+TestVectorZeroSizeDevice
+TestVectorZeroSizeHost
+TestZipIteratorCopyAoSToSoA
+TestZipIteratorCopyDevice
+TestZipIteratorCopyHost
+TestZipIteratorCopySoAToAoS
+TestZipIteratorManipulation
+TestZipIteratorReduce
+TestZipIteratorReduceByKey
+TestZipIteratorReference
+TestZipIteratorScan
+TestZipIteratorStableSort
+TestZipIteratorStableSortByKey
+TestZipIteratorSystem
+TestZipIteratorTransform
+TestZipIteratorTraversal
+TestZippedDiscardIterator
diff --git a/thrust/internal/test/unittest_omp.lst b/thrust/internal/test/unittest_omp.lst
new file mode 100644
index 0000000000000000000000000000000000000000..f59230e895fe3b99e5fd9a24cd8bf0d0a164f7db
--- /dev/null
+++ b/thrust/internal/test/unittest_omp.lst
@@ -0,0 +1,808 @@
+TestAdjacentDifference
+TestAdjacentDifferenceDiscardIterator
+TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes
+TestAdjacentDifferenceSimpleDevice
+TestAdjacentDifferenceSimpleHost
+TestAdvanceDevice
+TestAdvanceHost
+TestAllOfDevice
+TestAllOfHost
+TestAnyOfDevice
+TestAnyOfHost
+TestAssertEqual
+TestAssertGEqual
+TestAssertLEqual
+TestBitAndFunctionalDevice
+TestBitAndFunctionalHost
+TestBitOrFunctionalDevice
+TestBitOrFunctionalHost
+TestBitXorFunctionalDevice
+TestBitXorFunctionalHost
+TestComputeCapability
+TestConstantIteratorComparison
+TestConstantIteratorConstructFromConvertibleSpace
+TestConstantIteratorCopyDevice
+TestConstantIteratorCopyHost
+TestConstantIteratorIncrement
+TestConstantIteratorReduce
+TestConstantIteratorTransformDevice
+TestConstantIteratorTransformHost
+TestCopyConstantIteratorToZipIteratorDevice
+TestCopyConstantIteratorToZipIteratorHost
+TestCopyCountingIteratorDevice
+TestCopyCountingIteratorHost
+TestCopyDeviceThrow
+TestCopyFromConstIterator
+TestCopyIf
+TestCopyIfSimpleDevice
+TestCopyIfSimpleHost
+TestCopyIfStencil
+TestCopyIfStencilSimpleDevice
+TestCopyIfStencilSimpleHost
+TestCopyListToDevice
+TestCopyListToHost
+TestCopyMatchingTypesDevice
+TestCopyMatchingTypesHost
+TestCopyMixedTypesDevice
+TestCopyMixedTypesHost
+TestCopyNConstantIteratorToZipIteratorDevice
+TestCopyNConstantIteratorToZipIteratorHost
+TestCopyNCountingIteratorDevice
+TestCopyNCountingIteratorHost
+TestCopyNFromConstIterator
+TestCopyNListToDevice
+TestCopyNListToHost
+TestCopyNMatchingTypesDevice
+TestCopyNMatchingTypesHost
+TestCopyNMixedTypesDevice
+TestCopyNMixedTypesHost
+TestCopyNToDiscardIterator
+TestCopyNVectorBool
+TestCopyNZipIteratorDevice
+TestCopyNZipIteratorHost
+TestCopyToDiscardIterator
+TestCopyToDiscardIteratorZipped
+TestCopyVectorBool
+TestCopyZipIteratorDevice
+TestCopyZipIteratorHost
+TestCount
+TestCountFromConstIteratorSimpleDevice
+TestCountFromConstIteratorSimpleHost
+TestCountIf
+TestCountIfSimpleDevice
+TestCountIfSimpleHost
+TestCountSimpleDevice
+TestCountSimpleHost
+TestCountingIteratorComparison
+TestCountingIteratorCopyConstructor
+TestCountingIteratorDifference
+TestCountingIteratorDistance
+TestCountingIteratorIncrement
+TestCountingIteratorLowerBound
+TestCountingIteratorUnsignedType
+TestDeviceDeleteDestructorInvocation
+TestDeviceDereferenceCountingIterator
+TestDeviceDereferenceDevicePtr
+TestDeviceDereferenceDeviceVectorIterator
+TestDeviceDereferenceTransformIterator
+TestDeviceDereferenceTransformedCountingIterator
+TestDevicePointerManipulation
+TestDeviceReferenceAssignmentFromDeviceReference
+TestDeviceReferenceConstructorFromDevicePointer
+TestDeviceReferenceConstructorFromDeviceReference
+TestDeviceReferenceManipulation
+TestDiscardIteratorComparison
+TestDiscardIteratorIncrement
+TestDistanceDevice
+TestDistanceHost
+TestDividesFunctionalDevice
+TestDividesFunctionalHost
+TestEqual
+TestEqualSimpleDevice
+TestEqualSimpleHost
+TestEqualToFunctionalDevice
+TestEqualToFunctionalHost
+TestExclusiveScan32
+TestExclusiveScanByKeySimpleDevice
+TestExclusiveScanByKeySimpleHost
+TestExclusiveScanNullPtr
+TestFill
+TestFillDiscardIterator
+TestFillMixedTypesDevice
+TestFillMixedTypesHost
+TestFillN
+TestFillNDiscardIterator
+TestFillNMixedTypesDevice
+TestFillNMixedTypesHost
+TestFillNSimpleDevice
+TestFillNSimpleHost
+TestFillSimpleDevice
+TestFillSimpleHost
+TestFillTuple
+TestFillWithNonTrivialAssignment
+TestFillWithTrivialAssignment
+TestFillZipIteratorDevice
+TestFillZipIteratorHost
+TestFind
+TestFindIf
+TestFindIfNot
+TestFindIfNotSimpleDevice
+TestFindIfNotSimpleHost
+TestFindIfSimpleDevice
+TestFindIfSimpleHost
+TestFindSimpleDevice
+TestFindSimpleHost
+TestForEach
+TestForEachLargeRegisterFootprint
+TestForEachSimpleAnySpace
+TestForEachSimpleDevice
+TestForEachSimpleHost
+TestForEachWithLargeTypes
+TestFunctionalPlaceholdersBinaryEqualToDevice
+TestFunctionalPlaceholdersBinaryEqualToHost
+TestFunctionalPlaceholdersBinaryGreaterDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualHost
+TestFunctionalPlaceholdersBinaryGreaterHost
+TestFunctionalPlaceholdersBinaryLessDevice
+TestFunctionalPlaceholdersBinaryLessEqualDevice
+TestFunctionalPlaceholdersBinaryLessEqualHost
+TestFunctionalPlaceholdersBinaryLessHost
+TestFunctionalPlaceholdersBinaryNotEqualToDevice
+TestFunctionalPlaceholdersBinaryNotEqualToHost
+TestFunctionalPlaceholdersBitAnd<thrust::device_vector>
+TestFunctionalPlaceholdersBitAnd<thrust::host_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitNegateDevice
+TestFunctionalPlaceholdersBitNegateHost
+TestFunctionalPlaceholdersBitOr<thrust::device_vector>
+TestFunctionalPlaceholdersBitOr<thrust::host_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitXor<thrust::device_vector>
+TestFunctionalPlaceholdersBitXor<thrust::host_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::host_vector>
+TestFunctionalPlaceholdersDivides<thrust::device_vector>
+TestFunctionalPlaceholdersDivides<thrust::host_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersLogicalAndDevice
+TestFunctionalPlaceholdersLogicalAndHost
+TestFunctionalPlaceholdersLogicalNotDevice
+TestFunctionalPlaceholdersLogicalNotHost
+TestFunctionalPlaceholdersLogicalOrDevice
+TestFunctionalPlaceholdersLogicalOrHost
+TestFunctionalPlaceholdersMinus<thrust::device_vector>
+TestFunctionalPlaceholdersMinus<thrust::host_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersModulus<thrust::device_vector>
+TestFunctionalPlaceholdersModulus<thrust::host_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::device_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::host_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersNegateDevice
+TestFunctionalPlaceholdersNegateHost
+TestFunctionalPlaceholdersPlus<thrust::device_vector>
+TestFunctionalPlaceholdersPlus<thrust::host_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersPrefixDecrementDevice
+TestFunctionalPlaceholdersPrefixDecrementHost
+TestFunctionalPlaceholdersPrefixIncrementDevice
+TestFunctionalPlaceholdersPrefixIncrementHost
+TestFunctionalPlaceholdersSuffixDecrementDevice
+TestFunctionalPlaceholdersSuffixDecrementHost
+TestFunctionalPlaceholdersSuffixIncrementDevice
+TestFunctionalPlaceholdersSuffixIncrementHost
+TestFunctionalPlaceholdersTransformIterator<thrust::device_vector>
+TestFunctionalPlaceholdersTransformIterator<thrust::host_vector>
+TestFunctionalPlaceholdersUnaryPlusDevice
+TestFunctionalPlaceholdersUnaryPlusHost
+TestFunctionalPlaceholdersValue<thrust::device_vector>
+TestFunctionalPlaceholdersValue<thrust::host_vector>
+TestGather
+TestGatherCountingIteratorDevice
+TestGatherCountingIteratorHost
+TestGatherIf
+TestGatherIfSimpleDevice
+TestGatherIfSimpleHost
+TestGatherIfToDiscardIterator
+TestGatherSimpleDevice
+TestGatherSimpleHost
+TestGatherToDiscardIterator
+TestGenerate
+TestGenerateNSimpleDevice
+TestGenerateNSimpleHost
+TestGenerateNToDiscardIterator
+TestGenerateSimpleDevice
+TestGenerateSimpleHost
+TestGenerateToDiscardIterator
+TestGenerateTuple
+TestGenerateZipIteratorDevice
+TestGenerateZipIteratorHost
+TestGreaterEqualFunctionalDevice
+TestGreaterEqualFunctionalHost
+TestGreaterFunctionalDevice
+TestGreaterFunctionalHost
+TestIdentityFunctionalDevice
+TestIdentityFunctionalHost
+TestInclusiveScan32
+TestInclusiveScanByKeySimpleDevice
+TestInclusiveScanByKeySimpleHost
+TestInclusiveScanByKeyTransformIteratorDevice
+TestInclusiveScanByKeyTransformIteratorHost
+TestInclusiveScanWithIndirectionDevice
+TestInclusiveScanWithIndirectionHost
+TestInnerProduct
+TestInnerProductSimpleDevice
+TestInnerProductSimpleHost
+TestInnerProductWithOperatorDevice
+TestInnerProductWithOperatorHost
+TestIsCommutative
+TestIsPartitionedDevice
+TestIsPartitionedHost
+TestIsPartitionedSimpleDevice
+TestIsPartitionedSimpleHost
+TestIsPlainOldData
+TestIsSortedDevice
+TestIsSortedHost
+TestIsSortedRepeatedElementsDevice
+TestIsSortedRepeatedElementsHost
+TestIsSortedSimpleDevice
+TestIsSortedSimpleHost
+TestIsSortedUntilDevice
+TestIsSortedUntilHost
+TestIsSortedUntilRepeatedElementsDevice
+TestIsSortedUntilRepeatedElementsHost
+TestIsSortedUntilSimpleDevice
+TestIsSortedUntilSimpleHost
+TestIsTrivialIterator
+TestLessEqualFunctionalDevice
+TestLessEqualFunctionalHost
+TestLessFunctionalDevice
+TestLessFunctionalHost
+TestLog2
+TestLogicalAndFunctionalDevice
+TestLogicalAndFunctionalHost
+TestLogicalNotFunctionalDevice
+TestLogicalNotFunctionalHost
+TestLogicalOrFunctionalDevice
+TestLogicalOrFunctionalHost
+TestMakeConstantIterator
+TestMakeDevicePointer
+TestMakeDiscardIterator
+TestMakePermutationIteratorDevice
+TestMakePermutationIteratorHost
+TestMakeTransformIteratorDevice
+TestMakeTransformIteratorHost
+TestMakeTuple
+TestMax
+TestMaxActiveBlocks
+TestMaxBlocksize
+TestMaxBlocksizeWithHighestOccupancy
+TestMaxElement
+TestMaxElementSimpleDevice
+TestMaxElementSimpleHost
+TestMaximumFunctionalDevice
+TestMaximumFunctionalHost
+TestMerge
+TestMergeDescending
+TestMergeKeyValue
+TestMergeKeyValueDescending
+TestMergeSimpleDevice
+TestMergeSimpleHost
+TestMergeSortAscendingKey
+TestMergeSortAscendingKeyValue
+TestMergeSortDescendingKey
+TestMergeSortDescendingKeyValue
+TestMergeSortKeySimple
+TestMergeSortKeyValueSimple
+TestMergeSortStableKeySimple
+TestMergeToDiscardIterator
+TestMin
+TestMinElement
+TestMinElementSimpleDevice
+TestMinElementSimpleHost
+TestMinMaxElement
+TestMinMaxElementSimpleDevice
+TestMinMaxElementSimpleHost
+TestMinimumFunctionalDevice
+TestMinimumFunctionalHost
+TestMinstdRand0Equal
+TestMinstdRand0Max
+TestMinstdRand0Min
+TestMinstdRand0SaveRestore
+TestMinstdRand0Unequal
+TestMinstdRand0Validation
+TestMinstdRandEqual
+TestMinstdRandMax
+TestMinstdRandMin
+TestMinstdRandSaveRestore
+TestMinstdRandUnequal
+TestMinstdRandValidation
+TestMinusFunctionalDevice
+TestMinusFunctionalHost
+TestMismatchSimpleDevice
+TestMismatchSimpleHost
+TestModulusFunctionalDevice
+TestModulusFunctionalHost
+TestMultipliesFunctionalDevice
+TestMultipliesFunctionalHost
+TestNegateFunctionalDevice
+TestNegateFunctionalHost
+TestNoneOfDevice
+TestNoneOfHost
+TestNot1Device
+TestNot1Host
+TestNot2Device
+TestNot2Host
+TestNotEqualToFunctionalDevice
+TestNotEqualToFunctionalHost
+TestNullPtrDereferenceYieldsError
+TestPairComparison
+TestPairGet
+TestPairManipulation
+TestPairReduce
+TestPairScan
+TestPairScanByKey
+TestPairStableSort
+TestPairStableSortByKey
+TestPairTransform
+TestPairTupleElement
+TestPairTupleSize
+TestPartition
+TestPartitionCopy
+TestPartitionCopySimpleDevice
+TestPartitionCopySimpleHost
+TestPartitionCopyToDiscardIterator
+TestPartitionPointDevice
+TestPartitionPointHost
+TestPartitionPointSimpleDevice
+TestPartitionPointSimpleHost
+TestPartitionSimpleDevice
+TestPartitionSimpleHost
+TestPartitionZipIteratorDevice
+TestPartitionZipIteratorHost
+TestPermutationIteratorGatherDevice
+TestPermutationIteratorGatherHost
+TestPermutationIteratorHostDeviceGather
+TestPermutationIteratorHostDeviceScatter
+TestPermutationIteratorReduceDevice
+TestPermutationIteratorReduceHost
+TestPermutationIteratorScatterDevice
+TestPermutationIteratorScatterHost
+TestPermutationIteratorSimpleDevice
+TestPermutationIteratorSimpleHost
+TestPermutationIteratorWithCountingIteratorDevice
+TestPermutationIteratorWithCountingIteratorHost
+TestPlusFunctionalDevice
+TestPlusFunctionalHost
+TestProject1stFunctionalDevice
+TestProject1stFunctionalHost
+TestProject2ndFunctionalDevice
+TestProject2ndFunctionalHost
+TestRadixSort
+TestRadixSortByKey
+TestRadixSortByKeyLongLongValues
+TestRadixSortByKeyShortValues
+TestRadixSortByKeyUnaligned
+TestRadixSortKeySimple<thrust::device_vector>
+TestRadixSortKeyValueSimple<thrust::device_vector>
+TestRanlux24BaseEqual
+TestRanlux24BaseMax
+TestRanlux24BaseMin
+TestRanlux24BaseSaveRestore
+TestRanlux24BaseUnequal
+TestRanlux24BaseValidation
+TestRanlux24Equal
+TestRanlux24Max
+TestRanlux24Min
+TestRanlux24SaveRestore
+TestRanlux24Unequal
+TestRanlux24Validation
+TestRanlux48BaseEqual
+TestRanlux48BaseMax
+TestRanlux48BaseMin
+TestRanlux48BaseSaveRestore
+TestRanlux48BaseUnequal
+TestRanlux48BaseValidation
+TestRanlux48Equal
+TestRanlux48Max
+TestRanlux48Min
+TestRanlux48SaveRestore
+TestRanlux48Unequal
+TestRanlux48Validation
+TestRawPointerCastDevice
+TestRawPointerCastHost
+TestReduce
+TestReduceByKey
+TestReduceByKeySimpleDevice
+TestReduceByKeySimpleHost
+TestReduceByKeyToDiscardIterator
+TestReduceIntervals
+TestReduceIntervalsSimpleDevice
+TestReduceIntervalsSimpleHost
+TestReduceMixedTypesDevice
+TestReduceMixedTypesHost
+TestReduceNullPtr
+TestReduceSimpleDevice
+TestReduceSimpleHost
+TestReduceWithIndirectionDevice
+TestReduceWithIndirectionHost
+TestReduceWithLargeTypes
+TestReduceWithOperator
+TestRemove
+TestRemoveCopy
+TestRemoveCopyIf
+TestRemoveCopyIfSimpleDevice
+TestRemoveCopyIfSimpleHost
+TestRemoveCopyIfStencil
+TestRemoveCopyIfStencilSimpleDevice
+TestRemoveCopyIfStencilSimpleHost
+TestRemoveCopyIfStencilToDiscardIterator
+TestRemoveCopyIfToDiscardIterator
+TestRemoveCopySimpleDevice
+TestRemoveCopySimpleHost
+TestRemoveCopyToDiscardIterator
+TestRemoveCopyToDiscardIteratorZipped
+TestRemoveIf
+TestRemoveIfSimpleDevice
+TestRemoveIfSimpleHost
+TestRemoveIfStencil
+TestRemoveIfStencilSimpleDevice
+TestRemoveIfStencilSimpleHost
+TestRemoveSimpleDevice
+TestRemoveSimpleHost
+TestReplace
+TestReplaceCopy
+TestReplaceCopyIf
+TestReplaceCopyIfSimpleDevice
+TestReplaceCopyIfSimpleHost
+TestReplaceCopyIfStencil
+TestReplaceCopyIfStencilSimpleDevice
+TestReplaceCopyIfStencilSimpleHost
+TestReplaceCopyIfStencilToDiscardIterator
+TestReplaceCopyIfToDiscardIterator
+TestReplaceCopySimpleDevice
+TestReplaceCopySimpleHost
+TestReplaceCopyToDiscardIterator
+TestReplaceIf
+TestReplaceIfSimpleDevice
+TestReplaceIfSimpleHost
+TestReplaceIfStencil
+TestReplaceIfStencilSimpleDevice
+TestReplaceIfStencilSimpleHost
+TestReplaceSimpleDevice
+TestReplaceSimpleHost
+TestReverse
+TestReverseCopy
+TestReverseCopySimpleDevice
+TestReverseCopySimpleHost
+TestReverseCopyToDiscardIterator
+TestReverseIteratorCopyConstructor
+TestReverseIteratorCopyDevice
+TestReverseIteratorCopyHost
+TestReverseIteratorExclusiveScan
+TestReverseIteratorExclusiveScanSimple
+TestReverseIteratorIncrement
+TestReverseSimpleDevice
+TestReverseSimpleHost
+TestScalarBinarySearchDescendingSimpleDevice
+TestScalarBinarySearchDescendingSimpleHost
+TestScalarBinarySearchSimpleDevice
+TestScalarBinarySearchSimpleHost
+TestScalarEqualRangeDescendingSimpleDevice
+TestScalarEqualRangeDescendingSimpleHost
+TestScalarEqualRangeSimpleDevice
+TestScalarEqualRangeSimpleHost
+TestScalarLowerBoundDescendingSimpleDevice
+TestScalarLowerBoundDescendingSimpleHost
+TestScalarLowerBoundSimpleDevice
+TestScalarLowerBoundSimpleHost
+TestScalarUpperBoundDescendingSimpleDevice
+TestScalarUpperBoundDescendingSimpleHost
+TestScalarUpperBoundSimpleDevice
+TestScalarUpperBoundSimpleHost
+TestScan
+TestScanByKeyHeadFlagsDevice
+TestScanByKeyHeadFlagsHost
+TestScanByKeyLargeInput
+TestScanByKeyMixedTypes
+TestScanByKeyReusedKeysDevice
+TestScanByKeyReusedKeysHost
+TestScanByKeyWithLargeTypes
+TestScanMixedTypes
+TestScanMixedTypesDevice
+TestScanMixedTypesHost
+TestScanSimpleDevice
+TestScanSimpleHost
+TestScanToDiscardIterator
+TestScanWithLargeTypes
+TestScanWithOperator
+TestScanWithOperatorToDiscardIterator
+TestScatter
+TestScatterCountingIteratorDevice
+TestScatterCountingIteratorHost
+TestScatterIf
+TestScatterIfCountingIteratorDevice
+TestScatterIfCountingIteratorHost
+TestScatterIfSimpleDevice
+TestScatterIfSimpleHost
+TestScatterIfToDiscardIterator
+TestScatterSimpleDevice
+TestScatterSimpleHost
+TestScatterToDiscardIterator
+TestSelect
+TestSelectKeyValue
+TestSelectSemantics
+TestSequence
+TestSequenceSimpleDevice
+TestSequenceSimpleHost
+TestSequenceToDiscardIterator
+TestSetDifference
+TestSetDifferenceDescending
+TestSetDifferenceDescendingSimpleDevice
+TestSetDifferenceDescendingSimpleHost
+TestSetDifferenceEquivalentRanges
+TestSetDifferenceKeyValue
+TestSetDifferenceMultiset
+TestSetDifferenceSimpleDevice
+TestSetDifferenceSimpleHost
+TestSetIntersection
+TestSetIntersectionDescending
+TestSetIntersectionDescendingSimpleDevice
+TestSetIntersectionDescendingSimpleHost
+TestSetIntersectionEquivalentRanges
+TestSetIntersectionKeyValue
+TestSetIntersectionMultiset
+TestSetIntersectionSimpleDevice
+TestSetIntersectionSimpleHost
+TestSetIntersectionToDiscardIterator
+TestSetSymmetricDifference
+TestSetSymmetricDifferenceDescending
+TestSetSymmetricDifferenceDescendingSimpleDevice
+TestSetSymmetricDifferenceDescendingSimpleHost
+TestSetSymmetricDifferenceEquivalentRanges
+TestSetSymmetricDifferenceKeyValue
+TestSetSymmetricDifferenceMultiset
+TestSetSymmetricDifferenceSimpleDevice
+TestSetSymmetricDifferenceSimpleHost
+TestSetUnion
+TestSetUnionDescending
+TestSetUnionKeyValue
+TestSetUnionKeyValueDescending
+TestSetUnionSimpleDevice
+TestSetUnionSimpleHost
+TestSetUnionToDiscardIterator
+TestSetUnionWithEquivalentElementsSimpleDevice
+TestSetUnionWithEquivalentElementsSimpleHost
+TestSortAscendingKey
+TestSortAscendingKeyValue
+TestSortByKeySimpleDevice
+TestSortByKeySimpleHost
+TestSortByKeyVariableBits
+TestSortDescendingKey
+TestSortDescendingKeyValue
+TestSortNullPtr
+TestSortSimpleDevice
+TestSortSimpleHost
+TestSortVariableBits
+TestStablePartition
+TestStablePartitionCopy
+TestStablePartitionCopySimpleDevice
+TestStablePartitionCopySimpleHost
+TestStablePartitionCopyToDiscardIterator
+TestStablePartitionSimpleDevice
+TestStablePartitionSimpleHost
+TestStablePartitionZipIteratorDevice
+TestStablePartitionZipIteratorHost
+TestStableSort
+TestStableSortByKey
+TestStableSortByKeySemantics
+TestStableSortByKeySimpleDevice
+TestStableSortByKeySimpleHost
+TestStableSortByKeyWithLargeKeys
+TestStableSortByKeyWithLargeKeysAndValues
+TestStableSortByKeyWithLargeValues
+TestStableSortSemantics
+TestStableSortSimpleDevice
+TestStableSortSimpleHost
+TestStableSortWithIndirectionDevice
+TestStableSortWithIndirectionHost
+TestStableSortWithLargeKeys
+TestStandardIntegerTypes
+TestSwapRanges
+TestSwapRangesSimpleDevice
+TestSwapRangesSimpleHost
+TestSwapRangesUserSwap
+TestTaus88Equal
+TestTaus88Max
+TestTaus88Min
+TestTaus88SaveRestore
+TestTaus88Unequal
+TestTaus88Validation
+TestTransformBinary
+TestTransformBinaryCountingIteratorDevice
+TestTransformBinaryCountingIteratorHost
+TestTransformBinarySimpleDevice
+TestTransformBinarySimpleHost
+TestTransformBinaryToDiscardIterator
+TestTransformIfBinary
+TestTransformIfBinarySimpleDevice
+TestTransformIfBinarySimpleHost
+TestTransformIfBinaryToDiscardIterator
+TestTransformIfUnary
+TestTransformIfUnaryNoStencil
+TestTransformIfUnaryNoStencilSimpleDevice
+TestTransformIfUnaryNoStencilSimpleHost
+TestTransformIfUnarySimpleDevice
+TestTransformIfUnarySimpleHost
+TestTransformIfUnaryToDiscardIterator
+TestTransformIteratorDevice
+TestTransformIteratorHost
+TestTransformIteratorReduce
+TestTransformNullPtr
+TestTransformReduce
+TestTransformReduceCountingIteratorDevice
+TestTransformReduceCountingIteratorHost
+TestTransformReduceFromConst
+TestTransformReduceSimpleDevice
+TestTransformReduceSimpleHost
+TestTransformScan
+TestTransformScanCountingIteratorDevice
+TestTransformScanCountingIteratorHost
+TestTransformScanSimpleDevice
+TestTransformScanSimpleHost
+TestTransformScanToDiscardIterator
+TestTransformUnary
+TestTransformUnaryCountingIteratorDevice
+TestTransformUnaryCountingIteratorHost
+TestTransformUnarySimpleDevice
+TestTransformUnarySimpleHost
+TestTransformUnaryToDiscardIterator
+TestTransformUnaryToDiscardIteratorZipped
+TestTransformWithIndirectionDevice
+TestTransformWithIndirectionHost
+TestTrivialSequenceDevice
+TestTrivialSequenceHost
+TestTupleComparison
+TestTupleConstructor
+TestTupleGet
+TestTupleReduce
+TestTupleScan
+TestTupleStableSort
+TestTupleTie
+TestTupleTransform
+TestTypeName
+TestUniformDecomposition
+TestUniformIntDistributionMax
+TestUniformIntDistributionMin
+TestUniformIntDistributionSaveRestore
+TestUniformRealDistributionMax
+TestUniformRealDistributionMin
+TestUniformRealDistributionSaveRestore
+TestUninitializedCopyNonPODDevice
+TestUninitializedCopyNonPODHost
+TestUninitializedCopySimplePODDevice
+TestUninitializedCopySimplePODHost
+TestUninitializedFillNNonPOD
+TestUninitializedFillNPODDevice
+TestUninitializedFillNPODHost
+TestUninitializedFillNonPOD
+TestUninitializedFillPODDevice
+TestUninitializedFillPODHost
+TestUnique
+TestUniqueByKey
+TestUniqueByKeySimpleDevice
+TestUniqueByKeySimpleHost
+TestUniqueCopy
+TestUniqueCopyByKey
+TestUniqueCopyByKeySimpleDevice
+TestUniqueCopyByKeySimpleHost
+TestUniqueCopyByKeyToDiscardIterator
+TestUniqueCopySimpleDevice
+TestUniqueCopySimpleHost
+TestUniqueCopyToDiscardIterator
+TestUniqueSimpleDevice
+TestUniqueSimpleHost
+TestUnknownDeviceRobustness
+TestVectorAssignFromBiDirectionalIteratorDevice
+TestVectorAssignFromBiDirectionalIteratorHost
+TestVectorAssignFromDeviceVectorDevice
+TestVectorAssignFromDeviceVectorHost
+TestVectorAssignFromHostVectorDevice
+TestVectorAssignFromHostVectorHost
+TestVectorAssignFromSTLVectorDevice
+TestVectorAssignFromSTLVectorHost
+TestVectorBinarySearch
+TestVectorBinarySearchDescending
+TestVectorBinarySearchDescendingSimpleDevice
+TestVectorBinarySearchDescendingSimpleHost
+TestVectorBinarySearchDiscardIterator
+TestVectorBinarySearchSimpleDevice
+TestVectorBinarySearchSimpleHost
+TestVectorBool
+TestVectorContainingLargeType
+TestVectorCppZeroSizeDevice
+TestVectorCppZeroSizeHost
+TestVectorDataDevice
+TestVectorDataHost
+TestVectorElementAssignmentDevice
+TestVectorElementAssignmentHost
+TestVectorEquality
+TestVectorErasePositionDevice
+TestVectorErasePositionHost
+TestVectorEraseRangeDevice
+TestVectorEraseRangeHost
+TestVectorFillAssignDevice
+TestVectorFillAssignHost
+TestVectorFillInsert
+TestVectorFillInsertSimple<thrust::device_vector>
+TestVectorFillInsertSimple<thrust::host_vector>
+TestVectorFromBiDirectionalIteratorDevice
+TestVectorFromBiDirectionalIteratorHost
+TestVectorFromSTLVectorDevice
+TestVectorFromSTLVectorHost
+TestVectorFrontBackDevice
+TestVectorFrontBackHost
+TestVectorInequality
+TestVectorLowerBound
+TestVectorLowerBoundDescending
+TestVectorLowerBoundDescendingSimpleDevice
+TestVectorLowerBoundDescendingSimpleHost
+TestVectorLowerBoundDiscardIterator
+TestVectorLowerBoundSimpleDevice
+TestVectorLowerBoundSimpleHost
+TestVectorManipulationDevice
+TestVectorManipulationHost
+TestVectorRangeInsert
+TestVectorRangeInsertSimple<thrust::device_vector>
+TestVectorRangeInsertSimple<thrust::host_vector>
+TestVectorReservingDevice
+TestVectorReservingHost
+TestVectorResizingDevice
+TestVectorResizingHost
+TestVectorReversedDevice
+TestVectorReversedHost
+TestVectorShrinkToFitDevice
+TestVectorShrinkToFitHost
+TestVectorSwapDevice
+TestVectorSwapHost
+TestVectorToAndFromDeviceVectorDevice
+TestVectorToAndFromDeviceVectorHost
+TestVectorToAndFromHostVectorDevice
+TestVectorToAndFromHostVectorHost
+TestVectorUpperBound
+TestVectorUpperBoundDescending
+TestVectorUpperBoundDescendingSimpleDevice
+TestVectorUpperBoundDescendingSimpleHost
+TestVectorUpperBoundDiscardIterator
+TestVectorUpperBoundSimpleDevice
+TestVectorUpperBoundSimpleHost
+TestVectorWithInitialValueDevice
+TestVectorWithInitialValueHost
+TestVectorZeroSizeDevice
+TestVectorZeroSizeHost
+TestZipIteratorCopyAoSToSoA
+TestZipIteratorCopyDevice
+TestZipIteratorCopyHost
+TestZipIteratorCopySoAToAoS
+TestZipIteratorManipulation
+TestZipIteratorReduce
+TestZipIteratorReduceByKey
+TestZipIteratorReference
+TestZipIteratorScan
+TestZipIteratorSpace
+TestZipIteratorStableSort
+TestZipIteratorStableSortByKey
+TestZipIteratorTransform
+TestZipIteratorTraversal
+TestZippedDiscardIterator
diff --git a/thrust/internal/test/warningstester.cu b/thrust/internal/test/warningstester.cu
new file mode 100644
index 0000000000000000000000000000000000000000..77c2947ac6eb38583b1d949b48bc749a20e052a6
--- /dev/null
+++ b/thrust/internal/test/warningstester.cu
@@ -0,0 +1,8 @@
+//#include "cuda_runtime_api.h"
+#include "warningstester.h"
+
+int main()
+{
+  return 0;
+}
+
diff --git a/thrust/testing/CMakeLists.txt b/thrust/testing/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fdfc04e97b6bd35d73b7b3eab58dd272170f930b
--- /dev/null
+++ b/thrust/testing/CMakeLists.txt
@@ -0,0 +1,157 @@
+# Create meta targets that build all tests for a single configuration:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_meta_target ${config_prefix}.tests)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake --
+# these flag variables behave unintuitively:
+if (THRUST_ENABLE_TESTS_WITH_RDC)
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+else()
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+endif()
+
+# Generate testing framework libraries:
+add_subdirectory(unittest)
+
+# List of tests that aren't implemented for all backends, but are implemented for CUDA.
+set(partially_implemented_CUDA
+  async_copy
+  async_for_each
+  async_reduce
+  async_reduce_into
+  async_sort
+  async_transform
+  event
+  future
+)
+
+# List of tests that aren't implemented for all backends, but are implemented for CPP.
+set(partially_implemented_CPP
+)
+
+# List of tests that aren't implemented for all backends, but are implemented for TBB.
+set(partially_implemented_TBB
+)
+
+# List of tests that aren't implemented for all backends, but are implemented for OMP.
+set(partially_implemented_OMP
+)
+
+# List of all partially implemented tests.
+set(partially_implemented
+  ${partially_implemented_CUDA}
+  ${partially_implemented_CPP}
+  ${partially_implemented_TBB}
+  ${partially_implemented_OMP}
+)
+list(REMOVE_DUPLICATES partially_implemented)
+
+## thrust_add_test
+#
+# Add a test executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the test
+#   target. Useful for post-processing target information per-backend.
+# test_name: The name of the test minus "<config_prefix>.test." For example,
+#   testing/vector.cu will be "vector", and testing/cuda/copy.cu will be
+#   "cuda.copy".
+# test_src: The source file that implements the test.
+# thrust_target: The reference thrust target with configuration information.
+#
+function(thrust_add_test target_name_var test_name test_src thrust_target)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Wrap the .cu file in .cpp for non-CUDA backends
+  if ("CUDA" STREQUAL "${config_device}")
+    set(real_test_src "${test_src}")
+  else()
+    thrust_wrap_cu_in_cpp(real_test_src "${test_src}" ${thrust_target})
+  endif()
+
+  # The actual name of the test's target:
+  set(test_target ${config_prefix}.test.${test_name})
+  set(${target_name_var} ${test_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_framework_target ${config_prefix}.test.framework)
+  set(config_meta_target ${config_prefix}.tests)
+  set(test_meta_target thrust.all.test.${test_name})
+
+  add_executable(${test_target} "${real_test_src}")
+  target_link_libraries(${test_target} ${config_framework_target})
+  target_include_directories(${test_target} PRIVATE "${Thrust_SOURCE_DIR}/testing")
+  thrust_clone_target_properties(${test_target} ${thrust_target})
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${test_target})
+
+  # Meta target that builds tests with this name for all configurations:
+  if (NOT TARGET ${test_meta_target})
+    add_custom_target(${test_meta_target})
+  endif()
+  add_dependencies(${test_meta_target} ${test_target})
+
+  add_test(NAME ${test_target}
+    COMMAND "${CMAKE_COMMAND}"
+    "-DTHRUST_BINARY=$<TARGET_FILE:${test_target}>"
+    "-DTHRUST_SOURCE=${Thrust_SOURCE_DIR}"
+    -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunTest.cmake"
+  )
+
+  # Run OMP/TBB tests in serial. Multiple OMP processes will massively
+  # oversubscribe the machine with GCC's OMP, and we want to test these with
+  # the full CPU available to each unit test.
+  set(config_systems ${config_host} ${config_device})
+  if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
+    set_tests_properties(${test_target} PROPERTIES RUN_SERIAL ON)
+  endif()
+
+  # Check for per-test script. Script will be included in the current scope
+  # to allow custom property modifications.
+  get_filename_component(test_cmake_script "${test_src}" NAME_WLE)
+  set(test_cmake_script "${CMAKE_CURRENT_LIST_DIR}/${test_cmake_script}.cmake")
+  if (EXISTS "${test_cmake_script}")
+    include("${test_cmake_script}")
+  endif()
+endfunction()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+# Add common tests to all configs:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    if ("${test_name}" IN_LIST partially_implemented)
+      # This test is partially implemented on _some_ backends...
+      if (NOT "${test_name}" IN_LIST partially_implemented_${config_device})
+        # ...but not on the current one.
+        continue()
+      endif()
+    endif()
+
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+
+    if (THRUST_ENABLE_TESTS_WITH_RDC AND ("CUDA" STREQUAL "${config_device}"))
+      thrust_enable_rdc_for_cuda_target(${test_target})
+    endif()
+  endforeach()
+endforeach()
+
+# Add specialized tests:
+add_subdirectory(cpp)
+add_subdirectory(cuda)
+add_subdirectory(omp)
+add_subdirectory(regression)
diff --git a/thrust/testing/adjacent_difference.cu b/thrust/testing/adjacent_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f97ea350226ad7ecf03d5ab93a12e21fdddd2b9
--- /dev/null
+++ b/thrust/testing/adjacent_difference.cu
@@ -0,0 +1,163 @@
+#include <unittest/unittest.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+template <class Vector>
+void TestAdjacentDifferenceSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input(3);
+    Vector output(3);
+    input[0] = 1; input[1] = 4; input[2] = 6;
+
+    typename Vector::iterator result;
+
+    result = thrust::adjacent_difference(input.begin(), input.end(), output.begin());
+
+    ASSERT_EQUAL(result - output.begin(), 3);
+    ASSERT_EQUAL(output[0], T(1));
+    ASSERT_EQUAL(output[1], T(3));
+    ASSERT_EQUAL(output[2], T(2));
+
+    result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus<T>());
+
+    ASSERT_EQUAL(result - output.begin(), 3);
+    ASSERT_EQUAL(output[0], T( 1));
+    ASSERT_EQUAL(output[1], T( 5));
+    ASSERT_EQUAL(output[2], T(10));
+
+    // test in-place operation, result and first are permitted to be the same
+    result = thrust::adjacent_difference(input.begin(), input.end(), input.begin());
+
+    ASSERT_EQUAL(result - input.begin(), 3);
+    ASSERT_EQUAL(input[0], T(1));
+    ASSERT_EQUAL(input[1], T(3));
+    ASSERT_EQUAL(input[2], T(2));
+}
+DECLARE_VECTOR_UNITTEST(TestAdjacentDifferenceSimple);
+
+
+template <typename T>
+void TestAdjacentDifference(const size_t n)
+{
+    thrust::host_vector<T>   h_input = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    typename thrust::host_vector<T>::iterator   h_result;
+    typename thrust::device_vector<T>::iterator d_result;
+
+    h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin());
+    d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin());
+
+    ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
+    ASSERT_EQUAL(h_output, d_output);
+
+    h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
+    d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
+
+    ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
+    ASSERT_EQUAL(h_output, d_output);
+
+    // in-place operation
+    h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
+    d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
+
+    ASSERT_EQUAL(std::size_t(h_result - h_input.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_input.begin()), n);
+    ASSERT_EQUAL(h_input, h_output); //computed previously
+    ASSERT_EQUAL(d_input, d_output); //computed previously
+}
+DECLARE_VARIABLE_UNITTEST(TestAdjacentDifference);
+
+template<typename T>
+void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n)
+{
+    thrust::host_vector<T>   h_input = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    typename thrust::host_vector<T>::iterator   h_result;
+    typename thrust::device_vector<T>::iterator d_result;
+
+    h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
+    d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
+
+    // in-place operation with different iterator types
+    h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus<T>());
+    d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus<T>());
+
+    ASSERT_EQUAL(std::size_t(h_result - h_input.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_input.begin()), n);
+    ASSERT_EQUAL(h_output, h_input); // reference computed previously
+    ASSERT_EQUAL(d_output, d_input); // reference computed previously
+}
+DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes);
+
+template <typename T>
+void TestAdjacentDifferenceDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_input = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::discard_iterator<> h_result =
+      thrust::adjacent_difference(h_input.begin(), h_input.end(), thrust::make_discard_iterator());
+    thrust::discard_iterator<> d_result =
+      thrust::adjacent_difference(d_input.begin(), d_input.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceDiscardIterator);
+
+template<typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(my_system &system, InputIterator, InputIterator, OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestAdjacentDifferenceDispatchExplicit()
+{
+    thrust::device_vector<int> d_input(1);
+
+    my_system sys(0);
+    thrust::adjacent_difference(sys,
+                                d_input.begin(),
+                                d_input.end(),
+                                d_input.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceDispatchExplicit);
+
+template<typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(my_tag, InputIterator, InputIterator, OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestAdjacentDifferenceDispatchImplicit()
+{
+    thrust::device_vector<int> d_input(1);
+
+    thrust::adjacent_difference(thrust::retag<my_tag>(d_input.begin()),
+                                thrust::retag<my_tag>(d_input.end()),
+                                thrust::retag<my_tag>(d_input.begin()));
+
+    ASSERT_EQUAL(13, d_input.front());
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit);
diff --git a/thrust/testing/advance.cu b/thrust/testing/advance.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0860ef598b896d343513c88cd64de0666c2b67da
--- /dev/null
+++ b/thrust/testing/advance.cu
@@ -0,0 +1,93 @@
+#include <unittest/unittest.h>
+#include <thrust/advance.h>
+#include <thrust/sequence.h>
+
+// TODO expand this with other iterator types (forward, bidirectional, etc.)
+
+template <typename Vector>
+void TestAdvance()
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(10);
+    thrust::sequence(v.begin(), v.end());
+
+    Iterator i = v.begin();
+
+    thrust::advance(i, 1);
+
+    ASSERT_EQUAL(*i, T(1));
+    
+    thrust::advance(i, 8);
+
+    ASSERT_EQUAL(*i, T(9));
+    
+    thrust::advance(i, -4);
+
+    ASSERT_EQUAL(*i, T(5));
+}
+DECLARE_VECTOR_UNITTEST(TestAdvance);
+
+template <typename Vector>
+void TestNext()
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(10);
+    thrust::sequence(v.begin(), v.end());
+
+    Iterator const i0 = v.begin();
+
+    Iterator const i1 = thrust::next(i0);
+
+    ASSERT_EQUAL(*i0, T(0));
+    ASSERT_EQUAL(*i1, T(1));
+    
+    Iterator const i2 = thrust::next(i1, 8);
+
+    ASSERT_EQUAL(*i0, T(0));
+    ASSERT_EQUAL(*i1, T(1));
+    ASSERT_EQUAL(*i2, T(9));
+    
+    Iterator const i3 = thrust::next(i2, -4);
+
+    ASSERT_EQUAL(*i0, T(0));
+    ASSERT_EQUAL(*i1, T(1));
+    ASSERT_EQUAL(*i2, T(9));
+    ASSERT_EQUAL(*i3, T(5));
+}
+DECLARE_VECTOR_UNITTEST(TestNext);
+
+template <typename Vector>
+void TestPrev()
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(10);
+    thrust::sequence(v.begin(), v.end());
+
+    Iterator const i0 = v.end();
+
+    Iterator const i1 = thrust::prev(i0);
+
+    ASSERT_EQUAL_QUIET(i0, v.end());
+    ASSERT_EQUAL(*i1, T(9));
+    
+    Iterator const i2 = thrust::prev(i1, 8);
+
+    ASSERT_EQUAL_QUIET(i0, v.end());
+    ASSERT_EQUAL(*i1, T(9));
+    ASSERT_EQUAL(*i2, T(1));
+    
+    Iterator const i3 = thrust::prev(i2, -4);
+
+    ASSERT_EQUAL_QUIET(i0, v.end());
+    ASSERT_EQUAL(*i1, T(9));
+    ASSERT_EQUAL(*i2, T(1));
+    ASSERT_EQUAL(*i3, T(5));
+}
+DECLARE_VECTOR_UNITTEST(TestPrev);
+
diff --git a/thrust/testing/alignment.cu b/thrust/testing/alignment.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e55df2e961dd6e41bf6ad929b32bffec9e906a2a
--- /dev/null
+++ b/thrust/testing/alignment.cu
@@ -0,0 +1,360 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/alignment.h>
+
+struct alignof_mock_0
+{
+    char a;
+    char b;
+}; // size: 2 * sizeof(char), alignment: sizeof(char)
+
+struct alignof_mock_1
+{
+    int n;
+    char c;
+    // sizeof(int) - sizeof(char) bytes of padding
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_2
+{
+    int n;
+    char c;
+    // sizeof(int) - sizeof(char) bytes of padding
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_3
+{
+    char c;
+    // sizeof(int) - sizeof(char) bytes of padding
+    int n;
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_4
+{
+    char c0;
+    // sizeof(int) - sizeof(char) bytes of padding
+    int n;
+    char c1;
+    // sizeof(int) - sizeof(char) bytes of padding
+}; // size: 3 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_5
+{
+    char c0;
+    char c1;
+    // sizeof(int) - 2 * sizeof(char) bytes of padding
+    int n;
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_6
+{
+    int n;
+    char c0;
+    char c1;
+    // sizeof(int) - 2 * sizeof(char) bytes of padding
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+void test_alignof_mocks_sizes()
+{
+    ASSERT_EQUAL(sizeof(alignof_mock_0), 2 * sizeof(char));
+    ASSERT_EQUAL(sizeof(alignof_mock_1), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_2), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_3), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_4), 3 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_5), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_6), 2 * sizeof(int));
+}
+DECLARE_UNITTEST(test_alignof_mocks_sizes);
+
+void test_alignof()
+{
+    ASSERT_EQUAL(THRUST_ALIGNOF(bool)                  , sizeof(bool));
+    ASSERT_EQUAL(THRUST_ALIGNOF(signed char)           , sizeof(signed char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned char)         , sizeof(unsigned char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(char)                  , sizeof(char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(short int)             , sizeof(short int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned short int)    , sizeof(unsigned short int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(int)                   , sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned int)          , sizeof(unsigned int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(long int)              , sizeof(long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned long int)     , sizeof(unsigned long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(long long int)         , sizeof(long long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned long long int), sizeof(unsigned long long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(float)                 , sizeof(float));
+    ASSERT_EQUAL(THRUST_ALIGNOF(double)                , sizeof(double));
+    ASSERT_EQUAL(THRUST_ALIGNOF(long double)           , sizeof(long double));
+
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_0), sizeof(char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_1), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_2), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_3), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_4), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_5), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_6), sizeof(int));
+}
+DECLARE_UNITTEST(test_alignof);
+
+void test_alignment_of()
+{
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<bool>::value
+      , sizeof(bool)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<signed char>::value
+      , sizeof(signed char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned char>::value
+      , sizeof(unsigned char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<char>::value
+      , sizeof(char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<short int>::value
+      , sizeof(short int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned short int>::value
+      , sizeof(unsigned short int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<int>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned int>::value
+      , sizeof(unsigned int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<long int>::value
+      , sizeof(long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned long int>::value
+      , sizeof(unsigned long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<long long int>::value
+      , sizeof(long long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned long long int>::value
+      , sizeof(unsigned long long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<float>::value
+      , sizeof(float)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<double>::value
+      , sizeof(double)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<long double>::value
+      , sizeof(long double)
+    );
+
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_0>::value
+      , sizeof(char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_1>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_2>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_3>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_4>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_5>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_6>::value
+      , sizeof(int)
+    );
+}
+DECLARE_UNITTEST(test_alignment_of);
+
+template <std::size_t Align>
+void test_aligned_type_instantiation()
+{
+    typedef typename thrust::detail::aligned_type<Align>::type type;
+    ASSERT_GEQUAL(sizeof(type), 1lu);
+    ASSERT_EQUAL(THRUST_ALIGNOF(type), Align);
+    ASSERT_EQUAL(thrust::detail::alignment_of<type>::value, Align);
+}
+
+void test_aligned_type()
+{
+    test_aligned_type_instantiation<1>();
+    test_aligned_type_instantiation<2>();
+    test_aligned_type_instantiation<4>();
+    test_aligned_type_instantiation<8>();
+    test_aligned_type_instantiation<16>();
+    test_aligned_type_instantiation<32>();
+    test_aligned_type_instantiation<64>();
+    test_aligned_type_instantiation<128>();
+}
+DECLARE_UNITTEST(test_aligned_type);
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation(thrust::detail::true_type /* Align is valid */)
+{
+    typedef typename thrust::detail::aligned_storage<Len, Align>::type type;
+    ASSERT_GEQUAL(sizeof(type), Len);
+    ASSERT_EQUAL(THRUST_ALIGNOF(type), Align);
+    ASSERT_EQUAL(thrust::detail::alignment_of<type>::value, Align);
+}
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation(thrust::detail::false_type /* Align is invalid */)
+{
+  // no-op -- alignment is > max_align_t and MSVC complains loudly.
+}
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation()
+{
+  typedef thrust::detail::integral_constant<
+      bool, Align <= THRUST_ALIGNOF(thrust::detail::max_align_t)>
+      ValidAlign;
+  test_aligned_storage_instantiation<Len, Align>(ValidAlign());
+}
+
+template <std::size_t Len>
+void test_aligned_storage_size()
+{
+    test_aligned_storage_instantiation<Len, 1>();
+    test_aligned_storage_instantiation<Len, 2>();
+    test_aligned_storage_instantiation<Len, 4>();
+    test_aligned_storage_instantiation<Len, 8>();
+    test_aligned_storage_instantiation<Len, 16>();
+    test_aligned_storage_instantiation<Len, 32>();
+    test_aligned_storage_instantiation<Len, 64>();
+    test_aligned_storage_instantiation<Len, 128>();
+}
+
+void test_aligned_storage()
+{
+    test_aligned_storage_size<1>();
+    test_aligned_storage_size<2>();
+    test_aligned_storage_size<4>();
+    test_aligned_storage_size<8>();
+    test_aligned_storage_size<16>();
+    test_aligned_storage_size<32>();
+    test_aligned_storage_size<64>();
+    test_aligned_storage_size<128>();
+    test_aligned_storage_size<256>();
+    test_aligned_storage_size<512>();
+    test_aligned_storage_size<1024>();
+    test_aligned_storage_size<2048>();
+    test_aligned_storage_size<4096>();
+    test_aligned_storage_size<8192>();
+    test_aligned_storage_size<16384>();
+
+    test_aligned_storage_size<3>();
+    test_aligned_storage_size<5>();
+    test_aligned_storage_size<7>();
+
+    test_aligned_storage_size<17>();
+    test_aligned_storage_size<42>();
+
+    test_aligned_storage_size<10000>();
+}
+DECLARE_UNITTEST(test_aligned_storage);
+
+void test_max_align_t()
+{
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(bool)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(signed char)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned char)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(char)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(short int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned short int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(long long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned long long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(float)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(double)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(long double)
+    );
+}
+DECLARE_UNITTEST(test_max_align_t);
+
+void test_aligned_reinterpret_cast()
+{
+    thrust::detail::aligned_type<1>* a1 = 0;
+
+    thrust::detail::aligned_type<2>* a2 = 0;
+
+    // Cast to type with stricter (larger) alignment requirement.
+    a2 = thrust::detail::aligned_reinterpret_cast<
+        thrust::detail::aligned_type<2>*
+    >(a1);
+
+    // Cast to type with less strict (smaller) alignment requirement.
+    a1 = thrust::detail::aligned_reinterpret_cast<
+        thrust::detail::aligned_type<1>*
+    >(a2);
+}
+DECLARE_UNITTEST(test_aligned_reinterpret_cast);
+
diff --git a/thrust/testing/allocator.cu b/thrust/testing/allocator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a29408de95381ab3e02ef2e7e24db6d65b8af6b5
--- /dev/null
+++ b/thrust/testing/allocator.cu
@@ -0,0 +1,255 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/config.h>
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/system/cpp/vector.h>
+#include <memory>
+
+template <typename T>
+struct my_allocator_with_custom_construct1
+  : thrust::device_malloc_allocator<T>
+{
+  __host__ __device__
+  my_allocator_with_custom_construct1()
+  {}
+
+  __host__ __device__
+  void construct(T *p)
+  {
+    *p = 13;
+  }
+};
+
+template <typename T>
+void TestAllocatorCustomDefaultConstruct(size_t n)
+{
+  thrust::device_vector<T> ref(n, 13);
+  thrust::device_vector<T, my_allocator_with_custom_construct1<T> > vec(n);
+
+  ASSERT_EQUAL_QUIET(ref, vec);
+}
+DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDefaultConstruct);
+
+template <typename T>
+struct my_allocator_with_custom_construct2
+  : thrust::device_malloc_allocator<T>
+{
+  __host__ __device__
+  my_allocator_with_custom_construct2()
+  {}
+
+  template <typename Arg>
+  __host__ __device__
+  void construct(T *p, const Arg &)
+  {
+    *p = 13;
+  }
+};
+
+template <typename T>
+void TestAllocatorCustomCopyConstruct(size_t n)
+{
+  thrust::device_vector<T> ref(n, 13);
+  thrust::device_vector<T> copy_from(n, 7);
+  thrust::device_vector<T, my_allocator_with_custom_construct2<T> >
+    vec(copy_from.begin(), copy_from.end());
+
+  ASSERT_EQUAL_QUIET(ref, vec);
+}
+DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomCopyConstruct);
+
+template <typename T>
+struct my_allocator_with_custom_destroy
+{
+  typedef T         value_type;
+  typedef T &       reference;
+  typedef const T & const_reference;
+
+  static bool g_state;
+
+  __host__
+  my_allocator_with_custom_destroy(){}
+
+  __host__
+  my_allocator_with_custom_destroy(const my_allocator_with_custom_destroy &other)
+    : use_me_to_alloc(other.use_me_to_alloc)
+  {}
+
+  __host__
+  ~my_allocator_with_custom_destroy(){}
+
+  __host__ __device__
+  void destroy(T *)
+  {
+#if !__CUDA_ARCH__
+    g_state = true;
+#endif
+  }
+
+  value_type *allocate(std::ptrdiff_t n)
+  {
+    return use_me_to_alloc.allocate(n);
+  }
+
+  void deallocate(value_type *ptr, std::ptrdiff_t n)
+  {
+    use_me_to_alloc.deallocate(ptr,n);
+  }
+
+  bool operator==(const my_allocator_with_custom_destroy &) const
+  {
+    return true;
+  }
+
+  bool operator!=(const my_allocator_with_custom_destroy &other) const
+  {
+    return !(*this == other);
+  }
+
+  typedef thrust::detail::true_type is_always_equal;
+
+  // use composition rather than inheritance
+  // to avoid inheriting std::allocator's member
+  // function destroy
+  std::allocator<T> use_me_to_alloc;
+};
+
+template <typename T>
+bool my_allocator_with_custom_destroy<T>::g_state = false;
+
+template <typename T>
+void TestAllocatorCustomDestroy(size_t n)
+{
+  {
+    thrust::cpp::vector<T, my_allocator_with_custom_destroy<T> > vec(n);
+  } // destroy everything
+
+  if (0 < n)
+    ASSERT_EQUAL(true, my_allocator_with_custom_destroy<T>::g_state);
+}
+DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDestroy);
+
+template <typename T>
+struct my_minimal_allocator
+{
+  typedef T         value_type;
+
+  // XXX ideally, we shouldn't require
+  //     these two typedefs
+  typedef T &       reference;
+  typedef const T & const_reference;
+
+  __host__
+  my_minimal_allocator(){}
+
+  __host__
+  my_minimal_allocator(const my_minimal_allocator &other)
+    : use_me_to_alloc(other.use_me_to_alloc)
+  {}
+
+  __host__
+  ~my_minimal_allocator(){}
+
+  value_type *allocate(std::ptrdiff_t n)
+  {
+    return use_me_to_alloc.allocate(n);
+  }
+
+  void deallocate(value_type *ptr, std::ptrdiff_t n)
+  {
+    use_me_to_alloc.deallocate(ptr,n);
+  }
+
+  std::allocator<T> use_me_to_alloc;
+};
+
+template <typename T>
+void TestAllocatorMinimal(size_t n)
+{
+  thrust::cpp::vector<int, my_minimal_allocator<int> > vec(n, 13);
+
+  // XXX copy to h_vec because ASSERT_EQUAL doesn't know about cpp::vector
+  thrust::host_vector<int> h_vec(vec.begin(), vec.end());
+  thrust::host_vector<int> ref(n, 13);
+
+  ASSERT_EQUAL(ref, h_vec);
+}
+DECLARE_VARIABLE_UNITTEST(TestAllocatorMinimal);
+
+void TestAllocatorTraitsRebind()
+{
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<int>
+      >::template rebind_traits<float>::other,
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<float>
+      >
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<int>
+      >::template rebind_traits<float>::other,
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<float>
+      >
+    >::value),
+    true
+  );
+}
+DECLARE_UNITTEST(TestAllocatorTraitsRebind);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestAllocatorTraitsRebindCpp11()
+{
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<int>
+      >::template rebind_alloc<float>,
+      thrust::device_malloc_allocator<float>
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<int>
+      >::template rebind_alloc<float>,
+      my_minimal_allocator<float>
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<int>
+      >::template rebind_traits<float>,
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<float>
+      >
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<int>
+      >::template rebind_traits<float>,
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<float>
+      >
+    >::value),
+    true
+  );
+}
+DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11);
+#endif // C++11
+
diff --git a/thrust/testing/allocator_aware_policies.cu b/thrust/testing/allocator_aware_policies.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aaf841c707a1f9b25be4d5e4a6a101a221b45a93
--- /dev/null
+++ b/thrust/testing/allocator_aware_policies.cu
@@ -0,0 +1,156 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/seq.h>
+#include <thrust/system/cpp/detail/par.h>
+#include <thrust/system/omp/detail/par.h>
+#include <thrust/system/tbb/detail/par.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <thrust/system/cuda/detail/par.h>
+#endif
+
+template<typename T>
+struct test_allocator_t
+{
+};
+
+test_allocator_t<int> test_allocator = test_allocator_t<int>();
+const test_allocator_t<int> const_test_allocator = test_allocator_t<int>();
+
+struct test_memory_resource_t THRUST_FINAL : thrust::mr::memory_resource<>
+{
+    void * do_allocate(std::size_t size, std::size_t) THRUST_OVERRIDE
+    {
+        return reinterpret_cast<void *>(size);
+    }
+
+    void do_deallocate(void * ptr, std::size_t size, std::size_t) THRUST_OVERRIDE
+    {
+        ASSERT_EQUAL(ptr, reinterpret_cast<void *>(size));
+    }
+} test_memory_resource;
+
+template<typename Policy, template <typename> class CRTPBase>
+struct policy_info
+{
+    typedef Policy policy;
+
+    template<template <typename, template <typename> class> class Template, typename Argument>
+    struct apply_base_second
+    {
+        typedef Template<Argument, CRTPBase> type;
+    };
+};
+
+template<typename PolicyInfo>
+struct TestAllocatorAttachment
+{
+    template<typename Expected, typename T>
+    static void assert_correct(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_second<
+                    thrust::detail::execute_with_allocator,
+                    Expected
+                >::type
+            >::value), true);
+    }
+
+    template<typename ExpectedResource, typename T>
+    static void assert_npa_correct(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_second<
+                    thrust::detail::execute_with_allocator,
+                    thrust::mr::allocator<
+                        thrust::detail::max_align_t,
+                        ExpectedResource
+                    >
+                >::type
+            >::value), true);
+    }
+
+    template<typename Policy>
+    void test_temporary_allocation_valid(Policy policy)
+    {
+        using thrust::detail::get_temporary_buffer;
+
+        return_temporary_buffer(
+            policy,
+            get_temporary_buffer<int>(
+                policy,
+                123
+            ).first,
+            123
+        );
+    }
+
+    void operator()()
+    {
+        typename PolicyInfo::policy policy;
+
+        // test correctness of attachment
+        assert_correct<test_allocator_t<int> >(policy(test_allocator_t<int>()));
+        assert_correct<test_allocator_t<int>&>(policy(test_allocator));
+        assert_correct<test_allocator_t<int> >(policy(const_test_allocator));
+
+        assert_npa_correct<test_memory_resource_t>(policy(&test_memory_resource));
+
+        // test whether the resulting policy is actually usable
+        // a real allocator is necessary here, unlike above
+        std::allocator<int> alloc;
+        const std::allocator<int> const_alloc;
+
+        test_temporary_allocation_valid(policy(std::allocator<int>()));
+        test_temporary_allocation_valid(policy(alloc));
+        test_temporary_allocation_valid(policy(const_alloc));
+        test_temporary_allocation_valid(policy(&test_memory_resource));
+
+        #if THRUST_CPP_DIALECT >= 2011
+        test_temporary_allocation_valid(policy(std::allocator<int>()).after(1));
+        test_temporary_allocation_valid(policy(alloc).after(1));
+        test_temporary_allocation_valid(policy(const_alloc).after(1));
+        #endif
+    }
+};
+
+typedef policy_info<
+    thrust::detail::seq_t,
+    thrust::system::detail::sequential::execution_policy
+> sequential_info;
+typedef policy_info<
+    thrust::system::cpp::detail::par_t,
+    thrust::system::cpp::detail::execution_policy
+> cpp_par_info;
+typedef policy_info<
+    thrust::system::omp::detail::par_t,
+    thrust::system::omp::detail::execution_policy
+> omp_par_info;
+typedef policy_info<
+    thrust::system::tbb::detail::par_t,
+    thrust::system::tbb::detail::execution_policy
+> tbb_par_info;
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+typedef policy_info<
+    thrust::system::cuda::detail::par_t,
+    thrust::cuda_cub::execute_on_stream_base
+> cuda_par_info;
+#endif
+
+SimpleUnitTest<
+    TestAllocatorAttachment,
+    unittest::type_list<
+        sequential_info,
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cuda_par_info,
+#endif
+        cpp_par_info,
+        omp_par_info,
+        tbb_par_info
+    >
+> TestAllocatorAttachmentInstance;
diff --git a/thrust/testing/async_copy.cu b/thrust/testing/async_copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b92024cc62f0146c85845e39cd0329b9704828b5
--- /dev/null
+++ b/thrust/testing/async_copy.cu
@@ -0,0 +1,420 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/limits.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#define DEFINE_ASYNC_COPY_CALLABLE(name, ...)                                 \
+  struct THRUST_PP_CAT2(name, _fn)                                            \
+  {                                                                           \
+    template <typename ForwardIt, typename Sentinel, typename OutputIt>       \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    ) const                                                                   \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::copy(                                                  \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(output)               \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy
+);
+
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_host,   thrust::host
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_device, thrust::device
+);
+
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_host_to_device,    thrust::host,   thrust::device
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_device_to_host,    thrust::device, thrust::host
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_host_to_host,      thrust::host,   thrust::host
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_device_to_device,  thrust::device, thrust::device
+);
+
+#undef DEFINE_ASYNC_COPY_CALLABLE
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncCopyCallable>
+struct test_async_copy_host_to_device
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0(n);
+
+      auto f0 = AsyncCopyCallable{}(
+        h0.begin(), h0.end(), d0.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0, d0);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_host_to_device<invoke_async_copy_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_host_to_device
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_host_to_device<invoke_async_copy_host_to_device_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_host_to_device_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncCopyCallable>
+struct test_async_copy_device_to_host
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::host_vector<T>   h1(n);
+      thrust::device_vector<T> d0(n);
+
+      thrust::copy(h0.begin(), h0.end(), d0.begin());
+
+      ASSERT_EQUAL(h0, d0);
+
+      auto f0 = AsyncCopyCallable{}(
+        d0.begin(), d0.end(), h1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0, d0);
+      ASSERT_EQUAL(d0, h1);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_host<invoke_async_copy_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_device_to_host
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_host<invoke_async_copy_device_to_host_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_device_to_host_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncCopyCallable>
+struct test_async_copy_device_to_device
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0(n);
+      thrust::device_vector<T> d1(n);
+
+      thrust::copy(h0.begin(), h0.end(), d0.begin());
+
+      ASSERT_EQUAL(h0, d0);
+
+      auto f0 = AsyncCopyCallable{}(
+        d0.begin(), d0.end(), d1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0, d0);
+      ASSERT_EQUAL(d0, d1);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_device<invoke_async_copy_fn>::tester
+, NumericTypes
+, test_async_copy_device_to_device
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_device<invoke_async_copy_device_fn>::tester
+, NumericTypes
+, test_async_copy_device_to_device_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_device<invoke_async_copy_device_to_device_fn>::tester
+, NumericTypes
+, test_async_copy_device_to_device_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Non ContiguousIterator input.
+template <typename AsyncCopyCallable>
+struct test_async_copy_counting_iterator_input_to_device_vector
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(
+        unittest::truncate_to_max_representable<T>(n)
+      );
+
+      thrust::device_vector<T> d0(n);
+      thrust::device_vector<T> d1(n);
+
+      thrust::copy(first, last, d0.begin());
+
+      auto f0 = AsyncCopyCallable{}(
+        first, last, d1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(d0, d1);
+    }
+  };
+};
+// TODO: Re-add custom_numeric when it supports counting iterators.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_device
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_device_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_device_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_device_to_device_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_device_policies
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_host_to_device_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_host_to_device_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Non ContiguousIterator input.
+template <typename AsyncCopyCallable>
+struct test_async_copy_counting_iterator_input_to_host_vector
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(
+        unittest::truncate_to_max_representable<T>(n)
+      );
+
+      thrust::host_vector<T> d0(n);
+      thrust::host_vector<T> d1(n);
+
+      thrust::copy(first, last, d0.begin());
+
+      auto f0 = AsyncCopyCallable{}(
+        first, last, d1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(d0, d1);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_host_vector<
+    invoke_async_copy_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_host
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_host_vector<
+    invoke_async_copy_device_to_host_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_host_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_copy_roundtrip
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(n);
+
+    auto e0 = thrust::async::copy(
+      thrust::host, thrust::device
+    , h0.begin(), h0.end(), d0.begin()
+    );
+
+    auto e1 = thrust::async::copy(
+      thrust::device.after(e0), thrust::host
+    , d0.begin(), d0.end(), h0.begin()
+    );
+
+    TEST_EVENT_WAIT(e1);
+
+    ASSERT_EQUAL(h0, d0);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_roundtrip
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_roundtrip
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_copy_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h1(n);
+    thrust::device_vector<T> d0(n);
+    thrust::device_vector<T> d1(n);
+    thrust::device_vector<T> d2(n);
+
+    auto e0 = thrust::async::copy(
+      h0.begin(), h0.end(), d0.begin()
+    );
+
+    ASSERT_EQUAL(true, e0.valid_stream());
+
+    auto const e0_stream = e0.stream().native_handle();
+
+    auto e1 = thrust::async::copy(
+      thrust::device.after(e0), d0.begin(), d0.end(), d1.begin()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::copy(
+        thrust::device.after(e0), d0.begin(), d0.end(), d1.begin()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(e1);
+
+    auto e2 = thrust::async::copy(
+      thrust::host, after_policy2
+    , h0.begin(), h0.end(), d2.begin()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::copy(
+        thrust::host, after_policy2
+      , h0.begin(), h0.end(), d2.begin()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e2.stream().native_handle());
+
+    auto e3 = thrust::async::copy(
+      thrust::device.after(e2), thrust::host
+    , d1.begin(), d1.end(), h1.begin()
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e3.stream().native_handle());
+
+    TEST_EVENT_WAIT(e3);
+
+    ASSERT_EQUAL(h0, h1);
+    ASSERT_EQUAL(h0, d0);
+    ASSERT_EQUAL(h0, d1);
+    ASSERT_EQUAL(h0, d2);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_copy_after
+, BuiltinNumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: device_to_device NonContiguousIterator output (discard_iterator).
+
+// TODO: host_to_device non trivially relocatable.
+
+// TODO: device_to_host non trivially relocatable.
+
+// TODO: host_to_device NonContiguousIterator input (counting_iterator).
+
+// TODO: host_to_device NonContiguousIterator output (discard_iterator).
+
+// TODO: device_to_host NonContiguousIterator input (counting_iterator).
+
+// TODO: device_to_host NonContiguousIterator output (discard_iterator).
+
+// TODO: Mixed types, needs loosening of `is_trivially_relocatable_to` logic.
+
+// TODO: H->D copy, then dependent D->H copy (round trip).
+// Can't do this today because we can't do cross-system with explicit policies.
+
+#endif
+
diff --git a/thrust/testing/async_for_each.cu b/thrust/testing/async_for_each.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a09adf255c208b6b3577bef38d82c891fb064e82
--- /dev/null
+++ b/thrust/testing/async_for_each.cu
@@ -0,0 +1,99 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+
+#include <thrust/async/for_each.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#define DEFINE_ASYNC_FOR_EACH_CALLABLE(name, ...)                             \
+  struct THRUST_PP_CAT2(name, _fn)                                            \
+  {                                                                           \
+    template <typename ForwardIt, typename Sentinel, typename UnaryFunction>  \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, UnaryFunction&& f                   \
+    ) const                                                                   \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::for_each(                                              \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(f)                    \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_FOR_EACH_CALLABLE(
+  invoke_async_for_each
+);
+
+DEFINE_ASYNC_FOR_EACH_CALLABLE(
+  invoke_async_for_each_device, thrust::device
+);
+
+#undef DEFINE_ASYNC_FOR_EACH_CALLABLE
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct inplace_divide_by_2
+{
+  template <typename T>
+  __host__ __device__
+  void operator()(T& x) const
+  {
+    x /= 2;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncForEachCallable, typename UnaryFunction>
+struct test_async_for_each
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0_data(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0_data(h0_data);
+
+      thrust::for_each(h0_data.begin(), h0_data.end(), UnaryFunction{});
+
+      auto f0 = AsyncForEachCallable{}(
+        d0_data.begin(), d0_data.end(), UnaryFunction{}
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0_data, d0_data);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_for_each<
+      invoke_async_for_each_fn
+    , inplace_divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_for_each
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_for_each<
+      invoke_async_for_each_device_fn
+    , inplace_divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_for_each_policy
+);
+
+#endif
+
diff --git a/thrust/testing/async_reduce.cu b/thrust/testing/async_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5357c1af3f0edbcddba6802ffb2f2a2d80519156
--- /dev/null
+++ b/thrust/testing/async_reduce.cu
@@ -0,0 +1,1136 @@
+#define THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/reduce.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+template <typename T>
+struct custom_plus
+{
+  __host__ __device__
+  T operator()(T lhs, T rhs) const
+  {
+    return lhs + rhs;
+  }
+};
+
+#define DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(                                 \
+    NAME, MEMBERS, CTOR, DTOR, VALIDATE, ...                                  \
+  )                                                                           \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+    MEMBERS                                                                   \
+                                                                              \
+    NAME() { CTOR }                                                           \
+                                                                              \
+    ~NAME() { DTOR }                                                          \
+                                                                              \
+    template <typename Event>                                                 \
+    void validate_event(Event& e)                                             \
+    {                                                                         \
+      THRUST_UNUSED_VAR(e);                                                   \
+      VALIDATE                                                                \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_DECLTYPE_RETURNS(                                                  \
+      ::thrust::async::reduce(                                                \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+#define DEFINE_ASYNC_REDUCE_INVOKER(NAME, ...)                                \
+  DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(                                       \
+    NAME                                                                      \
+  , THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY()\
+  , __VA_ARGS__                                                               \
+  )                                                                           \
+  /**/
+
+#define DEFINE_SYNC_REDUCE_INVOKER(NAME, ...)                                 \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::reduce(                                                       \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_init
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_init
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_init_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_init_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_init_custom_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_init_custom_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncReduceInvoker
+, template <typename> class SyncReduceInvoker
+>
+struct test_async_reduce
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      AsyncReduceInvoker<T> invoke_async;
+      SyncReduceInvoker<T>  invoke_sync;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end());
+      auto f0b = invoke_async(d0b.begin(), d0b.end());
+      auto f0c = invoke_async(d0c.begin(), d0c.end());
+      auto f0d = invoke_async(d0d.begin(), d0d.end());
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      auto const r0 = invoke_sync(h0.begin(), h0.end());
+
+      auto const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f0a);
+      auto const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f0b);
+      auto const r1c = TEST_FUTURE_VALUE_RETRIEVAL(f0c);
+      auto const r1d = TEST_FUTURE_VALUE_RETRIEVAL(f0d);
+
+      ASSERT_EQUAL(r0, r1a);
+      ASSERT_EQUAL(r0, r1b);
+      ASSERT_EQUAL(r0, r1c);
+      ASSERT_EQUAL(r0, r1d);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on_init_custom_plus
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncReduceInvoker
+, template <typename> class SyncReduceInvoker
+>
+struct test_async_reduce_counting_iterator
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()()
+    {
+      constexpr std::size_t n = 15 * sizeof(T);
+
+      ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(n);
+
+      AsyncReduceInvoker<T> invoke_async;
+      SyncReduceInvoker<T>  invoke_sync;
+
+      auto f0a = invoke_async(first, last);
+      auto f0b = invoke_async(first, last);
+      auto f0c = invoke_async(first, last);
+      auto f0d = invoke_async(first, last);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      auto const r0 = invoke_sync(first, last);
+
+      auto const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f0a);
+      auto const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f0b);
+      auto const r1c = TEST_FUTURE_VALUE_RETRIEVAL(f0c);
+      auto const r1d = TEST_FUTURE_VALUE_RETRIEVAL(f0d);
+
+      ASSERT_EQUAL(r0, r1a);
+      ASSERT_EQUAL(r0, r1b);
+      ASSERT_EQUAL(r0, r1c);
+      ASSERT_EQUAL(r0, r1d);
+    }
+  };
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker
+    , reduce_sync_invoker
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device
+    , reduce_sync_invoker
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator_init
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator_init
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator_init_plus
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator_init_plus
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator_init_custom_plus
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator_init_custom_plus
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_using
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0a(h0);
+    thrust::device_vector<T> d0b(h0);
+
+    ASSERT_EQUAL(h0, d0a);
+    ASSERT_EQUAL(h0, d0b);
+
+    thrust::device_future<T> f0a;
+    thrust::device_future<T> f0b;
+
+    // When you import the customization points into the global namespace,
+    // they should be selected instead of the synchronous algorithms.
+    {
+      using namespace thrust::async;
+      f0a = reduce(d0a.begin(), d0a.end());
+    }
+    {
+      using thrust::async::reduce;
+      f0b = reduce(d0b.begin(), d0b.end());
+    }
+
+    // ADL should find the synchronous algorithms.
+    // This potentially runs concurrently with the copies.
+    T const r0 = reduce(h0.begin(), h0.end());
+
+    T const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f0a);
+    T const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f0b);
+
+    ASSERT_EQUAL(r0, r1a);
+    ASSERT_EQUAL(r0, r1b);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_using
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    auto f0 = thrust::async::reduce(
+      d0.begin(), d0.end()
+    );
+
+    ASSERT_EQUAL(true, f0.valid_stream());
+ 
+    auto const f0_stream = f0.stream().native_handle();
+
+    auto f1 = thrust::async::reduce(
+      thrust::device.after(f0), d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device.after(f0), d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(f0_stream, f1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(f1);
+
+    auto f2 = thrust::async::reduce(
+      after_policy2, d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        after_policy2, d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(f0_stream, f2.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+    T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f2);
+
+    ASSERT_EQUAL(r0, r1);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_after
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_on_then_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    cudaStream_t stream;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)
+    );
+
+    auto f0 = thrust::async::reduce(
+      thrust::device.on(stream), d0.begin(), d0.end()
+    );
+
+    ASSERT_EQUAL_QUIET(stream, f0.stream().native_handle());
+
+    auto f1 = thrust::async::reduce(
+      thrust::device.after(f0), d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device.after(f0), d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(stream, f1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(f1);
+
+    auto f2 = thrust::async::reduce(
+      after_policy2, d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        after_policy2, d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(stream, f2.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+    T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f2);
+
+    ASSERT_EQUAL(r0, r1);
+
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamDestroy(stream)
+    );
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_on_then_after
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_allocator_on_then_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    cudaStream_t stream0;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&stream0, cudaStreamNonBlocking)
+    );
+
+    cudaStream_t stream1;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)
+    );
+
+    auto f0 = thrust::async::reduce(
+      thrust::device(thrust::device_allocator<void>{}).on(stream0)
+    , d0.begin(), d0.end()
+    );
+
+    ASSERT_EQUAL_QUIET(stream0, f0.stream().native_handle());
+
+    auto f1 = thrust::async::reduce(
+      thrust::device(thrust::device_allocator<void>{}).after(f0)
+    , d0.begin(), d0.end()
+    );
+
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device(thrust::device_allocator<void>{}).after(f0)
+      , d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(stream0, f1.stream().native_handle());
+
+    auto f2 = thrust::async::reduce(
+      thrust::device(thrust::device_allocator<void>{}).on(stream1).after(f1)
+    , d0.begin(), d0.end()
+    );
+
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device(thrust::device_allocator<void>{}).on(stream1).after(f1)
+      , d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    KNOWN_FAILURE;
+    // FIXME: The below fails because you can't combine allocator attachment,
+    // `.on`, and `.after`.
+    ASSERT_EQUAL_QUIET(stream1, f2.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+    T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f2);
+
+    ASSERT_EQUAL(r0, r1);
+
+    thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream0));
+    thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream1));
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_allocator_on_then_after
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_caching
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    constexpr std::int64_t m = 32;
+
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    T const* f0_raw_data;
+
+    {
+      // Perform one reduction to ensure there's an entry in the caching
+      // allocator.
+      auto f0 = thrust::async::reduce(d0.begin(), d0.end());
+
+      TEST_EVENT_WAIT(f0);
+
+      f0_raw_data = f0.raw_data();
+    }
+
+    for (std::int64_t i = 0; i < m; ++i)
+    {
+      auto f1 = thrust::async::reduce(d0.begin(), d0.end());
+
+      ASSERT_EQUAL(true, f1.valid_stream());
+      ASSERT_EQUAL(true, f1.valid_content());
+
+      ASSERT_EQUAL_QUIET(f0_raw_data, f1.raw_data());
+
+      // This potentially runs concurrently with the copies.
+      T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+      T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f1);
+
+      ASSERT_EQUAL(r0, r1);
+    }
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_caching
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_copy_then_reduce
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0a(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h0b(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h0c(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h0d(unittest::random_integers<T>(n));
+
+    thrust::device_vector<T> d0a(n);
+    thrust::device_vector<T> d0b(n);
+    thrust::device_vector<T> d0c(n);
+    thrust::device_vector<T> d0d(n);
+
+    auto f0a = thrust::async::copy(h0a.begin(), h0a.end(), d0a.begin());
+    auto f0b = thrust::async::copy(h0b.begin(), h0b.end(), d0b.begin());
+    auto f0c = thrust::async::copy(h0c.begin(), h0c.end(), d0c.begin());
+    auto f0d = thrust::async::copy(h0d.begin(), h0d.end(), d0d.begin());
+
+    ASSERT_EQUAL(true, f0a.valid_stream());
+    ASSERT_EQUAL(true, f0b.valid_stream());
+    ASSERT_EQUAL(true, f0c.valid_stream());
+    ASSERT_EQUAL(true, f0d.valid_stream());
+
+    auto const f0a_stream = f0a.stream().native_handle();
+    auto const f0b_stream = f0b.stream().native_handle();
+    auto const f0c_stream = f0c.stream().native_handle();
+    auto const f0d_stream = f0d.stream().native_handle();
+
+    auto f1a = thrust::async::reduce(
+      thrust::device.after(f0a), d0a.begin(), d0a.end()
+    );
+    auto f1b = thrust::async::reduce(
+      thrust::device.after(f0b), d0b.begin(), d0b.end()
+    );
+    auto f1c = thrust::async::reduce(
+      thrust::device.after(f0c), d0c.begin(), d0c.end()
+    );
+    auto f1d = thrust::async::reduce(
+      thrust::device.after(f0d), d0d.begin(), d0d.end()
+    );
+
+    ASSERT_EQUAL(false, f0a.valid_stream());
+    ASSERT_EQUAL(false, f0b.valid_stream());
+    ASSERT_EQUAL(false, f0c.valid_stream());
+    ASSERT_EQUAL(false, f0d.valid_stream());
+
+    ASSERT_EQUAL(true, f1a.valid_stream());
+    ASSERT_EQUAL(true, f1a.valid_content());
+    ASSERT_EQUAL(true, f1b.valid_stream());
+    ASSERT_EQUAL(true, f1b.valid_content());
+    ASSERT_EQUAL(true, f1c.valid_stream());
+    ASSERT_EQUAL(true, f1c.valid_content());
+    ASSERT_EQUAL(true, f1d.valid_stream());
+    ASSERT_EQUAL(true, f1d.valid_content());
+
+    // Verify that streams were stolen.
+    ASSERT_EQUAL_QUIET(f0a_stream, f1a.stream().native_handle());
+    ASSERT_EQUAL_QUIET(f0b_stream, f1b.stream().native_handle());
+    ASSERT_EQUAL_QUIET(f0c_stream, f1c.stream().native_handle());
+    ASSERT_EQUAL_QUIET(f0d_stream, f1d.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0a.begin(), h0a.end());
+
+    T const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f1a);
+    T const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f1b);
+    T const r1c = TEST_FUTURE_VALUE_RETRIEVAL(f1c);
+    T const r1d = TEST_FUTURE_VALUE_RETRIEVAL(f1d);
+
+    ASSERT_EQUAL(r0, r1a);
+    ASSERT_EQUAL(r0, r1b);
+    ASSERT_EQUAL(r0, r1c);
+    ASSERT_EQUAL(r0, r1d);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_copy_then_reduce
+, BuiltinNumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: when_all from reductions.
+
+#endif
+
diff --git a/thrust/testing/async_reduce_into.cu b/thrust/testing/async_reduce_into.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4a2be99ec7bc754c74c0b762a2f0d36f1b9c045
--- /dev/null
+++ b/thrust/testing/async_reduce_into.cu
@@ -0,0 +1,625 @@
+#define THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/reduce.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_make_unique.h>
+
+template <typename T>
+struct custom_plus
+{
+  __host__ __device__
+  T operator()(T lhs, T rhs) const
+  {
+    return lhs + rhs;
+  }
+};
+
+#define DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(                            \
+    NAME, MEMBERS, CTOR, DTOR, VALIDATE, ...                                  \
+  )                                                                           \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+    MEMBERS                                                                   \
+                                                                              \
+    NAME() { CTOR }                                                           \
+                                                                              \
+    ~NAME() { DTOR }                                                          \
+                                                                              \
+    template <typename Event>                                                 \
+    void validate_event(Event& e)                                             \
+    {                                                                         \
+      THRUST_UNUSED_VAR(e);                                                   \
+      VALIDATE                                                                \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel, typename OutputIt                \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    )                                                                         \
+    THRUST_DECLTYPE_RETURNS(                                                  \
+      ::thrust::async::reduce_into(                                           \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+#define DEFINE_ASYNC_REDUCE_INTO_INVOKER(NAME, ...)                           \
+  DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(                                  \
+    NAME                                                                      \
+  , THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY()\
+  , __VA_ARGS__                                                               \
+  )                                                                           \
+  /**/
+
+#define DEFINE_SYNC_REDUCE_INVOKER(NAME, ...)                                 \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::reduce(                                                       \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_init
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_init
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_init_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_init_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_init_custom_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_init_custom_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncReduceIntoInvoker
+, template <typename> class SyncReduceIntoInvoker
+>
+struct test_async_reduce_into
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      auto s0a = thrust::device_make_unique<T>();
+      auto s0b = thrust::device_make_unique<T>();
+      auto s0c = thrust::device_make_unique<T>();
+      auto s0d = thrust::device_make_unique<T>();
+
+      auto const s0a_ptr = s0a.get();
+      auto const s0b_ptr = s0b.get();
+      auto const s0c_ptr = s0c.get();
+      auto const s0d_ptr = s0d.get();
+
+      AsyncReduceIntoInvoker<T> invoke_async;
+      SyncReduceIntoInvoker<T>  invoke_sync;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end(), s0a_ptr);
+      auto f0b = invoke_async(d0b.begin(), d0b.end(), s0b_ptr);
+      auto f0c = invoke_async(d0c.begin(), d0c.end(), s0c_ptr);
+      auto f0d = invoke_async(d0d.begin(), d0d.end(), s0d_ptr);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      auto const r0 = invoke_sync(h0.begin(), h0.end());
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(r0, *s0a_ptr);
+      ASSERT_EQUAL(r0, *s0b_ptr);
+      ASSERT_EQUAL(r0, *s0c_ptr);
+      ASSERT_EQUAL(r0, *s0d_ptr);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on_init_custom_plus
+);
+
+#endif
+
diff --git a/thrust/testing/async_sort.cu b/thrust/testing/async_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b39db3c3b46db5453717369f1bf0c84ca6733350
--- /dev/null
+++ b/thrust/testing/async_sort.cu
@@ -0,0 +1,334 @@
+#include <thrust/detail/config.h>
+
+// Disabled on MSVC && NVCC < 11.1 for GH issue #1098.
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && defined(__CUDACC__)
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1)
+#define THRUST_BUG_1098_ACTIVE
+#endif // NVCC version check
+#endif // MSVC + NVCC check
+
+#if THRUST_CPP_DIALECT >= 2014 && !defined(THRUST_BUG_1098_ACTIVE)
+
+#include <unittest/unittest.h>
+
+#include <thrust/async/sort.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+enum wait_policy
+{
+  wait_for_futures
+, do_not_wait_for_futures
+};
+
+template <typename T>
+struct custom_greater
+{
+  __host__ __device__
+  bool operator()(T rhs, T lhs) const
+  {
+    return lhs > rhs;
+  }
+};
+
+#define DEFINE_SORT_INVOKER(name, ...)                                        \
+  template <typename T>                                                       \
+  struct name                                                                 \
+  {                                                                           \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static void sync(                                                         \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    {                                                                         \
+      ::thrust::sort(                                                         \
+        THRUST_FWD(first), THRUST_FWD(last)                                   \
+      );                                                                      \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static auto async(                                                        \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::sort(                                                  \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last)                                   \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_SORT_INVOKER(
+  sort_invoker
+);
+DEFINE_SORT_INVOKER(
+  sort_invoker_device, thrust::device
+);
+
+#define DEFINE_SORT_OP_INVOKER(name, op, ...)                                 \
+  template <typename T>                                                       \
+  struct name                                                                 \
+  {                                                                           \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static void sync(                                                         \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    {                                                                         \
+      ::thrust::sort(                                                         \
+        THRUST_FWD(first), THRUST_FWD(last), op<T>{}                          \
+      );                                                                      \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static auto async(                                                        \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::sort(                                                  \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last), op<T>{}                          \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_less,        thrust::less
+);
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_less_device, thrust::less, thrust::device 
+);
+
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_greater,        thrust::greater
+);
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_greater_device, thrust::greater, thrust::device 
+);
+
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_custom_greater,        custom_greater
+);
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_custom_greater_device, custom_greater, thrust::device 
+);
+
+#undef DEFINE_SORT_INVOKER
+#undef DEFINE_SORT_OP_INVOKER
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <template <typename> class SortInvoker, wait_policy WaitPolicy>
+struct test_async_sort
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0_data(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0_data(h0_data);
+
+      ASSERT_EQUAL(h0_data, d0_data);
+
+      SortInvoker<T>::sync(
+        h0_data.begin(), h0_data.end()
+      );
+
+      auto f0 = SortInvoker<T>::async(
+        d0_data.begin(), d0_data.end()
+      );
+
+      if (wait_for_futures == WaitPolicy)
+      {
+        f0.wait();
+
+        ASSERT_EQUAL(h0_data, d0_data);
+      }
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_less
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_less_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_less
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_less_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_greater_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_greater_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_custom_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_custom_greater_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_custom_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_custom_greater_no_wait
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: Async copy then sort.
+
+// TODO: Test future return type.
+
+#endif
+
diff --git a/thrust/testing/async_transform.cu b/thrust/testing/async_transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..efaa885f096b855057f1aec2f8a4344fada964bb
--- /dev/null
+++ b/thrust/testing/async_transform.cu
@@ -0,0 +1,533 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/transform.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+template <typename T>
+struct divide_by_2
+{
+  __host__ __device__
+  T operator()(T x) const
+  {
+    return x / 2;
+  }
+};
+
+#define DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(                        \
+    NAME, MEMBERS, CTOR, DTOR, VALIDATE, ...                                  \
+  )                                                                           \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+    MEMBERS                                                                   \
+                                                                              \
+    NAME() { CTOR }                                                           \
+                                                                              \
+    ~NAME() { DTOR }                                                          \
+                                                                              \
+    template <typename Event>                                                 \
+    void validate_event(Event& e)                                             \
+    {                                                                         \
+      THRUST_UNUSED_VAR(e);                                                   \
+      VALIDATE                                                                \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel, typename OutputIt                \
+    , typename UnaryOperation                                                 \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    , UnaryOperation&& op                                                     \
+    )                                                                         \
+    THRUST_DECLTYPE_RETURNS(                                                  \
+      ::thrust::async::transform(                                             \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+#define DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(NAME, ...)                       \
+  DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(                              \
+    NAME                                                                      \
+  , THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY()\
+  , __VA_ARGS__                                                               \
+  )                                                                           \
+  /**/
+
+#define DEFINE_SYNC_TRANSFORM_UNARY_INVOKER(NAME, ...)                        \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel, typename OutputIt                \
+    , typename UnaryOperation                                                 \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    , UnaryOperation&& op                                                     \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::transform(                                                    \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device_allocator
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::transform`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device_allocator_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::transform`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+
+DEFINE_SYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_sync_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncTransformUnaryInvoker
+, template <typename> class SyncTransformUnaryInvoker
+, template <typename> class UnaryOperation
+>
+struct test_async_transform_unary
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      thrust::host_vector<T>   h1(n);
+
+      thrust::device_vector<T> d1a(n);
+      thrust::device_vector<T> d1b(n);
+      thrust::device_vector<T> d1c(n);
+      thrust::device_vector<T> d1d(n);
+
+      AsyncTransformUnaryInvoker<T> invoke_async;
+      SyncTransformUnaryInvoker<T>  invoke_sync;
+
+      UnaryOperation<T> op;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end(), d1a.begin(), op);
+      auto f0b = invoke_async(d0b.begin(), d0b.end(), d1b.begin(), op);
+      auto f0c = invoke_async(d0c.begin(), d0c.end(), d1c.begin(), op);
+      auto f0d = invoke_async(d0d.begin(), d0d.end(), d1d.begin(), op);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      invoke_sync(h0.begin(), h0.end(), h1.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      ASSERT_EQUAL(h1, d1a);
+      ASSERT_EQUAL(h1, d1b);
+      ASSERT_EQUAL(h1, d1c);
+      ASSERT_EQUAL(h1, d1d);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device_allocator
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_allocator_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_on_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device_allocator_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_allocator_on_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncTransformUnaryInvoker
+, template <typename> class SyncTransformUnaryInvoker
+, template <typename> class UnaryOperation
+>
+struct test_async_transform_unary_inplace
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      AsyncTransformUnaryInvoker<T> invoke_async;
+      SyncTransformUnaryInvoker<T>  invoke_sync;
+
+      UnaryOperation<T> op;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end(), d0a.begin(), op);
+      auto f0b = invoke_async(d0b.begin(), d0b.end(), d0b.begin(), op);
+      auto f0c = invoke_async(d0c.begin(), d0c.end(), d0c.begin(), op);
+      auto f0d = invoke_async(d0d.begin(), d0d.end(), d0d.begin(), op);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      invoke_sync(h0.begin(), h0.end(), h0.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device_allocator
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_allocator_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_on_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device_allocator_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_allocator_on_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncTransformUnaryInvoker
+, template <typename> class SyncTransformUnaryInvoker
+, template <typename> class UnaryOperation
+>
+struct test_async_transform_unary_counting_iterator
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()()
+    {
+      constexpr std::size_t n = 15 * sizeof(T);
+
+      ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(n);
+
+      thrust::host_vector<T>   h0(n);
+
+      thrust::device_vector<T> d0a(n);
+      thrust::device_vector<T> d0b(n);
+      thrust::device_vector<T> d0c(n);
+      thrust::device_vector<T> d0d(n);
+
+      AsyncTransformUnaryInvoker<T> invoke_async;
+      SyncTransformUnaryInvoker<T>  invoke_sync;
+
+      UnaryOperation<T> op;
+
+      auto f0a = invoke_async(first, last, d0a.begin(), op);
+      auto f0b = invoke_async(first, last, d0b.begin(), op);
+      auto f0c = invoke_async(first, last, d0c.begin(), op);
+      auto f0d = invoke_async(first, last, d0d.begin(), op);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      invoke_sync(first, last, h0.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+    }
+  };
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_counting_iterator<
+      transform_unary_async_invoker
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_transform_unary_counting_iterator_divide_by_2
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_counting_iterator<
+      transform_unary_async_invoker_device
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_transform_unary_counting_iterator_policy_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class UnaryOperation
+>
+struct test_async_transform_using
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+
+      thrust::host_vector<T>   h1(n);
+
+      thrust::device_vector<T> d1a(n);
+      thrust::device_vector<T> d1b(n);
+
+      UnaryOperation<T> op;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+
+      thrust::device_event f0a;
+      thrust::device_event f0b;
+
+      // When you import the customization points into the global namespace,
+      // they should be selected instead of the synchronous algorithms.
+      {
+        using namespace thrust::async;
+        f0a = transform(d0a.begin(), d0a.end(), d1a.begin(), op);
+      }
+      {
+        using thrust::async::transform;
+        f0b = transform(d0b.begin(), d0b.end(), d1b.begin(), op);
+      }
+
+      // ADL should find the synchronous algorithms.
+      // This potentially runs concurrently with the copies.
+      transform(h0.begin(), h0.end(), h1.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+
+      ASSERT_EQUAL(h1, d1a);
+      ASSERT_EQUAL(h1, d1b);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND(test_async_transform_using<divide_by_2>::tester)
+, NumericTypes
+, test_async_transform_using_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif
+
diff --git a/thrust/testing/binary_search.cu b/thrust/testing/binary_search.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2aceb8645876223c018ff7590b8f31ae44571667
--- /dev/null
+++ b/thrust/testing/binary_search.cu
@@ -0,0 +1,347 @@
+#include <unittest/unittest.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/retag.h>
+
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+//////////////////////
+// Scalar Functions //
+//////////////////////
+
+template <class Vector>
+void TestScalarLowerBoundSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 0) - vec.begin(), 0);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 1) - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 2) - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 3) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 4) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 5) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 6) - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 7) - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 8) - vec.begin(), 4);
+    ASSERT_EQUAL(thrust::lower_bound(vec.begin(), vec.end(), 9) - vec.begin(), 5);
+}
+DECLARE_VECTOR_UNITTEST(TestScalarLowerBoundSimple);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+ForwardIterator lower_bound(my_system &system, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestScalarLowerBoundDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::lower_bound(sys,
+                        vec.begin(),
+                        vec.end(),
+                        0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestScalarLowerBoundDispatchExplicit);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+ForwardIterator lower_bound(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    *first = 13;
+    return first;
+}
+
+
+void TestScalarLowerBoundDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::lower_bound(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestScalarLowerBoundDispatchImplicit);
+
+
+template <class Vector>
+void TestScalarUpperBoundSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 0) - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 1) - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 2) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 3) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 4) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 5) - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 6) - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 7) - vec.begin(), 4);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 8) - vec.begin(), 5);
+    ASSERT_EQUAL(thrust::upper_bound(vec.begin(), vec.end(), 9) - vec.begin(), 5);
+}
+DECLARE_VECTOR_UNITTEST(TestScalarUpperBoundSimple);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+ForwardIterator upper_bound(my_system &system, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestScalarUpperBoundDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::upper_bound(sys,
+                        vec.begin(),
+                        vec.end(),
+                        0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestScalarUpperBoundDispatchExplicit);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+ForwardIterator upper_bound(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    *first = 13;
+    return first;
+}
+
+void TestScalarUpperBoundDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::upper_bound(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestScalarUpperBoundDispatchImplicit);
+
+
+template <class Vector>
+void TestScalarBinarySearchSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 0),  true);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 1), false);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 2),  true);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 3), false);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 4), false);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 5),  true);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 6), false);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 7),  true);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 8),  true);
+    ASSERT_EQUAL(thrust::binary_search(vec.begin(), vec.end(), 9), false);
+}
+DECLARE_VECTOR_UNITTEST(TestScalarBinarySearchSimple);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+bool binary_search(my_system &system, ForwardIterator /*first*/, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    system.validate_dispatch();
+    return false;
+}
+
+void TestScalarBinarySearchDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::binary_search(sys,
+                          vec.begin(),
+                          vec.end(),
+                          0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestScalarBinarySearchDispatchExplicit);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+bool binary_search(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    *first = 13;
+    return false;
+}
+
+void TestScalarBinarySearchDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::binary_search(thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.end()),
+                          0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestScalarBinarySearchDispatchImplicit);
+
+
+template <class Vector>
+void TestScalarEqualRangeSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 0).first - vec.begin(), 0);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 1).first - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 2).first - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 3).first - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 4).first - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 5).first - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 6).first - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 7).first - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 8).first - vec.begin(), 4);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 9).first - vec.begin(), 5);
+    
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 0).second - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 1).second - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 2).second - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 3).second - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 4).second - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 5).second - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 6).second - vec.begin(), 3);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 7).second - vec.begin(), 4);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 8).second - vec.begin(), 5);
+    ASSERT_EQUAL(thrust::equal_range(vec.begin(), vec.end(), 9).second - vec.begin(), 5);
+}
+DECLARE_VECTOR_UNITTEST(TestScalarEqualRangeSimple);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator,ForwardIterator> equal_range(my_system &system, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    system.validate_dispatch();
+    return thrust::make_pair(first,first);
+}
+
+void TestScalarEqualRangeDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::equal_range(sys,
+                        vec.begin(),
+                        vec.end(),
+                        0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestScalarEqualRangeDispatchExplicit);
+
+
+template<typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator,ForwardIterator> equal_range(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
+{
+    *first = 13;
+    return thrust::make_pair(first,first);
+}
+
+void TestScalarEqualRangeDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::equal_range(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestScalarEqualRangeDispatchImplicit);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+void TestBoundsWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::detail::intmax_t distance_low_value = thrust::distance(
+        begin,
+        thrust::lower_bound(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    thrust::detail::intmax_t distance_high_value = thrust::distance(
+        begin,
+        thrust::lower_bound(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 16);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18);
+
+    distance_low_value = thrust::distance(
+        begin,
+        thrust::upper_bound(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    distance_high_value = thrust::distance(
+        begin,
+        thrust::upper_bound(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 17);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 17);
+}
+
+void TestBoundsWithBigIndexes()
+{
+    TestBoundsWithBigIndexesHelper(30);
+    TestBoundsWithBigIndexesHelper(31);
+    TestBoundsWithBigIndexesHelper(32);
+    TestBoundsWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestBoundsWithBigIndexes);
diff --git a/thrust/testing/binary_search_descending.cu b/thrust/testing/binary_search_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5228c456739c0d0dd92fa703134e84e5cb0c0002
--- /dev/null
+++ b/thrust/testing/binary_search_descending.cu
@@ -0,0 +1,129 @@
+#include <unittest/unittest.h>
+#include <thrust/binary_search.h>
+#include <thrust/functional.h>
+
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+//////////////////////
+// Scalar Functions //
+//////////////////////
+
+template <class Vector>
+void TestScalarLowerBoundDescendingSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+
+    vec[0] = 8;
+    vec[1] = 7;
+    vec[2] = 5;
+    vec[3] = 2;
+    vec[4] = 0;
+
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 0, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 1, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 2, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 3, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 4, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 5, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 6, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), 7, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 8, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+}
+DECLARE_VECTOR_UNITTEST(TestScalarLowerBoundDescendingSimple);
+
+
+template <class Vector>
+void TestScalarUpperBoundDescendingSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+
+    vec[0] = 8;
+    vec[1] = 7;
+    vec[2] = 5;
+    vec[3] = 2;
+    vec[4] = 0;
+
+    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), 0, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 1, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 2, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 3, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 4, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 5, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 6, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 7, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), 8, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+}
+DECLARE_VECTOR_UNITTEST(TestScalarUpperBoundDescendingSimple);
+
+
+template <class Vector>
+void TestScalarBinarySearchDescendingSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+
+    vec[0] = 8;
+    vec[1] = 7;
+    vec[2] = 5;
+    vec[3] = 2;
+    vec[4] = 0;
+
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 0, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 1, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 2, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 3, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 4, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 5, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 6, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 7, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 8, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+}
+DECLARE_VECTOR_UNITTEST(TestScalarBinarySearchDescendingSimple);
+
+
+template <class Vector>
+void TestScalarEqualRangeDescendingSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+
+    vec[0] = 8;
+    vec[1] = 7;
+    vec[2] = 5;
+    vec[3] = 2;
+    vec[4] = 0;
+
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater<T>()).first);
+
+    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater<T>()).second);
+}
+DECLARE_VECTOR_UNITTEST(TestScalarEqualRangeDescendingSimple);
+
diff --git a/thrust/testing/binary_search_vector.cu b/thrust/testing/binary_search_vector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e8f8358ea4f2091b341f8c2b03a4d34b04ae36f
--- /dev/null
+++ b/thrust/testing/binary_search_vector.cu
@@ -0,0 +1,468 @@
+#include <unittest/unittest.h>
+#include <thrust/binary_search.h>
+
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+// convert xxx_vector<T1> to xxx_vector<T2> 
+template <class ExampleVector, typename NewType> 
+struct vector_like
+{
+    typedef typename ExampleVector::allocator_type alloc;
+    typedef typename thrust::detail::allocator_traits<alloc> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<NewType> new_alloc;
+    typedef thrust::detail::vector_base<NewType, new_alloc> type;
+};
+
+template <class Vector>
+void TestVectorLowerBoundSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    Vector input(10);
+    thrust::sequence(input.begin(), input.end());
+
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
+
+    // test with integral output type
+    IntVector integral_output(10);
+    thrust::lower_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin());
+    
+    typename IntVector::iterator output_end = thrust::lower_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin());
+
+    ASSERT_EQUAL((output_end - integral_output.begin()), 10);
+
+    ASSERT_EQUAL(integral_output[0], 0);
+    ASSERT_EQUAL(integral_output[1], 1);
+    ASSERT_EQUAL(integral_output[2], 1);
+    ASSERT_EQUAL(integral_output[3], 2);
+    ASSERT_EQUAL(integral_output[4], 2);
+    ASSERT_EQUAL(integral_output[5], 2);
+    ASSERT_EQUAL(integral_output[6], 3);
+    ASSERT_EQUAL(integral_output[7], 3);
+    ASSERT_EQUAL(integral_output[8], 4);
+    ASSERT_EQUAL(integral_output[9], 5);
+
+//    // test with interator output type
+//    typedef typename vector_like<Vector, typename Vector::iterator>::type IteratorVector;
+//    IteratorVector iterator_output(10);
+//    thrust::lower_bound(vec.begin(), vec.end(), input.begin(), input.end(), iterator_output.begin());
+//
+//    ASSERT_EQUAL(iterator_output[0] - vec.begin(), 0);
+//    ASSERT_EQUAL(iterator_output[1] - vec.begin(), 1);
+//    ASSERT_EQUAL(iterator_output[2] - vec.begin(), 1);
+//    ASSERT_EQUAL(iterator_output[3] - vec.begin(), 2);
+//    ASSERT_EQUAL(iterator_output[4] - vec.begin(), 2);
+//    ASSERT_EQUAL(iterator_output[5] - vec.begin(), 2);
+//    ASSERT_EQUAL(iterator_output[6] - vec.begin(), 3);
+//    ASSERT_EQUAL(iterator_output[7] - vec.begin(), 3);
+//    ASSERT_EQUAL(iterator_output[8] - vec.begin(), 4);
+//    ASSERT_EQUAL(iterator_output[9] - vec.begin(), 5);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorLowerBoundSimple);
+
+
+template<typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(my_system &system, ForwardIterator,ForwardIterator,InputIterator,InputIterator,OutputIterator output)
+{
+    system.validate_dispatch();
+    return output;
+}
+
+void TestVectorLowerBoundDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::lower_bound(sys,
+                        vec.begin(),
+                        vec.end(),
+                        vec.begin(),
+                        vec.end(),
+                        vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestVectorLowerBoundDispatchExplicit);
+
+
+template<typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(my_tag, ForwardIterator,ForwardIterator,InputIterator,InputIterator,OutputIterator output)
+{
+    *output = 13;
+    return output;
+}
+
+void TestVectorLowerBoundDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::lower_bound(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestVectorLowerBoundDispatchImplicit);
+
+
+template <class Vector>
+void TestVectorUpperBoundSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    Vector input(10);
+    thrust::sequence(input.begin(), input.end());
+
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
+
+    // test with integral output type
+    IntVector integral_output(10);
+    typename IntVector::iterator output_end = thrust::upper_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin());
+
+    ASSERT_EQUAL((output_end - integral_output.begin()), 10);
+
+    ASSERT_EQUAL(integral_output[0], 1);
+    ASSERT_EQUAL(integral_output[1], 1);
+    ASSERT_EQUAL(integral_output[2], 2);
+    ASSERT_EQUAL(integral_output[3], 2);
+    ASSERT_EQUAL(integral_output[4], 2);
+    ASSERT_EQUAL(integral_output[5], 3);
+    ASSERT_EQUAL(integral_output[6], 3);
+    ASSERT_EQUAL(integral_output[7], 4);
+    ASSERT_EQUAL(integral_output[8], 5);
+    ASSERT_EQUAL(integral_output[9], 5);
+
+//    // test with interator output type
+//    typedef typename vector_like<Vector, typename Vector::iterator>::type IteratorVector;
+//    IteratorVector iterator_output(10);
+//    thrust::lower_bound(vec.begin(), vec.end(), input.begin(), input.end(), iterator_output.begin());
+//
+//    ASSERT_EQUAL(iterator_output[0] - vec.begin(), 1);
+//    ASSERT_EQUAL(iterator_output[1] - vec.begin(), 1);
+//    ASSERT_EQUAL(iterator_output[2] - vec.begin(), 2);
+//    ASSERT_EQUAL(iterator_output[3] - vec.begin(), 2);
+//    ASSERT_EQUAL(iterator_output[4] - vec.begin(), 2);
+//    ASSERT_EQUAL(iterator_output[5] - vec.begin(), 3);
+//    ASSERT_EQUAL(iterator_output[6] - vec.begin(), 3);
+//    ASSERT_EQUAL(iterator_output[7] - vec.begin(), 4);
+//    ASSERT_EQUAL(iterator_output[8] - vec.begin(), 5);
+//    ASSERT_EQUAL(iterator_output[9] - vec.begin(), 5);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorUpperBoundSimple);
+
+
+template<typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(my_system &system, ForwardIterator,ForwardIterator,InputIterator,InputIterator,OutputIterator output)
+{
+    system.validate_dispatch();
+    return output;
+}
+
+void TestVectorUpperBoundDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::upper_bound(sys,
+                        vec.begin(),
+                        vec.end(),
+                        vec.begin(),
+                        vec.end(),
+                        vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestVectorUpperBoundDispatchExplicit);
+
+
+template<typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(my_tag, ForwardIterator,ForwardIterator,InputIterator,InputIterator,OutputIterator output)
+{
+    *output = 13;
+    return output;
+}
+
+void TestVectorUpperBoundDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::upper_bound(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestVectorUpperBoundDispatchImplicit);
+
+
+template <class Vector>
+void TestVectorBinarySearchSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 0;
+    vec[1] = 2;
+    vec[2] = 5;
+    vec[3] = 7;
+    vec[4] = 8;
+
+    Vector input(10);
+    thrust::sequence(input.begin(), input.end());
+
+    typedef typename vector_like<Vector, bool>::type BoolVector;
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector,  int_type>::type IntVector;
+
+    // test with boolean output type
+    BoolVector bool_output(10);
+    typename BoolVector::iterator bool_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), bool_output.begin());
+
+    ASSERT_EQUAL((bool_output_end - bool_output.begin()), 10);
+
+    ASSERT_EQUAL(bool_output[0],  true);
+    ASSERT_EQUAL(bool_output[1], false);
+    ASSERT_EQUAL(bool_output[2],  true);
+    ASSERT_EQUAL(bool_output[3], false);
+    ASSERT_EQUAL(bool_output[4], false);
+    ASSERT_EQUAL(bool_output[5],  true);
+    ASSERT_EQUAL(bool_output[6], false);
+    ASSERT_EQUAL(bool_output[7],  true);
+    ASSERT_EQUAL(bool_output[8],  true);
+    ASSERT_EQUAL(bool_output[9], false);
+    
+    // test with integral output type
+    IntVector integral_output(10, 2);
+    typename IntVector::iterator int_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin());
+
+    ASSERT_EQUAL((int_output_end - integral_output.begin()), 10);
+    
+    ASSERT_EQUAL(integral_output[0], 1);
+    ASSERT_EQUAL(integral_output[1], 0);
+    ASSERT_EQUAL(integral_output[2], 1);
+    ASSERT_EQUAL(integral_output[3], 0);
+    ASSERT_EQUAL(integral_output[4], 0);
+    ASSERT_EQUAL(integral_output[5], 1);
+    ASSERT_EQUAL(integral_output[6], 0);
+    ASSERT_EQUAL(integral_output[7], 1);
+    ASSERT_EQUAL(integral_output[8], 1);
+    ASSERT_EQUAL(integral_output[9], 0);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorBinarySearchSimple);
+
+
+template<typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(my_system &system, ForwardIterator,ForwardIterator,InputIterator,InputIterator,OutputIterator output)
+{
+    system.validate_dispatch();
+    return output;
+}
+
+void TestVectorBinarySearchDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::binary_search(sys,
+                          vec.begin(),
+                          vec.end(),
+                          vec.begin(),
+                          vec.end(),
+                          vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestVectorBinarySearchDispatchExplicit);
+
+
+template<typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(my_tag, ForwardIterator,ForwardIterator,InputIterator,InputIterator,OutputIterator output)
+{
+    *output = 13;
+    return output;
+}
+
+void TestVectorBinarySearchDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::binary_search(thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.end()),
+                          thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.end()),
+                          thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestVectorBinarySearchDispatchImplicit);
+
+
+template <typename T>
+struct TestVectorLowerBound
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
+
+    thrust::lower_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin());
+    thrust::lower_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin());
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestVectorLowerBound, SignedIntegralTypes> TestVectorLowerBoundInstance;
+
+
+template <typename T>
+struct TestVectorUpperBound
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
+
+    thrust::upper_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin());
+    thrust::upper_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin());
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestVectorUpperBound, SignedIntegralTypes> TestVectorUpperBoundInstance;
+
+template <typename T>
+struct TestVectorBinarySearch
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
+
+    thrust::binary_search(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin());
+    thrust::binary_search(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin());
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestVectorBinarySearch, SignedIntegralTypes> TestVectorBinarySearchInstance;
+
+template <typename T>
+struct TestVectorLowerBoundDiscardIterator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    thrust::discard_iterator<> h_result =
+      thrust::lower_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), thrust::make_discard_iterator());
+    thrust::discard_iterator<> d_result =
+      thrust::lower_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(2*n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+  }
+};
+VariableUnitTest<TestVectorLowerBoundDiscardIterator, SignedIntegralTypes> TestVectorLowerBoundDiscardIteratorInstance;
+
+
+template <typename T>
+struct TestVectorUpperBoundDiscardIterator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::discard_iterator<> h_result =
+      thrust::upper_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), thrust::make_discard_iterator());
+    thrust::discard_iterator<> d_result =
+      thrust::upper_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(2*n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+  }
+};
+VariableUnitTest<TestVectorUpperBoundDiscardIterator, SignedIntegralTypes> TestVectorUpperBoundDiscardIteratorInstance;
+
+template <typename T>
+struct TestVectorBinarySearchDiscardIterator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::discard_iterator<> h_result =
+      thrust::binary_search(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), thrust::make_discard_iterator());
+    thrust::discard_iterator<> d_result =
+      thrust::binary_search(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(2*n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+  }
+};
+VariableUnitTest<TestVectorBinarySearchDiscardIterator, SignedIntegralTypes> TestVectorBinarySearchDiscardIteratorInstance;
+
diff --git a/thrust/testing/binary_search_vector_descending.cu b/thrust/testing/binary_search_vector_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..edc70663a0602d810d5a2eb2f9d9acced6d27e64
--- /dev/null
+++ b/thrust/testing/binary_search_vector_descending.cu
@@ -0,0 +1,225 @@
+#include <unittest/unittest.h>
+#include <thrust/binary_search.h>
+#include <thrust/functional.h>
+
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+// convert xxx_vector<T1> to xxx_vector<T2> 
+template <class ExampleVector, typename NewType> 
+struct vector_like
+{
+    typedef typename ExampleVector::allocator_type alloc;
+    typedef typename thrust::detail::allocator_traits<alloc> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<NewType> new_alloc;
+    typedef thrust::detail::vector_base<NewType, new_alloc> type;
+};
+
+template <class Vector>
+void TestVectorLowerBoundDescendingSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+
+    vec[0] = 8;
+    vec[1] = 7;
+    vec[2] = 5;
+    vec[3] = 2;
+    vec[4] = 0;
+
+    Vector input(10);
+    thrust::sequence(input.begin(), input.end());
+
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
+
+    // test with integral output type
+    IntVector integral_output(10);
+    typename IntVector::iterator output_end = thrust::lower_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<T>());
+
+    ASSERT_EQUAL_QUIET(integral_output.end(), output_end);
+
+    ASSERT_EQUAL(4, integral_output[0]);
+    ASSERT_EQUAL(4, integral_output[1]);
+    ASSERT_EQUAL(3, integral_output[2]);
+    ASSERT_EQUAL(3, integral_output[3]);
+    ASSERT_EQUAL(3, integral_output[4]);
+    ASSERT_EQUAL(2, integral_output[5]);
+    ASSERT_EQUAL(2, integral_output[6]);
+    ASSERT_EQUAL(1, integral_output[7]);
+    ASSERT_EQUAL(0, integral_output[8]);
+    ASSERT_EQUAL(0, integral_output[9]);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorLowerBoundDescendingSimple);
+
+
+template <class Vector>
+void TestVectorUpperBoundDescendingSimple(void)
+{
+    Vector vec(5);
+
+    vec[0] = 8;
+    vec[1] = 7;
+    vec[2] = 5;
+    vec[3] = 2;
+    vec[4] = 0;
+
+    Vector input(10);
+    thrust::sequence(input.begin(), input.end());
+
+    typedef typename Vector::difference_type int_type;
+    typedef typename Vector::value_type T;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
+
+    // test with integral output type
+    IntVector integral_output(10);
+    typename IntVector::iterator output_end = thrust::upper_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<T>());
+
+    ASSERT_EQUAL_QUIET(output_end, integral_output.end());
+
+    ASSERT_EQUAL(5, integral_output[0]);
+    ASSERT_EQUAL(4, integral_output[1]);
+    ASSERT_EQUAL(4, integral_output[2]);
+    ASSERT_EQUAL(3, integral_output[3]);
+    ASSERT_EQUAL(3, integral_output[4]);
+    ASSERT_EQUAL(3, integral_output[5]);
+    ASSERT_EQUAL(2, integral_output[6]);
+    ASSERT_EQUAL(2, integral_output[7]);
+    ASSERT_EQUAL(1, integral_output[8]);
+    ASSERT_EQUAL(0, integral_output[9]);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorUpperBoundDescendingSimple);
+
+
+template <class Vector>
+void TestVectorBinarySearchDescendingSimple(void)
+{
+  Vector vec(5);
+
+  vec[0] = 8;
+  vec[1] = 7;
+  vec[2] = 5;
+  vec[3] = 2;
+  vec[4] = 0;
+
+  Vector input(10);
+  thrust::sequence(input.begin(), input.end());
+
+  typedef typename vector_like<Vector, bool>::type BoolVector;
+  typedef typename Vector::difference_type int_type;
+  typedef typename Vector::value_type T;
+  typedef typename vector_like<Vector,  int_type>::type IntVector;
+
+  // test with boolean output type
+  BoolVector bool_output(10);
+  typename BoolVector::iterator bool_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), bool_output.begin(), thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(bool_output_end, bool_output.end());
+
+  ASSERT_EQUAL(true,  bool_output[0]);
+  ASSERT_EQUAL(false, bool_output[1]);
+  ASSERT_EQUAL(true,  bool_output[2]);
+  ASSERT_EQUAL(false, bool_output[3]);
+  ASSERT_EQUAL(false, bool_output[4]);
+  ASSERT_EQUAL(true,  bool_output[5]);
+  ASSERT_EQUAL(false, bool_output[6]);
+  ASSERT_EQUAL(true,  bool_output[7]);
+  ASSERT_EQUAL(true,  bool_output[8]);
+  ASSERT_EQUAL(false, bool_output[9]);
+  
+  // test with integral output type
+  IntVector integral_output(10, 2);
+  typename IntVector::iterator int_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(int_output_end, integral_output.end());
+  
+  ASSERT_EQUAL(1, integral_output[0]);
+  ASSERT_EQUAL(0, integral_output[1]);
+  ASSERT_EQUAL(1, integral_output[2]);
+  ASSERT_EQUAL(0, integral_output[3]);
+  ASSERT_EQUAL(0, integral_output[4]);
+  ASSERT_EQUAL(1, integral_output[5]);
+  ASSERT_EQUAL(0, integral_output[6]);
+  ASSERT_EQUAL(1, integral_output[7]);
+  ASSERT_EQUAL(1, integral_output[8]);
+  ASSERT_EQUAL(0, integral_output[9]);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorBinarySearchDescendingSimple);
+
+
+template <typename T>
+struct TestVectorLowerBoundDescending
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<T>());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
+
+    thrust::lower_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<T>());
+    thrust::lower_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<T>());
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestVectorLowerBoundDescending, SignedIntegralTypes> TestVectorLowerBoundDescendingInstance;
+
+
+template <typename T>
+struct TestVectorUpperBoundDescending
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<T>());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
+
+    thrust::upper_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<T>());
+    thrust::upper_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<T>());
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestVectorUpperBoundDescending, SignedIntegralTypes> TestVectorUpperBoundDescendingInstance;
+
+template <typename T>
+struct TestVectorBinarySearchDescending
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<T>());
+    thrust::device_vector<T> d_vec = h_vec;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
+
+    thrust::binary_search(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<T>());
+    thrust::binary_search(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<T>());
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestVectorBinarySearchDescending, SignedIntegralTypes> TestVectorBinarySearchDescendingInstance;
+
diff --git a/thrust/testing/caching_allocator.cu b/thrust/testing/caching_allocator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f98ea336b64bc3354d9ccc4e40c46a25af71ead5
--- /dev/null
+++ b/thrust/testing/caching_allocator.cu
@@ -0,0 +1,23 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/caching_allocator.h>
+
+template<typename Allocator>
+void test_implementation(Allocator alloc)
+{
+    typedef typename thrust::detail::allocator_traits<Allocator> Traits;
+    typedef typename Allocator::pointer Ptr;
+
+    Ptr p = Traits::allocate(alloc, 123);
+    Traits::deallocate(alloc, p, 123);
+
+    Ptr p2 = Traits::allocate(alloc, 123);
+    ASSERT_EQUAL(p, p2);
+}
+
+void TestSingleDeviceTLSCachingAllocator()
+{
+    test_implementation(thrust::detail::single_device_tls_caching_allocator());
+};
+DECLARE_UNITTEST(TestSingleDeviceTLSCachingAllocator);
diff --git a/thrust/testing/complex.cu b/thrust/testing/complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf980962aec88a738dee8f287e326f2ae0d82de5
--- /dev/null
+++ b/thrust/testing/complex.cu
@@ -0,0 +1,334 @@
+#include <unittest/unittest.h>
+
+#include <thrust/complex.h>
+#include <thrust/detail/config.h>
+
+#include <complex>
+#include <iostream>
+#include <sstream>
+
+/* 
+   The following tests do not check for the numerical accuracy of the operations.
+   That is tested in a separate program (complex_accuracy.cpp) which requires mpfr, 
+   and takes a lot of time to run.   
+ */
+
+template<typename T>
+struct TestComplexSizeAndAlignment
+{
+  void operator()()
+  {
+    THRUST_STATIC_ASSERT(
+      sizeof(thrust::complex<T>) == sizeof(T) * 2
+    );
+    THRUST_STATIC_ASSERT(
+      THRUST_ALIGNOF(thrust::complex<T>) == THRUST_ALIGNOF(T) * 2
+    );
+
+    THRUST_STATIC_ASSERT(
+      sizeof(thrust::complex<T const>) == sizeof(T) * 2
+    );
+    THRUST_STATIC_ASSERT(
+      THRUST_ALIGNOF(thrust::complex<T const>) == THRUST_ALIGNOF(T) * 2
+    );
+  }
+};
+SimpleUnitTest<TestComplexSizeAndAlignment, FloatingPointTypes> TestComplexSizeAndAlignmentInstance;
+
+template<typename T>
+struct TestComplexConstructors
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data = unittest::random_samples<T>(2);
+    
+    thrust::complex<T> a(data[0],data[1]);
+    thrust::complex<T> b(a);
+    a = thrust::complex<T>(data[0],data[1]);
+    ASSERT_ALMOST_EQUAL(a,b);
+    
+    a = thrust::complex<T>(data[0]);
+    ASSERT_EQUAL(data[0], a.real());
+    ASSERT_EQUAL(T(0), a.imag());
+    
+    a = thrust::complex<T>();
+    ASSERT_ALMOST_EQUAL(a,std::complex<T>(0));
+    
+    a = thrust::complex<T>(thrust::complex<float>(static_cast<float>(data[0]),static_cast<float>(data[1])));
+    ASSERT_ALMOST_EQUAL(a,b);
+    
+    a = thrust::complex<T>(thrust::complex<double>(static_cast<double>(data[0]),static_cast<double>(data[1])));
+    ASSERT_ALMOST_EQUAL(a,b);
+    
+    a = thrust::complex<T>(std::complex<float>(static_cast<float>(data[0]),static_cast<float>(data[1])));
+    ASSERT_ALMOST_EQUAL(a,b);
+    
+    a = thrust::complex<T>(std::complex<double>(static_cast<double>(data[0]),static_cast<double>(data[1])));
+    ASSERT_ALMOST_EQUAL(a,b);
+  }
+};
+SimpleUnitTest<TestComplexConstructors, FloatingPointTypes> TestComplexConstructorsInstance;
+
+
+template<typename T>
+struct TestComplexGetters
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data = unittest::random_samples<T>(2);
+
+    thrust::complex<T> z(data[0], data[1]);
+
+    ASSERT_EQUAL(data[0], z.real());
+    ASSERT_EQUAL(data[1], z.imag());
+
+    z.real(data[1]);
+    z.imag(data[0]);
+    ASSERT_EQUAL(data[1], z.real());
+    ASSERT_EQUAL(data[0], z.imag());
+
+    volatile thrust::complex<T> v(data[0], data[1]);
+
+    ASSERT_EQUAL(data[0], v.real());
+    ASSERT_EQUAL(data[1], v.imag());
+
+    v.real(data[1]);
+    v.imag(data[0]);
+    ASSERT_EQUAL(data[1], v.real());
+    ASSERT_EQUAL(data[0], v.imag());
+  }
+};
+SimpleUnitTest<TestComplexGetters, FloatingPointTypes> TestComplexGettersInstance;
+
+template<typename T>
+struct TestComplexMemberOperators
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+    thrust::host_vector<T> data_b = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data_a[0], data_a[1]);
+    thrust::complex<T> b(data_b[0], data_b[1]);
+
+    std::complex<T> c(a);
+    std::complex<T> d(b);
+
+    a += b;
+    c += d;
+    ASSERT_ALMOST_EQUAL(a,c);
+
+    a -= b;
+    c -= d;
+    ASSERT_ALMOST_EQUAL(a,c);
+
+    a *= b;
+    c *= d;
+    ASSERT_ALMOST_EQUAL(a,c);
+
+    a /= b;
+    c /= d;
+    ASSERT_ALMOST_EQUAL(a,c);
+
+    // casting operator
+    c = (std::complex<T>)a;
+  }
+};
+SimpleUnitTest<TestComplexMemberOperators, FloatingPointTypes> TestComplexMemberOperatorsInstance;
+
+
+template<typename T>
+struct TestComplexBasicArithmetic
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data[0], data[1]);
+    std::complex<T> b(a);
+
+    // Test the basic arithmetic functions against std
+    
+    ASSERT_ALMOST_EQUAL(abs(a),abs(b));
+
+    ASSERT_ALMOST_EQUAL(arg(a),arg(b));
+
+    ASSERT_ALMOST_EQUAL(norm(a),norm(b));
+
+    ASSERT_EQUAL(conj(a),conj(b));
+
+    ASSERT_ALMOST_EQUAL(thrust::polar(data[0],data[1]),std::polar(data[0],data[1]));
+
+    // random_samples does not seem to produce infinities so proj(z) == z
+    ASSERT_EQUAL(proj(a),a);
+    
+  }
+};
+SimpleUnitTest<TestComplexBasicArithmetic, FloatingPointTypes> TestComplexBasicArithmeticInstance;
+
+
+template<typename T>
+struct TestComplexBinaryArithmetic
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+    thrust::host_vector<T> data_b = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data_a[0], data_a[1]);
+    thrust::complex<T> b(data_b[0], data_b[1]);
+
+    ASSERT_ALMOST_EQUAL(a*b,std::complex<T>(a) * std::complex<T>(b));
+    ASSERT_ALMOST_EQUAL(a*data_b[0],std::complex<T>(a) * data_b[0]);
+    ASSERT_ALMOST_EQUAL(data_a[0]*b,data_b[0] * std::complex<T>(b));
+
+    ASSERT_ALMOST_EQUAL(a / b, std::complex<T>(a) / std::complex<T>(b));
+    ASSERT_ALMOST_EQUAL(a / data_b[0], std::complex<T>(a) / data_b[0]);
+    ASSERT_ALMOST_EQUAL(data_a[0] / b, data_b[0] / std::complex<T>(b));
+
+    ASSERT_EQUAL(a + b, std::complex<T>(a) + std::complex<T>(b));
+    ASSERT_EQUAL(a + data_b[0], std::complex<T>(a) + data_b[0]);
+    ASSERT_EQUAL(data_a[0] + b, data_b[0] + std::complex<T>(b));
+
+    ASSERT_EQUAL(a - b, std::complex<T>(a) - std::complex<T>(b));
+    ASSERT_EQUAL(a - data_b[0], std::complex<T>(a) - data_b[0]);
+    ASSERT_EQUAL(data_a[0] - b, data_b[0] - std::complex<T>(b));
+    
+  }
+};
+SimpleUnitTest<TestComplexBinaryArithmetic, FloatingPointTypes> TestComplexBinaryArithmeticInstance;
+
+template<typename T>
+struct TestComplexUnaryArithmetic
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data_a[0], data_a[1]);
+
+    ASSERT_EQUAL(+a,+std::complex<T>(a));
+    ASSERT_EQUAL(-a,-std::complex<T>(a));
+    
+  }
+};
+SimpleUnitTest<TestComplexUnaryArithmetic, FloatingPointTypes> TestComplexUnaryArithmeticInstance;
+
+
+template<typename T>
+struct TestComplexExponentialFunctions
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data_a[0], data_a[1]);
+    std::complex<T> b(a);
+
+    ASSERT_ALMOST_EQUAL(exp(a),exp(b));
+    ASSERT_ALMOST_EQUAL(log(a),log(b));
+    ASSERT_ALMOST_EQUAL(log10(a),log10(b));
+    
+  }
+};
+SimpleUnitTest<TestComplexExponentialFunctions, FloatingPointTypes> TestComplexExponentialFunctionsInstance;
+
+
+template<typename T>
+struct TestComplexPowerFunctions
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+    thrust::host_vector<T> data_b = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data_a[0], data_a[1]);
+    thrust::complex<T> b(data_b[0], data_b[1]);
+    std::complex<T> c(a);
+    std::complex<T> d(b);
+
+    ASSERT_ALMOST_EQUAL(pow(a,b),pow(c,d));
+    ASSERT_ALMOST_EQUAL(pow(a,b.real()),pow(c,d.real()));
+    ASSERT_ALMOST_EQUAL(pow(a.real(),b),pow(c.real(),d));
+
+    ASSERT_ALMOST_EQUAL(sqrt(a),sqrt(c));
+
+  }
+};
+SimpleUnitTest<TestComplexPowerFunctions, FloatingPointTypes> TestComplexPowerFunctionsInstance;
+
+template<typename T>
+struct TestComplexTrigonometricFunctions
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+
+    thrust::complex<T> a(data_a[0], data_a[1]);
+    std::complex<T> c(a);
+
+    ASSERT_ALMOST_EQUAL(cos(a),cos(c));
+    ASSERT_ALMOST_EQUAL(sin(a),sin(c));
+    ASSERT_ALMOST_EQUAL(tan(a),tan(c));
+
+    ASSERT_ALMOST_EQUAL(cosh(a),cosh(c));
+    ASSERT_ALMOST_EQUAL(sinh(a),sinh(c));
+    ASSERT_ALMOST_EQUAL(tanh(a),tanh(c));
+
+#if THRUST_CPP_DIALECT >= 2011
+
+    ASSERT_ALMOST_EQUAL(acos(a),acos(c));
+    ASSERT_ALMOST_EQUAL(asin(a),asin(c));
+    ASSERT_ALMOST_EQUAL(atan(a),atan(c));
+
+    ASSERT_ALMOST_EQUAL(acosh(a),acosh(c));
+    ASSERT_ALMOST_EQUAL(asinh(a),asinh(c));
+    ASSERT_ALMOST_EQUAL(atanh(a),atanh(c));
+
+#endif
+
+
+  }
+};
+SimpleUnitTest<TestComplexTrigonometricFunctions, FloatingPointTypes> TestComplexTrigonometricFunctionsInstance;
+
+template<typename T>
+struct TestComplexStreamOperators
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data_a = unittest::random_samples<T>(2);
+    thrust::complex<T> a(data_a[0], data_a[1]);
+    std::stringstream out;
+    out << a;
+    thrust::complex<T> b;
+    out >> b;
+    ASSERT_ALMOST_EQUAL(a,b);
+  }
+};
+SimpleUnitTest<TestComplexStreamOperators, FloatingPointTypes> TestComplexStreamOperatorsInstance;
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename T>
+struct TestComplexStdComplexDeviceInterop
+{
+  void operator()()
+  {
+    thrust::host_vector<T> data = unittest::random_samples<T>(6);
+    std::vector<std::complex<T> > vec(10);
+    vec[0] = std::complex<T>(data[0], data[1]);
+    vec[1] = std::complex<T>(data[2], data[3]);
+    vec[2] = std::complex<T>(data[4], data[5]);
+
+    thrust::device_vector<thrust::complex<T> > device_vec = vec;
+    ASSERT_ALMOST_EQUAL(vec[0].real(), thrust::complex<T>(device_vec[0]).real());
+    ASSERT_ALMOST_EQUAL(vec[0].imag(), thrust::complex<T>(device_vec[0]).imag());
+    ASSERT_ALMOST_EQUAL(vec[1].real(), thrust::complex<T>(device_vec[1]).real());
+    ASSERT_ALMOST_EQUAL(vec[1].imag(), thrust::complex<T>(device_vec[1]).imag());
+    ASSERT_ALMOST_EQUAL(vec[2].real(), thrust::complex<T>(device_vec[2]).real());
+    ASSERT_ALMOST_EQUAL(vec[2].imag(), thrust::complex<T>(device_vec[2]).imag());
+  }
+};
+SimpleUnitTest<TestComplexStdComplexDeviceInterop, FloatingPointTypes> TestComplexStdComplexDeviceInteropInstance;
+#endif
+
diff --git a/thrust/testing/complex_transform.cu b/thrust/testing/complex_transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..439597a0db55d3a94dcd57aa533a4264f4da6c75
--- /dev/null
+++ b/thrust/testing/complex_transform.cu
@@ -0,0 +1,387 @@
+#include <unittest/unittest.h>
+#include <thrust/host_vector.h>
+#include <thrust/complex.h>
+#include <thrust/transform.h>
+#include <iostream>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <unittest/cuda/testframework.h>
+#endif
+
+struct basic_arithmetic_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x,
+				const thrust::complex<T> &y)
+  {
+    // exercise unary and binary arithmetic operators
+    // Should return approximately 1
+    return (+x + +y) + (x * y) / (y * x) + (-y + -x);
+  } // end operator()()
+}; // end make_pair_functor
+
+struct complex_plane_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    // Should return a proximately 1
+    return thrust::proj( (thrust::polar(abs(x),arg(x)) * conj(x))/norm(x));
+  } // end operator()()
+}; // end make_pair_functor
+
+struct pow_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x,
+				const thrust::complex<T> &y)
+  {
+    // exercise power functions
+    return pow(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+struct sqrt_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    // exercise power functions
+    return sqrt(x);
+  } // end operator()()
+}; // end make_pair_functor
+
+struct log_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return log(x);
+  } // end operator()()
+}; // end make_pair_functor
+
+struct exp_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return exp(x);
+  } // end operator()()
+}; // end make_pair_functor
+
+struct log10_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return log10(x);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+struct cos_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return cos(x);
+  } 
+}; 
+
+struct sin_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return sin(x);
+  } 
+}; 
+
+struct tan_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return tan(x);
+  } 
+}; 
+
+
+
+struct cosh_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return cosh(x);
+  } 
+}; 
+
+struct sinh_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return sinh(x);
+  } 
+}; 
+
+struct tanh_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return tanh(x);
+  } 
+}; 
+
+
+struct acos_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return acos(x);
+  } 
+}; 
+
+struct asin_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return asin(x);
+  } 
+}; 
+
+struct atan_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return atan(x);
+  } 
+}; 
+
+
+struct acosh_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return acosh(x);
+  } 
+}; 
+
+struct asinh_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return asinh(x);
+  } 
+}; 
+
+struct atanh_functor
+{
+  template<typename T>
+  __host__ __device__
+  thrust::complex<T> operator()(const thrust::complex<T> &x)
+  {
+    return atanh(x);
+  } 
+}; 
+
+
+template <typename T>
+thrust::host_vector<thrust::complex<T> > random_complex_samples(size_t n){
+  thrust::host_vector<T> real = unittest::random_samples<T>(2*n);
+  thrust::host_vector<thrust::complex<T> > h_p1(n);
+  for(size_t i = 0; i<n; i++){
+    h_p1[i].real(real[i]);
+    h_p1[i].imag(real[2*i]);
+  }
+  return h_p1;
+}
+
+template <typename T>
+struct TestComplexArithmeticTransform
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::complex<T> type;
+    thrust::host_vector<type> h_p1 = random_complex_samples<T>(n);
+    thrust::host_vector<type> h_p2 = random_complex_samples<T>(n);
+    thrust::host_vector<type>   h_result(n);
+
+    thrust::device_vector<type> d_p1 = h_p1;
+    thrust::device_vector<type> d_p2 = h_p2;
+    thrust::device_vector<type> d_result(n);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), basic_arithmetic_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), basic_arithmetic_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestComplexArithmeticTransform, FloatingPointTypes> TestComplexArithmeticTransformInstance;
+
+template <typename T>
+struct TestComplexPlaneTransform
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::complex<T> type;
+    thrust::host_vector<type> h_p1 = random_complex_samples<T>(n);
+    thrust::host_vector<type>   h_result(n);
+
+    thrust::device_vector<type> d_p1 = h_p1;
+    thrust::device_vector<type> d_result(n);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), complex_plane_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), complex_plane_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestComplexPlaneTransform, FloatingPointTypes> TestComplexPlaneTransformInstance;
+
+
+template <typename T>
+struct TestComplexPowerTransform
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::complex<T> type;
+    thrust::host_vector<type> h_p1 = random_complex_samples<T>(n);
+    thrust::host_vector<type> h_p2 = random_complex_samples<T>(n);
+    thrust::host_vector<type>   h_result(n);
+
+    thrust::device_vector<type> d_p1 = h_p1;
+    thrust::device_vector<type> d_p2 = h_p2;
+    thrust::device_vector<type> d_result(n);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), pow_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), pow_functor());    
+    // pow can be very innacurate there's no point trying to check for equality
+    // Currently just checking for compilation
+    //    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), sqrt_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), sqrt_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestComplexPowerTransform, FloatingPointTypes> TestComplexPowerTransformInstance;
+
+template <typename T>
+struct TestComplexExponentialTransform
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::complex<T> type;
+    thrust::host_vector<type> h_p1 = random_complex_samples<T>(n);
+    thrust::host_vector<type>   h_result(n);
+
+    thrust::device_vector<type> d_p1 = h_p1;
+    thrust::device_vector<type> d_result(n);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), exp_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), exp_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), log_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), log_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), log10_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), log10_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestComplexExponentialTransform, FloatingPointTypes> TestComplexExponentialTransformInstance;
+
+template <typename T>
+struct TestComplexTrigonometricTransform
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::complex<T> type;
+    thrust::host_vector<type> h_p1 = random_complex_samples<T>(n);
+    thrust::host_vector<type>   h_result(n);
+
+    thrust::device_vector<type> d_p1 = h_p1;
+    thrust::device_vector<type> d_result(n);
+
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), sin_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), sin_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), cos_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), cos_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), tan_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), tan_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), sinh_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), sinh_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), cosh_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), cosh_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), tanh_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), tanh_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), asin_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), asin_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), acos_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), acos_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), atan_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), atan_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), asinh_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), asinh_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), acosh_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), acosh_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+    thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), atanh_functor());
+    thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), atanh_functor());    
+    ASSERT_ALMOST_EQUAL(h_result, d_result);
+
+  }
+};
+VariableUnitTest<TestComplexTrigonometricTransform, FloatingPointTypes> TestComplexTrigonometricTransformInstance;
+
diff --git a/thrust/testing/constant_iterator.cu b/thrust/testing/constant_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cbf771c9ab703882d4d287c7877ef0bbde465a43
--- /dev/null
+++ b/thrust/testing/constant_iterator.cu
@@ -0,0 +1,174 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/copy.h>
+#include <thrust/transform.h>
+#include <thrust/reduce.h>
+
+void TestConstantIteratorConstructFromConvertibleSystem(void)
+{
+  using namespace thrust;
+
+  constant_iterator<int> default_system(13);
+
+  constant_iterator<int, use_default, host_system_tag> host_system = default_system;
+  ASSERT_EQUAL(*default_system, *host_system);
+
+  constant_iterator<int, use_default, device_system_tag> device_system = default_system;
+  ASSERT_EQUAL(*default_system, *device_system);
+}
+DECLARE_UNITTEST(TestConstantIteratorConstructFromConvertibleSystem);
+
+void TestConstantIteratorIncrement(void)
+{
+    using namespace thrust;
+
+    constant_iterator<int> lhs(0,0);
+    constant_iterator<int> rhs(0,0);
+
+    ASSERT_EQUAL(0, lhs - rhs);
+
+    lhs++;
+
+    ASSERT_EQUAL(1, lhs - rhs);
+    
+    lhs++;
+    lhs++;
+    
+    ASSERT_EQUAL(3, lhs - rhs);
+
+    lhs += 5;
+    
+    ASSERT_EQUAL(8, lhs - rhs);
+
+    lhs -= 10;
+    
+    ASSERT_EQUAL(-2, lhs - rhs);
+}
+DECLARE_UNITTEST(TestConstantIteratorIncrement);
+
+void TestConstantIteratorIncrementBig(void)
+{
+    long long int n = 10000000000ULL;
+
+    thrust::constant_iterator<long long int> begin(1);
+    thrust::constant_iterator<long long int> end = begin + n;
+
+    ASSERT_EQUAL(thrust::distance(begin, end), n);
+}
+DECLARE_UNITTEST(TestConstantIteratorIncrementBig);
+
+void TestConstantIteratorComparison(void)
+{
+    using namespace thrust;
+
+    constant_iterator<int> iter1(0);
+    constant_iterator<int> iter2(0);
+
+    ASSERT_EQUAL(0, iter1 - iter2);
+    ASSERT_EQUAL(true, iter1 == iter2);
+
+    iter1++;
+    
+    ASSERT_EQUAL(1, iter1 - iter2);
+    ASSERT_EQUAL(false, iter1 == iter2);
+   
+    iter2++;
+
+    ASSERT_EQUAL(0, iter1 - iter2);
+    ASSERT_EQUAL(true, iter1 == iter2);
+  
+    iter1 += 100;
+    iter2 += 100;
+
+    ASSERT_EQUAL(0, iter1 - iter2);
+    ASSERT_EQUAL(true, iter1 == iter2);
+}
+DECLARE_UNITTEST(TestConstantIteratorComparison);
+
+
+void TestMakeConstantIterator(void)
+{
+    using namespace thrust;
+
+    // test one argument version
+    constant_iterator<int> iter0 = make_constant_iterator<int>(13);
+
+    ASSERT_EQUAL(13, *iter0);
+
+    // test two argument version
+    constant_iterator<int,thrust::detail::intmax_t> iter1 = make_constant_iterator<int,thrust::detail::intmax_t>(13, 7);
+
+    ASSERT_EQUAL(13, *iter1);
+    ASSERT_EQUAL(7, iter1 - iter0);
+}
+DECLARE_UNITTEST(TestMakeConstantIterator);
+
+
+template<typename Vector>
+void TestConstantIteratorCopy(void)
+{
+  using namespace thrust;
+
+  typedef constant_iterator<int> ConstIter;
+
+  Vector result(4);
+
+  ConstIter first = make_constant_iterator<int>(7);
+  ConstIter last  = first + result.size();
+  thrust::copy(first, last, result.begin());
+
+  ASSERT_EQUAL(7, result[0]);
+  ASSERT_EQUAL(7, result[1]);
+  ASSERT_EQUAL(7, result[2]);
+  ASSERT_EQUAL(7, result[3]);
+};
+DECLARE_VECTOR_UNITTEST(TestConstantIteratorCopy);
+
+
+template<typename Vector>
+void TestConstantIteratorTransform(void)
+{
+  using namespace thrust;
+
+  typedef typename Vector::value_type T;
+  typedef constant_iterator<T> ConstIter;
+
+  Vector result(4);
+
+  ConstIter first1 = make_constant_iterator<T>(7);
+  ConstIter last1  = first1 + result.size();
+  ConstIter first2 = make_constant_iterator<T>(3);
+
+  thrust::transform(first1, last1, result.begin(), thrust::negate<T>());
+
+  ASSERT_EQUAL(-7, result[0]);
+  ASSERT_EQUAL(-7, result[1]);
+  ASSERT_EQUAL(-7, result[2]);
+  ASSERT_EQUAL(-7, result[3]);
+  
+  thrust::transform(first1, last1, first2, result.begin(), thrust::plus<T>());
+
+  ASSERT_EQUAL(10, result[0]);
+  ASSERT_EQUAL(10, result[1]);
+  ASSERT_EQUAL(10, result[2]);
+  ASSERT_EQUAL(10, result[3]);
+};
+DECLARE_VECTOR_UNITTEST(TestConstantIteratorTransform);
+
+
+void TestConstantIteratorReduce(void)
+{
+  using namespace thrust;
+
+  typedef int T;
+  typedef constant_iterator<T> ConstIter;
+
+  ConstIter first = make_constant_iterator<T>(7);
+  ConstIter last  = first + 4;
+
+  T sum = thrust::reduce(first, last);
+
+  ASSERT_EQUAL(sum, 4 * 7);
+};
+DECLARE_UNITTEST(TestConstantIteratorReduce);
+
diff --git a/thrust/testing/copy.cu b/thrust/testing/copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64165c8e7add058dacd465efa6d2396d1646cb3c
--- /dev/null
+++ b/thrust/testing/copy.cu
@@ -0,0 +1,787 @@
+#include <unittest/unittest.h>
+#include <thrust/copy.h>
+
+#include <array>
+#include <algorithm>
+#include <list>
+#include <iterator>
+#include <thrust/sequence.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+void TestCopyFromConstIterator(void)
+{
+    typedef int T;
+
+    std::vector<T> v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    std::vector<int>::const_iterator begin = v.begin();
+    std::vector<int>::const_iterator end = v.end();
+
+    // copy to host_vector
+    thrust::host_vector<T> h(5, (T) 10);
+    thrust::host_vector<T>::iterator h_result = thrust::copy(begin, end, h.begin());
+    ASSERT_EQUAL(h[0], 0);
+    ASSERT_EQUAL(h[1], 1);
+    ASSERT_EQUAL(h[2], 2);
+    ASSERT_EQUAL(h[3], 3);
+    ASSERT_EQUAL(h[4], 4);
+    ASSERT_EQUAL_QUIET(h_result, h.end());
+
+    // copy to device_vector
+    thrust::device_vector<T> d(5, (T) 10);
+    thrust::device_vector<T>::iterator d_result = thrust::copy(begin, end, d.begin());
+    ASSERT_EQUAL(d[0], 0);
+    ASSERT_EQUAL(d[1], 1);
+    ASSERT_EQUAL(d[2], 2);
+    ASSERT_EQUAL(d[3], 3);
+    ASSERT_EQUAL(d[4], 4);
+    ASSERT_EQUAL_QUIET(d_result, d.end());
+}
+DECLARE_UNITTEST(TestCopyFromConstIterator);
+
+void TestCopyToDiscardIterator(void)
+{
+    typedef int T;
+
+    thrust::host_vector<T> h_input(5,1);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::discard_iterator<> reference(5);
+
+    // copy from host_vector
+    thrust::discard_iterator<> h_result =
+      thrust::copy(h_input.begin(), h_input.end(), thrust::make_discard_iterator());
+
+    // copy from device_vector
+    thrust::discard_iterator<> d_result =
+      thrust::copy(d_input.begin(), d_input.end(), thrust::make_discard_iterator());
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_UNITTEST(TestCopyToDiscardIterator);
+
+void TestCopyToDiscardIteratorZipped(void)
+{
+    typedef int T;
+
+    thrust::host_vector<T> h_input(5,1);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>     h_output(5);
+    thrust::device_vector<T>   d_output(5);
+    thrust::discard_iterator<> reference(5);
+
+    typedef thrust::tuple<thrust::discard_iterator<>,thrust::host_vector<T>::iterator>   Tuple1;
+    typedef thrust::tuple<thrust::discard_iterator<>,thrust::device_vector<T>::iterator> Tuple2;
+
+    typedef thrust::zip_iterator<Tuple1> ZipIterator1;
+    typedef thrust::zip_iterator<Tuple2> ZipIterator2;
+
+    // copy from host_vector
+    ZipIterator1 h_result =
+      thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(h_input.begin(),                 h_input.begin())),
+                   thrust::make_zip_iterator(thrust::make_tuple(h_input.end(),                   h_input.end())),
+                   thrust::make_zip_iterator(thrust::make_tuple(thrust::make_discard_iterator(), h_output.begin())));
+
+    // copy from device_vector
+    ZipIterator2 d_result =
+      thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(d_input.begin(),                 d_input.begin())),
+                   thrust::make_zip_iterator(thrust::make_tuple(d_input.end(),                   d_input.end())),
+                   thrust::make_zip_iterator(thrust::make_tuple(thrust::make_discard_iterator(), d_output.begin())));
+
+    ASSERT_EQUAL(h_output, h_input);
+    ASSERT_EQUAL(d_output, d_input);
+    ASSERT_EQUAL_QUIET(reference, thrust::get<0>(h_result.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(reference, thrust::get<0>(d_result.get_iterator_tuple()));
+}
+DECLARE_UNITTEST(TestCopyToDiscardIteratorZipped);
+
+template <class Vector>
+void TestCopyMatchingTypes(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    // copy to host_vector
+    thrust::host_vector<T> h(5, (T) 10);
+    typename thrust::host_vector<T>::iterator h_result = thrust::copy(v.begin(), v.end(), h.begin());
+    ASSERT_EQUAL(h[0], 0);
+    ASSERT_EQUAL(h[1], 1);
+    ASSERT_EQUAL(h[2], 2);
+    ASSERT_EQUAL(h[3], 3);
+    ASSERT_EQUAL(h[4], 4);
+    ASSERT_EQUAL_QUIET(h_result, h.end());
+
+    // copy to device_vector
+    thrust::device_vector<T> d(5, (T) 10);
+    typename thrust::device_vector<T>::iterator d_result = thrust::copy(v.begin(), v.end(), d.begin());
+    ASSERT_EQUAL(d[0], 0);
+    ASSERT_EQUAL(d[1], 1);
+    ASSERT_EQUAL(d[2], 2);
+    ASSERT_EQUAL(d[3], 3);
+    ASSERT_EQUAL(d[4], 4);
+    ASSERT_EQUAL_QUIET(d_result, d.end());
+}
+DECLARE_VECTOR_UNITTEST(TestCopyMatchingTypes);
+
+template <class Vector>
+void TestCopyMixedTypes(void)
+{
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    // copy to host_vector with different type
+    thrust::host_vector<float> h(5, (float) 10);
+    typename thrust::host_vector<float>::iterator h_result = thrust::copy(v.begin(), v.end(), h.begin());
+
+    ASSERT_EQUAL(h[0], 0);
+    ASSERT_EQUAL(h[1], 1);
+    ASSERT_EQUAL(h[2], 2);
+    ASSERT_EQUAL(h[3], 3);
+    ASSERT_EQUAL(h[4], 4);
+    ASSERT_EQUAL_QUIET(h_result, h.end());
+
+    // copy to device_vector with different type
+    thrust::device_vector<float> d(5, (float) 10);
+    typename thrust::device_vector<float>::iterator d_result = thrust::copy(v.begin(), v.end(), d.begin());
+    ASSERT_EQUAL(d[0], 0);
+    ASSERT_EQUAL(d[1], 1);
+    ASSERT_EQUAL(d[2], 2);
+    ASSERT_EQUAL(d[3], 3);
+    ASSERT_EQUAL(d[4], 4);
+    ASSERT_EQUAL_QUIET(d_result, d.end());
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyMixedTypes);
+
+
+void TestCopyVectorBool(void)
+{
+    std::vector<bool> v(3);
+    v[0] = true; v[1] = false; v[2] = true;
+
+    thrust::host_vector<bool> h(3);
+    thrust::device_vector<bool> d(3);
+
+    thrust::copy(v.begin(), v.end(), h.begin());
+    thrust::copy(v.begin(), v.end(), d.begin());
+
+    ASSERT_EQUAL(h[0], true);
+    ASSERT_EQUAL(h[1], false);
+    ASSERT_EQUAL(h[2], true);
+
+    ASSERT_EQUAL(d[0], true);
+    ASSERT_EQUAL(d[1], false);
+    ASSERT_EQUAL(d[2], true);
+}
+DECLARE_UNITTEST(TestCopyVectorBool);
+
+
+template <class Vector>
+void TestCopyListTo(void)
+{
+    typedef typename Vector::value_type T;
+
+    // copy from list to Vector
+    std::list<T> l;
+    l.push_back(0);
+    l.push_back(1);
+    l.push_back(2);
+    l.push_back(3);
+    l.push_back(4);
+
+    Vector v(l.size());
+
+    typename Vector::iterator v_result = thrust::copy(l.begin(), l.end(), v.begin());
+
+    ASSERT_EQUAL(v[0], T(0));
+    ASSERT_EQUAL(v[1], T(1));
+    ASSERT_EQUAL(v[2], T(2));
+    ASSERT_EQUAL(v[3], T(3));
+    ASSERT_EQUAL(v[4], T(4));
+    ASSERT_EQUAL_QUIET(v_result, v.end());
+
+    l.clear();
+
+    thrust::copy(v.begin(), v.end(), std::back_insert_iterator< std::list<T> >(l));
+
+    ASSERT_EQUAL(l.size(), 5lu);
+
+    typename std::list<T>::const_iterator iter = l.begin();
+    ASSERT_EQUAL(*iter, T(0));  iter++;
+    ASSERT_EQUAL(*iter, T(1));  iter++;
+    ASSERT_EQUAL(*iter, T(2));  iter++;
+    ASSERT_EQUAL(*iter, T(3));  iter++;
+    ASSERT_EQUAL(*iter, T(4));  iter++;
+}
+DECLARE_VECTOR_UNITTEST(TestCopyListTo);
+
+
+template<typename T>
+struct is_even
+{
+    __host__ __device__
+    bool operator()(T x) { return (x & 1) == 0; }
+};
+
+template<typename T>
+struct is_true
+{
+    __host__ __device__
+    bool operator()(T x) { return x ? true : false; }
+};
+
+template<typename T>
+struct mod_3
+{
+    __host__ __device__
+    unsigned int operator()(T x) { return x % 3; }
+};
+
+
+template <class Vector>
+void TestCopyIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    Vector dest(4);
+
+    typename Vector::iterator dest_end = thrust::copy_if(v.begin(), v.end(), dest.begin(), is_true<T>());
+
+    ASSERT_EQUAL(1, dest[0]);
+    ASSERT_EQUAL(2, dest[1]);
+    ASSERT_EQUAL(3, dest[2]);
+    ASSERT_EQUAL(4, dest[3]);
+    ASSERT_EQUAL_QUIET(dest.end(), dest_end);
+}
+DECLARE_VECTOR_UNITTEST(TestCopyIfSimple);
+
+
+template <typename T>
+void TestCopyIf(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    typename thrust::host_vector<T>::iterator   h_new_end;
+    typename thrust::device_vector<T>::iterator d_new_end;
+
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_true<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIf);
+
+
+template <typename T>
+void TestCopyIfIntegral(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    typename thrust::host_vector<T>::iterator   h_new_end;
+    typename thrust::device_vector<T>::iterator d_new_end;
+
+    // test with Predicate that returns a bool
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_even<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+
+    // test with Predicate that returns a non-bool
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), mod_3<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfIntegral);
+
+
+template <typename T>
+void TestCopyIfSequence(const size_t n)
+{
+    thrust::host_vector<T>   h_data(n); thrust::sequence(h_data.begin(), h_data.end());
+    thrust::device_vector<T> d_data(n); thrust::sequence(d_data.begin(), d_data.end());
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    typename thrust::host_vector<T>::iterator   h_new_end;
+    typename thrust::device_vector<T>::iterator d_new_end;
+
+    // test with Predicate that returns a bool
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_even<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+
+    // test with Predicate that returns a non-bool
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), mod_3<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfSequence);
+
+
+template <class Vector>
+void TestCopyIfStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    Vector s(5);
+    s[0] = 1; s[1] = 1; s[2] = 0; s[3] = 1; s[4] = 0;
+
+    Vector dest(3);
+
+    typename Vector::iterator dest_end = thrust::copy_if(v.begin(), v.end(), s.begin(), dest.begin(), is_true<T>());
+
+    ASSERT_EQUAL(0, dest[0]);
+    ASSERT_EQUAL(1, dest[1]);
+    ASSERT_EQUAL(3, dest[2]);
+    ASSERT_EQUAL_QUIET(dest.end(), dest_end);
+}
+DECLARE_VECTOR_UNITTEST(TestCopyIfStencilSimple);
+
+
+template <typename T>
+void TestCopyIfStencil(const size_t n)
+{
+    thrust::host_vector<T>   h_data(n); thrust::sequence(h_data.begin(), h_data.end());
+    thrust::device_vector<T> d_data(n); thrust::sequence(d_data.begin(), d_data.end());
+
+    thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_stencil = unittest::random_integers<T>(n);
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    typename thrust::host_vector<T>::iterator   h_new_end;
+    typename thrust::device_vector<T>::iterator d_new_end;
+
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_even<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_even<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfStencil);
+
+namespace
+{
+
+struct object_with_non_trivial_ctor
+{
+  // This struct will only properly assign if its `magic` member is
+  // set to this certain number.
+  static constexpr int MAGIC = 923390;
+
+  int field;
+  int magic;
+
+  __host__ __device__ object_with_non_trivial_ctor()
+  {
+    magic = MAGIC;
+    field = 0;
+  }
+  __host__ __device__ object_with_non_trivial_ctor(int f)
+  {
+    magic = MAGIC;
+    field = f;
+  }
+
+  object_with_non_trivial_ctor(const object_with_non_trivial_ctor& x) = default;
+
+  // This non-trivial assignment requires that `this` points to initialized
+  // memory
+  __host__ __device__ object_with_non_trivial_ctor&
+  operator=(const object_with_non_trivial_ctor& x)
+  {
+    // To really copy over x's field value, require we have magic value set.
+    // If copy_if copies to uninitialized bits, the field will rarely be 923390.
+    if (magic == MAGIC)
+    {
+      field = x.field;
+    }
+    return *this;
+  }
+};
+
+struct always_true
+{
+  __host__ __device__
+  bool operator()(const object_with_non_trivial_ctor&)
+  {
+    return true;
+  };
+};
+
+} // end anon namespace
+
+void TestCopyIfNonTrivial()
+{
+  // Attempting to copy an object_with_non_trivial_ctor into uninitialized
+  // memory will fail:
+  {
+    static constexpr size_t BufferAlign = alignof(object_with_non_trivial_ctor);
+    static constexpr size_t BufferSize = sizeof(object_with_non_trivial_ctor);
+    alignas(BufferAlign) std::array<unsigned char, BufferSize> buffer;
+
+    // Fill buffer with 0s to prevent warnings about uninitialized reads while
+    // ensure that the 'magic number' mechanism works as intended:
+    std::fill(buffer.begin(), buffer.end(), 0);
+
+    object_with_non_trivial_ctor initialized;
+    object_with_non_trivial_ctor *uninitialized =
+      reinterpret_cast<object_with_non_trivial_ctor*>(buffer.data());
+
+    object_with_non_trivial_ctor source(42);
+    initialized = source;
+    *uninitialized = source;
+
+    ASSERT_EQUAL(42, initialized.field);
+    ASSERT_NOT_EQUAL(42, uninitialized->field);
+  }
+
+  // This test ensures that we use placement new instead of assigning
+  // to uninitialized memory. See Thrust Github issue #1153.
+  thrust::device_vector<object_with_non_trivial_ctor> a(10, object_with_non_trivial_ctor(99));
+  thrust::device_vector<object_with_non_trivial_ctor> b(10);
+
+  thrust::copy_if(a.begin(), a.end(), b.begin(), always_true());
+
+  for (int i = 0; i < 10; i++)
+  {
+    object_with_non_trivial_ctor ha(a[i]);
+    object_with_non_trivial_ctor hb(b[i]);
+    int ia = ha.field;
+    int ib = hb.field;
+
+    ASSERT_EQUAL(ia, ib);
+  }
+}
+DECLARE_UNITTEST(TestCopyIfNonTrivial);
+
+template <typename Vector>
+void TestCopyCountingIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::counting_iterator<T> iter(1);
+
+    Vector vec(4);
+
+    thrust::copy(iter, iter + 4, vec.begin());
+
+    ASSERT_EQUAL(vec[0], 1);
+    ASSERT_EQUAL(vec[1], 2);
+    ASSERT_EQUAL(vec[2], 3);
+    ASSERT_EQUAL(vec[3], 4);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyCountingIterator);
+
+template <typename Vector>
+void TestCopyZipIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3); v1[0] = 1; v1[1] = 2; v1[2] = 3;
+    Vector v2(3); v2[0] = 4; v2[1] = 5; v2[2] = 6;
+    Vector v3(3, T(0));
+    Vector v4(3, T(0));
+
+    thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin())),
+                 thrust::make_zip_iterator(thrust::make_tuple(v1.end(),v2.end())),
+                 thrust::make_zip_iterator(thrust::make_tuple(v3.begin(),v4.begin())));
+
+    ASSERT_EQUAL(v1, v3);
+    ASSERT_EQUAL(v2, v4);
+};
+DECLARE_VECTOR_UNITTEST(TestCopyZipIterator);
+
+template <typename Vector>
+void TestCopyConstantIteratorToZipIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3,T(0));
+    Vector v2(3,T(0));
+
+    thrust::copy(thrust::make_constant_iterator(thrust::tuple<T,T>(4,7)),
+                 thrust::make_constant_iterator(thrust::tuple<T,T>(4,7)) + v1.size(),
+                 thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin())));
+
+    ASSERT_EQUAL(v1[0], 4);
+    ASSERT_EQUAL(v1[1], 4);
+    ASSERT_EQUAL(v1[2], 4);
+    ASSERT_EQUAL(v2[0], 7);
+    ASSERT_EQUAL(v2[1], 7);
+    ASSERT_EQUAL(v2[2], 7);
+};
+DECLARE_VECTOR_UNITTEST(TestCopyConstantIteratorToZipIterator);
+
+template<typename InputIterator, typename OutputIterator>
+OutputIterator copy(my_system &system, InputIterator, InputIterator, OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::copy(sys,
+                 vec.begin(),
+                 vec.end(),
+                 vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestCopyDispatchExplicit);
+
+
+template<typename InputIterator, typename OutputIterator>
+OutputIterator copy(my_tag, InputIterator, InputIterator, OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::copy(thrust::retag<my_tag>(vec.begin()),
+                 thrust::retag<my_tag>(vec.end()),
+                 thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestCopyDispatchImplicit);
+
+
+template<typename InputIterator, typename OutputIterator, typename Predicate>
+OutputIterator copy_if(my_system &system, InputIterator, InputIterator, OutputIterator result, Predicate)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestCopyIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::copy_if(sys,
+                    vec.begin(),
+                    vec.end(),
+                    vec.begin(),
+                    0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestCopyIfDispatchExplicit);
+
+
+template<typename InputIterator, typename OutputIterator, typename Predicate>
+OutputIterator copy_if(my_tag, InputIterator, InputIterator, OutputIterator result, Predicate)
+{
+    *result = 13;
+    return result;
+}
+
+void TestCopyIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::copy_if(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.end()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestCopyIfDispatchImplicit);
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
+OutputIterator copy_if(my_system &system, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, Predicate)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestCopyIfStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::copy_if(sys,
+                    vec.begin(),
+                    vec.end(),
+                    vec.begin(),
+                    vec.begin(),
+                    0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestCopyIfStencilDispatchExplicit);
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
+OutputIterator copy_if(my_tag, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, Predicate)
+{
+    *result = 13;
+    return result;
+}
+
+void TestCopyIfStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::copy_if(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.end()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestCopyIfStencilDispatchImplicit);
+
+struct only_set_when_expected_it
+{
+    long long expected;
+    bool * flag;
+
+    __host__ __device__ only_set_when_expected_it operator++() const { return *this; }
+    __host__ __device__ only_set_when_expected_it operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+=(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long value) const
+    {
+        if (value == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+namespace thrust
+{
+namespace detail
+{
+// We need this type to pass as a non-const ref for unary_transform_functor
+// to compile:
+template <>
+struct is_non_const_reference<only_set_when_expected_it> : thrust::true_type {};
+}
+
+template<>
+struct iterator_traits<only_set_when_expected_it>
+{
+    typedef long long value_type;
+    typedef only_set_when_expected_it reference;
+    typedef thrust::random_access_device_iterator_tag iterator_category;
+};
+}
+
+void TestCopyWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::copy(thrust::device, begin, end, out);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestCopyWithBigIndexes()
+{
+    TestCopyWithBigIndexesHelper(30);
+    TestCopyWithBigIndexesHelper(31);
+    TestCopyWithBigIndexesHelper(32);
+    TestCopyWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestCopyWithBigIndexes);
diff --git a/thrust/testing/copy_n.cu b/thrust/testing/copy_n.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2003b106993e0a6f5922b4eed34c72a8dc1773fb
--- /dev/null
+++ b/thrust/testing/copy_n.cu
@@ -0,0 +1,286 @@
+#include <unittest/unittest.h>
+#include <thrust/copy.h>
+
+#include <list>
+#include <iterator>
+#include <thrust/sequence.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+void TestCopyNFromConstIterator(void)
+{
+    typedef int T;
+
+    std::vector<T> v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    std::vector<int>::const_iterator begin = v.begin();
+
+    // copy to host_vector
+    thrust::host_vector<T> h(5, (T) 10);
+    thrust::host_vector<T>::iterator h_result = thrust::copy_n(begin, h.size(), h.begin());
+    ASSERT_EQUAL(h[0], 0);
+    ASSERT_EQUAL(h[1], 1);
+    ASSERT_EQUAL(h[2], 2);
+    ASSERT_EQUAL(h[3], 3);
+    ASSERT_EQUAL(h[4], 4);
+    ASSERT_EQUAL_QUIET(h_result, h.end());
+
+    // copy to device_vector
+    thrust::device_vector<T> d(5, (T) 10);
+    thrust::device_vector<T>::iterator d_result = thrust::copy_n(begin, d.size(), d.begin());
+    ASSERT_EQUAL(d[0], 0);
+    ASSERT_EQUAL(d[1], 1);
+    ASSERT_EQUAL(d[2], 2);
+    ASSERT_EQUAL(d[3], 3);
+    ASSERT_EQUAL(d[4], 4);
+    ASSERT_EQUAL_QUIET(d_result, d.end());
+}
+DECLARE_UNITTEST(TestCopyNFromConstIterator);
+
+void TestCopyNToDiscardIterator(void)
+{
+    typedef int T;
+
+    thrust::host_vector<T> h_input(5, 1);
+    thrust::device_vector<T> d_input = h_input;
+
+    // copy from host_vector
+    thrust::discard_iterator<> h_result =
+      thrust::copy_n(h_input.begin(), h_input.size(), thrust::make_discard_iterator());
+
+    // copy from device_vector
+    thrust::discard_iterator<> d_result =
+      thrust::copy_n(d_input.begin(), d_input.size(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(5);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_UNITTEST(TestCopyNToDiscardIterator);
+
+template <class Vector>
+void TestCopyNMatchingTypes(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    // copy to host_vector
+    thrust::host_vector<T> h(5, (T) 10);
+    typename thrust::host_vector<T>::iterator h_result = thrust::copy_n(v.begin(), v.size(), h.begin());
+    ASSERT_EQUAL(h[0], 0);
+    ASSERT_EQUAL(h[1], 1);
+    ASSERT_EQUAL(h[2], 2);
+    ASSERT_EQUAL(h[3], 3);
+    ASSERT_EQUAL(h[4], 4);
+    ASSERT_EQUAL_QUIET(h_result, h.end());
+
+    // copy to device_vector
+    thrust::device_vector<T> d(5, (T) 10);
+    typename thrust::device_vector<T>::iterator d_result = thrust::copy_n(v.begin(), v.size(), d.begin());
+    ASSERT_EQUAL(d[0], 0);
+    ASSERT_EQUAL(d[1], 1);
+    ASSERT_EQUAL(d[2], 2);
+    ASSERT_EQUAL(d[3], 3);
+    ASSERT_EQUAL(d[4], 4);
+    ASSERT_EQUAL_QUIET(d_result, d.end());
+}
+DECLARE_VECTOR_UNITTEST(TestCopyNMatchingTypes);
+
+template <class Vector>
+void TestCopyNMixedTypes(void)
+{
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    // copy to host_vector with different type
+    thrust::host_vector<float> h(5, (float) 10);
+    typename thrust::host_vector<float>::iterator h_result = thrust::copy_n(v.begin(), v.size(), h.begin());
+
+    ASSERT_EQUAL(h[0], 0);
+    ASSERT_EQUAL(h[1], 1);
+    ASSERT_EQUAL(h[2], 2);
+    ASSERT_EQUAL(h[3], 3);
+    ASSERT_EQUAL(h[4], 4);
+    ASSERT_EQUAL_QUIET(h_result, h.end());
+
+    // copy to device_vector with different type
+    thrust::device_vector<float> d(5, (float) 10);
+    typename thrust::device_vector<float>::iterator d_result = thrust::copy_n(v.begin(), v.size(), d.begin());
+    ASSERT_EQUAL(d[0], 0);
+    ASSERT_EQUAL(d[1], 1);
+    ASSERT_EQUAL(d[2], 2);
+    ASSERT_EQUAL(d[3], 3);
+    ASSERT_EQUAL(d[4], 4);
+    ASSERT_EQUAL_QUIET(d_result, d.end());
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyNMixedTypes);
+
+
+void TestCopyNVectorBool(void)
+{
+    std::vector<bool> v(3);
+    v[0] = true; v[1] = false; v[2] = true;
+
+    thrust::host_vector<bool> h(3);
+    thrust::device_vector<bool> d(3);
+    
+    thrust::copy_n(v.begin(), v.size(), h.begin());
+    thrust::copy_n(v.begin(), v.size(), d.begin());
+
+    ASSERT_EQUAL(h[0], true);
+    ASSERT_EQUAL(h[1], false);
+    ASSERT_EQUAL(h[2], true);
+
+    ASSERT_EQUAL(d[0], true);
+    ASSERT_EQUAL(d[1], false);
+    ASSERT_EQUAL(d[2], true);
+}
+DECLARE_UNITTEST(TestCopyNVectorBool);
+
+
+template <class Vector>
+void TestCopyNListTo(void)
+{
+    typedef typename Vector::value_type T;
+
+    // copy from list to Vector
+    std::list<T> l;
+    l.push_back(0);
+    l.push_back(1);
+    l.push_back(2);
+    l.push_back(3);
+    l.push_back(4);
+   
+    Vector v(l.size());
+
+    typename Vector::iterator v_result = thrust::copy_n(l.begin(), l.size(), v.begin());
+
+    ASSERT_EQUAL(v[0], T(0));
+    ASSERT_EQUAL(v[1], T(1));
+    ASSERT_EQUAL(v[2], T(2));
+    ASSERT_EQUAL(v[3], T(3));
+    ASSERT_EQUAL(v[4], T(4));
+    ASSERT_EQUAL_QUIET(v_result, v.end());
+
+    l.clear();
+
+    thrust::copy_n(v.begin(), v.size(), std::back_insert_iterator< std::list<T> >(l));
+
+    ASSERT_EQUAL(l.size(), 5lu);
+
+    typename std::list<T>::const_iterator iter = l.begin();
+    ASSERT_EQUAL(*iter, T(0));  iter++;
+    ASSERT_EQUAL(*iter, T(1));  iter++;
+    ASSERT_EQUAL(*iter, T(2));  iter++;
+    ASSERT_EQUAL(*iter, T(3));  iter++;
+    ASSERT_EQUAL(*iter, T(4));  iter++;
+}
+DECLARE_VECTOR_UNITTEST(TestCopyNListTo);
+
+
+template <typename Vector>
+void TestCopyNCountingIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::counting_iterator<T> iter(1);
+
+    Vector vec(4);
+
+    thrust::copy_n(iter, 4, vec.begin());
+
+    ASSERT_EQUAL(vec[0], T(1));
+    ASSERT_EQUAL(vec[1], T(2));
+    ASSERT_EQUAL(vec[2], T(3));
+    ASSERT_EQUAL(vec[3], T(4));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyNCountingIterator);
+
+template <typename Vector>
+void TestCopyNZipIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3); v1[0] = 1; v1[1] = 2; v1[2] = 3;
+    Vector v2(3); v2[0] = 4; v2[1] = 5; v2[2] = 6; 
+    Vector v3(3, T(0));
+    Vector v4(3, T(0));
+
+    thrust::copy_n(thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin())),
+                   3,
+                   thrust::make_zip_iterator(thrust::make_tuple(v3.begin(),v4.begin())));
+
+    ASSERT_EQUAL(v1, v3);
+    ASSERT_EQUAL(v2, v4);
+};
+DECLARE_VECTOR_UNITTEST(TestCopyNZipIterator);
+
+template <typename Vector>
+void TestCopyNConstantIteratorToZipIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3, T(0));
+    Vector v2(3, T(0));
+
+    thrust::copy_n(thrust::make_constant_iterator(thrust::tuple<T,T>(4,7)),
+                   v1.size(),
+                   thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin())));
+
+    ASSERT_EQUAL(v1[0], T(4));
+    ASSERT_EQUAL(v1[1], T(4));
+    ASSERT_EQUAL(v1[2], T(4));
+    ASSERT_EQUAL(v2[0], T(7));
+    ASSERT_EQUAL(v2[1], T(7));
+    ASSERT_EQUAL(v2[2], T(7));
+};
+DECLARE_VECTOR_UNITTEST(TestCopyNConstantIteratorToZipIterator);
+
+template<typename InputIterator, typename Size, typename OutputIterator>
+OutputIterator copy_n(my_system &system, InputIterator, Size, OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestCopyNDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::copy_n(sys,
+                   vec.begin(),
+                   1,
+                   vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestCopyNDispatchExplicit);
+
+
+template<typename InputIterator, typename Size, typename OutputIterator>
+OutputIterator copy_n(my_tag, InputIterator, Size, OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestCopyNDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::copy_n(thrust::retag<my_tag>(vec.begin()),
+                   1,
+                   thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestCopyNDispatchImplicit);
+
diff --git a/thrust/testing/count.cu b/thrust/testing/count.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a6021da79fb095a69f316355fb249ce5c358e351
--- /dev/null
+++ b/thrust/testing/count.cu
@@ -0,0 +1,137 @@
+#include <unittest/unittest.h>
+#include <thrust/count.h>
+#include <thrust/iterator/retag.h>
+
+template <class Vector>
+void TestCountSimple(void)
+{
+    Vector data(5);
+    data[0] = 1; data[1] = 1; data[2] = 0; data[3] = 0; data[4] = 1;
+
+    ASSERT_EQUAL(thrust::count(data.begin(), data.end(), 0), 2);
+    ASSERT_EQUAL(thrust::count(data.begin(), data.end(), 1), 3);
+    ASSERT_EQUAL(thrust::count(data.begin(), data.end(), 2), 0);
+}
+DECLARE_VECTOR_UNITTEST(TestCountSimple);
+
+template <typename T>
+void TestCount(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t cpu_result = thrust::count(h_data.begin(), h_data.end(), T(5));
+    size_t gpu_result = thrust::count(d_data.begin(), d_data.end(), T(5));
+
+    ASSERT_EQUAL(cpu_result, gpu_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestCount);
+
+
+
+
+template <typename T>
+struct greater_than_five
+{
+  __host__ __device__ bool operator()(const T &x) const {return x > 5;}
+};
+
+template <class Vector>
+void TestCountIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] = 1; data[1] = 6; data[2] = 1; data[3] = 9; data[4] = 2;
+
+    ASSERT_EQUAL(thrust::count_if(data.begin(), data.end(), greater_than_five<T>()), 2);
+}
+DECLARE_VECTOR_UNITTEST(TestCountIfSimple);
+
+
+template <typename T>
+void TestCountIf(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t cpu_result = thrust::count_if(h_data.begin(), h_data.end(), greater_than_five<T>());
+    size_t gpu_result = thrust::count_if(d_data.begin(), d_data.end(), greater_than_five<T>());
+
+    ASSERT_EQUAL(cpu_result, gpu_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestCountIf);
+
+
+template <typename Vector>
+void TestCountFromConstIteratorSimple(void)
+{
+    Vector data(5);
+    data[0] = 1; data[1] = 1; data[2] = 0; data[3] = 0; data[4] = 1;
+
+    ASSERT_EQUAL(thrust::count(data.cbegin(), data.cend(), 0), 2);
+    ASSERT_EQUAL(thrust::count(data.cbegin(), data.cend(), 1), 3);
+    ASSERT_EQUAL(thrust::count(data.cbegin(), data.cend(), 2), 0);
+}
+DECLARE_VECTOR_UNITTEST(TestCountFromConstIteratorSimple);
+
+
+template<typename InputIterator, typename EqualityComparable>
+int count(my_system &system, InputIterator, InputIterator, EqualityComparable x)
+{
+    system.validate_dispatch();
+    return x;
+}
+
+void TestCountDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::count(sys,
+                  vec.begin(),
+                  vec.end(),
+                  13);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestCountDispatchExplicit); 
+
+
+template<typename InputIterator, typename EqualityComparable>
+int count(my_tag, InputIterator /*first*/, InputIterator, EqualityComparable x)
+{
+    return x;
+}
+
+void TestCountDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    int result = thrust::count(thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.end()),
+                               13);
+
+    ASSERT_EQUAL(13, result);
+}
+DECLARE_UNITTEST(TestCountDispatchImplicit);
+
+void TestCountWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    long long result = thrust::count(thrust::device, begin, end, (1ll << magnitude) - 17);
+
+    ASSERT_EQUAL(result, 1);
+}
+
+void TestCountWithBigIndexes()
+{
+    TestCountWithBigIndexesHelper(30);
+    TestCountWithBigIndexesHelper(31);
+    TestCountWithBigIndexesHelper(32);
+    TestCountWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestCountWithBigIndexes);
diff --git a/thrust/testing/counting_iterator.cu b/thrust/testing/counting_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eede510fc4abbf7097c3fb335b5683fc2d7eca18
--- /dev/null
+++ b/thrust/testing/counting_iterator.cu
@@ -0,0 +1,224 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/detail/cstdint.h>
+
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+void TestCountingIteratorCopyConstructor(void)
+{
+    thrust::counting_iterator<int> iter0(100);
+
+    thrust::counting_iterator<int> iter1(iter0);
+
+    ASSERT_EQUAL_QUIET(iter0, iter1);
+    ASSERT_EQUAL(*iter0, *iter1);
+
+    // construct from related space
+    thrust::counting_iterator<int, thrust::host_system_tag> h_iter = iter0;
+    ASSERT_EQUAL(*iter0, *h_iter);
+
+    thrust::counting_iterator<int, thrust::device_system_tag> d_iter = iter0;
+    ASSERT_EQUAL(*iter0, *d_iter);
+}
+DECLARE_UNITTEST(TestCountingIteratorCopyConstructor);
+
+
+void TestCountingIteratorIncrement(void)
+{
+    thrust::counting_iterator<int> iter(0);
+
+    ASSERT_EQUAL(*iter, 0);
+
+    iter++;
+
+    ASSERT_EQUAL(*iter, 1);
+    
+    iter++;
+    iter++;
+    
+    ASSERT_EQUAL(*iter, 3);
+
+    iter += 5;
+    
+    ASSERT_EQUAL(*iter, 8);
+
+    iter -= 10;
+    
+    ASSERT_EQUAL(*iter, -2);
+}
+DECLARE_UNITTEST(TestCountingIteratorIncrement);
+
+
+void TestCountingIteratorComparison(void)
+{
+    thrust::counting_iterator<int> iter1(0);
+    thrust::counting_iterator<int> iter2(0);
+
+    ASSERT_EQUAL(iter1 - iter2, 0);
+    ASSERT_EQUAL(iter1 == iter2, true);
+
+    iter1++;
+    
+    ASSERT_EQUAL(iter1 - iter2, 1);
+    ASSERT_EQUAL(iter1 == iter2, false);
+   
+    iter2++;
+
+    ASSERT_EQUAL(iter1 - iter2, 0);
+    ASSERT_EQUAL(iter1 == iter2, true);
+  
+    iter1 += 100;
+    iter2 += 100;
+
+    ASSERT_EQUAL(iter1 - iter2, 0);
+    ASSERT_EQUAL(iter1 == iter2, true);
+}
+DECLARE_UNITTEST(TestCountingIteratorComparison);
+
+
+void TestCountingIteratorFloatComparison(void)
+{
+    thrust::counting_iterator<float> iter1(0);
+    thrust::counting_iterator<float> iter2(0);
+
+    ASSERT_EQUAL(iter1 - iter2, 0);
+    ASSERT_EQUAL(iter1 == iter2, true);
+    ASSERT_EQUAL(iter1 <  iter2, false);
+    ASSERT_EQUAL(iter2 <  iter1, false);
+
+    iter1++;
+    
+    ASSERT_EQUAL(iter1 - iter2, 1);
+    ASSERT_EQUAL(iter1 == iter2, false);
+    ASSERT_EQUAL(iter2 < iter1, true); 
+    ASSERT_EQUAL(iter1 < iter2, false); 
+   
+    iter2++;
+
+    ASSERT_EQUAL(iter1 - iter2, 0);
+    ASSERT_EQUAL(iter1 == iter2, true);
+    ASSERT_EQUAL(iter1 < iter2, false);
+    ASSERT_EQUAL(iter2 < iter1, false);
+  
+    iter1 += 100;
+    iter2 += 100;
+
+    ASSERT_EQUAL(iter1 - iter2, 0);
+    ASSERT_EQUAL(iter1 == iter2, true);
+    ASSERT_EQUAL(iter1 < iter2, false);
+    ASSERT_EQUAL(iter2 < iter1, false);
+
+
+    thrust::counting_iterator<float> iter3(0);
+    thrust::counting_iterator<float> iter4(0.5);
+
+    ASSERT_EQUAL(iter3 - iter4, 0);
+    ASSERT_EQUAL(iter3 == iter4, true);
+    ASSERT_EQUAL(iter3 < iter4, false);
+    ASSERT_EQUAL(iter4 < iter3, false);
+
+    iter3++; // iter3 = 1.0, iter4 = 0.5
+    
+    ASSERT_EQUAL(iter3 - iter4, 0);
+    ASSERT_EQUAL(iter3 == iter4, true);
+    ASSERT_EQUAL(iter3 < iter4, false);
+    ASSERT_EQUAL(iter4 < iter3, false);
+   
+    iter4++; // iter3 = 1.0, iter4 = 1.5
+
+    ASSERT_EQUAL(iter3 - iter4, 0);
+    ASSERT_EQUAL(iter3 == iter4, true);
+    ASSERT_EQUAL(iter3 < iter4, false);
+    ASSERT_EQUAL(iter4 < iter3, false);
+
+    iter4++; // iter3 = 1.0, iter4 = 2.5
+
+    ASSERT_EQUAL(iter3 - iter4, -1);
+    ASSERT_EQUAL(iter4 - iter3,  1);
+    ASSERT_EQUAL(iter3 == iter4, false);
+    ASSERT_EQUAL(iter3 < iter4, true);
+    ASSERT_EQUAL(iter4 < iter3, false);
+}
+DECLARE_UNITTEST(TestCountingIteratorFloatComparison);
+
+
+void TestCountingIteratorDistance(void)
+{
+    thrust::counting_iterator<int> iter1(0);
+    thrust::counting_iterator<int> iter2(5);
+
+    ASSERT_EQUAL(thrust::distance(iter1, iter2), 5);
+
+    iter1++;
+    
+    ASSERT_EQUAL(thrust::distance(iter1, iter2), 4);
+   
+    iter2 += 100;
+
+    ASSERT_EQUAL(thrust::distance(iter1, iter2), 104);
+}
+DECLARE_UNITTEST(TestCountingIteratorDistance);
+
+
+void TestCountingIteratorUnsignedType(void)
+{
+    thrust::counting_iterator<unsigned int> iter0(0);
+    thrust::counting_iterator<unsigned int> iter1(5);
+
+    ASSERT_EQUAL(iter1 - iter0,  5);
+    ASSERT_EQUAL(iter0 - iter1, -5);
+    ASSERT_EQUAL(iter0 != iter1, true);
+    ASSERT_EQUAL(iter0 <  iter1, true);
+    ASSERT_EQUAL(iter1 <  iter0, false);
+}
+DECLARE_UNITTEST(TestCountingIteratorUnsignedType);
+
+
+void TestCountingIteratorLowerBound(void)
+{
+    size_t n = 10000;
+    const size_t M = 100;
+
+    thrust::host_vector<unsigned int> h_data = unittest::random_integers<unsigned int>(n);
+    for(unsigned int i = 0; i < n; ++i)
+      h_data[i] %= M;
+
+    thrust::sort(h_data.begin(), h_data.end());
+
+    thrust::device_vector<unsigned int> d_data = h_data;
+
+    thrust::counting_iterator<unsigned int> search_begin(0);
+    thrust::counting_iterator<unsigned int> search_end(M);
+
+
+    thrust::host_vector<unsigned int> h_result(M);
+    thrust::device_vector<unsigned int> d_result(M);
+
+
+    thrust::lower_bound(h_data.begin(), h_data.end(), search_begin, search_end, h_result.begin());
+
+    thrust::lower_bound(d_data.begin(), d_data.end(), search_begin, search_end, d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_UNITTEST(TestCountingIteratorLowerBound);
+
+void TestCountingIteratorDifference(void)
+{
+    typedef thrust::counting_iterator<thrust::detail::uint64_t> Iterator;
+    typedef thrust::iterator_difference<Iterator>::type Difference;
+
+    Difference diff = std::numeric_limits<thrust::detail::uint32_t>::max() + 1;
+
+    Iterator first(0);
+    Iterator last = first + diff;
+
+    ASSERT_EQUAL(diff, last - first);
+}
+DECLARE_UNITTEST(TestCountingIteratorDifference);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/thrust/testing/cpp/CMakeLists.txt b/thrust/testing/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..215b81ee43d96fa6878ed9923c78b398d0a91d9f
--- /dev/null
+++ b/thrust/testing/cpp/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CPP")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "cpp.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/thrust/testing/cpp/adjacent_difference.cu b/thrust/testing/cpp/adjacent_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..584899becbdd6c01fc613ccfb09d4679ec2025d7
--- /dev/null
+++ b/thrust/testing/cpp/adjacent_difference.cu
@@ -0,0 +1,54 @@
+#include <unittest/unittest.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/execution_policy.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+struct detect_wrong_difference
+{
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/thrust/testing/cstdint.cu b/thrust/testing/cstdint.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5284955fd79aae10bd722065df98703081df5e82
--- /dev/null
+++ b/thrust/testing/cstdint.cu
@@ -0,0 +1,43 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/cstdint.h>
+
+#include <limits>
+
+void TestStandardIntegerTypes(void)
+{
+  ASSERT_EQUAL(sizeof(thrust::detail::int8_t),   1lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::int16_t),  2lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::int32_t),  4lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::int64_t),  8lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint8_t),  1lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint16_t), 2lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint32_t), 4lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint64_t), 8lu);
+
+  ASSERT_EQUAL(sizeof(thrust::detail::intptr_t),  sizeof(void *));
+  ASSERT_EQUAL(sizeof(thrust::detail::uintptr_t), sizeof(void *));
+
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int8_t >::is_integer,   true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int16_t>::is_integer,   true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int32_t>::is_integer,   true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int64_t>::is_integer,   true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint8_t >::is_integer,  true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint16_t>::is_integer,  true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint32_t>::is_integer,  true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint64_t>::is_integer,  true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int8_t >::is_signed,    true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int16_t>::is_signed,    true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int32_t>::is_signed,    true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::int64_t>::is_signed,    true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint8_t >::is_signed,   false);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint16_t>::is_signed,   false);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint32_t>::is_signed,   false);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uint64_t>::is_signed,   false);
+  
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::intptr_t>::is_integer,  true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uintptr_t>::is_integer, true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::intptr_t>::is_signed,   true);
+  ASSERT_EQUAL(std::numeric_limits<thrust::detail::uintptr_t>::is_signed,  false);
+}
+DECLARE_UNITTEST(TestStandardIntegerTypes);
+
diff --git a/thrust/testing/cuda/CMakeLists.txt b/thrust/testing/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6df1b19c042918732b29fa5d92522facb09a70bd
--- /dev/null
+++ b/thrust/testing/cuda/CMakeLists.txt
@@ -0,0 +1,28 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+# These tests always build with RDC, so make sure that the sm_XX flags are
+# compatible. See note in ThrustCudaConfig.cmake.
+set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "cuda.")
+
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+
+    # All in testing/cuda will test device-side launch (aka calling parallel
+    # algorithms from device code), which requires the CUDA device-side runtime,
+    # which requires RDC, so these always need to be built with RDC.
+    thrust_enable_rdc_for_cuda_target(${test_target})
+  endforeach()
+endforeach()
diff --git a/thrust/testing/cuda/adjacent_difference.cu b/thrust/testing/cuda/adjacent_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..96f3a523443e7c7ff4dc5d0993651c37fbef3d3b
--- /dev/null
+++ b/thrust/testing/cuda/adjacent_difference.cu
@@ -0,0 +1,146 @@
+#include <unittest/unittest.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/execution_policy.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__ void adjacent_difference_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  thrust::adjacent_difference(exec, first, last, result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename BinaryFunction>
+__global__ void adjacent_difference_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, BinaryFunction f)
+{
+  thrust::adjacent_difference(exec, first, last, result, f);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestAdjacentDifferenceDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T>   h_input = unittest::random_samples<T>(n);
+  thrust::device_vector<T> d_input = h_input;
+
+  thrust::host_vector<T>   h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin());
+  adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_output, d_output);
+
+  thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
+  adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_output, d_output);
+
+  // in-place operation
+  thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
+  adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_input, h_output); //computed previously
+  ASSERT_EQUAL(d_input, d_output); //computed previously
+}
+
+
+template<typename T>
+void TestAdjacentDifferenceDeviceSeq(const size_t n)
+{
+  TestAdjacentDifferenceDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceDeviceSeq);
+
+
+template<typename T>
+void TestAdjacentDifferenceDeviceDevice(const size_t n)
+{
+  TestAdjacentDifferenceDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceDeviceDevice);
+
+
+void TestAdjacentDifferenceCudaStreams()
+{
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::device_vector<int> input(3);
+  thrust::device_vector<int> output(3);
+  input[0] = 1; input[1] = 4; input[2] = 6;
+
+  thrust::adjacent_difference(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin());
+
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 3);
+  ASSERT_EQUAL(output[2], 2);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceCudaStreams);
+
+struct detect_wrong_difference
+{
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/thrust/testing/cuda/adjacent_difference.mk b/thrust/testing/cuda/adjacent_difference.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/adjacent_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/binary_search.cu b/thrust/testing/cuda/binary_search.cu
new file mode 100644
index 0000000000000000000000000000000000000000..58a83f61cec9842161f34b836126867614a45cfe
--- /dev/null
+++ b/thrust/testing/cuda/binary_search.cu
@@ -0,0 +1,25 @@
+#include <unittest/unittest.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/pair.h>
+#include <thrust/sequence.h>
+
+void TestEqualRangeOnStream()
+{ // Regression test for GH issue #921 (nvbug 2173437)
+  typedef typename thrust::device_vector<int> vector_t;
+  typedef typename vector_t::iterator iterator_t;
+  typedef thrust::pair<iterator_t, iterator_t> result_t;
+
+  vector_t input(10);
+  thrust::sequence(thrust::device, input.begin(), input.end(), 0);
+  cudaStream_t stream = 0;
+  result_t result = thrust::equal_range(thrust::cuda::par.on(stream),
+                                        input.begin(), input.end(),
+                                        5);
+
+  ASSERT_EQUAL(5, thrust::distance(input.begin(), result.first));
+  ASSERT_EQUAL(6, thrust::distance(input.begin(), result.second));
+}
+DECLARE_UNITTEST(TestEqualRangeOnStream);
diff --git a/thrust/testing/cuda/binary_search.mk b/thrust/testing/cuda/binary_search.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/binary_search.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/complex.cu b/thrust/testing/cuda/complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8034541ffb6f6dbb9eb09fa9c1574e36faa0b350
--- /dev/null
+++ b/thrust/testing/cuda/complex.cu
@@ -0,0 +1,53 @@
+#include <unittest/unittest.h>
+
+#include <thrust/complex.h>
+#include <thrust/detail/preprocessor.h>
+#include <thrust/detail/alignment.h>
+
+#include <cuda_fp16.h>
+
+template <typename T, typename VectorT>
+void TestComplexAlignment()
+{
+  THRUST_STATIC_ASSERT(
+    sizeof(thrust::complex<T>) == sizeof(VectorT)
+  );
+  THRUST_STATIC_ASSERT(
+    THRUST_ALIGNOF(thrust::complex<T>) == THRUST_ALIGNOF(VectorT)
+  );
+
+  THRUST_STATIC_ASSERT(
+    sizeof(thrust::complex<T const>) == sizeof(VectorT)
+  );
+  THRUST_STATIC_ASSERT(
+    THRUST_ALIGNOF(thrust::complex<T const>) == THRUST_ALIGNOF(VectorT)
+  );
+}
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<char, char2>)
+, TestComplexCharAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<short, short2>)
+, TestComplexShortAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<int, int2>)
+, TestComplexIntAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<long, long2>)
+, TestComplexLongAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<__half, __half2>)
+, TestComplexHalfAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<float, float2>)
+, TestComplexFloatAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<double, double2>)
+, TestComplexDoubleAlignment
+);
diff --git a/thrust/testing/cuda/complex.mk b/thrust/testing/cuda/complex.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/complex.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/copy.cu b/thrust/testing/cuda/copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ad6e26262a8641899ce0bd7a92d3920d692cf96
--- /dev/null
+++ b/thrust/testing/cuda/copy.cu
@@ -0,0 +1,92 @@
+#include <unittest/unittest.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  thrust::copy(exec, first, last, result);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestCopyDevice(ExecutionPolicy exec, size_t n)
+{
+  thrust::host_vector<T>   h_src = unittest::random_integers<T>(n);
+  thrust::host_vector<T>   h_dst(n);
+
+  thrust::device_vector<T> d_src = h_src;
+  thrust::device_vector<T> d_dst(n);
+  
+  thrust::copy(h_src.begin(), h_src.end(), h_dst.begin());
+  copy_kernel<<<1,1>>>(exec, d_src.begin(), d_src.end(), d_dst.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_dst, d_dst);
+}
+
+
+template<typename T>
+void TestCopyDeviceSeq(size_t n)
+{
+  TestCopyDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCopyDeviceSeq);
+
+
+template<typename T>
+void TestCopyDeviceDevice(size_t n)
+{
+  TestCopyDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCopyDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Size, typename Iterator2>
+__global__
+void copy_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, Iterator2 result)
+{
+  thrust::copy_n(exec, first, n, result);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestCopyNDevice(ExecutionPolicy exec, size_t n)
+{
+  thrust::host_vector<T>   h_src = unittest::random_integers<T>(n);
+  thrust::host_vector<T>   h_dst(n);
+
+  thrust::device_vector<T> d_src = h_src;
+  thrust::device_vector<T> d_dst(n);
+  
+  thrust::copy_n(h_src.begin(), h_src.size(), h_dst.begin());
+  copy_n_kernel<<<1,1>>>(exec, d_src.begin(), d_src.size(), d_dst.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_dst, d_dst);
+}
+
+
+template<typename T>
+void TestCopyNDeviceSeq(size_t n)
+{
+  TestCopyNDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCopyNDeviceSeq);
+
+
+template<typename T>
+void TestCopyNDeviceDevice(size_t n)
+{
+  TestCopyNDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCopyNDeviceDevice);
+
diff --git a/thrust/testing/cuda/copy.mk b/thrust/testing/cuda/copy.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/copy.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/copy_if.cu b/thrust/testing/cuda/copy_if.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dcec12fdeb4124e22e5b2579ac0a26676240706f
--- /dev/null
+++ b/thrust/testing/cuda/copy_if.cu
@@ -0,0 +1,250 @@
+#include <unittest/unittest.h>
+#include <thrust/copy.h>
+#include <thrust/sequence.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename T>
+struct is_even
+{
+  __host__ __device__
+  bool operator()(T x) { return (static_cast<unsigned int>(x) & 1) == 0; }
+};
+
+
+template<typename T>
+struct mod_3
+{
+  __host__ __device__
+  unsigned int operator()(T x) { return static_cast<unsigned int>(x) % 3; }
+};
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
+__global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Predicate pred, Iterator3 result2)
+{
+  *result2 = thrust::copy_if(exec, first, last, result1, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestCopyIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  typename thrust::host_vector<int>::iterator   h_new_end;
+  typename thrust::device_vector<int>::iterator d_new_end;
+
+  thrust::device_vector<
+    typename thrust::device_vector<int>::iterator
+  > d_new_end_vec(1);
+  
+  // test with Predicate that returns a bool
+  {
+    thrust::host_vector<int>   h_result(n);
+    thrust::device_vector<int> d_result(n);
+    
+    h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<int>());
+
+    copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), is_even<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
+    d_new_end = d_new_end_vec[0];
+    
+    h_result.resize(h_new_end - h_result.begin());
+    d_result.resize(d_new_end - d_result.begin());
+    
+    ASSERT_EQUAL(h_result, d_result);
+  }
+  
+  // test with Predicate that returns a non-bool
+  {
+    thrust::host_vector<int>   h_result(n);
+    thrust::device_vector<int> d_result(n);
+    
+    h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<int>());
+
+    copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), mod_3<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
+    d_new_end = d_new_end_vec[0];
+    
+    h_result.resize(h_new_end - h_result.begin());
+    d_result.resize(d_new_end - d_result.begin());
+    
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+
+
+void TestCopyIfDeviceSeq()
+{
+  TestCopyIfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestCopyIfDeviceSeq);
+
+
+void TestCopyIfDeviceDevice()
+{
+  TestCopyIfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestCopyIfDeviceDevice);
+
+
+void TestCopyIfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
+                                         data.begin(), 
+                                         data.end(), 
+                                         result.begin(),
+                                         is_even<int>());
+
+  ASSERT_EQUAL(end - result.begin(), 2);
+
+  ASSERT_EQUAL(result[0], 2);
+  ASSERT_EQUAL(result[1], 2);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
+__global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result1, Predicate pred, Iterator4 result2)
+{
+  *result2 = thrust::copy_if(exec, first, last, stencil_first, result1, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestCopyIfStencilDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data(n); thrust::sequence(h_data.begin(), h_data.end());
+  thrust::device_vector<int> d_data(n); thrust::sequence(d_data.begin(), d_data.end()); 
+  
+  thrust::host_vector<int>   h_stencil = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_stencil = unittest::random_integers<int>(n);
+  
+  thrust::host_vector<int>   h_result(n);
+  thrust::device_vector<int> d_result(n);
+  
+  typename thrust::host_vector<int>::iterator   h_new_end;
+  typename thrust::device_vector<int>::iterator d_new_end;
+
+  thrust::device_vector<
+    typename thrust::device_vector<int>::iterator
+  > d_new_end_vec(1);
+  
+  // test with Predicate that returns a bool
+  {
+    thrust::host_vector<int>   h_result(n);
+    thrust::device_vector<int> d_result(n);
+    
+    h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<int>());
+
+    copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), is_even<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
+    d_new_end = d_new_end_vec[0];
+    
+    h_result.resize(h_new_end - h_result.begin());
+    d_result.resize(d_new_end - d_result.begin());
+    
+    ASSERT_EQUAL(h_result, d_result);
+  }
+  
+  // test with Predicate that returns a non-bool
+  {
+    thrust::host_vector<int>   h_result(n);
+    thrust::device_vector<int> d_result(n);
+    
+    h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<int>());
+
+    copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), mod_3<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
+    d_new_end = d_new_end_vec[0];
+    
+    h_result.resize(h_new_end - h_result.begin());
+    d_result.resize(d_new_end - d_result.begin());
+    
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+
+
+void TestCopyIfStencilDeviceSeq()
+{
+  TestCopyIfStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestCopyIfStencilDeviceSeq);
+
+
+void TestCopyIfStencilDeviceDevice()
+{
+  TestCopyIfStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestCopyIfStencilDeviceDevice);
+
+
+void TestCopyIfStencilCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  Vector result(5);
+
+  Vector stencil(5);
+  stencil[0] = 0;
+  stencil[1] = 1;
+  stencil[2] = 0;
+  stencil[3] = 0;
+  stencil[4] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
+                                         data.begin(), 
+                                         data.end(),
+                                         stencil.begin(),
+                                         result.begin(),
+                                         thrust::identity<T>());
+
+  ASSERT_EQUAL(end - result.begin(), 2);
+
+  ASSERT_EQUAL(result[0], 2);
+  ASSERT_EQUAL(result[1], 2);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreams);
+
diff --git a/thrust/testing/cuda/copy_if.mk b/thrust/testing/cuda/copy_if.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/copy_if.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/count.cu b/thrust/testing/cuda/count.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32835f5c4bb116547880185207a408d2de1fba67
--- /dev/null
+++ b/thrust/testing/cuda/count.cu
@@ -0,0 +1,111 @@
+#include <unittest/unittest.h>
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
+__global__
+void count_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result)
+{
+  *result = thrust::count(exec, first, last, value);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestCountDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+
+  thrust::device_vector<size_t> d_result(1);
+  
+  size_t h_result = thrust::count(h_data.begin(), h_data.end(), T(5));
+
+  count_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), T(5), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_result, d_result[0]);
+}
+
+
+template<typename T>
+void TestCountDeviceSeq(const size_t n)
+{
+  TestCountDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCountDeviceSeq);
+
+
+template<typename T>
+void TestCountDeviceDevice(const size_t n)
+{
+  TestCountDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCountDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
+__global__
+void count_if_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::count_if(exec, first, last, pred);
+}
+
+
+template<typename T>
+struct greater_than_five
+{
+  __host__ __device__ bool operator()(const T &x) const {return x > 5;}
+};
+
+
+template<typename T, typename ExecutionPolicy>
+void TestCountIfDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+
+  thrust::device_vector<size_t> d_result(1);
+  
+  size_t h_result = thrust::count_if(h_data.begin(), h_data.end(), greater_than_five<T>());
+  count_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), greater_than_five<T>(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_result, d_result[0]);
+}
+
+
+template<typename T>
+void TestCountIfDeviceSeq(const size_t n)
+{
+  TestCountIfDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCountIfDeviceSeq);
+
+
+template<typename T>
+void TestCountIfDeviceDevice(const size_t n)
+{
+  TestCountIfDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestCountIfDeviceDevice);
+
+
+void TestCountCudaStreams()
+{
+  thrust::device_vector<int> data(5);
+  data[0] = 1; data[1] = 1; data[2] = 0; data[3] = 0; data[4] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  ASSERT_EQUAL(thrust::count(thrust::cuda::par.on(s), data.begin(), data.end(), 0), 2);
+  ASSERT_EQUAL(thrust::count(thrust::cuda::par.on(s), data.begin(), data.end(), 1), 3);
+  ASSERT_EQUAL(thrust::count(thrust::cuda::par.on(s), data.begin(), data.end(), 2), 0);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestCountCudaStreams);
+
diff --git a/thrust/testing/cuda/count.mk b/thrust/testing/cuda/count.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/count.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/cudart.cu b/thrust/testing/cuda/cudart.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f834cf5d33f8f4fc9502774295717f9ea9b36129
--- /dev/null
+++ b/thrust/testing/cuda/cudart.cu
@@ -0,0 +1,15 @@
+#include <unittest/unittest.h>
+#include <cuda_runtime_api.h>
+#include <thrust/detail/util/align.h>
+
+template<typename T>
+void TestCudaMallocResultAligned(const std::size_t n)
+{
+  T *ptr = 0;
+  cudaMalloc(&ptr, n * sizeof(T));
+  cudaFree(ptr);
+
+  ASSERT_EQUAL(true, thrust::detail::util::is_aligned(ptr));
+}
+DECLARE_VARIABLE_UNITTEST(TestCudaMallocResultAligned);
+
diff --git a/thrust/testing/cuda/cudart.mk b/thrust/testing/cuda/cudart.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/cudart.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/equal.cu b/thrust/testing/cuda/equal.cu
new file mode 100644
index 0000000000000000000000000000000000000000..84eb7254d82bb14632799c5919869f64f49ba52c
--- /dev/null
+++ b/thrust/testing/cuda/equal.cu
@@ -0,0 +1,122 @@
+#include <unittest/unittest.h>
+#include <thrust/equal.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void equal_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result)
+{
+  *result = thrust::equal(exec, first1, last1, first2);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename BinaryPredicate, typename Iterator3>
+__global__
+void equal_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, BinaryPredicate pred, Iterator3 result)
+{
+  *result = thrust::equal(exec, first1, last1, first2, pred);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestEqualDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::device_vector<T> d_data1 = unittest::random_samples<T>(n);
+  thrust::device_vector<T> d_data2 = unittest::random_samples<T>(n);
+  thrust::device_vector<bool> d_result(1, false);
+  
+  //empty ranges
+  equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.begin(), d_data1.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_result[0], true);
+  
+  //symmetric cases
+  equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.end(), d_data1.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_result[0], true);
+  
+  if(n > 0)
+  {
+    d_data1[0] = 0; d_data2[0] = 1;
+    
+    //different vectors
+    equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.end(), d_data2.begin(), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
+    ASSERT_EQUAL(d_result[0], false);
+    
+    //different predicates
+    equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.begin() + 1, d_data2.begin(), thrust::less<T>(), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
+    ASSERT_EQUAL(d_result[0], true);
+
+    equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.begin() + 1, d_data2.begin(), thrust::greater<T>(), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
+    ASSERT_EQUAL(d_result[0], false);
+  }
+}
+
+
+template<typename T>
+void TestEqualDeviceSeq(const size_t n)
+{
+  TestEqualDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestEqualDeviceSeq);
+
+
+template<typename T>
+void TestEqualDeviceDevice(const size_t n)
+{
+  TestEqualDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestEqualDeviceDevice);
+
+
+void TestEqualCudaStreams()
+{
+  thrust::device_vector<int> v1(5);
+  thrust::device_vector<int> v2(5);
+  v1[0] = 5; v1[1] = 2; v1[2] = 0; v1[3] = 0; v1[4] = 0;
+  v2[0] = 5; v2[1] = 2; v2[2] = 0; v2[3] = 6; v2[4] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.end(), v1.begin()), true);
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.end(), v2.begin()), false);
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v2.begin(), v2.end(), v2.begin()), true);
+  
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.begin() + 0, v1.begin()), true);
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.begin() + 1, v1.begin()), true);
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.begin() + 3, v2.begin()), true);
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.begin() + 4, v2.begin()), false);
+  
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.end(), v2.begin(), thrust::less_equal<int>()), true);
+  ASSERT_EQUAL(thrust::equal(thrust::cuda::par.on(s), v1.begin(), v1.end(), v2.begin(), thrust::greater<int>()),    false);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestEqualCudaStreams);
+
diff --git a/thrust/testing/cuda/equal.mk b/thrust/testing/cuda/equal.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/equal.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/fill.cu b/thrust/testing/cuda/fill.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17cf58c547fd8b8b21366667a10e4c9b3bf31a56
--- /dev/null
+++ b/thrust/testing/cuda/fill.cu
@@ -0,0 +1,220 @@
+#include <unittest/unittest.h>
+#include <thrust/fill.h>
+#include <thrust/execution_policy.h>
+#include <algorithm>
+
+template<typename ExecutionPolicy, typename Iterator, typename T>
+__global__
+void fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value)
+{
+  thrust::fill(exec, first, last, value);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestFillDevice(ExecutionPolicy exec, size_t n)
+{
+  thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+  
+  thrust::fill(h_data.begin() + std::min((size_t)1, n), h_data.begin() + std::min((size_t)3, n), (T) 0);
+
+  fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)1, n), d_data.begin() + std::min((size_t)3, n), (T) 0);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  thrust::fill(h_data.begin() + std::min((size_t)117, n), h_data.begin() + std::min((size_t)367, n), (T) 1);
+
+  fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)117, n), d_data.begin() + std::min((size_t)367, n), (T) 1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  thrust::fill(h_data.begin() + std::min((size_t)8, n), h_data.begin() + std::min((size_t)259, n), (T) 2);
+
+  fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)8, n), d_data.begin() + std::min((size_t)259, n), (T) 2);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  thrust::fill(h_data.begin() + std::min((size_t)3, n), h_data.end(), (T) 3);
+
+  fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)3, n), d_data.end(), (T) 3);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  thrust::fill(h_data.begin(), h_data.end(), (T) 4);
+
+  fill_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), (T) 4);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+}
+
+template<typename T>
+void TestFillDeviceSeq(size_t n)
+{
+  TestFillDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestFillDeviceSeq);
+
+template<typename T>
+void TestFillDeviceDevice(size_t n)
+{
+  TestFillDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestFillDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Size, typename T>
+__global__
+void fill_n_kernel(ExecutionPolicy exec, Iterator first, Size n, T value)
+{
+  thrust::fill_n(exec, first, n, value);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestFillNDevice(ExecutionPolicy exec, size_t n)
+{
+  thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+  
+  size_t begin_offset = std::min<size_t>(1,n);
+
+  thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)3, n) - begin_offset, (T) 0);
+
+  fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, std::min((size_t)3, n) - begin_offset, (T) 0);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+ 
+  ASSERT_EQUAL(h_data, d_data);
+  
+  begin_offset = std::min<size_t>(117, n);
+
+  thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
+
+  fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  begin_offset = std::min<size_t>(8, n);
+
+  thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
+
+  fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  begin_offset = std::min<size_t>(3, n);
+
+  thrust::fill_n(h_data.begin() + begin_offset, h_data.size() - begin_offset, (T) 3);
+
+  fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, d_data.size() - begin_offset, (T) 3);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+  
+  thrust::fill_n(h_data.begin(), h_data.size(), (T) 4);
+
+  fill_n_kernel<<<1,1>>>(exec, d_data.begin(), d_data.size(), (T) 4);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_data, d_data);
+}
+
+template<typename T>
+void TestFillNDeviceSeq(size_t n)
+{
+  TestFillNDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestFillNDeviceSeq);
+
+template<typename T>
+void TestFillNDeviceDevice(size_t n)
+{
+  TestFillNDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestFillNDeviceDevice);
+
+void TestFillCudaStreams()
+{
+  thrust::device_vector<int> v(5);
+  v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::fill(thrust::cuda::par.on(s), v.begin() + 1, v.begin() + 4, 7);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 7);
+  ASSERT_EQUAL(v[2], 7);
+  ASSERT_EQUAL(v[3], 7);
+  ASSERT_EQUAL(v[4], 4);
+  
+  thrust::fill(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 3, 8);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(v[0], 8);
+  ASSERT_EQUAL(v[1], 8);
+  ASSERT_EQUAL(v[2], 8);
+  ASSERT_EQUAL(v[3], 7);
+  ASSERT_EQUAL(v[4], 4);
+  
+  thrust::fill(thrust::cuda::par.on(s), v.begin() + 2, v.end(), 9);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(v[0], 8);
+  ASSERT_EQUAL(v[1], 8);
+  ASSERT_EQUAL(v[2], 9);
+  ASSERT_EQUAL(v[3], 9);
+  ASSERT_EQUAL(v[4], 9);
+  
+  thrust::fill(thrust::cuda::par.on(s), v.begin(), v.end(), 1);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(v[0], 1);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 1);
+  ASSERT_EQUAL(v[3], 1);
+  ASSERT_EQUAL(v[4], 1);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestFillCudaStreams);
+
diff --git a/thrust/testing/cuda/fill.mk b/thrust/testing/cuda/fill.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/fill.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/find.cu b/thrust/testing/cuda/find.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4fe6f4dca60d3fe5436542b86641503d15f0552e
--- /dev/null
+++ b/thrust/testing/cuda/find.cu
@@ -0,0 +1,246 @@
+#include <unittest/unittest.h>
+#include <thrust/find.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename T>
+struct equal_to_value_pred
+{
+    T value;
+
+    equal_to_value_pred(T value) : value(value) {}
+
+    __host__ __device__
+    bool operator()(T v) const { return v == value; }
+};
+
+
+template<typename T>
+struct not_equal_to_value_pred
+{
+    T value;
+
+    not_equal_to_value_pred(T value) : value(value) {}
+
+    __host__ __device__
+    bool operator()(T v) const { return v != value; }
+};
+
+
+template<typename T>
+struct less_than_value_pred
+{
+    T value;
+
+    less_than_value_pred(T value) : value(value) {}
+
+    __host__ __device__
+    bool operator()(T v) const { return v < value; }
+};
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
+__global__ void find_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result)
+{
+  *result = thrust::find(exec, first, last, value);
+}
+
+
+template<typename ExecutionPolicy>
+void TestFindDevice(ExecutionPolicy exec)
+{
+  size_t n = 100;
+
+  thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  typename thrust::host_vector<int>::iterator   h_iter;
+  
+  typedef typename thrust::device_vector<int>::iterator iter_type;
+  thrust::device_vector<iter_type> d_result(1);
+  
+  h_iter = thrust::find(h_data.begin(), h_data.end(), int(0));
+
+  find_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), int(0), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+  
+  for(size_t i = 1; i < n; i *= 2)
+  {
+    int sample = h_data[i];
+
+    h_iter = thrust::find(h_data.begin(), h_data.end(), sample);
+
+    find_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), sample, d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
+    ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+  }
+}
+
+
+void TestFindDeviceSeq()
+{
+  TestFindDevice(thrust::seq);
+};
+DECLARE_UNITTEST(TestFindDeviceSeq);
+
+
+void TestFindDeviceDevice()
+{
+  TestFindDevice(thrust::device);
+};
+DECLARE_UNITTEST(TestFindDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
+__global__ void find_if_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::find_if(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestFindIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 100;
+
+  thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  typename thrust::host_vector<int>::iterator   h_iter;
+  
+  typedef typename thrust::device_vector<int>::iterator iter_type;
+  thrust::device_vector<iter_type> d_result(1);
+  
+  h_iter = thrust::find_if(h_data.begin(), h_data.end(), equal_to_value_pred<int>(0));
+
+  find_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), equal_to_value_pred<int>(0), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+  
+  for (size_t i = 1; i < n; i *= 2)
+  {
+    int sample = h_data[i];
+
+    h_iter = thrust::find_if(h_data.begin(), h_data.end(), equal_to_value_pred<int>(sample));
+
+    find_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), equal_to_value_pred<int>(sample), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
+    ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+  }
+}
+
+
+void TestFindIfDeviceSeq()
+{
+  TestFindIfDevice(thrust::seq);
+};
+DECLARE_UNITTEST(TestFindIfDeviceSeq);
+
+
+void TestFindIfDeviceDevice()
+{
+  TestFindIfDevice(thrust::device);
+};
+DECLARE_UNITTEST(TestFindIfDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
+__global__ void find_if_not_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::find_if_not(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestFindIfNotDevice(ExecutionPolicy exec)
+{
+  size_t n = 100;
+  thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  typename thrust::host_vector<int>::iterator   h_iter;
+  
+  typedef typename thrust::device_vector<int>::iterator iter_type;
+  thrust::device_vector<iter_type> d_result(1);
+  
+  h_iter = thrust::find_if_not(h_data.begin(), h_data.end(), not_equal_to_value_pred<int>(0));
+
+  find_if_not_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), not_equal_to_value_pred<int>(0), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+  
+  for(size_t i = 1; i < n; i *= 2)
+  {
+    int sample = h_data[i];
+
+    h_iter = thrust::find_if_not(h_data.begin(), h_data.end(), not_equal_to_value_pred<int>(sample));
+
+    find_if_not_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), not_equal_to_value_pred<int>(sample), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
+    ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+  }
+}
+
+
+void TestFindIfNotDeviceSeq()
+{
+  TestFindIfNotDevice(thrust::seq);
+};
+DECLARE_UNITTEST(TestFindIfNotDeviceSeq);
+
+
+void TestFindIfNotDeviceDevice()
+{
+  TestFindIfNotDevice(thrust::device);
+};
+DECLARE_UNITTEST(TestFindIfNotDeviceDevice);
+
+
+void TestFindCudaStreams()
+{
+  thrust::device_vector<int> vec(5);
+  vec[0] = 1;
+  vec[1] = 2;
+  vec[2] = 3;
+  vec[3] = 3;
+  vec[4] = 5;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  ASSERT_EQUAL(thrust::find(thrust::cuda::par.on(s), vec.begin(), vec.end(), 0) - vec.begin(), 5);
+  ASSERT_EQUAL(thrust::find(thrust::cuda::par.on(s), vec.begin(), vec.end(), 1) - vec.begin(), 0);
+  ASSERT_EQUAL(thrust::find(thrust::cuda::par.on(s), vec.begin(), vec.end(), 2) - vec.begin(), 1);
+  ASSERT_EQUAL(thrust::find(thrust::cuda::par.on(s), vec.begin(), vec.end(), 3) - vec.begin(), 2);
+  ASSERT_EQUAL(thrust::find(thrust::cuda::par.on(s), vec.begin(), vec.end(), 4) - vec.begin(), 5);
+  ASSERT_EQUAL(thrust::find(thrust::cuda::par.on(s), vec.begin(), vec.end(), 5) - vec.begin(), 4);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestFindCudaStreams);
+
diff --git a/thrust/testing/cuda/find.mk b/thrust/testing/cuda/find.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/find.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/for_each.cu b/thrust/testing/cuda/for_each.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be6a7738c58d35ab5a9109991011cc7fac75fd1a
--- /dev/null
+++ b/thrust/testing/cuda/for_each.cu
@@ -0,0 +1,235 @@
+#include <unittest/unittest.h>
+#include <thrust/for_each.h>
+#include <thrust/execution_policy.h>
+#include <algorithm>
+
+static const size_t NUM_REGISTERS = 64;
+
+template <size_t N> __host__ __device__ void f   (int * x) { int temp = *x; f<N - 1>(x + 1); *x = temp;};
+template <>         __host__ __device__ void f<0>(int * /*x*/) { }
+template <size_t N>
+struct CopyFunctorWithManyRegisters
+{
+  __host__ __device__
+  void operator()(int * ptr)
+  {
+      f<N>(ptr);
+  }
+};
+
+
+void TestForEachLargeRegisterFootprint()
+{
+  int current_device = -1;
+  cudaGetDevice(&current_device);
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, current_device);
+
+  thrust::device_vector<int> data(NUM_REGISTERS, 12345);
+
+  thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant
+  
+  thrust::for_each(input.begin(), input.end(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
+}
+DECLARE_UNITTEST(TestForEachLargeRegisterFootprint);
+
+
+void TestForEachNLargeRegisterFootprint()
+{
+  int current_device = -1;
+  cudaGetDevice(&current_device);
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, current_device);
+
+  thrust::device_vector<int> data(NUM_REGISTERS, 12345);
+
+  thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant
+  
+  thrust::for_each_n(input.begin(), input.size(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
+}
+DECLARE_UNITTEST(TestForEachNLargeRegisterFootprint);
+
+
+template <typename T>
+struct mark_present_for_each
+{
+  T * ptr;
+  __host__ __device__ void
+  operator()(T x){ ptr[(int) x] = 1; }
+};
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Function>
+__global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
+{
+  thrust::for_each(exec, first, last, f);
+}
+
+
+template<typename T>
+void TestForEachDeviceSeq(const size_t n)
+{
+  const size_t output_size = std::min((size_t) 10, 2 * n);
+  
+  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
+  
+  for(size_t i = 0; i < n; i++)
+    h_input[i] =  ((size_t) h_input[i]) % output_size;
+  
+  thrust::device_vector<T> d_input = h_input;
+  
+  thrust::host_vector<T>   h_output(output_size, (T) 0);
+  thrust::device_vector<T> d_output(output_size, (T) 0);
+  
+  mark_present_for_each<T> h_f;
+  mark_present_for_each<T> d_f;
+  h_f.ptr = &h_output[0];
+  d_f.ptr = (&d_output[0]).get();
+  
+  thrust::for_each(h_input.begin(), h_input.end(), h_f);
+  
+  for_each_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.end(), d_f);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+ 
+  ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestForEachDeviceSeq);
+
+
+template<typename T>
+void TestForEachDeviceDevice(const size_t n)
+{
+  const size_t output_size = std::min((size_t) 10, 2 * n);
+  
+  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
+  
+  for(size_t i = 0; i < n; i++)
+    h_input[i] = ((size_t) h_input[i]) % output_size;
+  
+  thrust::device_vector<T> d_input = h_input;
+  
+  thrust::host_vector<T>   h_output(output_size, (T) 0);
+  thrust::device_vector<T> d_output(output_size, (T) 0);
+  
+  mark_present_for_each<T> h_f;
+  mark_present_for_each<T> d_f;
+  h_f.ptr = &h_output[0];
+  d_f.ptr = (&d_output[0]).get();
+  
+  thrust::for_each(h_input.begin(), h_input.end(), h_f);
+  
+  for_each_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.end(), d_f);
+  {
+    cudaError_t const err = cudaGetLastError();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+ 
+  ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestForEachDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
+__global__
+void for_each_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
+{
+  thrust::for_each_n(exec, first, n, f);
+}
+
+
+template<typename T>
+void TestForEachNDeviceSeq(const size_t n)
+{
+  const size_t output_size = std::min((size_t) 10, 2 * n);
+  
+  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
+  
+  for(size_t i = 0; i < n; i++)
+    h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);
+  
+  thrust::device_vector<T> d_input = h_input;
+  
+  thrust::host_vector<T>   h_output(output_size, (T) 0);
+  thrust::device_vector<T> d_output(output_size, (T) 0);
+  
+  mark_present_for_each<T> h_f;
+  mark_present_for_each<T> d_f;
+  h_f.ptr = &h_output[0];
+  d_f.ptr = (&d_output[0]).get();
+  
+  thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
+  
+  for_each_n_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.size(), d_f);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceSeq);
+
+
+template<typename T>
+void TestForEachNDeviceDevice(const size_t n)
+{
+  const size_t output_size = std::min((size_t) 10, 2 * n);
+  
+  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
+  
+  for(size_t i = 0; i < n; i++)
+    h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);
+  
+  thrust::device_vector<T> d_input = h_input;
+  
+  thrust::host_vector<T>   h_output(output_size, (T) 0);
+  thrust::device_vector<T> d_output(output_size, (T) 0);
+  
+  mark_present_for_each<T> h_f;
+  mark_present_for_each<T> d_f;
+  h_f.ptr = &h_output[0];
+  d_f.ptr = (&d_output[0]).get();
+  
+  thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
+  
+  for_each_n_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.size(), d_f);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice);
+
+
+void TestForEachCudaStreams()
+{
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::device_vector<int> input(5);
+  thrust::device_vector<int> output(7, 0);
+  
+  input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
+  
+  mark_present_for_each<int> f;
+  f.ptr = thrust::raw_pointer_cast(output.data());
+  
+  thrust::for_each(thrust::cuda::par.on(s), input.begin(), input.end(), f);
+
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(output[0], 0);
+  ASSERT_EQUAL(output[1], 0);
+  ASSERT_EQUAL(output[2], 1);
+  ASSERT_EQUAL(output[3], 1);
+  ASSERT_EQUAL(output[4], 1);
+  ASSERT_EQUAL(output[5], 0);
+  ASSERT_EQUAL(output[6], 1);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestForEachCudaStreams);
+
diff --git a/thrust/testing/cuda/for_each.mk b/thrust/testing/cuda/for_each.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/for_each.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/gather.cu b/thrust/testing/cuda/gather.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a9a8c9333e1a3aeb2747941a738daa230394bb50
--- /dev/null
+++ b/thrust/testing/cuda/gather.cu
@@ -0,0 +1,188 @@
+#include <unittest/unittest.h>
+#include <thrust/gather.h>
+#include <thrust/execution_policy.h>
+#include <algorithm>
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void gather_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 elements_first, Iterator3 result)
+{
+  thrust::gather(exec, map_first, map_last, elements_first, result);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestGatherDevice(ExecutionPolicy exec, const size_t n)
+{
+  const size_t source_size = std::min((size_t) 10, 2 * n);
+  
+  // source vectors to gather from
+  thrust::host_vector<T>   h_source = unittest::random_samples<T>(source_size);
+  thrust::device_vector<T> d_source = h_source;
+  
+  // gather indices
+  thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+  
+  for(size_t i = 0; i < n; i++)
+    h_map[i] =  h_map[i] % source_size;
+  
+  thrust::device_vector<unsigned int> d_map = h_map;
+  
+  // gather destination
+  thrust::host_vector<T>   h_output(n);
+  thrust::device_vector<T> d_output(n);
+  
+  thrust::gather(h_map.begin(), h_map.end(), h_source.begin(), h_output.begin());
+
+  gather_kernel<<<1,1>>>(exec, d_map.begin(), d_map.end(), d_source.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+
+template<typename T>
+void TestGatherDeviceSeq(const size_t n)
+{
+  TestGatherDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherDeviceSeq);
+
+template<typename T>
+void TestGatherDeviceDevice(const size_t n)
+{
+  TestGatherDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherDeviceDevice);
+
+
+void TestGatherCudaStreams()
+{
+  thrust::device_vector<int> map(5);  // gather indices
+  thrust::device_vector<int> src(8);  // source vector
+  thrust::device_vector<int> dst(5);  // destination vector
+  
+  map[0] = 6; map[1] = 2; map[2] = 1; map[3] = 7; map[4] = 2;
+  src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4; src[5] = 5; src[6] = 6; src[7] = 7;
+  dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::gather(thrust::cuda::par.on(s), map.begin(), map.end(), src.begin(), dst.begin());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(dst[0], 6);
+  ASSERT_EQUAL(dst[1], 2);
+  ASSERT_EQUAL(dst[2], 1);
+  ASSERT_EQUAL(dst[3], 7);
+  ASSERT_EQUAL(dst[4], 2);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestGatherCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate>
+__global__
+void gather_if_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 stencil_first, Iterator3 elements_first, Iterator4 result, Predicate pred)
+{
+  thrust::gather_if(exec, map_first, map_last, stencil_first, elements_first, result, pred);
+}
+
+
+template<typename T>
+struct is_even_gather_if
+{
+  __host__ __device__
+  bool operator()(const T i) const
+  { 
+    return (i % 2) == 0;
+  }
+};
+
+
+template<typename T, typename ExecutionPolicy>
+void TestGatherIfDevice(ExecutionPolicy exec, const size_t n)
+{
+  const size_t source_size = std::min((size_t) 10, 2 * n);
+  
+  // source vectors to gather from
+  thrust::host_vector<T>   h_source = unittest::random_samples<T>(source_size);
+  thrust::device_vector<T> d_source = h_source;
+  
+  // gather indices
+  thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+  
+  for(size_t i = 0; i < n; i++)
+      h_map[i] = h_map[i] % source_size;
+  
+  thrust::device_vector<unsigned int> d_map = h_map;
+  
+  // gather stencil
+  thrust::host_vector<unsigned int> h_stencil = unittest::random_integers<unsigned int>(n);
+  
+  for(size_t i = 0; i < n; i++)
+    h_stencil[i] = h_stencil[i] % 2;
+  
+  thrust::device_vector<unsigned int> d_stencil = h_stencil;
+  
+  // gather destination
+  thrust::host_vector<T>   h_output(n);
+  thrust::device_vector<T> d_output(n);
+  
+  thrust::gather_if(h_map.begin(), h_map.end(), h_stencil.begin(), h_source.begin(), h_output.begin(), is_even_gather_if<unsigned int>());
+
+  gather_if_kernel<<<1,1>>>(exec, d_map.begin(), d_map.end(), d_stencil.begin(), d_source.begin(), d_output.begin(), is_even_gather_if<unsigned int>());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+
+template<typename T>
+void TestGatherIfDeviceSeq(const size_t n)
+{
+  TestGatherIfDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherIfDeviceSeq);
+
+template<typename T>
+void TestGatherIfDeviceDevice(const size_t n)
+{
+  TestGatherIfDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherIfDeviceDevice);
+
+void TestGatherIfCudaStreams(void)
+{
+  thrust::device_vector<int> flg(5);  // predicate array
+  thrust::device_vector<int> map(5);  // gather indices
+  thrust::device_vector<int> src(8);  // source vector
+  thrust::device_vector<int> dst(5);  // destination vector
+  
+  flg[0] = 0; flg[1] = 1; flg[2] = 0; flg[3] = 1; flg[4] = 0;
+  map[0] = 6; map[1] = 2; map[2] = 1; map[3] = 7; map[4] = 2;
+  src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4; src[5] = 5; src[6] = 6; src[7] = 7;
+  dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::gather_if(thrust::cuda::par.on(s), map.begin(), map.end(), flg.begin(), src.begin(), dst.begin());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(dst[0], 0);
+  ASSERT_EQUAL(dst[1], 2);
+  ASSERT_EQUAL(dst[2], 0);
+  ASSERT_EQUAL(dst[3], 7);
+  ASSERT_EQUAL(dst[4], 0);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestGatherIfCudaStreams);
+
diff --git a/thrust/testing/cuda/gather.mk b/thrust/testing/cuda/gather.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/gather.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/generate.cu b/thrust/testing/cuda/generate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c495e55639cd213c71cc4a60a179d4ebbc83a3a2
--- /dev/null
+++ b/thrust/testing/cuda/generate.cu
@@ -0,0 +1,157 @@
+#include <unittest/unittest.h>
+#include <thrust/generate.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Function>
+__global__
+void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
+{
+  thrust::generate(exec, first, last, f);
+}
+
+
+template<typename T>
+struct return_value
+{
+  T val;
+  
+  return_value(void){}
+  return_value(T v):val(v){}
+  
+  __host__ __device__
+  T operator()(void){ return val; }
+};
+
+
+template<typename T, typename ExecutionPolicy>
+void TestGenerateDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+  
+  T value = 13;
+  return_value<T> f(value);
+  
+  thrust::generate(h_result.begin(), h_result.end(), f);
+
+  generate_kernel<<<1,1>>>(exec, d_result.begin(), d_result.end(), f);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_result, d_result);
+}
+
+
+template<typename T>
+void TestGenerateDeviceSeq(const size_t n)
+{
+  TestGenerateDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerateDeviceSeq);
+
+
+template<typename T>
+void TestGenerateDeviceDevice(const size_t n)
+{
+  TestGenerateDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerateDeviceDevice);
+
+
+void TestGenerateCudaStreams()
+{
+  thrust::device_vector<int> result(5);
+  
+  int value = 13;
+  
+  return_value<int> f(value);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::generate(thrust::cuda::par.on(s), result.begin(), result.end(), f);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(result[0], value);
+  ASSERT_EQUAL(result[1], value);
+  ASSERT_EQUAL(result[2], value);
+  ASSERT_EQUAL(result[3], value);
+  ASSERT_EQUAL(result[4], value);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestGenerateCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
+__global__
+void generate_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
+{
+  thrust::generate_n(exec, first, n, f);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestGenerateNDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+  
+  T value = 13;
+  return_value<T> f(value);
+  
+  thrust::generate_n(h_result.begin(), h_result.size(), f);
+
+  generate_n_kernel<<<1,1>>>(exec, d_result.begin(), d_result.size(), f);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(h_result, d_result);
+}
+
+
+template<typename T>
+void TestGenerateNDeviceSeq(const size_t n)
+{
+  TestGenerateNDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerateNDeviceSeq);
+
+
+template<typename T>
+void TestGenerateNDeviceDevice(const size_t n)
+{
+  TestGenerateNDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerateNDeviceDevice);
+
+
+void TestGenerateNCudaStreams()
+{
+  thrust::device_vector<int> result(5);
+  
+  int value = 13;
+  
+  return_value<int> f(value);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::generate_n(thrust::cuda::par.on(s), result.begin(), result.size(), f);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(result[0], value);
+  ASSERT_EQUAL(result[1], value);
+  ASSERT_EQUAL(result[2], value);
+  ASSERT_EQUAL(result[3], value);
+  ASSERT_EQUAL(result[4], value);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestGenerateNCudaStreams);
+
diff --git a/thrust/testing/cuda/generate.mk b/thrust/testing/cuda/generate.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/generate.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/inner_product.cu b/thrust/testing/cuda/inner_product.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3dbb1150c65bd1109b003e85167db605b7e40f43
--- /dev/null
+++ b/thrust/testing/cuda/inner_product.cu
@@ -0,0 +1,72 @@
+#include <unittest/unittest.h>
+#include <thrust/inner_product.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T, typename Iterator3>
+__global__
+void inner_product_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, T init, Iterator3 result)
+{
+  *result = thrust::inner_product(exec, first1, last1, first2, init);
+}
+
+
+template<typename ExecutionPolicy>
+void TestInnerProductDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+
+  thrust::host_vector<int> h_v1 = unittest::random_integers<int>(n);
+  thrust::host_vector<int> h_v2 = unittest::random_integers<int>(n);
+  
+  thrust::device_vector<int> d_v1 = h_v1;
+  thrust::device_vector<int> d_v2 = h_v2;
+  
+  thrust::device_vector<int> result(1);
+  
+  int init = 13;
+  
+  int expected = thrust::inner_product(h_v1.begin(), h_v1.end(), h_v2.begin(), init);
+
+  inner_product_kernel<<<1,1>>>(exec, d_v1.begin(), d_v1.end(), d_v2.begin(), init, result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(expected, result[0]);
+}
+
+
+void TestInnerProductDeviceSeq()
+{
+  TestInnerProductDevice(thrust::seq);
+};
+DECLARE_UNITTEST(TestInnerProductDeviceSeq);
+
+
+void TestInnerProductDeviceDevice()
+{
+  TestInnerProductDevice(thrust::device);
+};
+DECLARE_UNITTEST(TestInnerProductDeviceDevice);
+
+
+void TestInnerProductCudaStreams()
+{
+  thrust::device_vector<int> v1(3);
+  thrust::device_vector<int> v2(3);
+  v1[0] =  1; v1[1] = -2; v1[2] =  3;
+  v2[0] = -4; v2[1] =  5; v2[2] =  6;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  int init = 3;
+  int result = thrust::inner_product(thrust::cuda::par.on(s), v1.begin(), v1.end(), v2.begin(), init);
+  ASSERT_EQUAL(result, 7);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestInnerProductCudaStreams);
+
diff --git a/thrust/testing/cuda/inner_product.mk b/thrust/testing/cuda/inner_product.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/inner_product.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/is_partitioned.cu b/thrust/testing/cuda/is_partitioned.cu
new file mode 100644
index 0000000000000000000000000000000000000000..70379793bfdcbeaa8eb1097ad4b919564736570b
--- /dev/null
+++ b/thrust/testing/cuda/is_partitioned.cu
@@ -0,0 +1,102 @@
+#include <unittest/unittest.h>
+#include <thrust/partition.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
+__global__
+void is_partitioned_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::is_partitioned(exec, first, last, pred);
+}
+
+
+template<typename T>
+struct is_even
+{
+  __host__ __device__
+  bool operator()(T x) const { return ((int) x % 2) == 0; }
+};
+
+
+template<typename ExecutionPolicy>
+void TestIsPartitionedDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+
+  n = thrust::max<size_t>(n, 2);
+
+  thrust::device_vector<int> v = unittest::random_integers<int>(n);
+
+  thrust::device_vector<bool> result(1);
+
+  v[0] = 1;
+  v[1] = 0;
+
+  is_partitioned_kernel<<<1,1>>>(exec, v.begin(), v.end(), is_even<int>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+
+  thrust::partition(v.begin(), v.end(), is_even<int>());
+
+  is_partitioned_kernel<<<1,1>>>(exec, v.begin(), v.end(), is_even<int>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+}
+
+
+void TestIsPartitionedDeviceSeq()
+{
+  TestIsPartitionedDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestIsPartitionedDeviceSeq);
+
+
+void TestIsPartitionedDeviceDevice()
+{
+  TestIsPartitionedDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestIsPartitionedDeviceDevice);
+
+
+void TestIsPartitionedCudaStreams()
+{
+  thrust::device_vector<int> v(4);
+  v[0] = 1; v[1] = 1; v[2] = 1; v[3] = 0;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  // empty partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(thrust::cuda::par.on(s), v.begin(), v.begin(), thrust::identity<int>()));
+
+  // one element true partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(thrust::cuda::par.on(s), v.begin(), v.begin() + 1, thrust::identity<int>()));
+
+  // just true partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(thrust::cuda::par.on(s), v.begin(), v.begin() + 2, thrust::identity<int>()));
+
+  // both true & false partitions
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<int>()));
+
+  // one element false partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(thrust::cuda::par.on(s), v.begin() + 3, v.end(), thrust::identity<int>()));
+
+  v[0] = 1; v[1] = 0; v[2] = 1; v[3] = 1;
+
+  // not partitioned
+  ASSERT_EQUAL_QUIET(false, thrust::is_partitioned(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<int>()));
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestIsPartitionedCudaStreams);
+
diff --git a/thrust/testing/cuda/is_partitioned.mk b/thrust/testing/cuda/is_partitioned.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/is_partitioned.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/is_sorted.cu b/thrust/testing/cuda/is_sorted.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c6e11f6fce6412af03c7e96c84951339691a03ea
--- /dev/null
+++ b/thrust/testing/cuda/is_sorted.cu
@@ -0,0 +1,92 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
+__global__
+void is_sorted_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
+{
+  *result = thrust::is_sorted(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy>
+void TestIsSortedDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+
+  thrust::device_vector<int> v = unittest::random_integers<int>(n);
+
+  thrust::device_vector<bool> result(1);
+
+  v[0] = 1;
+  v[1] = 0;
+
+  is_sorted_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+
+  thrust::sort(v.begin(), v.end());
+
+  is_sorted_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+}
+
+void TestIsSortedDeviceSeq()
+{
+  TestIsSortedDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestIsSortedDeviceSeq);
+
+
+void TestIsSortedDeviceDevice()
+{
+  TestIsSortedDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestIsSortedDeviceDevice);
+
+
+void TestIsSortedCudaStreams()
+{
+  thrust::device_vector<int> v(4);
+  v[0] = 0; v[1] = 5; v[2] = 8; v[3] = 0;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 0), true);
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 1), true);
+  
+  // the following line crashes gcc 4.3
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 3)
+  // do nothing
+#else
+  // compile this line on other compilers
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 2), true);
+#endif // GCC
+
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 3), true);
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 4), false);
+  
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 3, thrust::less<int>()),    true);
+  
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 1, thrust::greater<int>()), true);
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.begin() + 4, thrust::greater<int>()), false);
+  
+  ASSERT_EQUAL(thrust::is_sorted(thrust::cuda::par.on(s), v.begin(), v.end()), false);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestIsSortedCudaStreams);
+
diff --git a/thrust/testing/cuda/is_sorted.mk b/thrust/testing/cuda/is_sorted.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/is_sorted.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/is_sorted_until.cu b/thrust/testing/cuda/is_sorted_until.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d84f09fcad524e96ad0bce3c2af590132a4df276
--- /dev/null
+++ b/thrust/testing/cuda/is_sorted_until.cu
@@ -0,0 +1,121 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void is_sorted_until_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  *result = thrust::is_sorted_until(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy>
+void TestIsSortedUntilDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+
+  thrust::device_vector<int> v = unittest::random_integers<int>(n);
+
+  typedef typename thrust::device_vector<int>::iterator iter_type;
+
+  thrust::device_vector<iter_type> result(1);
+
+  v[0] = 1;
+  v[1] = 0;
+  
+  is_sorted_until_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL_QUIET(v.begin() + 1, (iter_type)result[0]);
+  
+  thrust::sort(v.begin(), v.end());
+  
+  is_sorted_until_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL_QUIET(v.end(), (iter_type)result[0]);
+}
+
+
+void TestIsSortedUntilDeviceSeq()
+{
+  TestIsSortedUntilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestIsSortedUntilDeviceSeq);
+
+
+void TestIsSortedUntilDeviceDevice()
+{
+  TestIsSortedUntilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestIsSortedUntilDeviceDevice);
+
+
+void TestIsSortedUntilCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  typedef Vector::value_type T;
+  typedef Vector::iterator Iterator;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector v(4);
+  v[0] = 0; v[1] = 5; v[2] = 8; v[3] = 0;
+
+  Iterator first = v.begin();
+
+  Iterator last  = v.begin() + 0;
+  Iterator ref = last;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last));
+
+  last = v.begin() + 1;
+  ref = last;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last));
+
+  last = v.begin() + 2;
+  ref = last;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last));
+
+  last = v.begin() + 3;
+  ref = v.begin() + 3;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last));
+
+  last = v.begin() + 4;
+  ref = v.begin() + 3;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last));
+
+  last = v.begin() + 3;
+  ref = v.begin() + 3;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last, thrust::less<T>()));
+
+  last = v.begin() + 4;
+  ref = v.begin() + 3;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last, thrust::less<T>()));
+
+  last = v.begin() + 1;
+  ref = v.begin() + 1;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last, thrust::greater<T>()));
+
+  last = v.begin() + 4;
+  ref = v.begin() + 1;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last, thrust::greater<T>()));
+
+  first = v.begin() + 2;
+  last = v.begin() + 4;
+  ref = v.begin() + 4;
+  ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(thrust::cuda::par.on(s), first, last, thrust::greater<T>()));
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestIsSortedUntilCudaStreams);
+
diff --git a/thrust/testing/cuda/is_sorted_until.mk b/thrust/testing/cuda/is_sorted_until.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/is_sorted_until.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/logical.cu b/thrust/testing/cuda/logical.cu
new file mode 100644
index 0000000000000000000000000000000000000000..61e7dc49a684a3136210e101a1407926603183c6
--- /dev/null
+++ b/thrust/testing/cuda/logical.cu
@@ -0,0 +1,328 @@
+#include <unittest/unittest.h>
+#include <thrust/logical.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
+__global__
+void all_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
+{
+  *result = thrust::all_of(exec, first, last, f);
+}
+
+
+template<typename ExecutionPolicy>
+void TestAllOfDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  thrust::device_vector<T> v(3, 1);
+  thrust::device_vector<bool> result(1);
+  
+  all_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+  
+  v[1] = 0;
+  
+  all_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+  
+  all_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 0, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+
+  all_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 1, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+
+  all_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+
+  all_of_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+}
+
+
+void TestAllOfDeviceSeq()
+{
+  TestAllOfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestAllOfDeviceSeq);
+
+
+void TestAllOfDeviceDevice()
+{
+  TestAllOfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestAllOfDeviceDevice);
+
+
+void TestAllOfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector v(3, 1);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  ASSERT_EQUAL(thrust::all_of(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>()), true);
+  
+  v[1] = 0;
+  
+  ASSERT_EQUAL(thrust::all_of(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>()), false);
+  
+  ASSERT_EQUAL(thrust::all_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 0, thrust::identity<T>()), true);
+  ASSERT_EQUAL(thrust::all_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 1, thrust::identity<T>()), true);
+  ASSERT_EQUAL(thrust::all_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 2, thrust::identity<T>()), false);
+  ASSERT_EQUAL(thrust::all_of(thrust::cuda::par.on(s), v.begin() + 1, v.begin() + 2, thrust::identity<T>()), false);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestAllOfCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
+__global__
+void any_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
+{
+  *result = thrust::any_of(exec, first, last, f);
+}
+
+
+template<typename ExecutionPolicy>
+void TestAnyOfDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  
+  thrust::device_vector<T> v(3, 1);
+  thrust::device_vector<bool> result(1);
+  
+  any_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+  
+  v[1] = 0;
+  
+  any_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+  
+  any_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 0, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+
+  any_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 1, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+
+  any_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+
+  any_of_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+}
+
+
+void TestAnyOfDeviceSeq()
+{
+  TestAnyOfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestAnyOfDeviceSeq);
+
+
+void TestAnyOfDeviceDevice()
+{
+  TestAnyOfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestAnyOfDeviceDevice);
+
+
+void TestAnyOfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector v(3, 1);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL(thrust::any_of(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>()), true);
+
+  v[1] = 0;
+  
+  ASSERT_EQUAL(thrust::any_of(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>()), true);
+
+  ASSERT_EQUAL(thrust::any_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 0, thrust::identity<T>()), false);
+  ASSERT_EQUAL(thrust::any_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 1, thrust::identity<T>()), true);
+  ASSERT_EQUAL(thrust::any_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 2, thrust::identity<T>()), true);
+  ASSERT_EQUAL(thrust::any_of(thrust::cuda::par.on(s), v.begin() + 1, v.begin() + 2, thrust::identity<T>()), false);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestAnyOfCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
+__global__
+void none_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
+{
+  *result = thrust::none_of(exec, first, last, f);
+}
+
+
+template<typename ExecutionPolicy>
+void TestNoneOfDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  
+  thrust::device_vector<T> v(3, 1);
+  thrust::device_vector<bool> result(1);
+  
+  none_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+  
+  v[1] = 0;
+  
+  none_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+  
+  none_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 0, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+
+  none_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 1, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+
+  none_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(false, result[0]);
+
+  none_of_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(true, result[0]);
+}
+
+
+void TestNoneOfDeviceSeq()
+{
+  TestNoneOfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestNoneOfDeviceSeq);
+
+
+void TestNoneOfDeviceDevice()
+{
+  TestNoneOfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestNoneOfDeviceDevice);
+
+
+void TestNoneOfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector v(3, 1);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL(thrust::none_of(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>()), false);
+
+  v[1] = 0;
+  
+  ASSERT_EQUAL(thrust::none_of(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>()), false);
+
+  ASSERT_EQUAL(thrust::none_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 0, thrust::identity<T>()), true);
+  ASSERT_EQUAL(thrust::none_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 1, thrust::identity<T>()), false);
+  ASSERT_EQUAL(thrust::none_of(thrust::cuda::par.on(s), v.begin() + 0, v.begin() + 2, thrust::identity<T>()), false);
+  ASSERT_EQUAL(thrust::none_of(thrust::cuda::par.on(s), v.begin() + 1, v.begin() + 2, thrust::identity<T>()), true);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestNoneOfCudaStreams);
+
diff --git a/thrust/testing/cuda/logical.mk b/thrust/testing/cuda/logical.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/logical.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/managed_memory_pointer.cu b/thrust/testing/cuda/managed_memory_pointer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..46a2191fab59596a96d842a9960c596508dbb5f9
--- /dev/null
+++ b/thrust/testing/cuda/managed_memory_pointer.cu
@@ -0,0 +1,141 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#  include <unittest/unittest.h>
+
+#  include <thrust/allocate_unique.h>
+#  include <thrust/memory/detail/device_system_resource.h>
+#  include <thrust/mr/allocator.h>
+#  include <thrust/type_traits/is_contiguous_iterator.h>
+
+#  include <numeric>
+#  include <vector>
+
+namespace
+{
+
+template <typename T>
+using allocator =
+  thrust::mr::stateless_resource_allocator<T, thrust::universal_memory_resource>;
+
+// The managed_memory_pointer class should be identified as a
+// contiguous_iterator
+THRUST_STATIC_ASSERT(
+  thrust::is_contiguous_iterator<allocator<int>::pointer>::value);
+
+template <typename T>
+struct some_object {
+  some_object(T data)
+      : m_data(data)
+  {}
+
+  void setter(T data) { m_data = data; }
+  T getter() const { return m_data; }
+
+private:
+  T m_data;
+};
+
+} // namespace
+
+template <typename T>
+void TestAllocateUnique()
+{
+  // Simple test to ensure that pointers created with universal_memory_resource
+  // can be dereferenced and used with STL code. This is necessary as some
+  // STL implementations break when using fancy references that overload
+  // `operator&`, so universal_memory_resource uses a special pointer type that
+  // returns regular C++ references that can be safely used host-side.
+
+  // These operations fail to compile with fancy references:
+  auto pRaw = thrust::allocate_unique<T>(allocator<T>{}, 42);
+  auto pObj =
+    thrust::allocate_unique<some_object<T> >(allocator<some_object<T> >{}, 42);
+
+  static_assert(
+    std::is_same<decltype(pRaw.get()),
+                 thrust::system::cuda::detail::managed_memory_pointer<T> >::value,
+    "Unexpected pointer returned from unique_ptr::get.");
+  static_assert(
+    std::is_same<decltype(pObj.get()),
+                 thrust::system::cuda::detail::managed_memory_pointer<
+                   some_object<T> > >::value,
+    "Unexpected pointer returned from unique_ptr::get.");
+
+  ASSERT_EQUAL(*pRaw, T(42));
+  ASSERT_EQUAL(*pRaw.get(), T(42));
+  ASSERT_EQUAL(pObj->getter(), T(42));
+  ASSERT_EQUAL((*pObj).getter(), T(42));
+  ASSERT_EQUAL(pObj.get()->getter(), T(42));
+  ASSERT_EQUAL((*pObj.get()).getter(), T(42));
+}
+DECLARE_GENERIC_UNITTEST(TestAllocateUnique);
+
+template <typename T>
+void TestIterationRaw()
+{
+  auto array = thrust::allocate_unique_n<T>(allocator<T>{}, 6, 42);
+
+  static_assert(
+    std::is_same<decltype(array.get()),
+                 thrust::system::cuda::detail::managed_memory_pointer<T> >::value,
+    "Unexpected pointer returned from unique_ptr::get.");
+
+  for (auto iter = array.get(), end = array.get() + 6; iter < end; ++iter)
+  {
+    ASSERT_EQUAL(*iter, T(42));
+    ASSERT_EQUAL(*iter.get(), T(42));
+  }
+}
+DECLARE_GENERIC_UNITTEST(TestIterationRaw);
+
+template <typename T>
+void TestIterationObj()
+{
+  auto array =
+    thrust::allocate_unique_n<some_object<T> >(allocator<some_object<T> >{},
+                                               6,
+                                               42);
+
+  static_assert(
+    std::is_same<decltype(array.get()),
+                 thrust::system::cuda::detail::managed_memory_pointer<
+                   some_object<T> > >::value,
+    "Unexpected pointer returned from unique_ptr::get.");
+
+  for (auto iter = array.get(), end = array.get() + 6; iter < end; ++iter)
+  {
+    ASSERT_EQUAL(iter->getter(), T(42));
+    ASSERT_EQUAL((*iter).getter(), T(42));
+    ASSERT_EQUAL(iter.get()->getter(), T(42));
+    ASSERT_EQUAL((*iter.get()).getter(), T(42));
+  }
+}
+DECLARE_GENERIC_UNITTEST(TestIterationObj);
+
+template <typename T>
+void TestStdVector()
+{
+  // Verify that a std::vector using the universal allocator will work with
+  // STL algorithms.
+  std::vector<T, allocator<T> > v0;
+
+  static_assert(
+    std::is_same<typename std::decay<decltype(v0)>::type::pointer,
+                 thrust::system::cuda::detail::managed_memory_pointer<
+                   T > >::value,
+    "Unexpected pointer returned from unique_ptr::get.");
+
+  v0.resize(6);
+  std::iota(v0.begin(), v0.end(), 0);
+  ASSERT_EQUAL(v0[0], T(0));
+  ASSERT_EQUAL(v0[1], T(1));
+  ASSERT_EQUAL(v0[2], T(2));
+  ASSERT_EQUAL(v0[3], T(3));
+  ASSERT_EQUAL(v0[4], T(4));
+  ASSERT_EQUAL(v0[5], T(5));
+}
+DECLARE_GENERIC_UNITTEST(TestStdVector);
+
+#endif // C++11
diff --git a/thrust/testing/cuda/managed_memory_pointer.mk b/thrust/testing/cuda/managed_memory_pointer.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/managed_memory_pointer.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/max_element.cu b/thrust/testing/cuda/max_element.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a18d9656ac40ebc6cc12f9204c959a32400bf343
--- /dev/null
+++ b/thrust/testing/cuda/max_element.cu
@@ -0,0 +1,114 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
+__global__
+void max_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
+{
+  *result = thrust::max_element(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator, typename BinaryPredicate, typename Iterator2>
+__global__
+void max_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, BinaryPredicate pred, Iterator2 result)
+{
+  *result = thrust::max_element(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestMaxElementDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int> h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+
+  typedef typename thrust::device_vector<int>::iterator iter_type;
+
+  thrust::device_vector<iter_type> d_result(1);
+  
+  typename thrust::host_vector<int>::iterator   h_max = thrust::max_element(h_data.begin(), h_data.end());
+
+  max_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_max - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+
+  
+  typename thrust::host_vector<int>::iterator   h_min = thrust::max_element(h_data.begin(), h_data.end(), thrust::greater<int>());
+
+  max_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), thrust::greater<int>(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_min - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+}
+
+
+void TestMaxElementDeviceSeq()
+{
+  TestMaxElementDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestMaxElementDeviceSeq);
+
+
+void TestMaxElementDeviceDevice()
+{
+  TestMaxElementDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestMaxElementDeviceDevice);
+
+
+void TestMaxElementCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()), 5);
+  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 1);
+  
+  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()), 1);
+  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreams);
+
+void TestMaxElementDevicePointer()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  T* raw_ptr = thrust::raw_pointer_cast(data.data());
+  size_t n = data.size();
+  ASSERT_EQUAL( thrust::max_element(thrust::device, raw_ptr, raw_ptr+n) - raw_ptr, 1);
+  ASSERT_EQUAL( thrust::max_element(thrust::device, raw_ptr, raw_ptr+n, thrust::greater<T>()) - raw_ptr, 2);
+}
+DECLARE_UNITTEST(TestMaxElementDevicePointer);
diff --git a/thrust/testing/cuda/max_element.mk b/thrust/testing/cuda/max_element.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/max_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/memory.cu b/thrust/testing/cuda/memory.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d71dfa926544d7f301c74f0e969234a6dea85017
--- /dev/null
+++ b/thrust/testing/cuda/memory.cu
@@ -0,0 +1,127 @@
+#include <unittest/unittest.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/memory.h>
+#include <thrust/execution_policy.h>
+#include <thrust/logical.h>
+
+
+template<typename T1, typename T2>
+bool are_same_type(const T1 &, const T2 &)
+{
+  return false;
+}
+
+
+template<typename T>
+bool are_same_type(const T &, const T &)
+{
+  return true;
+}
+
+
+void TestSelectSystemCudaToCpp()
+{
+  using thrust::system::detail::generic::select_system;
+
+  thrust::cuda::tag cuda_tag;
+  thrust::cpp::tag cpp_tag;
+  thrust::cuda_cub::cross_system<thrust::cuda::tag,thrust::cpp::tag> cuda_to_cpp(cuda_tag, cpp_tag);
+
+  // select_system(cuda::tag, thrust::host_system_tag) should return cuda_to_cpp
+  bool is_cuda_to_cpp = are_same_type(cuda_to_cpp, select_system(cuda_tag, cpp_tag));
+  ASSERT_EQUAL(true, is_cuda_to_cpp);
+}
+DECLARE_UNITTEST(TestSelectSystemCudaToCpp);
+
+
+template<typename Iterator>
+__global__ void get_temporary_buffer_kernel(size_t n, Iterator result)
+{
+  *result = thrust::get_temporary_buffer<int>(thrust::seq, n);
+}
+
+
+template<typename Pointer>
+__global__ void return_temporary_buffer_kernel(Pointer ptr, std::ptrdiff_t n)
+{
+  thrust::return_temporary_buffer(thrust::seq, ptr, n);
+}
+
+
+void TestGetTemporaryBufferDeviceSeq()
+{
+  const std::ptrdiff_t n = 9001;
+
+  typedef thrust::pointer<int, thrust::detail::seq_t> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> ptr_and_sz_type;
+  thrust::device_vector<ptr_and_sz_type> d_result(1);
+  
+  get_temporary_buffer_kernel<<<1,1>>>(n, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  ptr_and_sz_type ptr_and_sz = d_result[0];
+
+  if(ptr_and_sz.second > 0)
+  {
+    ASSERT_EQUAL(ptr_and_sz.second, n);
+
+    const int ref_val = 13;
+    thrust::device_vector<int> ref(n, ref_val);
+
+    thrust::fill_n(thrust::device, ptr_and_sz.first, n, ref_val);
+
+    ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
+
+    return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first, ptr_and_sz.second);
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+}
+DECLARE_UNITTEST(TestGetTemporaryBufferDeviceSeq);
+
+
+template<typename Iterator>
+__global__ void malloc_kernel(size_t n, Iterator result)
+{
+  *result = static_cast<int*>(thrust::malloc(thrust::seq, sizeof(int) * n).get());
+}
+
+
+template<typename Pointer>
+__global__ void free_kernel(Pointer ptr)
+{
+  thrust::free(thrust::seq, ptr);
+}
+
+
+void TestMallocDeviceSeq()
+{
+  const std::ptrdiff_t n = 9001;
+
+  typedef thrust::pointer<int, thrust::detail::seq_t> pointer;
+  thrust::device_vector<pointer> d_result(1);
+  
+  malloc_kernel<<<1,1>>>(n, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  pointer ptr = d_result[0];
+
+  if(ptr.get() != 0)
+  {
+    const int ref_val = 13;
+    thrust::device_vector<int> ref(n, ref_val);
+
+    thrust::fill_n(thrust::device, ptr, n, ref_val);
+
+    ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr, ptr + n, thrust::placeholders::_1 == ref_val));
+
+    free_kernel<<<1,1>>>(ptr);
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+}
+DECLARE_UNITTEST(TestMallocDeviceSeq);
+
diff --git a/thrust/testing/cuda/memory.mk b/thrust/testing/cuda/memory.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/memory.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/merge.cu b/thrust/testing/cuda/merge.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e13b9d3ab42f75c9a9a850975f236b4d5f0bfea
--- /dev/null
+++ b/thrust/testing/cuda/merge.cu
@@ -0,0 +1,120 @@
+#include <unittest/unittest.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/extrema.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
+__global__
+void merge_kernel(ExecutionPolicy exec,
+                  Iterator1 first1, Iterator1 last1,
+                  Iterator2 first2, Iterator2 last2,
+                  Iterator3 result1,
+                  Iterator4 result2)
+{
+  *result2 = thrust::merge(exec, first1, last1, first2, last2, result1);
+}
+
+
+template<typename ExecutionPolicy>
+void TestMergeDevice(ExecutionPolicy exec)
+{
+  size_t n = 10000;
+  size_t sizes[]   = {0, 1, n / 2, n, n + 1, 2 * n};
+  size_t num_sizes = sizeof(sizes) / sizeof(size_t);
+
+  thrust::host_vector<int> random = unittest::random_integers<unittest::int8_t>(n + *thrust::max_element(sizes, sizes + num_sizes));
+
+  thrust::host_vector<int> h_a(random.begin(), random.begin() + n);
+  thrust::host_vector<int> h_b(random.begin() + n, random.end());
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+  
+  thrust::device_vector<int> d_a = h_a;
+  thrust::device_vector<int> d_b = h_b;
+
+  for(size_t i = 0; i < num_sizes; i++)
+  {
+    size_t size = sizes[i];
+    
+    thrust::host_vector<int>   h_result(n + size);
+    thrust::device_vector<int> d_result(n + size);
+
+    typename thrust::host_vector<int>::iterator   h_end;
+
+    typedef typename thrust::device_vector<int>::iterator iter_type;
+    thrust::device_vector<iter_type> d_end(1);
+    
+    h_end = thrust::merge(h_a.begin(), h_a.end(),
+                          h_b.begin(), h_b.begin() + size,
+                          h_result.begin());
+    h_result.resize(h_end - h_result.begin());
+
+    merge_kernel<<<1,1>>>(exec,
+                          d_a.begin(), d_a.end(),
+                          d_b.begin(), d_b.begin() + size,
+                          d_result.begin(),
+                          d_end.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
+    d_result.resize((iter_type)d_end[0] - d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+
+
+void TestMergeDeviceSeq()
+{
+  TestMergeDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestMergeDeviceSeq);
+
+
+void TestMergeDeviceDevice()
+{
+  TestMergeDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestMergeDeviceDevice);
+
+
+void TestMergeCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(7);
+  ref[0] = 0;
+  ref[1] = 0;
+  ref[2] = 2;
+  ref[3] = 3;
+  ref[4] = 3;
+  ref[5] = 4;
+  ref[6] = 4;
+
+  Vector result(7);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Iterator end = thrust::merge(thrust::cuda::par.on(s),
+                               a.begin(), a.end(),
+                               b.begin(), b.end(),
+                               result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestMergeCudaStreams);
+
diff --git a/thrust/testing/cuda/merge.mk b/thrust/testing/cuda/merge.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/merge.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/merge_by_key.cu b/thrust/testing/cuda/merge_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..84b80e0072d24c4dc0e6121283e63e530d78a7c7
--- /dev/null
+++ b/thrust/testing/cuda/merge_by_key.cu
@@ -0,0 +1,134 @@
+#include <unittest/unittest.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy,
+         typename Iterator1,
+         typename Iterator2,
+         typename Iterator3,
+         typename Iterator4,
+         typename Iterator5,
+         typename Iterator6,
+         typename Iterator7>
+__global__
+void merge_by_key_kernel(ExecutionPolicy exec,
+                         Iterator1 keys_first1, Iterator1 keys_last1,
+                         Iterator2 keys_first2, Iterator2 keys_last2,
+                         Iterator3 values_first1,
+                         Iterator4 values_first2,
+                         Iterator5 keys_result,
+                         Iterator6 values_result,
+                         Iterator7 result)
+{
+  *result = thrust::merge_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestMergeByKeyDevice(ExecutionPolicy exec)
+{
+  thrust::device_vector<int> a_key(3), a_val(3), b_key(4), b_val(4);
+
+  a_key[0] = 0;  a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 13; a_val[1] = 7; a_val[2] = 42;
+
+  b_key[0] = 0 ; b_key[1] = 3;  b_key[2] = 3; b_key[3] = 4;
+  b_val[0] = 42; b_val[1] = 42; b_val[2] = 7; b_val[3] = 13;
+
+  thrust::device_vector<int> ref_key(7), ref_val(7);
+  ref_key[0] = 0; ref_val[0] = 13;
+  ref_key[1] = 0; ref_val[1] = 42;
+  ref_key[2] = 2; ref_val[2] = 7;
+  ref_key[3] = 3; ref_val[3] = 42;
+  ref_key[4] = 3; ref_val[4] = 7;
+  ref_key[5] = 4; ref_val[5] = 42;
+  ref_key[6] = 4; ref_val[6] = 13;
+
+  thrust::device_vector<int> result_key(7), result_val(7);
+
+  typedef typename thrust::device_vector<int>::iterator Iterator;
+
+  thrust::device_vector<thrust::pair<Iterator,Iterator> > result_ends(1);
+
+  merge_by_key_kernel<<<1,1>>>(exec,
+                               a_key.begin(), a_key.end(),
+                               b_key.begin(), b_key.end(),
+                               a_val.begin(), b_val.begin(),
+                               result_key.begin(),
+                               result_val.begin(),
+                               result_ends.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  thrust::pair<Iterator,Iterator> ends = result_ends[0];
+
+  ASSERT_EQUAL_QUIET(result_key.end(), ends.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), ends.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+
+
+void TestMergeByKeyDeviceSeq()
+{
+  TestMergeByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestMergeByKeyDeviceSeq);
+
+
+void TestMergeByKeyDeviceDevice()
+{
+  TestMergeByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestMergeByKeyDeviceDevice);
+
+
+void TestMergeByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a_key(3), a_val(3), b_key(4), b_val(4);
+
+  a_key[0] = 0;  a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 13; a_val[1] = 7; a_val[2] = 42;
+
+  b_key[0] = 0 ; b_key[1] = 3;  b_key[2] = 3; b_key[3] = 4;
+  b_val[0] = 42; b_val[1] = 42; b_val[2] = 7; b_val[3] = 13;
+
+  Vector ref_key(7), ref_val(7);
+  ref_key[0] = 0; ref_val[0] = 13;
+  ref_key[1] = 0; ref_val[1] = 42;
+  ref_key[2] = 2; ref_val[2] = 7;
+  ref_key[3] = 3; ref_val[3] = 42;
+  ref_key[4] = 3; ref_val[4] = 7;
+  ref_key[5] = 4; ref_val[5] = 42;
+  ref_key[6] = 4; ref_val[6] = 13;
+
+  Vector result_key(7), result_val(7);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::pair<Iterator,Iterator> ends =
+    thrust::merge_by_key(thrust::cuda::par.on(s),
+                         a_key.begin(), a_key.end(),
+                         b_key.begin(), b_key.end(),
+                         a_val.begin(), b_val.begin(),
+                         result_key.begin(),
+                         result_val.begin());
+
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result_key.end(), ends.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), ends.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestMergeByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/merge_by_key.mk b/thrust/testing/cuda/merge_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/merge_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/merge_sort.cu b/thrust/testing/cuda/merge_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a4c2aa2e9e479b1c4d1bb613279df4680340372
--- /dev/null
+++ b/thrust/testing/cuda/merge_sort.cu
@@ -0,0 +1,256 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template <typename T>
+struct less_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;}
+};
+
+
+template <class Vector>
+void InitializeSimpleKeySortTest(Vector& unsorted_keys, Vector& sorted_keys)
+{
+    unsorted_keys.resize(7);
+    unsorted_keys[0] = 1; 
+    unsorted_keys[1] = 3; 
+    unsorted_keys[2] = 6;
+    unsorted_keys[3] = 5;
+    unsorted_keys[4] = 2;
+    unsorted_keys[5] = 0;
+    unsorted_keys[6] = 4;
+
+    sorted_keys.resize(7); 
+    sorted_keys[0] = 0; 
+    sorted_keys[1] = 1; 
+    sorted_keys[2] = 2;
+    sorted_keys[3] = 3;
+    sorted_keys[4] = 4;
+    sorted_keys[5] = 5;
+    sorted_keys[6] = 6;
+}
+
+
+template <class Vector>
+void InitializeSimpleKeyValueSortTest(Vector& unsorted_keys, Vector& unsorted_values,
+                                      Vector& sorted_keys,   Vector& sorted_values)
+{
+    unsorted_keys.resize(7);   
+    unsorted_values.resize(7);   
+    unsorted_keys[0] = 1;  unsorted_values[0] = 0;
+    unsorted_keys[1] = 3;  unsorted_values[1] = 1;
+    unsorted_keys[2] = 6;  unsorted_values[2] = 2;
+    unsorted_keys[3] = 5;  unsorted_values[3] = 3;
+    unsorted_keys[4] = 2;  unsorted_values[4] = 4;
+    unsorted_keys[5] = 0;  unsorted_values[5] = 5;
+    unsorted_keys[6] = 4;  unsorted_values[6] = 6;
+    
+    sorted_keys.resize(7);
+    sorted_values.resize(7);
+    sorted_keys[0] = 0;  sorted_values[1] = 0;  
+    sorted_keys[1] = 1;  sorted_values[3] = 1;  
+    sorted_keys[2] = 2;  sorted_values[6] = 2;
+    sorted_keys[3] = 3;  sorted_values[5] = 3;
+    sorted_keys[4] = 4;  sorted_values[2] = 4;
+    sorted_keys[5] = 5;  sorted_values[0] = 5;
+    sorted_keys[6] = 6;  sorted_values[4] = 6;
+}
+
+
+template <class Vector>
+void InitializeSimpleStableKeySortTest(Vector& unsorted_keys, Vector& sorted_keys)
+{
+    unsorted_keys.resize(9);   
+    unsorted_keys[0] = 25; 
+    unsorted_keys[1] = 14; 
+    unsorted_keys[2] = 35; 
+    unsorted_keys[3] = 16; 
+    unsorted_keys[4] = 26; 
+    unsorted_keys[5] = 34; 
+    unsorted_keys[6] = 36; 
+    unsorted_keys[7] = 24; 
+    unsorted_keys[8] = 15; 
+    
+    sorted_keys.resize(9);
+    sorted_keys[0] = 14; 
+    sorted_keys[1] = 16; 
+    sorted_keys[2] = 15; 
+    sorted_keys[3] = 25; 
+    sorted_keys[4] = 26; 
+    sorted_keys[5] = 24; 
+    sorted_keys[6] = 35; 
+    sorted_keys[7] = 34; 
+    sorted_keys[8] = 36; 
+}
+
+
+void TestMergeSortKeySimple(void)
+{
+#if 0
+    typedef thrust::device_vector<int> Vector;
+    typedef Vector::value_type T;
+
+    Vector unsorted_keys;
+    Vector   sorted_keys;
+
+    InitializeSimpleKeySortTest(unsorted_keys, sorted_keys);
+
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), thrust::less<T>());
+
+    ASSERT_EQUAL(unsorted_keys, sorted_keys);
+#else
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_UNITTEST(TestMergeSortKeySimple);
+
+
+void TestMergeSortKeyValueSimple(void)
+{
+#if 0
+    typedef thrust::device_vector<int> Vector;
+    typedef Vector::value_type T;
+
+    Vector unsorted_keys, unsorted_values;
+    Vector   sorted_keys,   sorted_values;
+
+    InitializeSimpleKeyValueSortTest(unsorted_keys, unsorted_values, sorted_keys, sorted_values);
+
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort_by_key(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), unsorted_values.begin(), thrust::less<T>());
+
+    ASSERT_EQUAL(unsorted_keys,   sorted_keys);
+    ASSERT_EQUAL(unsorted_values, sorted_values);
+#else
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_UNITTEST(TestMergeSortKeyValueSimple);
+
+
+void TestMergeSortStableKeySimple(void)
+{
+#if 0
+    typedef thrust::device_vector<int> Vector;
+    typedef Vector::value_type T;
+
+    Vector unsorted_keys;
+    Vector   sorted_keys;
+
+    InitializeSimpleStableKeySortTest(unsorted_keys, sorted_keys);
+
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), less_div_10<T>());
+
+    ASSERT_EQUAL(unsorted_keys,   sorted_keys);
+#else
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_UNITTEST(TestMergeSortStableKeySimple);
+
+
+void TestMergeSortDescendingKey(void)
+{
+#if 0
+    const size_t n = 10027;
+
+    thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
+    thrust::device_vector<int> d_data = h_data;
+
+    thrust::sort(h_data.begin(), h_data.end(), thrust::greater<int>());
+
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, d_data.begin(), d_data.end(), thrust::greater<int>());
+
+    ASSERT_EQUAL(h_data, d_data);
+#else
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_UNITTEST(TestMergeSortDescendingKey);
+
+
+template <typename T>
+void TestMergeSortAscendingKeyValue(const size_t n)
+{
+#if 0
+    thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_keys = h_keys;
+    
+    thrust::host_vector<T>   h_values = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_values = h_values;
+
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::less<T>());
+
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
+
+    ASSERT_EQUAL(h_keys,   d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+#else
+    (void)n;
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeSortAscendingKeyValue);
+
+
+void TestMergeSortDescendingKeyValue(void)
+{
+#if 0
+    const size_t n = 10027;
+
+    thrust::host_vector<int>   h_keys = unittest::random_integers<int>(n);
+    thrust::device_vector<int> d_keys = h_keys;
+    
+    thrust::host_vector<int>   h_values = unittest::random_integers<int>(n);
+    thrust::device_vector<int> d_values = h_values;
+
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::greater<int>());
+
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::greater<int>());
+
+    ASSERT_EQUAL(h_keys,   d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+#else
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_UNITTEST(TestMergeSortDescendingKeyValue);
+
+
+template<typename U>
+void TestMergeSortKeyValue(size_t n)
+{
+#if 0
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_data(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_data[i] = T(h_keys[i], h_values[i]);
+  }
+
+  thrust::device_vector<T> d_data = h_data;
+
+  thrust::stable_sort(h_data.begin(), h_data.end());
+  thrust::cuda_bulk::tag cuda_tag;
+  thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, d_data.begin(), d_data.end(), thrust::less<T>());
+
+  ASSERT_EQUAL_QUIET(h_data, d_data);
+#else
+    (void) n;
+    KNOWN_FAILURE;
+#endif
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeSortKeyValue);
+
diff --git a/thrust/testing/cuda/merge_sort.mk b/thrust/testing/cuda/merge_sort.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/merge_sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/min_element.cu b/thrust/testing/cuda/min_element.cu
new file mode 100644
index 0000000000000000000000000000000000000000..49d13c2a59b281ba502654a5ed8678a8eaeb9ae4
--- /dev/null
+++ b/thrust/testing/cuda/min_element.cu
@@ -0,0 +1,113 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
+__global__
+void min_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
+{
+  *result = thrust::min_element(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator, typename BinaryPredicate, typename Iterator2>
+__global__
+void min_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, BinaryPredicate pred, Iterator2 result)
+{
+  *result = thrust::min_element(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestMinElementDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int> h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+
+  typedef typename thrust::device_vector<int>::iterator iter_type;
+
+  thrust::device_vector<iter_type> d_result(1);
+  
+  typename thrust::host_vector<int>::iterator   h_min = thrust::min_element(h_data.begin(), h_data.end());
+
+  min_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_min - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+
+  typename thrust::host_vector<int>::iterator   h_max = thrust::min_element(h_data.begin(), h_data.end(), thrust::greater<int>());
+
+  min_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), thrust::greater<int>(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(h_max - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
+}
+
+
+void TestMinElementDeviceSeq()
+{
+  TestMinElementDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestMinElementDeviceSeq);
+
+
+void TestMinElementDeviceDevice()
+{
+  TestMinElementDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestMinElementDeviceDevice);
+
+
+void TestMinElementCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL( *thrust::min_element(thrust::cuda::par.on(s), data.begin(), data.end()), 1);
+  ASSERT_EQUAL( thrust::min_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 2);
+  
+  ASSERT_EQUAL( *thrust::min_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()), 5);
+  ASSERT_EQUAL( thrust::min_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 1);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestMinElementCudaStreams);
+
+void TestMinElementDevicePointer()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  T* raw_ptr = thrust::raw_pointer_cast(data.data());
+  size_t n = data.size();
+  ASSERT_EQUAL( thrust::min_element(thrust::device, raw_ptr, raw_ptr+n) - raw_ptr, 2);
+  ASSERT_EQUAL( thrust::min_element(thrust::device, raw_ptr, raw_ptr+n, thrust::greater<T>()) - raw_ptr, 1);
+}
+DECLARE_UNITTEST(TestMinElementDevicePointer);
diff --git a/thrust/testing/cuda/min_element.mk b/thrust/testing/cuda/min_element.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/min_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/minmax_element.cu b/thrust/testing/cuda/minmax_element.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3cae07a22043e6bcc105052d44b192bbe9f2af8
--- /dev/null
+++ b/thrust/testing/cuda/minmax_element.cu
@@ -0,0 +1,133 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void minmax_element_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  *result = thrust::minmax_element(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename BinaryPredicate>
+__global__
+void minmax_element_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, BinaryPredicate pred, Iterator2 result)
+{
+  *result = thrust::minmax_element(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestMinMaxElementDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  typename thrust::host_vector<int>::iterator   h_min;
+  typename thrust::host_vector<int>::iterator   h_max;
+  typename thrust::device_vector<int>::iterator d_min;
+  typename thrust::device_vector<int>::iterator d_max;
+
+  typedef thrust::pair<
+    typename thrust::device_vector<int>::iterator,
+    typename thrust::device_vector<int>::iterator
+  > pair_type;
+
+  thrust::device_vector<pair_type> d_result(1);
+  
+  h_min = thrust::minmax_element(h_data.begin(), h_data.end()).first;
+  h_max = thrust::minmax_element(h_data.begin(), h_data.end()).second;
+
+  d_min = thrust::minmax_element(d_data.begin(), d_data.end()).first;
+  d_max = thrust::minmax_element(d_data.begin(), d_data.end()).second;
+
+  minmax_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  d_min = ((pair_type)d_result[0]).first;
+  d_max = ((pair_type)d_result[0]).second;
+  
+  ASSERT_EQUAL(h_min - h_data.begin(), d_min - d_data.begin());
+  ASSERT_EQUAL(h_max - h_data.begin(), d_max - d_data.begin());
+  
+  h_max = thrust::minmax_element(h_data.begin(), h_data.end(), thrust::greater<int>()).first;
+  h_min = thrust::minmax_element(h_data.begin(), h_data.end(), thrust::greater<int>()).second;
+
+  minmax_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), thrust::greater<int>(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  d_max = ((pair_type)d_result[0]).first;
+  d_min = ((pair_type)d_result[0]).second;
+  
+  ASSERT_EQUAL(h_min - h_data.begin(), d_min - d_data.begin());
+  ASSERT_EQUAL(h_max - h_data.begin(), d_max - d_data.begin());
+}
+
+
+void TestMinMaxElementDeviceSeq()
+{
+  TestMinMaxElementDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestMinMaxElementDeviceSeq);
+
+
+void TestMinMaxElementDeviceDevice()
+{
+  TestMinMaxElementDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestMinMaxElementDeviceDevice);
+
+
+void TestMinMaxElementCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL( *thrust::minmax_element(thrust::cuda::par.on(s), data.begin(), data.end()).first,  1);
+  ASSERT_EQUAL( *thrust::minmax_element(thrust::cuda::par.on(s), data.begin(), data.end()).second, 5);
+  ASSERT_EQUAL(  thrust::minmax_element(thrust::cuda::par.on(s), data.begin(), data.end()).first  - data.begin(), 2);
+  ASSERT_EQUAL(  thrust::minmax_element(thrust::cuda::par.on(s), data.begin(), data.end()).second - data.begin(), 1);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestMinMaxElementCudaStreams);
+
+void TestMinMaxElementDevicePointer()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  T* raw_ptr = thrust::raw_pointer_cast(data.data());
+  size_t n = data.size();
+  ASSERT_EQUAL( thrust::minmax_element(thrust::device, raw_ptr, raw_ptr+n).first - raw_ptr,  2);
+  ASSERT_EQUAL( thrust::minmax_element(thrust::device, raw_ptr, raw_ptr+n).second - raw_ptr, 1);
+}
+DECLARE_UNITTEST(TestMinMaxElementDevicePointer);
+
diff --git a/thrust/testing/cuda/minmax_element.mk b/thrust/testing/cuda/minmax_element.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/minmax_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/mismatch.cu b/thrust/testing/cuda/mismatch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5b08f43072156f2bead1e14cc043154599e033b8
--- /dev/null
+++ b/thrust/testing/cuda/mismatch.cu
@@ -0,0 +1,106 @@
+#include <unittest/unittest.h>
+#include <thrust/mismatch.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__ void mismatch_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result)
+{
+  *result = thrust::mismatch(exec, first1, last1, first2);
+}
+
+
+template<typename ExecutionPolicy>
+void TestMismatchDevice(ExecutionPolicy exec)
+{
+  thrust::device_vector<int> a(4);
+  thrust::device_vector<int> b(4);
+  a[0] = 1; b[0] = 1;
+  a[1] = 2; b[1] = 2;
+  a[2] = 3; b[2] = 4;
+  a[3] = 4; b[3] = 3;
+
+  typedef thrust::pair<
+    typename thrust::device_vector<int>::iterator,
+    typename thrust::device_vector<int>::iterator
+  > pair_type;
+
+  thrust::device_vector<pair_type> d_result(1);
+  
+  mismatch_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(2, ((pair_type)d_result[0]).first  - a.begin());
+  ASSERT_EQUAL(2, ((pair_type)d_result[0]).second - b.begin());
+  
+  b[2] = 3;
+  
+  mismatch_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(3, ((pair_type)d_result[0]).first  - a.begin());
+  ASSERT_EQUAL(3, ((pair_type)d_result[0]).second - b.begin());
+  
+  b[3] = 4;
+  
+  mismatch_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(4, ((pair_type)d_result[0]).first  - a.begin());
+  ASSERT_EQUAL(4, ((pair_type)d_result[0]).second - b.begin());
+}
+
+
+void TestMismatchDeviceSeq()
+{
+  TestMismatchDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestMismatchDeviceSeq);
+
+
+void TestMismatchDeviceDevice()
+{
+  TestMismatchDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestMismatchDeviceDevice);
+
+
+void TestMismatchCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector a(4); Vector b(4);
+  a[0] = 1; b[0] = 1;
+  a[1] = 2; b[1] = 2;
+  a[2] = 3; b[2] = 4;
+  a[3] = 4; b[3] = 3;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL(thrust::mismatch(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin()).first  - a.begin(), 2);
+  ASSERT_EQUAL(thrust::mismatch(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin()).second - b.begin(), 2);
+
+  b[2] = 3;
+  
+  ASSERT_EQUAL(thrust::mismatch(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin()).first  - a.begin(), 3);
+  ASSERT_EQUAL(thrust::mismatch(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin()).second - b.begin(), 3);
+  
+  b[3] = 4;
+  
+  ASSERT_EQUAL(thrust::mismatch(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin()).first  - a.begin(), 4);
+  ASSERT_EQUAL(thrust::mismatch(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin()).second - b.begin(), 4);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestMismatchCudaStreams);
+
diff --git a/thrust/testing/cuda/mismatch.mk b/thrust/testing/cuda/mismatch.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/mismatch.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/pair_sort.cu b/thrust/testing/cuda/pair_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..87838e429c4e538e90c10cfc40a446ead6413850
--- /dev/null
+++ b/thrust/testing/cuda/pair_sort.cu
@@ -0,0 +1,74 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported)
+{
+#if (__CUDA_ARCH__ >= 200)
+  *is_supported = true;
+  thrust::stable_sort(exec, first, last);
+#else
+  *is_supported = false;
+#endif
+}
+
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+template<typename ExecutionPolicy>
+void TestPairStableSortDevice(ExecutionPolicy exec)
+{
+  size_t n = 10000;
+  typedef thrust::pair<int,int> P;
+
+  thrust::host_vector<int>   h_p1 = unittest::random_integers<int>(n);
+  thrust::host_vector<int>   h_p2 = unittest::random_integers<int>(n);
+  thrust::host_vector<P>   h_pairs(n);
+
+  // zip up pairs on the host
+  thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+  thrust::device_vector<P> d_pairs = h_pairs;
+
+  thrust::device_vector<bool> is_supported(1);
+
+  stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), is_supported.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  if(is_supported[0])
+  {
+    // sort on the host
+    thrust::stable_sort(h_pairs.begin(), h_pairs.end());
+
+    ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
+  }
+};
+
+
+void TestPairStableSortDeviceSeq()
+{
+  TestPairStableSortDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPairStableSortDeviceSeq);
+
+
+void TestPairStableSortDeviceDevice()
+{
+  TestPairStableSortDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPairStableSortDeviceDevice);
+
diff --git a/thrust/testing/cuda/pair_sort.mk b/thrust/testing/cuda/pair_sort.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/pair_sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/pair_sort_by_key.cu b/thrust/testing/cuda/pair_sort_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19996e5a2e463d515833d35a6bda1b5653b08e57
--- /dev/null
+++ b/thrust/testing/cuda/pair_sort_by_key.cu
@@ -0,0 +1,84 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/sequence.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported)
+{
+#if (__CUDA_ARCH__ >= 200)
+  *is_supported = true;
+  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first);
+#else
+  *is_supported = false;
+#endif
+}
+
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+template<typename ExecutionPolicy>
+void TestPairStableSortByKeyDevice(ExecutionPolicy exec)
+{
+  size_t n = 10000;
+  typedef thrust::pair<int,int> P;
+
+  // host arrays
+  thrust::host_vector<int>   h_p1 = unittest::random_integers<int>(n);
+  thrust::host_vector<int>   h_p2 = unittest::random_integers<int>(n);
+  thrust::host_vector<P>   h_pairs(n);
+
+  thrust::host_vector<int> h_values(n);
+  thrust::sequence(h_values.begin(), h_values.end());
+
+  // zip up pairs on the host
+  thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+  // device arrays
+  thrust::device_vector<P>   d_pairs = h_pairs;
+  thrust::device_vector<int> d_values = h_values;
+
+  thrust::device_vector<bool> is_supported(1);
+
+  // sort on the device
+  stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin(), is_supported.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  if(is_supported[0])
+  {
+    // sort on the host
+    thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
+
+    ASSERT_EQUAL_QUIET(h_pairs,  d_pairs);
+    ASSERT_EQUAL(h_values, d_values);
+  }
+};
+
+
+void TestPairStableSortByKeyDeviceSeq()
+{
+  TestPairStableSortByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPairStableSortByKeyDeviceSeq);
+
+
+void TestPairStableSortByKeyDeviceDevice()
+{
+  TestPairStableSortByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPairStableSortByKeyDeviceDevice);
+
diff --git a/thrust/testing/cuda/pair_sort_by_key.mk b/thrust/testing/cuda/pair_sort_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/pair_sort_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/partition.cu b/thrust/testing/cuda/partition.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a70ac073246f2d79f725dba92cb952348d9b9037
--- /dev/null
+++ b/thrust/testing/cuda/partition.cu
@@ -0,0 +1,556 @@
+#include <unittest/unittest.h>
+#include <thrust/partition.h>
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
+__global__
+void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::partition(exec, first, last, pred);
+}
+
+
+template<typename T>
+struct is_even
+{
+  __host__ __device__
+  bool operator()(T x) const { return ((int) x % 2) == 0; }
+};
+
+
+template<typename ExecutionPolicy>
+void TestPartitionDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  typedef typename thrust::device_vector<T>::iterator iterator;
+  
+  thrust::device_vector<T> data(5);
+  data[0] = 1; 
+  data[1] = 2; 
+  data[2] = 1;
+  data[3] = 1; 
+  data[4] = 2; 
+
+  thrust::device_vector<iterator> result(1);
+  
+  partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  thrust::device_vector<T> ref(5);
+  ref[0] = 2;
+  ref[1] = 2;
+  ref[2] = 1;
+  ref[3] = 1;
+  ref[4] = 1;
+  
+  ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+  ASSERT_EQUAL(ref, data);
+}
+
+
+void TestPartitionDeviceSeq()
+{
+  TestPartitionDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPartitionDeviceSeq);
+
+
+void TestPartitionDeviceDevice()
+{
+  TestPartitionDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPartitionDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
+__global__
+void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
+{
+  *result = thrust::partition(exec, first, last, stencil_first, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestPartitionStencilDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  typedef typename thrust::device_vector<T>::iterator iterator;
+  
+  thrust::device_vector<T> data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 0;
+  data[3] = 0;
+  data[4] = 1;
+  
+  thrust::device_vector<T> stencil(5);
+  stencil[0] = 1; 
+  stencil[1] = 2; 
+  stencil[2] = 1;
+  stencil[3] = 1; 
+  stencil[4] = 2; 
+
+  thrust::device_vector<iterator> result(1);
+  
+  partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  thrust::device_vector<T> ref(5);
+  ref[0] = 1;
+  ref[1] = 1;
+  ref[2] = 0;
+  ref[3] = 0;
+  ref[4] = 0;
+  
+  ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+  ASSERT_EQUAL(ref, data);
+}
+
+
+void TestPartitionStencilDeviceSeq()
+{
+  TestPartitionStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPartitionStencilDeviceSeq);
+
+
+void TestPartitionStencilDeviceDevice()
+{
+  TestPartitionStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPartitionStencilDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
+__global__
+void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
+{
+  *result = thrust::partition_copy(exec, first, last, true_result, false_result, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestPartitionCopyDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  typedef thrust::device_vector<T>::iterator iterator;
+  
+  thrust::device_vector<T> data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  1; 
+  data[4] =  2; 
+  
+  thrust::device_vector<int> true_results(2);
+  thrust::device_vector<int> false_results(3);
+
+  typedef thrust::pair<iterator,iterator> pair_type;
+  thrust::device_vector<pair_type> iterators(1);
+  
+  partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  thrust::device_vector<T> true_ref(2);
+  true_ref[0] =  2;
+  true_ref[1] =  2;
+  
+  thrust::device_vector<T> false_ref(3);
+  false_ref[0] =  1;
+  false_ref[1] =  1;
+  false_ref[2] =  1;
+
+  pair_type ends = iterators[0];
+  
+  ASSERT_EQUAL(2, ends.first - true_results.begin());
+  ASSERT_EQUAL(3, ends.second - false_results.begin());
+  ASSERT_EQUAL(true_ref, true_results);
+  ASSERT_EQUAL(false_ref, false_results);
+}
+
+
+void TestPartitionCopyDeviceSeq()
+{
+  TestPartitionCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPartitionCopyDeviceSeq);
+
+
+void TestPartitionCopyDeviceDevice()
+{
+  TestPartitionCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPartitionCopyDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
+__global__
+void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
+{
+  *result = thrust::partition_copy(exec, first, last, stencil_first, true_result, false_result, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestPartitionCopyStencilDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  
+  thrust::device_vector<int> data(5);
+  data[0] =  0; 
+  data[1] =  1; 
+  data[2] =  0;
+  data[3] =  0; 
+  data[4] =  1; 
+  
+  thrust::device_vector<int> stencil(5);
+  stencil[0] =  1; 
+  stencil[1] =  2; 
+  stencil[2] =  1;
+  stencil[3] =  1; 
+  stencil[4] =  2; 
+  
+  thrust::device_vector<int> true_results(2);
+  thrust::device_vector<int> false_results(3);
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  typedef thrust::pair<iterator,iterator> pair_type;
+  thrust::device_vector<pair_type> iterators(1);
+
+  partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  pair_type ends = iterators[0];
+  
+  thrust::device_vector<int> true_ref(2);
+  true_ref[0] =  1;
+  true_ref[1] =  1;
+  
+  thrust::device_vector<int> false_ref(3);
+  false_ref[0] =  0;
+  false_ref[1] =  0;
+  false_ref[2] =  0;
+  
+  ASSERT_EQUAL(2, ends.first - true_results.begin());
+  ASSERT_EQUAL(3, ends.second - false_results.begin());
+  ASSERT_EQUAL(true_ref, true_results);
+  ASSERT_EQUAL(false_ref, false_results);
+}
+
+
+void TestPartitionCopyStencilDeviceSeq()
+{
+  TestPartitionCopyStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDeviceSeq);
+
+
+void TestPartitionCopyStencilDeviceDevice()
+{
+  TestPartitionCopyStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2, typename Iterator3>
+__global__
+void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
+{
+#if (__CUDA_ARCH__ >= 200)
+  *is_supported = true;
+  *result = thrust::stable_partition(exec, first, last, pred);
+#else
+  *is_supported = false;
+#endif
+}
+
+
+template<typename ExecutionPolicy>
+void TestStablePartitionDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  typedef typename thrust::device_vector<T>::iterator iterator;
+  
+  thrust::device_vector<T> data(5);
+  data[0] = 1; 
+  data[1] = 2; 
+  data[2] = 1;
+  data[3] = 1; 
+  data[4] = 2; 
+
+  thrust::device_vector<iterator> result(1);
+  thrust::device_vector<bool> is_supported(1);
+  
+  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin(), is_supported.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  if(is_supported[0])
+  {
+    thrust::device_vector<T> ref(5);
+    ref[0] = 2;
+    ref[1] = 2;
+    ref[2] = 1;
+    ref[3] = 1;
+    ref[4] = 1;
+    
+    ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+    ASSERT_EQUAL(ref, data);
+  }
+}
+
+
+void TestStablePartitionDeviceSeq()
+{
+  TestStablePartitionDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestStablePartitionDeviceSeq);
+
+
+void TestStablePartitionDeviceDevice()
+{
+  TestStablePartitionDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestStablePartitionDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3, typename Iterator4>
+__global__
+void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
+{
+#if (__CUDA_ARCH__ >= 200)
+  *is_supported = true;
+  *result = thrust::stable_partition(exec, first, last, stencil_first, pred);
+#else
+  *is_supported = false;
+#endif
+}
+
+
+template<typename ExecutionPolicy>
+void TestStablePartitionStencilDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  typedef typename thrust::device_vector<T>::iterator iterator;
+  
+  thrust::device_vector<T> data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 0;
+  data[3] = 0;
+  data[4] = 1;
+  
+  thrust::device_vector<T> stencil(5);
+  stencil[0] = 1; 
+  stencil[1] = 2; 
+  stencil[2] = 1;
+  stencil[3] = 1; 
+  stencil[4] = 2; 
+
+  thrust::device_vector<iterator> result(1);
+  thrust::device_vector<bool> is_supported(1);
+  
+  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin(), is_supported.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  if(is_supported[0])
+  {
+    thrust::device_vector<T> ref(5);
+    ref[0] = 1;
+    ref[1] = 1;
+    ref[2] = 0;
+    ref[3] = 0;
+    ref[4] = 0;
+    
+    ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+    ASSERT_EQUAL(ref, data);
+  }
+}
+
+
+void TestStablePartitionStencilDeviceSeq()
+{
+  TestStablePartitionStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDeviceSeq);
+
+
+void TestStablePartitionStencilDeviceDevice()
+{
+  TestStablePartitionStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
+__global__
+void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
+{
+  *result = thrust::stable_partition_copy(exec, first, last, true_result, false_result, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestStablePartitionCopyDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  typedef thrust::device_vector<T>::iterator iterator;
+  
+  thrust::device_vector<T> data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  1; 
+  data[4] =  2; 
+  
+  thrust::device_vector<int> true_results(2);
+  thrust::device_vector<int> false_results(3);
+
+  typedef thrust::pair<iterator,iterator> pair_type;
+  thrust::device_vector<pair_type> iterators(1);
+  
+  stable_partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  thrust::device_vector<T> true_ref(2);
+  true_ref[0] =  2;
+  true_ref[1] =  2;
+  
+  thrust::device_vector<T> false_ref(3);
+  false_ref[0] =  1;
+  false_ref[1] =  1;
+  false_ref[2] =  1;
+
+  pair_type ends = iterators[0];
+  
+  ASSERT_EQUAL(2, ends.first - true_results.begin());
+  ASSERT_EQUAL(3, ends.second - false_results.begin());
+  ASSERT_EQUAL(true_ref, true_results);
+  ASSERT_EQUAL(false_ref, false_results);
+}
+
+
+void TestStablePartitionCopyDeviceSeq()
+{
+  TestStablePartitionCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDeviceSeq);
+
+
+void TestStablePartitionCopyDeviceDevice()
+{
+  TestStablePartitionCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
+__global__
+void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
+{
+  *result = thrust::stable_partition_copy(exec, first, last, stencil_first, true_result, false_result, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestStablePartitionCopyStencilDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  
+  thrust::device_vector<int> data(5);
+  data[0] =  0; 
+  data[1] =  1; 
+  data[2] =  0;
+  data[3] =  0; 
+  data[4] =  1; 
+  
+  thrust::device_vector<int> stencil(5);
+  stencil[0] =  1; 
+  stencil[1] =  2; 
+  stencil[2] =  1;
+  stencil[3] =  1; 
+  stencil[4] =  2; 
+  
+  thrust::device_vector<int> true_results(2);
+  thrust::device_vector<int> false_results(3);
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  typedef thrust::pair<iterator,iterator> pair_type;
+  thrust::device_vector<pair_type> iterators(1);
+
+  stable_partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  pair_type ends = iterators[0];
+  
+  thrust::device_vector<int> true_ref(2);
+  true_ref[0] =  1;
+  true_ref[1] =  1;
+  
+  thrust::device_vector<int> false_ref(3);
+  false_ref[0] =  0;
+  false_ref[1] =  0;
+  false_ref[2] =  0;
+  
+  ASSERT_EQUAL(2, ends.first - true_results.begin());
+  ASSERT_EQUAL(3, ends.second - false_results.begin());
+  ASSERT_EQUAL(true_ref, true_results);
+  ASSERT_EQUAL(false_ref, false_results);
+}
+
+
+void TestStablePartitionCopyStencilDeviceSeq()
+{
+  TestStablePartitionCopyStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceSeq);
+
+
+void TestStablePartitionCopyStencilDeviceDevice()
+{
+  TestStablePartitionCopyStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceDevice);
+
+
+void TestPartitionCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  typedef Vector::iterator   Iterator;
+  
+  Vector data(5);
+  data[0] = 1; 
+  data[1] = 2; 
+  data[2] = 1;
+  data[3] = 1; 
+  data[4] = 2; 
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  Iterator iter = thrust::partition(thrust::cuda::par.on(s), data.begin(), data.end(), is_even<T>());
+  
+  Vector ref(5);
+  ref[0] = 2;
+  ref[1] = 2;
+  ref[2] = 1;
+  ref[3] = 1;
+  ref[4] = 1;
+  
+  ASSERT_EQUAL(iter - data.begin(), 2);
+  ASSERT_EQUAL(data, ref);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreams);
+
diff --git a/thrust/testing/cuda/partition.mk b/thrust/testing/cuda/partition.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/partition.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/partition_point.cu b/thrust/testing/cuda/partition_point.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0b95fcb02b1e7361726ff10e745fd2c0a882e4e6
--- /dev/null
+++ b/thrust/testing/cuda/partition_point.cu
@@ -0,0 +1,81 @@
+#include <unittest/unittest.h>
+#include <thrust/partition.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
+__global__
+void partition_point_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::partition_point(exec, first, last, pred);
+}
+
+
+template<typename T>
+struct is_even
+{
+  __host__ __device__
+  bool operator()(T x) const { return ((int) x % 2) == 0; }
+};
+
+
+template<typename ExecutionPolicy>
+void TestPartitionPointDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::device_vector<int> v = unittest::random_integers<int>(n);
+  typedef typename thrust::device_vector<int>::iterator iterator;
+
+  iterator ref = thrust::stable_partition(v.begin(), v.end(), is_even<int>());
+
+  thrust::device_vector<iterator> result(1);
+  partition_point_kernel<<<1,1>>>(exec, v.begin(), v.end(), is_even<int>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  ASSERT_EQUAL(ref - v.begin(), (iterator)result[0] - v.begin());
+}
+
+
+void TestPartitionPointDeviceSeq()
+{
+  TestPartitionPointDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestPartitionPointDeviceSeq);
+
+
+void TestPartitionPointDeviceDevice()
+{
+  TestPartitionPointDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestPartitionPointDeviceDevice);
+
+
+void TestPartitionPointCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  typedef Vector::iterator Iterator;
+
+  Vector v(4);
+  v[0] = 1; v[1] = 1; v[2] = 1; v[3] = 0;
+
+  Iterator first = v.begin();
+
+  Iterator last = v.begin() + 4;
+  Iterator ref = first + 3;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  ASSERT_EQUAL_QUIET(ref, thrust::partition_point(thrust::cuda::par.on(s), first, last, thrust::identity<T>()));
+
+  last = v.begin() + 3;
+  ref = last;
+  ASSERT_EQUAL_QUIET(ref, thrust::partition_point(thrust::cuda::par.on(s), first, last, thrust::identity<T>()));
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestPartitionPointCudaStreams);
+
diff --git a/thrust/testing/cuda/partition_point.mk b/thrust/testing/cuda/partition_point.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/partition_point.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/pinned_allocator.cu b/thrust/testing/cuda/pinned_allocator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..23ccc7d407a6f35af7728c5edf55f54c425b94b8
--- /dev/null
+++ b/thrust/testing/cuda/pinned_allocator.cu
@@ -0,0 +1,19 @@
+#include <unittest/unittest.h>
+#include <thrust/system/cuda/experimental/pinned_allocator.h>
+#include <thrust/host_vector.h>
+#include <thrust/copy.h>
+
+template <typename T>
+void TestPinnedAllocatorSimple(const size_t n)
+{
+  typedef thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T> > Vector;
+
+  Vector h_input = unittest::random_integers<T>(n);
+  Vector h_output(n);
+
+  thrust::copy(h_input.begin(), h_input.end(), h_output.begin());
+
+  ASSERT_EQUAL(h_input, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestPinnedAllocatorSimple);
+
diff --git a/thrust/testing/cuda/pinned_allocator.mk b/thrust/testing/cuda/pinned_allocator.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/pinned_allocator.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/reduce.cu b/thrust/testing/cuda/reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9cefcc0edceba81d018578c05af3550d1052de4d
--- /dev/null
+++ b/thrust/testing/cuda/reduce.cu
@@ -0,0 +1,75 @@
+#include <unittest/unittest.h>
+#include <thrust/reduce.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
+__global__
+void reduce_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T init, Iterator2 result)
+{
+  *result = thrust::reduce(exec, first, last, init);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestReduceDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+  
+  thrust::device_vector<T> d_result(1);
+  
+  T init = 13;
+  
+  T h_result = thrust::reduce(h_data.begin(), h_data.end(), init);
+  
+  reduce_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), init, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_result, d_result[0]);
+}
+
+
+template<typename T>
+struct TestReduceDeviceSeq
+{
+  void operator()(const size_t n)
+  {
+    TestReduceDevice<T>(thrust::seq, n);
+  }
+};
+VariableUnitTest<TestReduceDeviceSeq, IntegralTypes> TestReduceDeviceSeqInstance;
+
+
+template<typename T>
+struct TestReduceDeviceDevice
+{
+  void operator()(const size_t n)
+  {
+    TestReduceDevice<T>(thrust::device, n);
+  }
+};
+VariableUnitTest<TestReduceDeviceDevice, IntegralTypes> TestReduceDeviceDeviceInstance;
+
+
+void TestReduceCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector v(3);
+  v[0] = 1; v[1] = -2; v[2] = 3;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  // no initializer
+  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end()), 2);
+
+  // with initializer
+  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end(), 10), 12);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestReduceCudaStreams);
+
diff --git a/thrust/testing/cuda/reduce.mk b/thrust/testing/cuda/reduce.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/reduce.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/reduce_by_key.cu b/thrust/testing/cuda/reduce_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..993a39bd409f8da925f4907680e36d44588102b5
--- /dev/null
+++ b/thrust/testing/cuda/reduce_by_key.cu
@@ -0,0 +1,266 @@
+#include <unittest/unittest.h>
+#include <thrust/reduce.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
+__global__
+void reduce_by_key_kernel(ExecutionPolicy exec,
+                          Iterator1 keys_first, Iterator1 keys_last,
+                          Iterator2 values_first,
+                          Iterator3 keys_result,
+                          Iterator4 values_result,
+                          Iterator5 result)
+{
+  *result = thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename BinaryPredicate, typename Iterator5>
+__global__
+void reduce_by_key_kernel(ExecutionPolicy exec,
+                          Iterator1 keys_first, Iterator1 keys_last,
+                          Iterator2 values_first,
+                          Iterator3 keys_result,
+                          Iterator4 values_result,
+                          BinaryPredicate pred,
+                          Iterator5 result)
+{
+  *result = thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result, pred);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename BinaryPredicate, typename BinaryFunction, typename Iterator5>
+__global__
+void reduce_by_key_kernel(ExecutionPolicy exec,
+                          Iterator1 keys_first, Iterator1 keys_last,
+                          Iterator2 values_first,
+                          Iterator3 keys_result,
+                          Iterator4 values_result,
+                          BinaryPredicate pred,
+                          BinaryFunction binary_op,
+                          Iterator5 result)
+{
+  *result = thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result, pred, binary_op);
+}
+
+
+template<typename T>
+struct is_equal_div_10_reduce
+{
+  __host__ __device__
+  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+
+template<typename Vector>
+void initialize_keys(Vector& keys)
+{
+  keys.resize(9);
+  keys[0] = 11;
+  keys[1] = 11;
+  keys[2] = 21;
+  keys[3] = 20;
+  keys[4] = 21;
+  keys[5] = 21;
+  keys[6] = 21;
+  keys[7] = 37;
+  keys[8] = 37;
+}
+
+
+template<typename Vector>
+void initialize_values(Vector& values)
+{
+  values.resize(9);
+  values[0] = 0; 
+  values[1] = 1;
+  values[2] = 2;
+  values[3] = 3;
+  values[4] = 4;
+  values[5] = 5;
+  values[6] = 6;
+  values[7] = 7;
+  values[8] = 8;
+}
+
+
+template<typename ExecutionPolicy>
+void TestReduceByKeyDevice(ExecutionPolicy exec)
+{
+  typedef int T;
+  
+  thrust::device_vector<T> keys;
+  thrust::device_vector<T> values;
+
+  typedef typename thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > iterator_pair;
+
+  thrust::device_vector<iterator_pair> new_last_vec(1);
+  iterator_pair new_last;
+  
+  // basic test
+  initialize_keys(keys);  initialize_values(values);
+  
+  thrust::device_vector<T> output_keys(keys.size());
+  thrust::device_vector<T> output_values(values.size());
+  
+  reduce_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+  
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 20);
+  ASSERT_EQUAL(output_keys[3], 21);
+  ASSERT_EQUAL(output_keys[4], 37);
+  
+  ASSERT_EQUAL(output_values[0],  1);
+  ASSERT_EQUAL(output_values[1],  2);
+  ASSERT_EQUAL(output_values[2],  3);
+  ASSERT_EQUAL(output_values[3], 15);
+  ASSERT_EQUAL(output_values[4], 15);
+  
+  // test BinaryPredicate
+  initialize_keys(keys);  initialize_values(values);
+  
+  reduce_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+  
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 37);
+  
+  ASSERT_EQUAL(output_values[0],  1);
+  ASSERT_EQUAL(output_values[1], 20);
+  ASSERT_EQUAL(output_values[2], 15);
+  
+  // test BinaryFunction
+  initialize_keys(keys);  initialize_values(values);
+  
+  reduce_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+  
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 20);
+  ASSERT_EQUAL(output_keys[3], 21);
+  ASSERT_EQUAL(output_keys[4], 37);
+  
+  ASSERT_EQUAL(output_values[0],  1);
+  ASSERT_EQUAL(output_values[1],  2);
+  ASSERT_EQUAL(output_values[2],  3);
+  ASSERT_EQUAL(output_values[3], 15);
+  ASSERT_EQUAL(output_values[4], 15);
+}
+
+
+void TestReduceByKeyDeviceSeq()
+{
+  TestReduceByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReduceByKeyDeviceSeq);
+
+
+void TestReduceByKeyDeviceDevice()
+{
+  TestReduceByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReduceByKeyDeviceDevice);
+
+
+void TestReduceByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector keys;
+  Vector values;
+
+  thrust::pair<Vector::iterator, Vector::iterator> new_last;
+
+  // basic test
+  initialize_keys(keys);  initialize_values(values);
+
+  Vector output_keys(keys.size());
+  Vector output_values(values.size());
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 20);
+  ASSERT_EQUAL(output_keys[3], 21);
+  ASSERT_EQUAL(output_keys[4], 37);
+  
+  ASSERT_EQUAL(output_values[0],  1);
+  ASSERT_EQUAL(output_values[1],  2);
+  ASSERT_EQUAL(output_values[2],  3);
+  ASSERT_EQUAL(output_values[3], 15);
+  ASSERT_EQUAL(output_values[4], 15);
+
+  // test BinaryPredicate
+  initialize_keys(keys);  initialize_values(values);
+  
+  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 37);
+  
+  ASSERT_EQUAL(output_values[0],  1);
+  ASSERT_EQUAL(output_values[1], 20);
+  ASSERT_EQUAL(output_values[2], 15);
+
+  // test BinaryFunction
+  initialize_keys(keys);  initialize_values(values);
+
+  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 20);
+  ASSERT_EQUAL(output_keys[3], 21);
+  ASSERT_EQUAL(output_keys[4], 37);
+  
+  ASSERT_EQUAL(output_values[0],  1);
+  ASSERT_EQUAL(output_values[1],  2);
+  ASSERT_EQUAL(output_values[2],  3);
+  ASSERT_EQUAL(output_values[3], 15);
+  ASSERT_EQUAL(output_values[4], 15);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/reduce_by_key.mk b/thrust/testing/cuda/reduce_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/reduce_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/remove.cu b/thrust/testing/cuda/remove.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3509cd31b9d1c85b3cfda08e22e3f28de480e2a8
--- /dev/null
+++ b/thrust/testing/cuda/remove.cu
@@ -0,0 +1,542 @@
+#include <unittest/unittest.h>
+#include <thrust/remove.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
+__global__
+void remove_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val, Iterator2 result)
+{
+  *result = thrust::remove(exec, first, last, val);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
+__global__
+void remove_if_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::remove_if(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
+__global__
+void remove_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
+{
+  *result = thrust::remove_if(exec, first, last, stencil_first, pred);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T, typename Iterator3>
+__global__
+void remove_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, T val, Iterator3 result2)
+{
+  *result2 = thrust::remove_copy(exec, first, last, result1, val);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
+__global__
+void remove_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, Predicate pred, Iterator3 result_end)
+{
+  *result_end = thrust::remove_copy_if(exec, first, last, result, pred);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
+__global__
+void remove_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result, Predicate pred, Iterator4 result_end)
+{
+  *result_end = thrust::remove_copy_if(exec, first, last, stencil_first, result, pred);
+}
+
+
+template<typename T>
+struct is_even
+  : thrust::unary_function<T,bool>
+{
+  __host__ __device__
+  bool operator()(T x) { return (static_cast<unsigned int>(x) & 1) == 0; }
+};
+
+
+template<typename T>
+struct is_true
+  : thrust::unary_function<T,bool>
+{
+  __host__ __device__
+  bool operator()(T x) { return x ? true : false; }
+};
+
+
+template<typename ExecutionPolicy>
+void TestRemoveDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  thrust::device_vector<iterator> d_result(1);
+  
+  size_t h_size = thrust::remove(h_data.begin(), h_data.end(), 0) - h_data.begin();
+
+  remove_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), 0, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  size_t d_size = (iterator)d_result[0] - d_data.begin();
+  
+  ASSERT_EQUAL(h_size, d_size);
+  
+  h_data.resize(h_size);
+  d_data.resize(d_size);
+  
+  ASSERT_EQUAL(h_data, d_data);
+}
+
+
+void TestRemoveDeviceSeq()
+{
+  TestRemoveDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestRemoveDeviceSeq);
+
+
+void TestRemoveDeviceDevice()
+{
+  TestRemoveDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestRemoveDeviceDevice);
+
+
+template<typename ExecutionPolicy>
+void TestRemoveIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  thrust::device_vector<iterator> d_result(1);
+  
+  size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), is_true<int>()) - h_data.begin();
+
+  remove_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), is_true<int>(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  size_t d_size = (iterator)d_result[0] - d_data.begin();
+  
+  ASSERT_EQUAL(h_size, d_size);
+  
+  h_data.resize(h_size);
+  d_data.resize(d_size);
+  
+  ASSERT_EQUAL(h_data, d_data);
+}
+
+
+void TestRemoveIfDeviceSeq()
+{
+  TestRemoveIfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestRemoveIfDeviceSeq);
+
+
+void TestRemoveIfDeviceDevice()
+{
+  TestRemoveIfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestRemoveIfDeviceDevice);
+
+
+template<typename ExecutionPolicy>
+void TestRemoveIfStencilDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  thrust::device_vector<iterator> d_result(1);
+  
+  thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
+  thrust::device_vector<bool> d_stencil = h_stencil;
+  
+  size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), h_stencil.begin(), is_true<int>()) - h_data.begin();
+
+  remove_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), is_true<int>(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  size_t d_size = (iterator)d_result[0] - d_data.begin();
+  
+  ASSERT_EQUAL(h_size, d_size);
+  
+  h_data.resize(h_size);
+  d_data.resize(d_size);
+  
+  ASSERT_EQUAL(h_data, d_data);
+}
+
+
+void TestRemoveIfStencilDeviceSeq()
+{
+  TestRemoveIfStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestRemoveIfStencilDeviceSeq);
+
+
+void TestRemoveIfStencilDeviceDevice()
+{
+  TestRemoveIfStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestRemoveIfStencilDeviceDevice);
+
+
+template<typename ExecutionPolicy>
+void TestRemoveCopyDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::host_vector<int>   h_result(n);
+  thrust::device_vector<int> d_result(n);
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  thrust::device_vector<iterator> d_new_end(1);
+  
+  size_t h_size = thrust::remove_copy(h_data.begin(), h_data.end(), h_result.begin(), 0) - h_result.begin();
+
+  remove_copy_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), 0, d_new_end.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  size_t d_size = (iterator)d_new_end[0] - d_result.begin();
+  
+  ASSERT_EQUAL(h_size, d_size);
+  
+  h_result.resize(h_size);
+  d_result.resize(d_size);
+  
+  ASSERT_EQUAL(h_result, d_result);
+}
+
+
+void TestRemoveCopyDeviceSeq()
+{
+  TestRemoveCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestRemoveCopyDeviceSeq);
+
+
+void TestRemoveCopyDeviceDevice()
+{
+  TestRemoveCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestRemoveCopyDeviceDevice);
+
+
+template<typename ExecutionPolicy>
+void TestRemoveCopyIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::host_vector<int>   h_result(n);
+  thrust::device_vector<int> d_result(n);
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  thrust::device_vector<iterator> d_new_end(1);
+  
+  size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<int>()) - h_result.begin();
+
+  remove_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), is_true<int>(), d_new_end.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  size_t d_size = (iterator)d_new_end[0] - d_result.begin();
+  
+  ASSERT_EQUAL(h_size, d_size);
+  
+  h_result.resize(h_size);
+  d_result.resize(d_size);
+  
+  ASSERT_EQUAL(h_result, d_result);
+}
+
+
+void TestRemoveCopyIfDeviceSeq()
+{
+  TestRemoveCopyIfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestRemoveCopyIfDeviceSeq);
+
+
+void TestRemoveCopyIfDeviceDevice()
+{
+  TestRemoveCopyIfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestRemoveCopyIfDeviceDevice);
+
+
+template<typename ExecutionPolicy>
+void TestRemoveCopyIfStencilDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::host_vector<int>   h_result(n);
+  thrust::device_vector<int> d_result(n);
+
+  typedef typename thrust::device_vector<int>::iterator iterator;
+  thrust::device_vector<iterator> d_new_end(1);
+
+  thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
+  thrust::device_vector<bool> d_stencil = h_stencil;
+  
+  size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_true<int>()) - h_result.begin();
+
+  remove_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_true<int>(), d_new_end.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  size_t d_size = (iterator)d_new_end[0] - d_result.begin();
+  
+  ASSERT_EQUAL(h_size, d_size);
+  
+  h_result.resize(h_size);
+  d_result.resize(d_size);
+  
+  ASSERT_EQUAL(h_result, d_result);
+}
+
+
+void TestRemoveCopyIfStencilDeviceSeq()
+{
+  TestRemoveCopyIfStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestRemoveCopyIfStencilDeviceSeq);
+
+
+void TestRemoveCopyIfStencilDeviceDevice()
+{
+  TestRemoveCopyIfStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestRemoveCopyIfStencilDeviceDevice);
+
+
+void TestRemoveCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::remove(thrust::cuda::par.on(s),
+                                        data.begin(), 
+                                        data.end(), 
+                                        (T) 2);
+
+  ASSERT_EQUAL(end - data.begin(), 3);
+
+  ASSERT_EQUAL(data[0], 1);
+  ASSERT_EQUAL(data[1], 1);
+  ASSERT_EQUAL(data[2], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestRemoveCudaStreams);
+
+
+void TestRemoveCopyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::remove_copy(thrust::cuda::par.on(s),
+                                             data.begin(), 
+                                             data.end(), 
+                                             result.begin(), 
+                                             (T) 2);
+
+  ASSERT_EQUAL(end - result.begin(), 3);
+
+  ASSERT_EQUAL(result[0], 1);
+  ASSERT_EQUAL(result[1], 1);
+  ASSERT_EQUAL(result[2], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestRemoveCopyCudaStreams);
+
+
+void TestRemoveIfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::remove_if(thrust::cuda::par.on(s),
+                                           data.begin(), 
+                                           data.end(), 
+                                           is_even<T>());
+
+  ASSERT_EQUAL(end - data.begin(), 3);
+
+  ASSERT_EQUAL(data[0], 1);
+  ASSERT_EQUAL(data[1], 1);
+  ASSERT_EQUAL(data[2], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestRemoveIfCudaStreams);
+
+
+void TestRemoveIfStencilCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  Vector stencil(5);
+  stencil[0] = 0;
+  stencil[1] = 1;
+  stencil[2] = 0;
+  stencil[3] = 0;
+  stencil[4] = 1;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::remove_if(thrust::cuda::par.on(s),
+                                           data.begin(), 
+                                           data.end(),
+                                           stencil.begin(),
+                                           thrust::identity<T>());
+
+  ASSERT_EQUAL(end - data.begin(), 3);
+
+  ASSERT_EQUAL(data[0], 1);
+  ASSERT_EQUAL(data[1], 1);
+  ASSERT_EQUAL(data[2], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestRemoveIfStencilCudaStreams);
+
+
+void TestRemoveCopyIfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::remove_copy_if(thrust::cuda::par.on(s),
+                                                data.begin(), 
+                                                data.end(), 
+                                                result.begin(), 
+                                                is_even<T>());
+
+  ASSERT_EQUAL(end - result.begin(), 3);
+
+  ASSERT_EQUAL(result[0], 1);
+  ASSERT_EQUAL(result[1], 1);
+  ASSERT_EQUAL(result[2], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestRemoveCopyIfCudaStreams);
+
+
+void TestRemoveCopyIfStencilCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  Vector stencil(5);
+  stencil[0] = 0;
+  stencil[1] = 1;
+  stencil[2] = 0;
+  stencil[3] = 0;
+  stencil[4] = 1;
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Vector::iterator end = thrust::remove_copy_if(thrust::cuda::par.on(s),
+                                                data.begin(), 
+                                                data.end(), 
+                                                stencil.begin(),
+                                                result.begin(), 
+                                                thrust::identity<T>());
+
+  ASSERT_EQUAL(end - result.begin(), 3);
+
+  ASSERT_EQUAL(result[0], 1);
+  ASSERT_EQUAL(result[1], 1);
+  ASSERT_EQUAL(result[2], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestRemoveCopyIfStencilCudaStreams);
+
diff --git a/thrust/testing/cuda/remove.mk b/thrust/testing/cuda/remove.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/remove.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/replace.cu b/thrust/testing/cuda/replace.cu
new file mode 100644
index 0000000000000000000000000000000000000000..24a03b2d523cf930ae7cf6e958f17acb5f3252f2
--- /dev/null
+++ b/thrust/testing/cuda/replace.cu
@@ -0,0 +1,295 @@
+#include <unittest/unittest.h>
+#include <thrust/replace.h>
+#include <thrust/execution_policy.h>
+
+
+template <typename T>
+struct less_than_five
+{
+  __host__ __device__ bool operator()(const T &val) const {return val < 5;}
+};
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T1, typename T2>
+__global__
+void replace_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T1 old_value, T2 new_value)
+{
+  thrust::replace(exec, first, last, old_value, new_value);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestReplaceDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+  
+  T old_value = 0;
+  T new_value = 1;
+  
+  thrust::replace(h_data.begin(), h_data.end(), old_value, new_value);
+
+  replace_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), old_value, new_value);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_ALMOST_EQUAL(h_data, d_data);
+}
+
+
+template<typename T>
+void TestReplaceDeviceSeq(const size_t n)
+{
+  TestReplaceDevice<T>(thrust::seq, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceDeviceSeq);
+
+template<typename T>
+void TestReplaceDeviceDevice(const size_t n)
+{
+  TestReplaceDevice<T>(thrust::device, n);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T1, typename T2>
+__global__
+void replace_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, T1 old_value, T2 new_value)
+{
+  thrust::replace_copy(exec, first, last, result, old_value, new_value);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReplaceCopyDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  int old_value = 0;
+  int new_value = 1;
+  
+  thrust::host_vector<int>   h_dest(n);
+  thrust::device_vector<int> d_dest(n);
+  
+  thrust::replace_copy(h_data.begin(), h_data.end(), h_dest.begin(), old_value, new_value);
+
+  replace_copy_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_dest.begin(), old_value, new_value);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_ALMOST_EQUAL(h_data, d_data);
+  ASSERT_ALMOST_EQUAL(h_dest, d_dest);
+}
+
+void TestReplaceCopyDeviceSeq()
+{
+  TestReplaceCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReplaceCopyDeviceSeq);
+
+void TestReplaceCopyDeviceDevice()
+{
+  TestReplaceCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReplaceCopyDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename T>
+__global__
+void replace_if_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, T new_value)
+{
+  thrust::replace_if(exec, first, last, pred, new_value);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReplaceIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::replace_if(h_data.begin(), h_data.end(), less_than_five<int>(), 0);
+
+  replace_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_ALMOST_EQUAL(h_data, d_data);
+}
+
+void TestReplaceIfDeviceSeq()
+{
+  TestReplaceIfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReplaceIfDeviceSeq);
+
+void TestReplaceIfDeviceDevice()
+{
+  TestReplaceIfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReplaceIfDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename T>
+__global__
+void replace_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, T new_value)
+{
+  thrust::replace_if(exec, first, last, stencil_first, pred, new_value);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReplaceIfStencilDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::host_vector<int>   h_stencil = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_stencil = h_stencil;
+  
+  thrust::replace_if(h_data.begin(), h_data.end(), h_stencil.begin(), less_than_five<int>(), 0);
+
+  replace_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_ALMOST_EQUAL(h_data, d_data);
+}
+
+void TestReplaceIfStencilDeviceSeq()
+{
+  TestReplaceIfStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReplaceIfStencilDeviceSeq);
+
+void TestReplaceIfStencilDeviceDevice()
+{
+  TestReplaceIfStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReplaceIfStencilDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename T>
+__global__
+void replace_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, Predicate pred, T new_value)
+{
+  thrust::replace_copy_if(exec, first, last, result, pred, new_value);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReplaceCopyIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::host_vector<int>   h_dest(n);
+  thrust::device_vector<int> d_dest(n);
+  
+  thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<int>(), 0);
+
+  replace_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_ALMOST_EQUAL(h_data, d_data);
+  ASSERT_ALMOST_EQUAL(h_dest, d_dest);
+}
+
+void TestReplaceCopyIfDeviceSeq()
+{
+  TestReplaceCopyIfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReplaceCopyIfDeviceSeq);
+
+void TestReplaceCopyIfDeviceDevice()
+{
+  TestReplaceCopyIfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReplaceCopyIfDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename T>
+__global__
+void replace_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result, Predicate pred, T new_value)
+{
+  thrust::replace_copy_if(exec, first, last, stencil_first, result, pred, new_value);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReplaceCopyIfStencilDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int>   h_data = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::host_vector<int>   h_stencil = unittest::random_samples<int>(n);
+  thrust::device_vector<int> d_stencil = h_stencil;
+  
+  thrust::host_vector<int>   h_dest(n);
+  thrust::device_vector<int> d_dest(n);
+  
+  thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<int>(), 0);
+
+  replace_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_ALMOST_EQUAL(h_data, d_data);
+  ASSERT_ALMOST_EQUAL(h_dest, d_dest);
+}
+
+
+void TestReplaceCopyIfStencilDeviceSeq()
+{
+  TestReplaceCopyIfStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReplaceCopyIfStencilDeviceSeq);
+
+
+void TestReplaceCopyIfStencilDeviceDevice()
+{
+  TestReplaceCopyIfStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReplaceCopyIfStencilDeviceDevice);
+
+
+void TestReplaceCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(5);
+  data[0] =  1; 
+  data[1] =  2; 
+  data[2] =  1;
+  data[3] =  3; 
+  data[4] =  2; 
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::replace(thrust::cuda::par.on(s), data.begin(), data.end(), (T) 1, (T) 4);
+  thrust::replace(thrust::cuda::par.on(s), data.begin(), data.end(), (T) 2, (T) 5);
+
+  cudaStreamSynchronize(s);
+
+  Vector result(5);
+  result[0] =  4; 
+  result[1] =  5; 
+  result[2] =  4;
+  result[3] =  3; 
+  result[4] =  5; 
+
+  ASSERT_EQUAL(data, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestReplaceCudaStreams);
+
diff --git a/thrust/testing/cuda/replace.mk b/thrust/testing/cuda/replace.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/replace.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/reverse.cu b/thrust/testing/cuda/reverse.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f6dfab08fe9333e8b82327d14d04c5cbbdf6e08
--- /dev/null
+++ b/thrust/testing/cuda/reverse.cu
@@ -0,0 +1,149 @@
+#include <unittest/unittest.h>
+#include <thrust/reverse.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator>
+__global__
+void reverse_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
+{
+  thrust::reverse(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReverseDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int> h_data = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+  
+  thrust::reverse(h_data.begin(), h_data.end());
+
+  reverse_kernel<<<1,1>>>(exec, raw_pointer_cast(d_data.data()), raw_pointer_cast(d_data.data() + d_data.size()));
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_data, d_data);
+};
+
+
+void TestReverseDeviceSeq()
+{
+  TestReverseDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReverseDeviceSeq);
+
+
+void TestReverseDeviceDevice()
+{
+  TestReverseDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReverseDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void reverse_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  thrust::reverse_copy(exec, first, last, result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestReverseCopyDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  thrust::host_vector<int> h_data = unittest::random_integers<int>(n);
+  thrust::device_vector<int> d_data = h_data;
+
+  thrust::host_vector<int> h_result(n);
+  thrust::device_vector<int> d_result(n);
+
+  thrust::reverse_copy(h_data.begin(), h_data.end(), h_result.begin());
+
+  reverse_copy_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  ASSERT_EQUAL(h_result, d_result);
+};
+
+
+void TestReverseCopyDeviceSeq()
+{
+  TestReverseCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestReverseCopyDeviceSeq);
+
+
+void TestReverseCopyDeviceDevice()
+{
+  TestReverseCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestReverseCopyDeviceDevice);
+
+
+void TestReverseCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  Vector data(5);
+  data[0] = 1;
+  data[1] = 2;
+  data[2] = 3;
+  data[3] = 4;
+  data[4] = 5;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::reverse(thrust::cuda::par.on(s), data.begin(), data.end());
+
+  cudaStreamSynchronize(s);
+
+  Vector ref(5);
+  ref[0] = 5;
+  ref[1] = 4;
+  ref[2] = 3;
+  ref[3] = 2;
+  ref[4] = 1;
+
+  ASSERT_EQUAL(ref, data);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestReverseCudaStreams);
+
+
+void TestReverseCopyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  Vector data(5);
+  data[0] = 1;
+  data[1] = 2;
+  data[2] = 3;
+  data[3] = 4;
+  data[4] = 5;
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::reverse_copy(thrust::cuda::par.on(s), data.begin(), data.end(), result.begin());
+
+  cudaStreamSynchronize(s);
+
+  Vector ref(5);
+  ref[0] = 5;
+  ref[1] = 4;
+  ref[2] = 3;
+  ref[3] = 2;
+  ref[4] = 1;
+
+  ASSERT_EQUAL(ref, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestReverseCopyCudaStreams);
+
diff --git a/thrust/testing/cuda/reverse.mk b/thrust/testing/cuda/reverse.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/reverse.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/scan.cu b/thrust/testing/cuda/scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e67470cabf4b2381d745312a28f362099266dbd6
--- /dev/null
+++ b/thrust/testing/cuda/scan.cu
@@ -0,0 +1,214 @@
+#include <cstdio>
+#include <unittest/unittest.h>
+#include <thrust/scan.h>
+#include <thrust/functional.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  thrust::inclusive_scan(exec, first, last, result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void exclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  thrust::exclusive_scan(exec, first, last, result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T>
+__global__
+void exclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, T init)
+{
+  thrust::exclusive_scan(exec, first, last, result, init);
+}
+
+
+template<typename T, typename ExecutionPolicy>
+void TestScanDevice(ExecutionPolicy exec, const size_t n)
+{
+  thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_input = h_input;
+  
+  thrust::host_vector<T>   h_output(n);
+  thrust::device_vector<T> d_output(n);
+  
+  thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+
+  inclusive_scan_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_output, h_output);
+  
+  thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+
+  exclusive_scan_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_output, h_output);
+  
+  thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), (T) 11);
+
+  exclusive_scan_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), (T) 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_output, h_output);
+  
+  // in-place scans
+  h_output = h_input;
+  d_output = d_input;
+
+  thrust::inclusive_scan(h_output.begin(), h_output.end(), h_output.begin());
+
+  inclusive_scan_kernel<<<1,1>>>(exec, d_output.begin(), d_output.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_output, h_output);
+  
+  h_output = h_input;
+  d_output = d_input;
+  
+  thrust::exclusive_scan(h_output.begin(), h_output.end(), h_output.begin());
+
+  exclusive_scan_kernel<<<1,1>>>(exec, d_output.begin(), d_output.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(d_output, h_output);
+}
+
+
+template<typename T>
+struct TestScanDeviceSeq
+{
+  void operator()(const size_t n)
+  {
+    TestScanDevice<T>(thrust::seq, n);
+  }
+};
+VariableUnitTest<TestScanDeviceSeq, IntegralTypes> TestScanDeviceSeqInstance;
+
+
+template<typename T>
+struct TestScanDeviceDevice
+{
+  void operator()(const size_t n)
+  {
+    TestScanDevice<T>(thrust::device, n);
+  }
+};
+VariableUnitTest<TestScanDeviceDevice, IntegralTypes> TestScanDeviceDeviceInstance;
+
+
+void TestScanCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector::iterator iter;
+
+  Vector input(5);
+  Vector result(5);
+  Vector output(5);
+
+  input[0] = 1; input[1] = 3; input[2] = -2; input[3] = 4; input[4] = -5;
+
+  Vector input_copy(input);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  // inclusive scan
+  iter = thrust::inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin());
+  cudaStreamSynchronize(s);
+
+  result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+  
+  // exclusive scan
+  iter = thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 0);
+  cudaStreamSynchronize(s);
+
+  result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+  
+  // exclusive scan with init
+  iter = thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 3);
+  cudaStreamSynchronize(s);
+
+  result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+  
+  // inclusive scan with op
+  iter = thrust::inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+
+  // exclusive scan with init and op
+  iter = thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 3, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+
+  // inplace inclusive scan
+  input = input_copy;
+  iter = thrust::inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), input.begin());
+  cudaStreamSynchronize(s);
+
+  result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  // inplace exclusive scan with init
+  input = input_copy;
+  iter = thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), input.begin(), 3);
+  cudaStreamSynchronize(s);
+
+  result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  // inplace exclusive scan with implicit init=0
+  input = input_copy;
+  iter = thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), input.begin());
+  cudaStreamSynchronize(s);
+
+  result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestScanCudaStreams);
+
diff --git a/thrust/testing/cuda/scan.mk b/thrust/testing/cuda/scan.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/scan.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/scan_by_key.cu b/thrust/testing/cuda/scan_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e65560edfc1f2f6999378f8dbdf100b20869401a
--- /dev/null
+++ b/thrust/testing/cuda/scan_by_key.cu
@@ -0,0 +1,249 @@
+#include <unittest/unittest.h>
+#include <thrust/scan.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void inclusive_scan_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
+{
+  thrust::inclusive_scan_by_key(exec, keys_first, keys_last, values_first, result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void exclusive_scan_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
+{
+  thrust::exclusive_scan_by_key(exec, keys_first, keys_last, values_first, result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename T>
+__global__
+void exclusive_scan_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result, T init)
+{
+  thrust::exclusive_scan_by_key(exec, keys_first, keys_last, values_first, result, init);
+}
+
+
+template<typename ExecutionPolicy>
+void TestScanByKeyDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+
+  thrust::host_vector<int> h_keys(n);
+  for(size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if(rand() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+  
+  thrust::host_vector<int>   h_vals = unittest::random_integers<int>(n);
+  for(size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = i % 10;
+  }
+  thrust::device_vector<int> d_vals = h_vals;
+  
+  thrust::host_vector<int>   h_output(n);
+  thrust::device_vector<int> d_output(n);
+  
+  thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+  inclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_output, h_output);
+  
+  thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+  exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_output, h_output);
+  
+  thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), 11);
+  exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_output, h_output);
+  
+  // in-place scans
+  h_output = h_vals;
+  d_output = d_vals;
+  thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin());
+  inclusive_scan_by_key_kernel<<<1,1>>>(exec,d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_output, h_output);
+  
+  h_output = h_vals;
+  d_output = d_vals;
+  thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin(), 11);
+  exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin(), 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_output, h_output);
+}
+
+
+void TestScanByKeyDeviceSeq()
+{
+  TestScanByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestScanByKeyDeviceSeq);
+
+
+void TestScanByKeyDeviceDevice()
+{
+  TestScanByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestScanByKeyDeviceDevice);
+
+
+void TestInclusiveScanByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  typedef Vector::iterator   Iterator;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Iterator iter = thrust::inclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(iter, output.end());
+
+  ASSERT_EQUAL(output[0],  1);
+  ASSERT_EQUAL(output[1],  2);
+  ASSERT_EQUAL(output[2],  5);
+  ASSERT_EQUAL(output[3],  9);
+  ASSERT_EQUAL(output[4],  5);
+  ASSERT_EQUAL(output[5],  6);
+  ASSERT_EQUAL(output[6], 13);
+  
+  thrust::inclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>(), thrust::multiplies<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(output[0],  1);
+  ASSERT_EQUAL(output[1],  2);
+  ASSERT_EQUAL(output[2],  6);
+  ASSERT_EQUAL(output[3], 24);
+  ASSERT_EQUAL(output[4],  5);
+  ASSERT_EQUAL(output[5],  6);
+  ASSERT_EQUAL(output[6], 42);
+  
+  thrust::inclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(output[0],  1);
+  ASSERT_EQUAL(output[1],  2);
+  ASSERT_EQUAL(output[2],  5);
+  ASSERT_EQUAL(output[3],  9);
+  ASSERT_EQUAL(output[4],  5);
+  ASSERT_EQUAL(output[5],  6);
+  ASSERT_EQUAL(output[6], 13);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyCudaStreams);
+
+
+void TestExclusiveScanByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  typedef Vector::iterator   Iterator;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  Iterator iter = thrust::exclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(iter, output.end());
+
+  ASSERT_EQUAL(output[0], 0);
+  ASSERT_EQUAL(output[1], 0);
+  ASSERT_EQUAL(output[2], 2);
+  ASSERT_EQUAL(output[3], 5);
+  ASSERT_EQUAL(output[4], 0);
+  ASSERT_EQUAL(output[5], 0);
+  ASSERT_EQUAL(output[6], 6);
+
+  thrust::exclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin(), T(10));
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+  
+  thrust::exclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>(), thrust::multiplies<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 20);
+  ASSERT_EQUAL(output[3], 60);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 60);
+  
+  thrust::exclusive_scan_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/scan_by_key.mk b/thrust/testing/cuda/scan_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/scan_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/scatter.cu b/thrust/testing/cuda/scatter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52bd9755f58c811cd2e2ebf8c32e88662a3f38e2
--- /dev/null
+++ b/thrust/testing/cuda/scatter.cu
@@ -0,0 +1,182 @@
+#include <unittest/unittest.h>
+#include <thrust/scatter.h>
+#include <thrust/execution_policy.h>
+#include <algorithm>
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void scatter_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 map_first, Iterator3 result)
+{
+  thrust::scatter(exec, first, last, map_first, result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestScatterDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  const size_t output_size = std::min((size_t) 10, 2 * n);
+  
+  thrust::host_vector<int> h_input(n, 1);
+  thrust::device_vector<int> d_input(n, 1);
+  
+  thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+  
+  for(size_t i = 0; i < n; i++)
+  {
+    h_map[i] =  h_map[i] % output_size;
+  }
+  
+  thrust::device_vector<unsigned int> d_map = h_map;
+  
+  thrust::host_vector<int>   h_output(output_size, 0);
+  thrust::device_vector<int> d_output(output_size, 0);
+  
+  thrust::scatter(h_input.begin(), h_input.end(), h_map.begin(), h_output.begin());
+
+  scatter_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_map.begin(), d_output.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+
+void TestScatterDeviceSeq()
+{
+  TestScatterDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestScatterDeviceSeq);
+
+void TestScatterDeviceDevice()
+{
+  TestScatterDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestScatterDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Function>
+__global__
+void scatter_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 map_first, Iterator3 stencil_first, Iterator4 result, Function f)
+{
+  thrust::scatter_if(exec, first, last, map_first, stencil_first, result, f);
+}
+
+
+template<typename T>
+struct is_even_scatter_if
+{
+  __host__ __device__ bool operator()(const T i) const { return (i % 2) == 0; }
+};
+
+
+template<typename ExecutionPolicy>
+void TestScatterIfDevice(ExecutionPolicy exec)
+{
+  size_t n = 1000;
+  const size_t output_size = std::min((size_t) 10, 2 * n);
+  
+  thrust::host_vector<int> h_input(n, 1);
+  thrust::device_vector<int> d_input(n, 1);
+  
+  thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+  
+  for(size_t i = 0; i < n; i++)
+  {
+    h_map[i] =  h_map[i] % output_size;
+  }
+  
+  thrust::device_vector<unsigned int> d_map = h_map;
+  
+  thrust::host_vector<int>   h_output(output_size, 0);
+  thrust::device_vector<int> d_output(output_size, 0);
+  
+  thrust::scatter_if(h_input.begin(), h_input.end(), h_map.begin(), h_map.begin(), h_output.begin(), is_even_scatter_if<unsigned int>());
+
+  scatter_if_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_map.begin(), d_map.begin(), d_output.begin(), is_even_scatter_if<unsigned int>());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+
+
+void TestScatterIfDeviceSeq()
+{
+  TestScatterIfDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestScatterIfDeviceSeq);
+
+
+void TestScatterIfDeviceDevice()
+{
+  TestScatterIfDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestScatterIfDeviceDevice);
+
+
+void TestScatterCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector map(5);  // scatter indices
+  Vector src(5);  // source vector
+  Vector dst(8);  // destination vector
+
+  map[0] = 6; map[1] = 3; map[2] = 1; map[3] = 7; map[4] = 2;
+  src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4;
+  dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0; dst[5] = 0; dst[6] = 0; dst[7] = 0;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::scatter(thrust::cuda::par.on(s), src.begin(), src.end(), map.begin(), dst.begin());
+
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(dst[0], 0);
+  ASSERT_EQUAL(dst[1], 2);
+  ASSERT_EQUAL(dst[2], 4);
+  ASSERT_EQUAL(dst[3], 1);
+  ASSERT_EQUAL(dst[4], 0);
+  ASSERT_EQUAL(dst[5], 0);
+  ASSERT_EQUAL(dst[6], 0);
+  ASSERT_EQUAL(dst[7], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestScatterCudaStreams);
+
+
+void TestScatterIfCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  
+  Vector flg(5);  // predicate array
+  Vector map(5);  // scatter indices
+  Vector src(5);  // source vector
+  Vector dst(8);  // destination vector
+  
+  flg[0] = 0; flg[1] = 1; flg[2] = 0; flg[3] = 1; flg[4] = 0;
+  map[0] = 6; map[1] = 3; map[2] = 1; map[3] = 7; map[4] = 2;
+  src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4;
+  dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0; dst[5] = 0; dst[6] = 0; dst[7] = 0;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::scatter_if(thrust::cuda::par.on(s), src.begin(), src.end(), map.begin(), flg.begin(), dst.begin());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(dst[0], 0);
+  ASSERT_EQUAL(dst[1], 0);
+  ASSERT_EQUAL(dst[2], 0);
+  ASSERT_EQUAL(dst[3], 1);
+  ASSERT_EQUAL(dst[4], 0);
+  ASSERT_EQUAL(dst[5], 0);
+  ASSERT_EQUAL(dst[6], 0);
+  ASSERT_EQUAL(dst[7], 3);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestScatterIfCudaStreams);
+
diff --git a/thrust/testing/cuda/scatter.mk b/thrust/testing/cuda/scatter.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/scatter.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/sequence.cu b/thrust/testing/cuda/sequence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..acbe09848cf32b5c9df9562fec9eb945e0560d2c
--- /dev/null
+++ b/thrust/testing/cuda/sequence.cu
@@ -0,0 +1,123 @@
+#include <unittest/unittest.h>
+#include <thrust/sequence.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator>
+__global__
+void sequence_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
+{
+  thrust::sequence(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T>
+__global__
+void sequence_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T init)
+{
+  thrust::sequence(exec, first, last, init);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T>
+__global__
+void sequence_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T init, T step)
+{
+  thrust::sequence(exec, first, last, init, step);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSequenceDevice(ExecutionPolicy exec)
+{
+  thrust::device_vector<int> v(5);
+  
+  sequence_kernel<<<1,1>>>(exec, v.begin(), v.end());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+ 
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 2);
+  ASSERT_EQUAL(v[3], 3);
+  ASSERT_EQUAL(v[4], 4);
+  
+  sequence_kernel<<<1,1>>>(exec, v.begin(), v.end(), 10);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(v[0], 10);
+  ASSERT_EQUAL(v[1], 11);
+  ASSERT_EQUAL(v[2], 12);
+  ASSERT_EQUAL(v[3], 13);
+  ASSERT_EQUAL(v[4], 14);
+  
+  sequence_kernel<<<1,1>>>(exec, v.begin(), v.end(), 10, 2);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(v[0], 10);
+  ASSERT_EQUAL(v[1], 12);
+  ASSERT_EQUAL(v[2], 14);
+  ASSERT_EQUAL(v[3], 16);
+  ASSERT_EQUAL(v[4], 18);
+}
+
+void TestSequenceDeviceSeq()
+{
+  TestSequenceDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSequenceDeviceSeq);
+
+void TestSequenceDeviceDevice()
+{
+  TestSequenceDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSequenceDeviceDevice);
+
+void TestSequenceCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  
+  Vector v(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::sequence(thrust::cuda::par.on(s), v.begin(), v.end());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 2);
+  ASSERT_EQUAL(v[3], 3);
+  ASSERT_EQUAL(v[4], 4);
+
+  thrust::sequence(thrust::cuda::par.on(s), v.begin(), v.end(), 10);
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v[0], 10);
+  ASSERT_EQUAL(v[1], 11);
+  ASSERT_EQUAL(v[2], 12);
+  ASSERT_EQUAL(v[3], 13);
+  ASSERT_EQUAL(v[4], 14);
+  
+  thrust::sequence(thrust::cuda::par.on(s), v.begin(), v.end(), 10, 2);
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v[0], 10);
+  ASSERT_EQUAL(v[1], 12);
+  ASSERT_EQUAL(v[2], 14);
+  ASSERT_EQUAL(v[3], 16);
+  ASSERT_EQUAL(v[4], 18);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSequenceCudaStreams);
+
diff --git a/thrust/testing/cuda/sequence.mk b/thrust/testing/cuda/sequence.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/sequence.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_difference.cu b/thrust/testing/cuda/set_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d87db42d96cac3ac91f7c6dfdee44d168596383a
--- /dev/null
+++ b/thrust/testing/cuda/set_difference.cu
@@ -0,0 +1,84 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
+__global__
+void set_difference_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator2 last2, Iterator3 result1, Iterator4 result2)
+{
+  *result2 = thrust::set_difference(exec, first1, last1, first2, last2, result1);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetDifferenceDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 4; a[3] = 5;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4; b[4] = 6;
+
+  Vector ref(2);
+  ref[0] = 2; ref[1] = 5;
+
+  Vector result(2);
+
+  thrust::device_vector<Iterator> end_vec(1);
+
+  set_difference_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), b.end(), result.begin(), end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  Iterator end = end_vec.front();
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+
+
+void TestSetDifferenceDeviceSeq()
+{
+  TestSetDifferenceDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetDifferenceDeviceSeq);
+
+
+void TestSetDifferenceDeviceDevice()
+{
+  TestSetDifferenceDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetDifferenceDeviceDevice);
+
+
+void TestSetDifferenceCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 4; a[3] = 5;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4; b[4] = 6;
+
+  Vector ref(2);
+  ref[0] = 2; ref[1] = 5;
+
+  Vector result(2);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Iterator end = thrust::set_difference(thrust::cuda::par.on(s), a.begin(), a.end(), b.begin(), b.end(), result.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetDifferenceCudaStreams);
+
diff --git a/thrust/testing/cuda/set_difference.mk b/thrust/testing/cuda/set_difference.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_difference_by_key.cu b/thrust/testing/cuda/set_difference_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..31d2860b09a1204790df1e70cbd730b5d0c7f88b
--- /dev/null
+++ b/thrust/testing/cuda/set_difference_by_key.cu
@@ -0,0 +1,128 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
+__global__
+void set_difference_by_key_kernel(ExecutionPolicy exec,
+                                  Iterator1 keys_first1, Iterator1 keys_last1,
+                                  Iterator2 keys_first2, Iterator2 keys_last2,
+                                  Iterator3 values_first1,
+                                  Iterator4 values_first2,
+                                  Iterator5 keys_result,
+                                  Iterator6 values_result,
+                                  Iterator7 result)
+{
+  *result = thrust::set_difference_by_key(exec,
+                                          keys_first1, keys_last1,
+                                          keys_first2, keys_last2,
+                                          values_first1,
+                                          values_first2,
+                                          keys_result,
+                                          values_result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetDifferenceByKeyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4; a_key[3] = 5;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4; b_key[4] = 6;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 2; ref_key[1] = 5;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  typedef thrust::pair<Iterator,Iterator> iter_pair;
+
+  thrust::device_vector<iter_pair> end_vec(1);
+
+  set_difference_by_key_kernel<<<1,1>>>(exec,
+                                        a_key.begin(), a_key.end(),
+                                        b_key.begin(), b_key.end(),
+                                        a_val.begin(),
+                                        b_val.begin(),
+                                        result_key.begin(),
+                                        result_val.begin(),
+                                        end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter_pair end = end_vec.front();
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+
+
+void TestSetDifferenceByKeyDeviceSeq()
+{
+  TestSetDifferenceByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetDifferenceByKeyDeviceSeq);
+
+
+void TestSetDifferenceByKeyDeviceDevice()
+{
+  TestSetDifferenceByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetDifferenceByKeyDeviceDevice);
+
+
+void TestSetDifferenceByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4; a_key[3] = 5;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4; b_key[4] = 6;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 2; ref_key[1] = 5;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_difference_by_key(thrust::cuda::par.on(s),
+                                  a_key.begin(), a_key.end(),
+                                  b_key.begin(), b_key.end(),
+                                  a_val.begin(),
+                                  b_val.begin(),
+                                  result_key.begin(),
+                                  result_val.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetDifferenceByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/set_difference_by_key.mk b/thrust/testing/cuda/set_difference_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_difference_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_intersection.cu b/thrust/testing/cuda/set_intersection.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a57bc1b2adf526ee2b87a650375344d42af472cf
--- /dev/null
+++ b/thrust/testing/cuda/set_intersection.cu
@@ -0,0 +1,92 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/discard_iterator.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
+__global__
+void set_intersection_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1,
+                             Iterator2 first2, Iterator2 last2,
+                             Iterator3 result1,
+                             Iterator4 result2)
+{
+  *result2 = thrust::set_intersection(exec, first1, last1, first2, last2, result1);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(2);
+  ref[0] = 0; ref[1] = 4;
+
+  Vector result(2);
+  thrust::device_vector<Iterator> end_vec(1);
+
+  set_intersection_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), b.end(), result.begin(), end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  Iterator end = end_vec.front();
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+
+
+void TestSetIntersectionDeviceSeq()
+{
+  TestSetIntersectionDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetIntersectionDeviceSeq);
+
+
+void TestSetIntersectionDeviceDevice()
+{
+  TestSetIntersectionDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetIntersectionDeviceDevice);
+
+
+void TestSetIntersectionCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(2);
+  ref[0] = 0; ref[1] = 4;
+
+  Vector result(2);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Iterator end = thrust::set_intersection(thrust::cuda::par.on(s),
+                                          a.begin(), a.end(),
+                                          b.begin(), b.end(),
+                                          result.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreams);
+
diff --git a/thrust/testing/cuda/set_intersection.mk b/thrust/testing/cuda/set_intersection.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_intersection.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_intersection_by_key.cu b/thrust/testing/cuda/set_intersection_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a19f822216b7191535a079f8331ee97e5be0c73d
--- /dev/null
+++ b/thrust/testing/cuda/set_intersection_by_key.cu
@@ -0,0 +1,115 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6>
+__global__
+void set_intersection_by_key_kernel(ExecutionPolicy exec,
+                                    Iterator1 keys_first1, Iterator1 keys_last1,
+                                    Iterator2 keys_first2, Iterator2 keys_last2,
+                                    Iterator3 values_first1,
+                                    Iterator4 keys_result,
+                                    Iterator5 values_result,
+                                    Iterator6 result)
+{
+  *result = thrust::set_intersection_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionByKeyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 0; ref_key[1] = 4;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  typedef thrust::pair<Iterator,Iterator> iter_pair;
+  thrust::device_vector<iter_pair> end_vec(1);
+
+  set_intersection_by_key_kernel<<<1,1>>>(exec,
+                                          a_key.begin(), a_key.end(),
+                                          b_key.begin(), b_key.end(),
+                                          a_val.begin(),
+                                          result_key.begin(),
+                                          result_val.begin(),
+                                          end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  thrust::pair<Iterator,Iterator> end = end_vec.front();
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+
+
+void TestSetIntersectionByKeyDeviceSeq()
+{
+  TestSetIntersectionByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceSeq);
+
+
+void TestSetIntersectionByKeyDeviceDevice()
+{
+  TestSetIntersectionByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceDevice);
+
+
+void TestSetIntersectionByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 0; ref_key[1] = 4;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_intersection_by_key(thrust::cuda::par.on(s),
+                                    a_key.begin(), a_key.end(),
+                                    b_key.begin(), b_key.end(),
+                                    a_val.begin(),
+                                    result_key.begin(),
+                                    result_val.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/set_intersection_by_key.mk b/thrust/testing/cuda/set_intersection_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_intersection_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_symmetric_difference.cu b/thrust/testing/cuda/set_symmetric_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34969886e0d30ffa89f4ee0de4880188076aaf44
--- /dev/null
+++ b/thrust/testing/cuda/set_symmetric_difference.cu
@@ -0,0 +1,94 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
+__global__
+void set_symmetric_difference_kernel(ExecutionPolicy exec,
+                                     Iterator1 first1, Iterator1 last1,
+                                     Iterator2 first2, Iterator2 last2,
+                                     Iterator3 result1,
+                                     Iterator4 result2)
+{
+  *result2 = thrust::set_symmetric_difference(exec, first1, last1, first2, last2, result1);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetSymmetricDifferenceDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 4; a[3] = 6;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4; b[4] = 7;
+
+  Vector ref(5);
+  ref[0] = 2; ref[1] = 3; ref[2] = 3; ref[3] = 6; ref[4] = 7;
+
+  Vector result(5);
+  thrust::device_vector<Iterator> end_vec(1);
+
+  set_symmetric_difference_kernel<<<1,1>>>(exec,
+                                           a.begin(), a.end(),
+                                           b.begin(), b.end(),
+                                           result.begin(),
+                                           end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  Iterator end = end_vec[0];
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+
+
+void TestSetSymmetricDifferenceDeviceSeq()
+{
+  TestSetSymmetricDifferenceDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceDeviceSeq);
+
+
+void TestSetSymmetricDifferenceDeviceDevice()
+{
+  TestSetSymmetricDifferenceDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceDeviceDevice);
+
+
+void TestSetSymmetricDifferenceCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 4; a[3] = 6;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4; b[4] = 7;
+
+  Vector ref(5);
+  ref[0] = 2; ref[1] = 3; ref[2] = 3; ref[3] = 6; ref[4] = 7;
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Iterator end = thrust::set_symmetric_difference(thrust::cuda::par.on(s),
+                                                  a.begin(), a.end(),
+                                                  b.begin(), b.end(),
+                                                  result.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceCudaStreams);
+
diff --git a/thrust/testing/cuda/set_symmetric_difference.mk b/thrust/testing/cuda/set_symmetric_difference.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_symmetric_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_symmetric_difference_by_key.cu b/thrust/testing/cuda/set_symmetric_difference_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a6c68ce9e72d54b8921d0e05ac69b5e7ac33ffb
--- /dev/null
+++ b/thrust/testing/cuda/set_symmetric_difference_by_key.cu
@@ -0,0 +1,120 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
+__global__
+void set_symmetric_difference_by_key_kernel(ExecutionPolicy exec,
+                                            Iterator1 keys_first1, Iterator1 keys_last1,
+                                            Iterator2 keys_first2, Iterator2 keys_last2,
+                                            Iterator3 values_first1,
+                                            Iterator4 values_first2,
+                                            Iterator5 keys_result,
+                                            Iterator6 values_result,
+                                            Iterator7 result)
+{
+  *result = thrust::set_symmetric_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetSymmetricDifferenceByKeyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4; a_key[3] = 6;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4; b_key[4] = 7;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 2; ref_key[1] = 3; ref_key[2] = 3; ref_key[3] = 6; ref_key[4] = 7;
+  ref_val[0] = 0; ref_val[1] = 1; ref_val[2] = 1; ref_val[3] = 0; ref_val[4] = 1;
+
+  Vector result_key(5), result_val(5);
+
+  typedef thrust::pair<Iterator,Iterator> iter_pair;
+  thrust::device_vector<iter_pair> end_vec(1);
+
+  set_symmetric_difference_by_key_kernel<<<1,1>>>(exec,
+                                                  a_key.begin(), a_key.end(),
+                                                  b_key.begin(), b_key.end(),
+                                                  a_val.begin(),
+                                                  b_val.begin(),
+                                                  result_key.begin(),
+                                                  result_val.begin(),
+                                                  end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter_pair end = end_vec[0];
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+
+
+void TestSetSymmetricDifferenceByKeyDeviceSeq()
+{
+  TestSetSymmetricDifferenceByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDeviceSeq);
+
+
+void TestSetSymmetricDifferenceByKeyDeviceDevice()
+{
+  TestSetSymmetricDifferenceByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDeviceDevice);
+
+
+void TestSetSymmetricDifferenceByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4; a_key[3] = 6;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4; b_key[4] = 7;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 2; ref_key[1] = 3; ref_key[2] = 3; ref_key[3] = 6; ref_key[4] = 7;
+  ref_val[0] = 0; ref_val[1] = 1; ref_val[2] = 1; ref_val[3] = 0; ref_val[4] = 1;
+
+  Vector result_key(5), result_val(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_symmetric_difference_by_key(thrust::cuda::par.on(s),
+                                            a_key.begin(), a_key.end(),
+                                            b_key.begin(), b_key.end(),
+                                            a_val.begin(),
+                                            b_val.begin(),
+                                            result_key.begin(),
+                                            result_val.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/set_symmetric_difference_by_key.mk b/thrust/testing/cuda/set_symmetric_difference_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_symmetric_difference_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_union.cu b/thrust/testing/cuda/set_union.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb5b543e153bc5b3b255e35ad50b5b1adbe4d565
--- /dev/null
+++ b/thrust/testing/cuda/set_union.cu
@@ -0,0 +1,94 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
+__global__
+void set_union_kernel(ExecutionPolicy exec,
+                      Iterator1 first1, Iterator1 last1,
+                      Iterator2 first2, Iterator2 last2,
+                      Iterator3 result1,
+                      Iterator4 result2)
+{
+  *result2 = thrust::set_union(exec, first1, last1, first2, last2, result1);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetUnionDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(5);
+  ref[0] = 0; ref[1] = 2; ref[2] = 3; ref[3] = 3; ref[4] = 4;
+
+  Vector result(5);
+  thrust::device_vector<Iterator> end_vec(1);
+
+  set_union_kernel<<<1,1>>>(exec,
+                            a.begin(), a.end(),
+                            b.begin(), b.end(),
+                            result.begin(),
+                            end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  Iterator end = end_vec[0];
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+
+
+void TestSetUnionDeviceSeq()
+{
+  TestSetUnionDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetUnionDeviceSeq);
+
+
+void TestSetUnionDeviceDevice()
+{
+  TestSetUnionDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetUnionDeviceDevice);
+
+
+void TestSetUnionCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(5);
+  ref[0] = 0; ref[1] = 2; ref[2] = 3; ref[3] = 3; ref[4] = 4;
+
+  Vector result(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  Iterator end = thrust::set_union(thrust::cuda::par.on(s),
+                                   a.begin(), a.end(),
+                                   b.begin(), b.end(),
+                                   result.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetUnionCudaStreams);
+
diff --git a/thrust/testing/cuda/set_union.mk b/thrust/testing/cuda/set_union.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_union.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/set_union_by_key.cu b/thrust/testing/cuda/set_union_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1be3d9302397823da335b438009a8b0ad5159cf6
--- /dev/null
+++ b/thrust/testing/cuda/set_union_by_key.cu
@@ -0,0 +1,119 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
+__global__
+void set_union_by_key_kernel(ExecutionPolicy exec,
+                             Iterator1 keys_first1, Iterator1 keys_last1,
+                             Iterator2 keys_first2, Iterator2 keys_last2,
+                             Iterator3 values_first1,
+                             Iterator4 values_first2,
+                             Iterator5 keys_result,
+                             Iterator6 values_result,
+                             Iterator7 result)
+{
+  *result = thrust::set_union_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSetUnionByKeyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3), b_val(4);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 0; ref_key[1] = 2; ref_key[2] = 3; ref_key[3] = 3; ref_key[4] = 4;
+  ref_val[0] = 0; ref_val[1] = 0; ref_val[2] = 1; ref_val[3] = 1; ref_val[4] = 0;
+
+  Vector result_key(5), result_val(5);
+
+  thrust::device_vector<thrust::pair<Iterator,Iterator> > end_vec(1);
+
+  set_union_by_key_kernel<<<1,1>>>(exec,
+                                   a_key.begin(), a_key.end(),
+                                   b_key.begin(), b_key.end(),
+                                   a_val.begin(),
+                                   b_val.begin(),
+                                   result_key.begin(),
+                                   result_val.begin(),
+                                   end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  thrust::pair<Iterator,Iterator> end = end_vec[0];
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+
+
+void TestSetUnionByKeyDeviceSeq()
+{
+  TestSetUnionByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSetUnionByKeyDeviceSeq);
+
+
+void TestSetUnionByKeyDeviceDevice()
+{
+  TestSetUnionByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSetUnionByKeyDeviceDevice);
+
+
+void TestSetUnionByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3), b_val(4);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 0; ref_key[1] = 2; ref_key[2] = 3; ref_key[3] = 3; ref_key[4] = 4;
+  ref_val[0] = 0; ref_val[1] = 0; ref_val[2] = 1; ref_val[3] = 1; ref_val[4] = 0;
+
+  Vector result_key(5), result_val(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_union_by_key(thrust::cuda::par.on(s),
+                             a_key.begin(), a_key.end(),
+                             b_key.begin(), b_key.end(),
+                             a_val.begin(),
+                             b_val.begin(),
+                             result_key.begin(),
+                             result_val.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSetUnionByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/set_union_by_key.mk b/thrust/testing/cuda/set_union_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/set_union_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/sort.cu b/thrust/testing/cuda/sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f3d6413c7d2f915b710e7db9eeeb4ca7b371400
--- /dev/null
+++ b/thrust/testing/cuda/sort.cu
@@ -0,0 +1,170 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Compare, typename Iterator2>
+__global__
+void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported)
+{
+#if (__CUDA_ARCH__ >= 200)
+  *is_supported = true;
+  thrust::sort(exec, first, last, comp);
+#else
+  *is_supported = false;
+#endif
+}
+
+
+template<typename T>
+struct my_less
+{
+  __host__ __device__
+  bool operator()(const T& lhs, const T& rhs) const
+  {
+    return lhs < rhs;
+  }
+};
+
+
+template<typename T, typename ExecutionPolicy, typename Compare>
+void TestComparisonSortDevice(ExecutionPolicy exec, const size_t n, Compare comp)
+{
+  thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+  
+  thrust::device_vector<bool> is_supported(1);
+
+  sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp, is_supported.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+
+  if(is_supported[0])
+  {
+    thrust::sort(h_data.begin(), h_data.end(), comp);
+    
+    ASSERT_EQUAL(h_data, d_data);
+  }
+};
+
+
+template<typename T>
+  struct TestComparisonSortDeviceSeq
+{
+  void operator()(const size_t n)
+  {
+    TestComparisonSortDevice<T>(thrust::seq, n, my_less<T>());
+  }
+};
+VariableUnitTest<
+  TestComparisonSortDeviceSeq,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestComparisonSortDeviceSeqInstance;
+
+
+template<typename T>
+  struct TestComparisonSortDeviceDevice
+{
+  void operator()(const size_t n)
+  {
+    TestComparisonSortDevice<T>(thrust::device, n, my_less<T>());
+  }
+};
+VariableUnitTest<
+  TestComparisonSortDeviceDevice,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestComparisonSortDeviceDeviceDeviceInstance;
+
+
+template<typename T, typename ExecutionPolicy>
+void TestSortDevice(ExecutionPolicy exec, const size_t n)
+{
+  TestComparisonSortDevice<T>(exec, n, thrust::less<T>());
+};
+
+
+template<typename T>
+  struct TestSortDeviceSeq
+{
+  void operator()(const size_t n)
+  {
+    TestSortDevice<T>(thrust::seq, n);
+  }
+};
+VariableUnitTest<
+  TestSortDeviceSeq,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestSortDeviceSeqInstance;
+
+
+template<typename T>
+  struct TestSortDeviceDevice
+{
+  void operator()(const size_t n)
+  {
+    TestSortDevice<T>(thrust::device, n);
+  }
+};
+VariableUnitTest<
+  TestSortDeviceDevice,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestSortDeviceDeviceInstance;
+
+
+void TestSortCudaStreams()
+{
+  thrust::device_vector<int> keys(10);
+
+  keys[0] = 9;
+  keys[1] = 3;
+  keys[2] = 2;
+  keys[3] = 0;
+  keys[4] = 4;
+  keys[5] = 7;
+  keys[6] = 8;
+  keys[7] = 1;
+  keys[8] = 5;
+  keys[9] = 6;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::sort(thrust::cuda::par.on(s), keys.begin(), keys.end());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
+                      
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSortCudaStreams);
+
+
+void TestComparisonSortCudaStreams()
+{
+  thrust::device_vector<int> keys(10);
+
+  keys[0] = 9;
+  keys[1] = 3;
+  keys[2] = 2;
+  keys[3] = 0;
+  keys[4] = 4;
+  keys[5] = 7;
+  keys[6] = 8;
+  keys[7] = 1;
+  keys[8] = 5;
+  keys[9] = 6;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::sort(thrust::cuda::par.on(s), keys.begin(), keys.end(), my_less<int>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end(), my_less<int>()));
+                      
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestComparisonSortCudaStreams);
+
diff --git a/thrust/testing/cuda/sort.mk b/thrust/testing/cuda/sort.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/sort_by_key.cu b/thrust/testing/cuda/sort_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1e848879b11ae5a0e41ccd9620fa6d989e0c3d01
--- /dev/null
+++ b/thrust/testing/cuda/sort_by_key.cu
@@ -0,0 +1,176 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Compare, typename Iterator3>
+__global__
+void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp, Iterator3 is_supported)
+{
+#if (__CUDA_ARCH__ >= 200)
+  *is_supported = true;
+  thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp);
+#else
+  *is_supported = false;
+#endif
+}
+
+
+template<typename T>
+struct my_less
+{
+  __host__ __device__
+  bool operator()(const T& lhs, const T& rhs) const
+  {
+    return lhs < rhs;
+  }
+};
+
+
+template<typename T, typename ExecutionPolicy, typename Compare>
+void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare comp)
+{
+  thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_keys = h_keys;
+
+  thrust::host_vector<T>   h_values = h_keys;
+  thrust::device_vector<T> d_values = d_keys;
+  
+  thrust::device_vector<bool> is_supported(1);
+  sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp, is_supported.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  if(is_supported[0])
+  {
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp);
+    
+    ASSERT_EQUAL(h_keys, d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+  }
+};
+
+
+template<typename T>
+  struct TestComparisonSortByKeyDeviceSeq
+{
+  void operator()(const size_t n)
+  {
+    TestComparisonSortByKeyDevice<T>(thrust::seq, n, my_less<T>());
+  }
+};
+VariableUnitTest<
+  TestComparisonSortByKeyDeviceSeq,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestComparisonSortByKeyDeviceSeqInstance;
+
+
+template<typename T>
+  struct TestComparisonSortByKeyDeviceDevice
+{
+  void operator()(const size_t n)
+  {
+    TestComparisonSortByKeyDevice<T>(thrust::device, n, my_less<T>());
+  }
+};
+VariableUnitTest<
+  TestComparisonSortByKeyDeviceDevice,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestComparisonSortByKeyDeviceDeviceDeviceInstance;
+
+
+template<typename T, typename ExecutionPolicy>
+void TestSortByKeyDevice(ExecutionPolicy exec, const size_t n)
+{
+  TestComparisonSortByKeyDevice<T>(exec, n, thrust::less<T>());
+};
+
+
+template<typename T>
+  struct TestSortByKeyDeviceSeq
+{
+  void operator()(const size_t n)
+  {
+    TestSortByKeyDevice<T>(thrust::seq, n);
+  }
+};
+VariableUnitTest<
+  TestSortByKeyDeviceSeq,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestSortByKeyDeviceSeqInstance;
+
+
+template<typename T>
+  struct TestSortByKeyDeviceDevice
+{
+  void operator()(const size_t n)
+  {
+    TestSortByKeyDevice<T>(thrust::device, n);
+  }
+};
+VariableUnitTest<
+  TestSortByKeyDeviceDevice,
+  unittest::type_list<unittest::int8_t,unittest::int32_t>
+> TestSortByKeyDeviceDeviceInstance;
+
+
+void TestComparisonSortByKeyCudaStreams()
+{
+  thrust::device_vector<int> keys(10);
+  thrust::device_vector<int> vals(10);
+
+  keys[0] = 9; vals[0] = 9;
+  keys[1] = 3; vals[1] = 3;
+  keys[2] = 2; vals[2] = 2;
+  keys[3] = 0; vals[3] = 0;
+  keys[4] = 4; vals[4] = 4;
+  keys[5] = 7; vals[5] = 7;
+  keys[6] = 8; vals[6] = 8;
+  keys[7] = 1; vals[7] = 1;
+  keys[8] = 5; vals[8] = 5;
+  keys[9] = 6; vals[9] = 6;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::sort_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin(), my_less<int>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
+  ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end()));
+                      
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestComparisonSortByKeyCudaStreams);
+
+
+void TestSortByKeyCudaStreams()
+{
+  thrust::device_vector<int> keys(10);
+  thrust::device_vector<int> vals(10);
+
+  keys[0] = 9; vals[0] = 9;
+  keys[1] = 3; vals[1] = 3;
+  keys[2] = 2; vals[2] = 2;
+  keys[3] = 0; vals[3] = 0;
+  keys[4] = 4; vals[4] = 4;
+  keys[5] = 7; vals[5] = 7;
+  keys[6] = 8; vals[6] = 8;
+  keys[7] = 1; vals[7] = 1;
+  keys[8] = 5; vals[8] = 5;
+  keys[9] = 6; vals[9] = 6;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::sort_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), vals.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
+  ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end()));
+                      
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSortByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/sort_by_key.mk b/thrust/testing/cuda/sort_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/sort_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/stream_legacy.cu b/thrust/testing/cuda/stream_legacy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..51c82a096d265a0b75dea23bd0a06c53a1c0daf0
--- /dev/null
+++ b/thrust/testing/cuda/stream_legacy.cu
@@ -0,0 +1,21 @@
+#include <unittest/unittest.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thread>
+
+void verify_stream()
+{
+  auto exec = thrust::device;
+  auto stream = thrust::cuda_cub::stream(exec);
+  ASSERT_EQUAL(stream, cudaStreamLegacy);
+}
+
+void TestLegacyDefaultStream()
+{
+  verify_stream();
+
+  std::thread t(verify_stream);
+  t.join();
+}
+DECLARE_UNITTEST(TestLegacyDefaultStream);
diff --git a/thrust/testing/cuda/stream_per_thread.cmake b/thrust/testing/cuda/stream_per_thread.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..265f4fdc30b5b89daa0886a6cc5ab6765d9a5202
--- /dev/null
+++ b/thrust/testing/cuda/stream_per_thread.cmake
@@ -0,0 +1,11 @@
+# This test should always use per-thread streams on NVCC.
+set_target_properties(${test_target} PROPERTIES
+  COMPILE_OPTIONS
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:--default-stream=per-thread>
+)
+
+# NVC++ does not have an equivalent option, and will always
+# use the global stream by default.
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "Feta")
+  set_tests_properties(${test_target} PROPERTIES WILL_FAIL ON)
+endif()
diff --git a/thrust/testing/cuda/stream_per_thread.cu b/thrust/testing/cuda/stream_per_thread.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ef126e78a5917916a6919e6fd249301d2d79fdd9
--- /dev/null
+++ b/thrust/testing/cuda/stream_per_thread.cu
@@ -0,0 +1,21 @@
+#include <unittest/unittest.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thread>
+
+void verify_stream()
+{
+  auto exec = thrust::device;
+  auto stream = thrust::cuda_cub::stream(exec);
+  ASSERT_EQUAL(stream, cudaStreamPerThread);
+}
+
+void TestPerThreadDefaultStream()
+{
+  verify_stream();
+
+  std::thread t(verify_stream);
+  t.join();
+}
+DECLARE_UNITTEST(TestPerThreadDefaultStream);
diff --git a/thrust/testing/cuda/stream_per_thread.mk b/thrust/testing/cuda/stream_per_thread.mk
new file mode 100644
index 0000000000000000000000000000000000000000..da9adfe1b8ed4f27084c153bdfd0241c1791b0a2
--- /dev/null
+++ b/thrust/testing/cuda/stream_per_thread.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += --default-stream per-thread
diff --git a/thrust/testing/cuda/swap_ranges.cu b/thrust/testing/cuda/swap_ranges.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e2392bbe2bed9ee8c5e157c8be3f76bc63a42d41
--- /dev/null
+++ b/thrust/testing/cuda/swap_ranges.cu
@@ -0,0 +1,85 @@
+#include <unittest/unittest.h>
+#include <thrust/swap.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void swap_ranges_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2)
+{
+  thrust::swap_ranges(exec, first1, last1, first2);
+}
+
+
+template<typename ExecutionPolicy>
+void TestSwapRangesDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector v1(5);
+  v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+
+  Vector v2(5);
+  v2[0] = 5; v2[1] = 6; v2[2] = 7; v2[3] = 8; v2[4] = 9;
+
+  swap_ranges_kernel<<<1,1>>>(exec, v1.begin(), v1.end(), v2.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  ASSERT_EQUAL(v1[0], 5);
+  ASSERT_EQUAL(v1[1], 6);
+  ASSERT_EQUAL(v1[2], 7);
+  ASSERT_EQUAL(v1[3], 8);
+  ASSERT_EQUAL(v1[4], 9);
+  
+  ASSERT_EQUAL(v2[0], 0);
+  ASSERT_EQUAL(v2[1], 1);
+  ASSERT_EQUAL(v2[2], 2);
+  ASSERT_EQUAL(v2[3], 3);
+  ASSERT_EQUAL(v2[4], 4);
+}
+
+void TestSwapRangesDeviceSeq()
+{
+  TestSwapRangesDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestSwapRangesDeviceSeq);
+
+void TestSwapRangesDeviceDevice()
+{
+  TestSwapRangesDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestSwapRangesDeviceDevice);
+
+void TestSwapRangesCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+
+  Vector v1(5);
+  v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+
+  Vector v2(5);
+  v2[0] = 5; v2[1] = 6; v2[2] = 7; v2[3] = 8; v2[4] = 9;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::swap_ranges(thrust::cuda::par.on(s), v1.begin(), v1.end(), v2.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v1[0], 5);
+  ASSERT_EQUAL(v1[1], 6);
+  ASSERT_EQUAL(v1[2], 7);
+  ASSERT_EQUAL(v1[3], 8);
+  ASSERT_EQUAL(v1[4], 9);
+  
+  ASSERT_EQUAL(v2[0], 0);
+  ASSERT_EQUAL(v2[1], 1);
+  ASSERT_EQUAL(v2[2], 2);
+  ASSERT_EQUAL(v2[3], 3);
+  ASSERT_EQUAL(v2[4], 4);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestSwapRangesCudaStreams);
+
diff --git a/thrust/testing/cuda/swap_ranges.mk b/thrust/testing/cuda/swap_ranges.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/swap_ranges.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/tabulate.cu b/thrust/testing/cuda/tabulate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..564d85e7ec1584e9d0adada619140bd96651d00f
--- /dev/null
+++ b/thrust/testing/cuda/tabulate.cu
@@ -0,0 +1,114 @@
+#include <unittest/unittest.h>
+#include <thrust/tabulate.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename Function>
+__global__
+void tabulate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
+{
+  thrust::tabulate(exec, first, last, f);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTabulateDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  using namespace thrust::placeholders;
+  typedef typename Vector::value_type T;
+  
+  Vector v(5);
+
+  tabulate_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 2);
+  ASSERT_EQUAL(v[3], 3);
+  ASSERT_EQUAL(v[4], 4);
+
+  tabulate_kernel<<<1,1>>>(exec, v.begin(), v.end(), -_1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(v[0],  0);
+  ASSERT_EQUAL(v[1], -1);
+  ASSERT_EQUAL(v[2], -2);
+  ASSERT_EQUAL(v[3], -3);
+  ASSERT_EQUAL(v[4], -4);
+  
+  tabulate_kernel<<<1,1>>>(exec, v.begin(), v.end(), _1 * _1 * _1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 8);
+  ASSERT_EQUAL(v[3], 27);
+  ASSERT_EQUAL(v[4], 64);
+}
+
+void TestTabulateDeviceSeq()
+{
+  TestTabulateDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTabulateDeviceSeq);
+
+void TestTabulateDeviceDevice()
+{
+  TestTabulateDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTabulateDeviceDevice);
+
+void TestTabulateCudaStreams()
+{
+  using namespace thrust::placeholders;
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector v(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::tabulate(thrust::cuda::par.on(s), v.begin(), v.end(), thrust::identity<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 2);
+  ASSERT_EQUAL(v[3], 3);
+  ASSERT_EQUAL(v[4], 4);
+
+  thrust::tabulate(thrust::cuda::par.on(s), v.begin(), v.end(), -_1);
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v[0],  0);
+  ASSERT_EQUAL(v[1], -1);
+  ASSERT_EQUAL(v[2], -2);
+  ASSERT_EQUAL(v[3], -3);
+  ASSERT_EQUAL(v[4], -4);
+  
+  thrust::tabulate(thrust::cuda::par.on(s), v.begin(), v.end(), _1 * _1 * _1);
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 8);
+  ASSERT_EQUAL(v[3], 27);
+  ASSERT_EQUAL(v[4], 64);
+
+  cudaStreamSynchronize(s);
+}
+DECLARE_UNITTEST(TestTabulateCudaStreams);
+
diff --git a/thrust/testing/cuda/tabulate.mk b/thrust/testing/cuda/tabulate.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/tabulate.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fa0358e57597de1e888f5d65da8c54b6c6db3a03
--- /dev/null
+++ b/thrust/testing/cuda/transform.cu
@@ -0,0 +1,328 @@
+#include <unittest/unittest.h>
+#include <thrust/transform.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function, typename Iterator3>
+__global__
+void transform_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function f, Iterator3 result2)
+{
+  *result2 = thrust::transform(exec, first, last, result1, f);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformUnaryDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  typename Vector::iterator iter;
+  
+  Vector input(3);
+  Vector output(3);
+  Vector result(3);
+  input[0]  =  1; input[1]  = -2; input[2]  =  3;
+  result[0] = -1; result[1] =  2; result[2] = -3;
+
+  thrust::device_vector<typename Vector::iterator> iter_vec(1);
+  
+  transform_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(output, result);
+}
+
+void TestTransformUnaryDeviceSeq()
+{
+  TestTransformUnaryDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformUnaryDeviceSeq);
+
+void TestTransformUnaryDeviceDevice()
+{
+  TestTransformUnaryDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformUnaryDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function, typename Predicate, typename Iterator3>
+__global__
+void transform_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function f, Predicate pred, Iterator3 result2)
+{
+  *result2 = thrust::transform_if(exec, first, last, result1, f, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformIfUnaryNoStencilDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  typename Vector::iterator iter;
+  
+  Vector input(3);
+  Vector output(3);
+  Vector result(3);
+  
+  input[0]   =  0; input[1]   = -2; input[2]   =  0;
+  output[0]  = -1; output[1]  = -2; output[2]  = -3; 
+  result[0]  = -1; result[1]  =  2; result[2]  = -3;
+
+  thrust::device_vector<typename Vector::iterator> iter_vec(1);
+  
+  transform_if_kernel<<<1,1>>>(exec,
+                               input.begin(), input.end(),
+                               output.begin(),
+                               thrust::negate<T>(),
+                               thrust::identity<T>(),
+                               iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(output, result);
+}
+
+void TestTransformIfUnaryNoStencilDeviceSeq()
+{
+  TestTransformIfUnaryNoStencilDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformIfUnaryNoStencilDeviceSeq);
+
+void TestTransformIfUnaryNoStencilDeviceDevice()
+{
+  TestTransformIfUnaryNoStencilDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformIfUnaryNoStencilDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Function, typename Predicate, typename Iterator4>
+__global__
+void transform_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result1, Function f, Predicate pred, Iterator4 result2)
+{
+  *result2 = thrust::transform_if(exec, first, last, stencil_first, result1, f, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformIfUnaryDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  typename Vector::iterator iter;
+  
+  Vector input(3);
+  Vector stencil(3);
+  Vector output(3);
+  Vector result(3);
+  
+  input[0]   =  1; input[1]   = -2; input[2]   =  3;
+  output[0]  =  1; output[1]  =  2; output[2]  =  3; 
+  stencil[0] =  1; stencil[1] =  0; stencil[2] =  1;
+  result[0]  = -1; result[1]  =  2; result[2]  = -3;
+
+  thrust::device_vector<typename Vector::iterator> iter_vec(1);
+  
+  transform_if_kernel<<<1,1>>>(exec,
+                               input.begin(), input.end(),
+                               stencil.begin(),
+                               output.begin(),
+                               thrust::negate<T>(),
+                               thrust::identity<T>(),
+                               iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(output, result);
+}
+
+void TestTransformIfUnaryDeviceSeq()
+{
+  TestTransformIfUnaryDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformIfUnaryDeviceSeq);
+
+void TestTransformIfUnaryDeviceDevice()
+{
+  TestTransformIfUnaryDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformIfUnaryDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Function, typename Iterator4>
+__global__
+void transform_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result1, Function f, Iterator4 result2)
+{
+  *result2 = thrust::transform(exec, first1, last1, first2, result1, f);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformBinaryDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  typename Vector::iterator iter;
+  
+  Vector input1(3);
+  Vector input2(3);
+  Vector output(3);
+  Vector result(3);
+  input1[0] =  1; input1[1] = -2; input1[2] =  3;
+  input2[0] = -4; input2[1] =  5; input2[2] =  6;
+  result[0] =  5; result[1] = -7; result[2] = -3;
+
+  thrust::device_vector<typename Vector::iterator> iter_vec(1);
+  
+  transform_kernel<<<1,1>>>(exec, input1.begin(), input1.end(), input2.begin(), output.begin(), thrust::minus<T>(), iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
+  ASSERT_EQUAL(output, result);
+}
+
+void TestTransformBinaryDeviceSeq()
+{
+  TestTransformBinaryDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformBinaryDeviceSeq);
+
+void TestTransformBinaryDeviceDevice()
+{
+  TestTransformBinaryDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformBinaryDeviceDevice);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Function, typename Predicate, typename Iterator5>
+__global__
+void transform_if_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 stencil_first, Iterator4 result1, Function f, Predicate pred, Iterator5 result2)
+{
+  *result2 = thrust::transform_if(exec, first1, last1, first2, stencil_first, result1, f, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformIfBinaryDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  typename Vector::iterator iter;
+  
+  Vector input1(3);
+  Vector input2(3);
+  Vector stencil(3);
+  Vector output(3);
+  Vector result(3);
+  
+  input1[0]  =  1; input1[1]  = -2; input1[2]  =  3;
+  input2[0]  = -4; input2[1]  =  5; input2[2]  =  6;
+  stencil[0] =  0; stencil[1] =  1; stencil[2] =  0;
+  output[0]  =  1; output[1]  =  2; output[2]  =  3;
+  result[0]  =  5; result[1]  =  2; result[2]  = -3;
+  
+  thrust::identity<T> identity;
+
+  thrust::device_vector<typename Vector::iterator> iter_vec(1);
+  
+  transform_if_kernel<<<1,1>>>(exec,
+                               input1.begin(), input1.end(),
+                               input2.begin(),
+                               stencil.begin(),
+                               output.begin(),
+                               thrust::minus<T>(),
+                               thrust::not1(identity),
+                               iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
+  ASSERT_EQUAL(output, result);
+}
+
+void TestTransformIfBinaryDeviceSeq()
+{
+  TestTransformIfBinaryDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformIfBinaryDeviceSeq);
+
+void TestTransformIfBinaryDeviceDevice()
+{
+  TestTransformIfBinaryDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformIfBinaryDeviceDevice);
+
+void TestTransformUnaryCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector::iterator iter;
+
+  Vector input(3);
+  Vector output(3);
+  Vector result(3);
+  input[0]  =  1; input[1]  = -2; input[2]  =  3;
+  result[0] = -1; result[1] =  2; result[2] = -3;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  iter = thrust::transform(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(output, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestTransformUnaryCudaStreams);
+
+
+void TestTransformBinaryCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector::iterator iter;
+
+  Vector input1(3);
+  Vector input2(3);
+  Vector output(3);
+  Vector result(3);
+  input1[0] =  1; input1[1] = -2; input1[2] =  3;
+  input2[0] = -4; input2[1] =  5; input2[2] =  6;
+  result[0] =  5; result[1] = -7; result[2] = -3;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  iter = thrust::transform(thrust::cuda::par.on(s), input1.begin(), input1.end(), input2.begin(), output.begin(), thrust::minus<T>());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
+  ASSERT_EQUAL(output, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestTransformBinaryCudaStreams);
+
diff --git a/thrust/testing/cuda/transform.mk b/thrust/testing/cuda/transform.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/transform.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/transform_reduce.cu b/thrust/testing/cuda/transform_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dcc8f646b8ca0ef96fe5df0b42b2f291b00ded50
--- /dev/null
+++ b/thrust/testing/cuda/transform_reduce.cu
@@ -0,0 +1,70 @@
+#include <unittest/unittest.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Function1, typename T, typename Function2, typename Iterator2>
+__global__
+void transform_reduce_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Function1 f1, T init, Function2 f2, Iterator2 result)
+{
+  *result = thrust::transform_reduce(exec, first, last, f1, init, f2);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformReduceDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  Vector data(3);
+  data[0] = 1; data[1] = -2; data[2] = 3;
+  
+  T init = 10;
+
+  thrust::device_vector<T> result(1);
+
+  transform_reduce_kernel<<<1,1>>>(exec, data.begin(), data.end(), thrust::negate<T>(), init, thrust::plus<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(8, (T)result[0]);
+}
+
+
+void TestTransformReduceDeviceSeq()
+{
+  TestTransformReduceDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformReduceDeviceSeq);
+
+
+void TestTransformReduceDeviceDevice()
+{
+  TestTransformReduceDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformReduceDeviceDevice);
+
+
+void TestTransformReduceCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector data(3);
+  data[0] = 1; data[1] = -2; data[2] = 3;
+  
+  T init = 10;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  T result = thrust::transform_reduce(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::negate<T>(), init, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(8, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestTransformReduceCudaStreams);
+
diff --git a/thrust/testing/cuda/transform_reduce.mk b/thrust/testing/cuda/transform_reduce.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/transform_reduce.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/transform_scan.cu b/thrust/testing/cuda/transform_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e629fcdffe70435a675fd9e2a4c00df9e662a738
--- /dev/null
+++ b/thrust/testing/cuda/transform_scan.cu
@@ -0,0 +1,186 @@
+#include <unittest/unittest.h>
+#include <thrust/transform_scan.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function1, typename Function2, typename Iterator3>
+__global__
+void transform_inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function1 f1, Function2 f2, Iterator3 result2)
+{
+  *result2 = thrust::transform_inclusive_scan(exec, first, last, result1, f1, f2);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function1, typename T, typename Function2, typename Iterator3>
+__global__
+void transform_exclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, Function1 f1, T init, Function2 f2, Iterator3 result2)
+{
+  *result2 = thrust::transform_exclusive_scan(exec, first, last, result, f1, init, f2);
+}
+
+
+template<typename ExecutionPolicy>
+void TestTransformScanDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef typename Vector::value_type T;
+  
+  typename Vector::iterator iter;
+  
+  Vector input(5);
+  Vector ref(5);
+  Vector output(5);
+  
+  input[0] = 1; input[1] = 3; input[2] = -2; input[3] = 4; input[4] = -5;
+  
+  Vector input_copy(input);
+
+  thrust::device_vector<typename Vector::iterator> iter_vec(1);
+  
+  // inclusive scan
+  transform_inclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  ref[0] = -1; ref[1] = -4; ref[2] = -2; ref[3] = -6; ref[4] = -1;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(ref, output);
+  
+  // exclusive scan with 0 init
+  transform_exclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ref[0] = 0; ref[1] = -1; ref[2] = -4; ref[3] = -2; ref[4] = -6;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(ref, output);
+  
+  // exclusive scan with nonzero init
+  transform_exclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  ref[0] = 3; ref[1] = 2; ref[2] = -1; ref[3] = 1; ref[4] = -3;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(ref, output);
+  
+  // inplace inclusive scan
+  input = input_copy;
+  transform_inclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  ref[0] = -1; ref[1] = -4; ref[2] = -2; ref[3] = -6; ref[4] = -1;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(ref, input);
+  
+  // inplace exclusive scan with init
+  input = input_copy;
+  transform_exclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  ref[0] = 3; ref[1] = 2; ref[2] = -1; ref[3] = 1; ref[4] = -3;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(ref, input);
+}
+
+
+void TestTransformScanDeviceSeq()
+{
+  TestTransformScanDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestTransformScanDeviceSeq);
+
+
+void TestTransformScanDeviceDevice()
+{
+  TestTransformScanDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestTransformScanDeviceDevice);
+
+
+void TestTransformScanCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector::iterator iter;
+
+  Vector input(5);
+  Vector result(5);
+  Vector output(5);
+
+  input[0] = 1; input[1] = 3; input[2] = -2; input[3] = 4; input[4] = -5;
+
+  Vector input_copy(input);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  // inclusive scan
+  iter = thrust::transform_inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+  
+  // exclusive scan with 0 init
+  iter = thrust::transform_exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = 0; result[1] = -1; result[2] = -4; result[3] = -2; result[4] = -6;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+  
+  // exclusive scan with nonzero init
+  iter = thrust::transform_exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input,  input_copy);
+  ASSERT_EQUAL(output, result);
+  
+  // inplace inclusive scan
+  input = input_copy;
+  iter = thrust::transform_inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  // inplace exclusive scan with init
+  input = input_copy;
+  iter = thrust::transform_exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestTransformScanCudaStreams);
+
diff --git a/thrust/testing/cuda/transform_scan.mk b/thrust/testing/cuda/transform_scan.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/transform_scan.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/uninitialized_copy.cu b/thrust/testing/cuda/uninitialized_copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..31feb07162032d239876f010446a3819ea53d61d
--- /dev/null
+++ b/thrust/testing/cuda/uninitialized_copy.cu
@@ -0,0 +1,146 @@
+#include <unittest/unittest.h>
+#include <thrust/uninitialized_copy.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void uninitialized_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  thrust::uninitialized_copy(exec, first, last, result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUninitializedCopyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  
+  Vector v1(5);
+  v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+  
+  // copy to Vector
+  Vector v2(5);
+  uninitialized_copy_kernel<<<1,1>>>(exec, v1.begin(), v1.end(), v2.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  ASSERT_EQUAL(v2[0], 0);
+  ASSERT_EQUAL(v2[1], 1);
+  ASSERT_EQUAL(v2[2], 2);
+  ASSERT_EQUAL(v2[3], 3);
+  ASSERT_EQUAL(v2[4], 4);
+}
+
+
+void TestUninitializedCopyDeviceSeq()
+{
+  TestUninitializedCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUninitializedCopyDeviceSeq);
+
+
+void TestUninitializedCopyDeviceDevice()
+{
+  TestUninitializedCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUninitializedCopyDeviceDevice);
+
+
+void TestUninitializedCopyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  
+  Vector v1(5);
+  v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+  
+  // copy to Vector
+  Vector v2(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::uninitialized_copy(thrust::cuda::par.on(s), v1.begin(), v1.end(), v2.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v2[0], 0);
+  ASSERT_EQUAL(v2[1], 1);
+  ASSERT_EQUAL(v2[2], 2);
+  ASSERT_EQUAL(v2[3], 3);
+  ASSERT_EQUAL(v2[4], 4);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUninitializedCopyCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Size, typename Iterator2>
+__global__
+void uninitialized_copy_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, Iterator2 result)
+{
+  thrust::uninitialized_copy_n(exec, first, n, result);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUninitializedCopyNDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  
+  Vector v1(5);
+  v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+  
+  // copy to Vector
+  Vector v2(5);
+  uninitialized_copy_n_kernel<<<1,1>>>(exec, v1.begin(), v1.size(), v2.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  ASSERT_EQUAL(v2[0], 0);
+  ASSERT_EQUAL(v2[1], 1);
+  ASSERT_EQUAL(v2[2], 2);
+  ASSERT_EQUAL(v2[3], 3);
+  ASSERT_EQUAL(v2[4], 4);
+}
+
+
+void TestUninitializedCopyNDeviceSeq()
+{
+  TestUninitializedCopyNDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUninitializedCopyNDeviceSeq);
+
+
+void TestUninitializedCopyNDeviceDevice()
+{
+  TestUninitializedCopyNDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUninitializedCopyNDeviceDevice);
+
+
+void TestUninitializedCopyNCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  
+  Vector v1(5);
+  v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+  
+  // copy to Vector
+  Vector v2(5);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  thrust::uninitialized_copy_n(thrust::cuda::par.on(s), v1.begin(), v1.size(), v2.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(v2[0], 0);
+  ASSERT_EQUAL(v2[1], 1);
+  ASSERT_EQUAL(v2[2], 2);
+  ASSERT_EQUAL(v2[3], 3);
+  ASSERT_EQUAL(v2[4], 4);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUninitializedCopyNCudaStreams);
+
diff --git a/thrust/testing/cuda/uninitialized_copy.mk b/thrust/testing/cuda/uninitialized_copy.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/uninitialized_copy.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/uninitialized_fill.cu b/thrust/testing/cuda/uninitialized_fill.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd74773471a446f72ee0f45c812b512d985cd2c7
--- /dev/null
+++ b/thrust/testing/cuda/uninitialized_fill.cu
@@ -0,0 +1,253 @@
+#include <unittest/unittest.h>
+#include <thrust/uninitialized_fill.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator, typename T>
+__global__
+void uninitialized_fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val)
+{
+  thrust::uninitialized_fill(exec, first, last, val);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUninitializedFillDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector v(5);
+  v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+  
+  T exemplar(7);
+  
+  uninitialized_fill_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 4, exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], 4);
+  
+  exemplar = 8;
+  
+  uninitialized_fill_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 3, exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(v[0], exemplar);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], 7);
+  ASSERT_EQUAL(v[4], 4);
+  
+  exemplar = 9;
+  
+  uninitialized_fill_kernel<<<1,1>>>(exec, v.begin() + 2, v.end(), exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(v[0], 8);
+  ASSERT_EQUAL(v[1], 8);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], 9);
+  
+  exemplar = 1;
+  
+  uninitialized_fill_kernel<<<1,1>>>(exec, v.begin(), v.end(), exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  
+  ASSERT_EQUAL(v[0], exemplar);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], exemplar);
+}
+
+
+void TestUninitializedFillDeviceSeq()
+{
+  TestUninitializedFillDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUninitializedFillDeviceSeq);
+
+
+void TestUninitializedFillDeviceDevice()
+{
+  TestUninitializedFillDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUninitializedFillDeviceDevice);
+
+
+void TestUninitializedFillCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector v(5);
+  v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+  
+  T exemplar(7);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::uninitialized_fill(thrust::cuda::par.on(s), v.begin(), v.end(), exemplar);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(v[0], exemplar);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], exemplar);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUninitializedFillCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Size, typename T, typename Iterator2>
+__global__
+void uninitialized_fill_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, T val, Iterator2 result)
+{
+  *result = thrust::uninitialized_fill_n(exec, first, n, val);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUninitializedFillNDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector v(5);
+  v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+  
+  T exemplar(7);
+
+  thrust::device_vector<Vector::iterator> iter_vec(1);
+  
+  uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin() + 1, 3, exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  Vector::iterator iter = iter_vec[0];
+  
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], 4);
+  ASSERT_EQUAL_QUIET(v.begin() + 4, iter);
+  
+  exemplar = 8;
+  
+  uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin() + 0, 3, exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(v[0], exemplar);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], 7);
+  ASSERT_EQUAL(v[4], 4);
+  ASSERT_EQUAL_QUIET(v.begin() + 3, iter);
+  
+  exemplar = 9;
+  
+  uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin() + 2, 3, exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(v[0], 8);
+  ASSERT_EQUAL(v[1], 8);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], 9);
+  ASSERT_EQUAL_QUIET(v.end(), iter);
+  
+  exemplar = 1;
+  
+  uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin(), v.size(), exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  
+  ASSERT_EQUAL(v[0], exemplar);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], exemplar);
+  ASSERT_EQUAL_QUIET(v.end(), iter);
+}
+
+
+void TestUninitializedFillNDeviceSeq()
+{
+  TestUninitializedFillNDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUninitializedFillNDeviceSeq);
+
+
+void TestUninitializedFillNDeviceDevice()
+{
+  TestUninitializedFillNDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUninitializedFillNDeviceDevice);
+
+
+void TestUninitializedFillNCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector v(5);
+  v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+  
+  T exemplar(7);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  thrust::uninitialized_fill_n(thrust::cuda::par.on(s), v.begin(), v.size(), exemplar);
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(v[0], exemplar);
+  ASSERT_EQUAL(v[1], exemplar);
+  ASSERT_EQUAL(v[2], exemplar);
+  ASSERT_EQUAL(v[3], exemplar);
+  ASSERT_EQUAL(v[4], exemplar);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUninitializedFillNCudaStreams);
+
diff --git a/thrust/testing/cuda/uninitialized_fill.mk b/thrust/testing/cuda/uninitialized_fill.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/uninitialized_fill.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/unique.cu b/thrust/testing/cuda/unique.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c0dc7973dea50ebb049ce9147edc194b178ca00f
--- /dev/null
+++ b/thrust/testing/cuda/unique.cu
@@ -0,0 +1,278 @@
+#include <unittest/unittest.h>
+#include <thrust/unique.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  *result = thrust::unique(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename BinaryPredicate, typename Iterator2>
+__global__
+void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, BinaryPredicate pred, Iterator2 result)
+{
+  *result = thrust::unique(exec, first, last, pred);
+}
+
+
+template<typename T>
+struct is_equal_div_10_unique
+{
+  __host__ __device__
+  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+
+template<typename ExecutionPolicy>
+void TestUniqueDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37; 
+
+  thrust::device_vector<Vector::iterator> new_last_vec(1);
+  Vector::iterator new_last;
+  
+  unique_kernel<<<1,1>>>(exec, data.begin(), data.end(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+
+  ASSERT_EQUAL(new_last - data.begin(), 7);
+  ASSERT_EQUAL(data[0], 11);
+  ASSERT_EQUAL(data[1], 12);
+  ASSERT_EQUAL(data[2], 20);
+  ASSERT_EQUAL(data[3], 29);
+  ASSERT_EQUAL(data[4], 21);
+  ASSERT_EQUAL(data[5], 31);
+  ASSERT_EQUAL(data[6], 37);
+
+  unique_kernel<<<1,1>>>(exec, data.begin(), new_last, is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+
+  ASSERT_EQUAL(new_last - data.begin(), 3);
+  ASSERT_EQUAL(data[0], 11);
+  ASSERT_EQUAL(data[1], 20);
+  ASSERT_EQUAL(data[2], 31);
+}
+
+
+void TestUniqueDeviceSeq()
+{
+  TestUniqueDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUniqueDeviceSeq);
+
+
+void TestUniqueDeviceDevice()
+{
+  TestUniqueDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUniqueDeviceDevice);
+
+
+void TestUniqueCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37; 
+
+  thrust::device_vector<Vector::iterator> new_last_vec(1);
+  Vector::iterator new_last;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), data.end());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(new_last - data.begin(), 7);
+  ASSERT_EQUAL(data[0], 11);
+  ASSERT_EQUAL(data[1], 12);
+  ASSERT_EQUAL(data[2], 20);
+  ASSERT_EQUAL(data[3], 29);
+  ASSERT_EQUAL(data[4], 21);
+  ASSERT_EQUAL(data[5], 31);
+  ASSERT_EQUAL(data[6], 37);
+
+  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), new_last, is_equal_div_10_unique<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(new_last - data.begin(), 3);
+  ASSERT_EQUAL(data[0], 11);
+  ASSERT_EQUAL(data[1], 20);
+  ASSERT_EQUAL(data[2], 31);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void unique_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Iterator3 result2)
+{
+  *result2 = thrust::unique_copy(exec, first, last, result1);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename BinaryPredicate, typename Iterator3>
+__global__
+void unique_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, BinaryPredicate pred, Iterator3 result2)
+{
+  *result2 = thrust::unique_copy(exec, first, last, result1, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37; 
+  
+  Vector output(10, -1);
+
+  thrust::device_vector<Vector::iterator> new_last_vec(1);
+  Vector::iterator new_last;
+  
+  unique_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), output.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+
+  ASSERT_EQUAL(new_last - output.begin(), 7);
+  ASSERT_EQUAL(output[0], 11);
+  ASSERT_EQUAL(output[1], 12);
+  ASSERT_EQUAL(output[2], 20);
+  ASSERT_EQUAL(output[3], 29);
+  ASSERT_EQUAL(output[4], 21);
+  ASSERT_EQUAL(output[5], 31);
+  ASSERT_EQUAL(output[6], 37);
+
+  unique_copy_kernel<<<1,1>>>(exec, output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+
+  ASSERT_EQUAL(new_last - data.begin(), 3);
+  ASSERT_EQUAL(data[0], 11);
+  ASSERT_EQUAL(data[1], 20);
+  ASSERT_EQUAL(data[2], 31);
+}
+
+
+void TestUniqueCopyDeviceSeq()
+{
+  TestUniqueCopyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUniqueCopyDeviceSeq);
+
+
+void TestUniqueCopyDeviceDevice()
+{
+  TestUniqueCopyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUniqueCopyDeviceDevice);
+
+
+void TestUniqueCopyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37; 
+  
+  Vector output(10, -1);
+
+  thrust::device_vector<Vector::iterator> new_last_vec(1);
+  Vector::iterator new_last;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  new_last = thrust::unique_copy(thrust::cuda::par.on(s), data.begin(), data.end(), output.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(new_last - output.begin(), 7);
+  ASSERT_EQUAL(output[0], 11);
+  ASSERT_EQUAL(output[1], 12);
+  ASSERT_EQUAL(output[2], 20);
+  ASSERT_EQUAL(output[3], 29);
+  ASSERT_EQUAL(output[4], 21);
+  ASSERT_EQUAL(output[5], 31);
+  ASSERT_EQUAL(output[6], 37);
+
+  new_last = thrust::unique_copy(thrust::cuda::par.on(s), output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(new_last - data.begin(), 3);
+  ASSERT_EQUAL(data[0], 11);
+  ASSERT_EQUAL(data[1], 20);
+  ASSERT_EQUAL(data[2], 31);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreams);
+
diff --git a/thrust/testing/cuda/unique.mk b/thrust/testing/cuda/unique.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/unique.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/cuda/unique_by_key.cu b/thrust/testing/cuda/unique_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c58a64d51dfe05231aa64bbb3e813547ab6fef53
--- /dev/null
+++ b/thrust/testing/cuda/unique_by_key.cu
@@ -0,0 +1,341 @@
+#include <unittest/unittest.h>
+#include <thrust/unique.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+
+template<typename T>
+struct is_equal_div_10_unique
+{
+  __host__ __device__
+  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+
+template<typename Vector>
+void initialize_keys(Vector& keys)
+{
+  keys.resize(9);
+  keys[0] = 11;
+  keys[1] = 11;
+  keys[2] = 21;
+  keys[3] = 20;
+  keys[4] = 21;
+  keys[5] = 21;
+  keys[6] = 21;
+  keys[7] = 37;
+  keys[8] = 37;
+}
+
+
+template<typename Vector>
+void initialize_values(Vector& values)
+{
+  values.resize(9);
+  values[0] = 0; 
+  values[1] = 1;
+  values[2] = 2;
+  values[3] = 3;
+  values[4] = 4;
+  values[5] = 5;
+  values[6] = 6;
+  values[7] = 7;
+  values[8] = 8;
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+__global__
+void unique_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
+{
+  *result = thrust::unique_by_key(exec, keys_first, keys_last, values_first);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename BinaryPredicate, typename Iterator3>
+__global__
+void unique_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, BinaryPredicate pred, Iterator3 result)
+{
+  *result = thrust::unique_by_key(exec, keys_first, keys_last, values_first, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUniqueByKeyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector keys;
+  Vector values;
+  
+  typedef thrust::pair<typename Vector::iterator, typename Vector::iterator> iter_pair;
+  thrust::device_vector<iter_pair> new_last_vec(1);
+  iter_pair new_last;
+  
+  // basic test
+  initialize_keys(keys);  initialize_values(values);
+  
+  unique_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+  
+  ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - values.begin(), 5);
+  ASSERT_EQUAL(keys[0], 11);
+  ASSERT_EQUAL(keys[1], 21);
+  ASSERT_EQUAL(keys[2], 20);
+  ASSERT_EQUAL(keys[3], 21);
+  ASSERT_EQUAL(keys[4], 37);
+  
+  ASSERT_EQUAL(values[0], 0);
+  ASSERT_EQUAL(values[1], 2);
+  ASSERT_EQUAL(values[2], 3);
+  ASSERT_EQUAL(values[3], 4);
+  ASSERT_EQUAL(values[4], 7);
+  
+  // test BinaryPredicate
+  initialize_keys(keys);  initialize_values(values);
+  
+  unique_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+  
+  ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
+  ASSERT_EQUAL(new_last.second - values.begin(), 3);
+  ASSERT_EQUAL(keys[0], 11);
+  ASSERT_EQUAL(keys[1], 21);
+  ASSERT_EQUAL(keys[2], 37);
+  
+  ASSERT_EQUAL(values[0], 0);
+  ASSERT_EQUAL(values[1], 2);
+  ASSERT_EQUAL(values[2], 7);
+}
+
+void TestUniqueByKeyDeviceSeq()
+{
+  TestUniqueByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUniqueByKeyDeviceSeq);
+
+
+void TestUniqueByKeyDeviceDevice()
+{
+  TestUniqueByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUniqueByKeyDeviceDevice);
+
+
+void TestUniqueByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+  
+  Vector keys;
+  Vector values;
+  
+  typedef thrust::pair<Vector::iterator, Vector::iterator> iter_pair;
+  iter_pair new_last;
+  
+  // basic test
+  initialize_keys(keys);  initialize_values(values);
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+  
+  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin());
+  cudaStreamSynchronize(s);
+  
+  ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - values.begin(), 5);
+  ASSERT_EQUAL(keys[0], 11);
+  ASSERT_EQUAL(keys[1], 21);
+  ASSERT_EQUAL(keys[2], 20);
+  ASSERT_EQUAL(keys[3], 21);
+  ASSERT_EQUAL(keys[4], 37);
+  
+  ASSERT_EQUAL(values[0], 0);
+  ASSERT_EQUAL(values[1], 2);
+  ASSERT_EQUAL(values[2], 3);
+  ASSERT_EQUAL(values[3], 4);
+  ASSERT_EQUAL(values[4], 7);
+  
+  // test BinaryPredicate
+  initialize_keys(keys);  initialize_values(values);
+  
+  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
+  
+  ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
+  ASSERT_EQUAL(new_last.second - values.begin(), 3);
+  ASSERT_EQUAL(keys[0], 11);
+  ASSERT_EQUAL(keys[1], 21);
+  ASSERT_EQUAL(keys[2], 37);
+  
+  ASSERT_EQUAL(values[0], 0);
+  ASSERT_EQUAL(values[1], 2);
+  ASSERT_EQUAL(values[2], 7);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreams);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
+__global__
+void unique_by_key_copy_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, Iterator5 result)
+{
+  *result = thrust::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_result, values_result);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename BinaryPredicate, typename Iterator5>
+__global__
+void unique_by_key_copy_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, BinaryPredicate pred, Iterator5 result)
+{
+  *result = thrust::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_result, values_result, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyByKeyDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector keys;
+  Vector values;
+
+  typedef thrust::pair<typename Vector::iterator, typename Vector::iterator> iter_pair;
+  thrust::device_vector<iter_pair> new_last_vec(1);
+  iter_pair new_last;
+
+  // basic test
+  initialize_keys(keys);  initialize_values(values);
+
+  Vector output_keys(keys.size());
+  Vector output_values(values.size());
+
+  unique_by_key_copy_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 20);
+  ASSERT_EQUAL(output_keys[3], 21);
+  ASSERT_EQUAL(output_keys[4], 37);
+  
+  ASSERT_EQUAL(output_values[0], 0);
+  ASSERT_EQUAL(output_values[1], 2);
+  ASSERT_EQUAL(output_values[2], 3);
+  ASSERT_EQUAL(output_values[3], 4);
+  ASSERT_EQUAL(output_values[4], 7);
+
+  // test BinaryPredicate
+  initialize_keys(keys);  initialize_values(values);
+  
+  unique_by_key_copy_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  new_last = new_last_vec[0];
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 37);
+  
+  ASSERT_EQUAL(output_values[0], 0);
+  ASSERT_EQUAL(output_values[1], 2);
+  ASSERT_EQUAL(output_values[2], 7);
+}
+
+
+void TestUniqueCopyByKeyDeviceSeq()
+{
+  TestUniqueCopyByKeyDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceSeq);
+
+
+void TestUniqueCopyByKeyDeviceDevice()
+{
+  TestUniqueCopyByKeyDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceDevice);
+
+
+void TestUniqueCopyByKeyCudaStreams()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector keys;
+  Vector values;
+
+  typedef thrust::pair<Vector::iterator, Vector::iterator> iter_pair;
+  iter_pair new_last;
+
+  // basic test
+  initialize_keys(keys);  initialize_values(values);
+
+  Vector output_keys(keys.size());
+  Vector output_values(values.size());
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 20);
+  ASSERT_EQUAL(output_keys[3], 21);
+  ASSERT_EQUAL(output_keys[4], 37);
+  
+  ASSERT_EQUAL(output_values[0], 0);
+  ASSERT_EQUAL(output_values[1], 2);
+  ASSERT_EQUAL(output_values[2], 3);
+  ASSERT_EQUAL(output_values[3], 4);
+  ASSERT_EQUAL(output_values[4], 7);
+
+  // test BinaryPredicate
+  initialize_keys(keys);  initialize_values(values);
+  
+  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
+  ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
+  ASSERT_EQUAL(output_keys[0], 11);
+  ASSERT_EQUAL(output_keys[1], 21);
+  ASSERT_EQUAL(output_keys[2], 37);
+  
+  ASSERT_EQUAL(output_values[0], 0);
+  ASSERT_EQUAL(output_values[1], 2);
+  ASSERT_EQUAL(output_values[2], 7);
+
+  cudaStreamDestroy(s);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreams);
+
diff --git a/thrust/testing/cuda/unique_by_key.mk b/thrust/testing/cuda/unique_by_key.mk
new file mode 100644
index 0000000000000000000000000000000000000000..7d930481e787f6d4f1336828e66c71aae8705954
--- /dev/null
+++ b/thrust/testing/cuda/unique_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/thrust/testing/decompose.cu b/thrust/testing/decompose.cu
new file mode 100644
index 0000000000000000000000000000000000000000..41f40967af4859f8157c7814ac806ac2dc03ca60
--- /dev/null
+++ b/thrust/testing/decompose.cu
@@ -0,0 +1,84 @@
+#include <unittest/unittest.h>
+
+#include <thrust/system/detail/internal/decompose.h>
+
+using thrust::system::detail::internal::uniform_decomposition;
+
+void TestUniformDecomposition(void)
+{
+  {
+    uniform_decomposition<int> ud(10, 10, 1);
+   
+    // [0,10)
+    ASSERT_EQUAL(ud.size(), 1);
+    ASSERT_EQUAL(ud[0].begin(),   0);
+    ASSERT_EQUAL(ud[0].end(),    10);
+    ASSERT_EQUAL(ud[0].size(),   10);
+  }
+  
+  {
+    uniform_decomposition<int> ud(10, 20, 1);
+   
+    // [0,10)
+    ASSERT_EQUAL(ud.size(), 1);
+    ASSERT_EQUAL(ud[0].begin(),  0);
+    ASSERT_EQUAL(ud[0].end(),   10);
+    ASSERT_EQUAL(ud[0].size(),  10);
+  }
+
+  {
+    uniform_decomposition<int> ud(8, 5, 2);
+   
+    // [0,5)[5,8)
+    ASSERT_EQUAL(ud.size(), 2);
+    ASSERT_EQUAL(ud[0].begin(),  0);
+    ASSERT_EQUAL(ud[0].end(),    5);
+    ASSERT_EQUAL(ud[0].size(),   5);
+    ASSERT_EQUAL(ud[1].begin(),  5);
+    ASSERT_EQUAL(ud[1].end(),    8);
+    ASSERT_EQUAL(ud[1].size(),   3);
+  }
+  
+  {
+    uniform_decomposition<int> ud(8, 5, 3);
+   
+    // [0,5)[5,8)
+    ASSERT_EQUAL(ud.size(), 2);
+    ASSERT_EQUAL(ud[0].begin(),  0);
+    ASSERT_EQUAL(ud[0].end(),    5);
+    ASSERT_EQUAL(ud[0].size(),   5);
+    ASSERT_EQUAL(ud[1].begin(),  5);
+    ASSERT_EQUAL(ud[1].end(),    8);
+    ASSERT_EQUAL(ud[1].size(),   3);
+  }
+
+  {
+    uniform_decomposition<int> ud(10, 1, 2);
+   
+    // [0,5)[5,10)
+    ASSERT_EQUAL(ud.size(), 2);
+    ASSERT_EQUAL(ud[0].begin(),  0);
+    ASSERT_EQUAL(ud[0].end(),    5);
+    ASSERT_EQUAL(ud[0].size(),   5);
+    ASSERT_EQUAL(ud[1].begin(),  5);
+    ASSERT_EQUAL(ud[1].end(),   10);
+    ASSERT_EQUAL(ud[1].size(),   5);
+  }
+
+  {
+    // [0,4)[4,8)[8,10)
+    uniform_decomposition<int> ud(10, 2, 3);   
+
+    ASSERT_EQUAL(ud.size(), 3);
+    ASSERT_EQUAL(ud[0].begin(),  0);
+    ASSERT_EQUAL(ud[0].end(),    4);
+    ASSERT_EQUAL(ud[0].size(),   4);
+    ASSERT_EQUAL(ud[1].begin(),  4);
+    ASSERT_EQUAL(ud[1].end(),    8);
+    ASSERT_EQUAL(ud[1].size(),   4);
+    ASSERT_EQUAL(ud[2].begin(),  8);
+    ASSERT_EQUAL(ud[2].end(),   10);
+    ASSERT_EQUAL(ud[2].size(),   2);
+  }
+}
+DECLARE_UNITTEST(TestUniformDecomposition);
diff --git a/thrust/testing/dependencies_aware_policies.cu b/thrust/testing/dependencies_aware_policies.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5313392151c684786ad68eb46ebc069c9b023780
--- /dev/null
+++ b/thrust/testing/dependencies_aware_policies.cu
@@ -0,0 +1,189 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/seq.h>
+#include <thrust/system/cpp/detail/par.h>
+#include <thrust/system/omp/detail/par.h>
+#include <thrust/system/tbb/detail/par.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#  include <thrust/system/cuda/detail/par.h>
+#endif
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template<typename T>
+struct test_allocator_t
+{
+};
+
+test_allocator_t<int> test_allocator = test_allocator_t<int>();
+
+template<int I>
+struct test_dependency_t
+{
+};
+
+template<int I>
+test_dependency_t<I> test_dependency()
+{
+    return {};
+}
+
+template<typename Policy, template<typename> class CRTPBase>
+struct policy_info
+{
+    using policy = Policy;
+
+    template<template<template<typename> class, typename...> class Template, typename ...Arguments>
+    using apply_base_first = Template<CRTPBase, Arguments...>;
+
+    template<template<typename, template<typename> class, typename...> class Template, typename First, typename ...Arguments>
+    using apply_base_second = Template<First, CRTPBase, Arguments...>;
+};
+
+template<typename PolicyInfo>
+struct TestDependencyAttachment
+{
+    template<typename ...Expected, typename T>
+    static void assert_correct(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_first<
+                    thrust::detail::execute_with_dependencies,
+                    Expected...
+                >
+            >::value), true);
+    }
+
+    template<typename Allocator, typename ...Expected, typename T>
+    static void assert_correct_with_allocator(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_second<
+                    thrust::detail::execute_with_allocator_and_dependencies,
+                    Allocator,
+                    Expected...
+                >
+            >::value), true);
+    }
+
+    void operator()()
+    {
+        typename PolicyInfo::policy policy;
+
+        assert_correct<
+            test_dependency_t<1>
+        >(policy
+            .after(
+                test_dependency<1>()
+            )
+        );
+
+        assert_correct<
+            test_dependency_t<1>,
+            test_dependency_t<2>
+        >(policy
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>()
+            )
+        );
+
+        assert_correct<
+            test_dependency_t<1>,
+            test_dependency_t<2>,
+            test_dependency_t<3>
+        >(policy
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>(),
+                test_dependency<3>()
+            )
+        );
+
+        assert_correct_with_allocator<
+            test_allocator_t<int> &,
+            test_dependency_t<1>
+        >(policy(test_allocator)
+            .after(
+                test_dependency<1>()
+            )
+        );
+
+        assert_correct_with_allocator<
+            test_allocator_t<int> &,
+            test_dependency_t<1>,
+            test_dependency_t<2>
+        >(policy(test_allocator)
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>()
+            )
+        );
+
+        assert_correct_with_allocator<
+            test_allocator_t<int> &,
+            test_dependency_t<1>,
+            test_dependency_t<2>,
+            test_dependency_t<3>
+        >(policy(test_allocator)
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>(),
+                test_dependency<3>()
+            )
+        );
+    }
+};
+
+typedef policy_info<
+    thrust::detail::seq_t,
+    thrust::system::detail::sequential::execution_policy
+> sequential_info;
+typedef policy_info<
+    thrust::system::cpp::detail::par_t,
+    thrust::system::cpp::detail::execution_policy
+> cpp_par_info;
+typedef policy_info<
+    thrust::system::omp::detail::par_t,
+    thrust::system::omp::detail::execution_policy
+> omp_par_info;
+typedef policy_info<
+    thrust::system::tbb::detail::par_t,
+    thrust::system::tbb::detail::execution_policy
+> tbb_par_info;
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+typedef policy_info<
+    thrust::system::cuda::detail::par_t,
+    thrust::cuda_cub::execute_on_stream_base
+> cuda_par_info;
+#endif
+
+SimpleUnitTest<
+    TestDependencyAttachment,
+    unittest::type_list<
+        // TODO: uncomment when dependencies are generalized to all backends
+        // sequential_info,
+        // cpp_par_info,
+        // omp_par_info,
+        // tbb_par_info,
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cuda_par_info
+#endif
+    >
+> TestDependencyAttachmentInstance;
+
+#else // C++11
+
+void TestDummy()
+{
+}
+DECLARE_UNITTEST(TestDummy);
+
+#endif // C++11
diff --git a/thrust/testing/dereference.cu b/thrust/testing/dereference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ef5a991efaa850101cac0ab28db28c15f1f4f297
--- /dev/null
+++ b/thrust/testing/dereference.cu
@@ -0,0 +1,109 @@
+#include <unittest/unittest.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+
+template <typename Iterator1, typename Iterator2>
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+__global__
+#endif 
+void simple_copy_on_device(Iterator1 first1, Iterator1 last1, Iterator2 first2)
+{
+    while(first1 != last1)
+        *(first2++) = *(first1++);
+}
+
+template <typename Iterator1, typename Iterator2>
+void simple_copy(Iterator1 first1, Iterator1 last1, Iterator2 first2)
+{
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    simple_copy_on_device<<<1,1>>>(first1, last1, first2);
+#else
+    simple_copy_on_device(first1, last1, first2);
+#endif
+}
+
+
+void TestDeviceDereferenceDeviceVectorIterator(void)
+{
+    thrust::device_vector<int> input = unittest::random_integers<int>(100); 
+    thrust::device_vector<int> output(input.size(), 0);
+
+    simple_copy(input.begin(), input.end(), output.begin());
+
+    ASSERT_EQUAL(input, output);
+}
+DECLARE_UNITTEST(TestDeviceDereferenceDeviceVectorIterator);
+
+void TestDeviceDereferenceDevicePtr(void)
+{
+    thrust::device_vector<int> input = unittest::random_integers<int>(100); 
+    thrust::device_vector<int> output(input.size(), 0);
+
+    thrust::device_ptr<int> _first1 = &input[0];
+    thrust::device_ptr<int> _last1  = _first1 + input.size();
+    thrust::device_ptr<int> _first2 = &output[0];
+
+    simple_copy(_first1, _last1, _first2);
+
+    ASSERT_EQUAL(input, output);
+}
+DECLARE_UNITTEST(TestDeviceDereferenceDevicePtr);
+
+void TestDeviceDereferenceTransformIterator(void)
+{
+    thrust::device_vector<int> input = unittest::random_integers<int>(100); 
+    thrust::device_vector<int> output(input.size(), 0);
+
+    simple_copy(thrust::make_transform_iterator(input.begin(), thrust::identity<int>()),
+                thrust::make_transform_iterator(input.end (),  thrust::identity<int>()),
+                output.begin());
+
+    ASSERT_EQUAL(input, output);
+}
+DECLARE_UNITTEST(TestDeviceDereferenceTransformIterator);
+
+void TestDeviceDereferenceCountingIterator(void)
+{
+    thrust::counting_iterator<int> first(1);
+    thrust::counting_iterator<int> last(6);
+
+    thrust::device_vector<int> output(5);
+
+    simple_copy(first, last, output.begin());
+
+    ASSERT_EQUAL(output[0], 1);
+    ASSERT_EQUAL(output[1], 2);
+    ASSERT_EQUAL(output[2], 3);
+    ASSERT_EQUAL(output[3], 4);
+    ASSERT_EQUAL(output[4], 5);
+}
+DECLARE_UNITTEST(TestDeviceDereferenceCountingIterator);
+
+void TestDeviceDereferenceTransformedCountingIterator(void)
+{
+    thrust::counting_iterator<int> first(1);
+    thrust::counting_iterator<int> last(6);
+
+    thrust::device_vector<int> output(5);
+
+    simple_copy(thrust::make_transform_iterator(first, thrust::negate<int>()),
+                thrust::make_transform_iterator(last,  thrust::negate<int>()),
+                output.begin());
+
+    ASSERT_EQUAL(output[0], -1);
+    ASSERT_EQUAL(output[1], -2);
+    ASSERT_EQUAL(output[2], -3);
+    ASSERT_EQUAL(output[3], -4);
+    ASSERT_EQUAL(output[4], -5);
+}
+DECLARE_UNITTEST(TestDeviceDereferenceTransformedCountingIterator);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/thrust/testing/device_delete.cu b/thrust/testing/device_delete.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6684cb2b53859ade38e263d831b74977bc307e92
--- /dev/null
+++ b/thrust/testing/device_delete.cu
@@ -0,0 +1,47 @@
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_new.h>
+#include <thrust/device_delete.h>
+
+struct Foo
+{
+  __host__ __device__
+  Foo(void)
+    :set_me_upon_destruction(0)
+  {}
+
+  __host__ __device__
+  ~Foo(void)
+  {
+#ifdef __CUDA_ARCH__
+    // __device__ overload
+    if(set_me_upon_destruction != 0)
+      *set_me_upon_destruction = true;
+#endif
+  }
+
+  bool *set_me_upon_destruction;
+};
+
+#if !defined(__QNX__)
+void TestDeviceDeleteDestructorInvocation(void)
+{
+  KNOWN_FAILURE;
+//
+//  thrust::device_vector<bool> destructor_flag(1, false);
+//
+//  thrust::device_ptr<Foo> foo_ptr  = thrust::device_new<Foo>();
+//
+//  Foo exemplar;
+//  exemplar.set_me_upon_destruction = thrust::raw_pointer_cast(&destructor_flag[0]);
+//  *foo_ptr = exemplar;
+//
+//  ASSERT_EQUAL(false, destructor_flag[0]);
+//
+//  thrust::device_delete(foo_ptr);
+//
+//  ASSERT_EQUAL(true, destructor_flag[0]);
+}
+DECLARE_UNITTEST(TestDeviceDeleteDestructorInvocation);
+#endif
diff --git a/thrust/testing/device_ptr.cu b/thrust/testing/device_ptr.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c3e7c8bf859a61b5578e502bd81f437faf26388c
--- /dev/null
+++ b/thrust/testing/device_ptr.cu
@@ -0,0 +1,121 @@
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_ptr.h>
+
+void TestDevicePointerManipulation(void)
+{
+    thrust::device_vector<int> data(5);
+
+    thrust::device_ptr<int> begin(&data[0]);
+    thrust::device_ptr<int> end(&data[0] + 5);
+
+    ASSERT_EQUAL(end - begin, 5);
+
+    begin++;
+    begin--;
+    
+    ASSERT_EQUAL(end - begin, 5);
+
+    begin += 1;
+    begin -= 1;
+    
+    ASSERT_EQUAL(end - begin, 5);
+
+    begin = begin + (int) 1;
+    begin = begin - (int) 1;
+
+    ASSERT_EQUAL(end - begin, 5);
+
+    begin = begin + (unsigned int) 1;
+    begin = begin - (unsigned int) 1;
+    
+    ASSERT_EQUAL(end - begin, 5);
+    
+    begin = begin + (size_t) 1;
+    begin = begin - (size_t) 1;
+
+    ASSERT_EQUAL(end - begin, 5);
+
+    begin = begin + (ptrdiff_t) 1;
+    begin = begin - (ptrdiff_t) 1;
+
+    ASSERT_EQUAL(end - begin, 5);
+
+    begin = begin + (thrust::device_ptr<int>::difference_type) 1;
+    begin = begin - (thrust::device_ptr<int>::difference_type) 1;
+
+    ASSERT_EQUAL(end - begin, 5);
+}
+DECLARE_UNITTEST(TestDevicePointerManipulation);
+
+
+void TestMakeDevicePointer(void)
+{
+    typedef int T;
+
+    T *raw_ptr = 0;
+
+    thrust::device_ptr<T> p0 = thrust::device_pointer_cast(raw_ptr);
+
+    ASSERT_EQUAL(thrust::raw_pointer_cast(p0), raw_ptr);
+
+    thrust::device_ptr<T> p1 = thrust::device_pointer_cast(p0);
+
+    ASSERT_EQUAL(p0, p1);
+}
+DECLARE_UNITTEST(TestMakeDevicePointer);
+
+
+template<typename Vector>
+void TestRawPointerCast(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(3);
+
+    T * first;
+    T * last;
+    
+    first = thrust::raw_pointer_cast(&vec[0]);
+    last  = thrust::raw_pointer_cast(&vec[3]);
+    ASSERT_EQUAL(last - first, 3);
+
+    first = thrust::raw_pointer_cast(&vec.front());
+    last  = thrust::raw_pointer_cast(&vec.back());
+    ASSERT_EQUAL(last - first, 2);
+
+    // Do we want these to work?
+    //first = thrust::raw_pointer_cast(vec.begin());
+    //last  = thrust::raw_pointer_cast(vec.end());
+    //ASSERT_EQUAL(last - first, 3);
+}
+DECLARE_VECTOR_UNITTEST(TestRawPointerCast);
+
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename T>
+void TestDevicePointerNullptrCompatibility()
+{
+    thrust::device_ptr<T> p0(nullptr);
+
+    ASSERT_EQUAL_QUIET(nullptr, p0);
+    ASSERT_EQUAL_QUIET(p0, nullptr);
+
+    p0 = nullptr;
+
+    ASSERT_EQUAL_QUIET(nullptr, p0);
+    ASSERT_EQUAL_QUIET(p0, nullptr);
+}
+DECLARE_GENERIC_UNITTEST(TestDevicePointerNullptrCompatibility);
+
+template<typename T>
+void TestDevicePointerBoolConversion()
+{
+    thrust::device_ptr<T> p0(nullptr);
+    auto const b = bool(p0);
+
+    ASSERT_EQUAL_QUIET(false, b);
+}
+DECLARE_GENERIC_UNITTEST(TestDevicePointerBoolConversion);
+#endif
+
diff --git a/thrust/testing/device_reference.cu b/thrust/testing/device_reference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c30934d75ff7b251bc915c8563e2405e8dc75da2
--- /dev/null
+++ b/thrust/testing/device_reference.cu
@@ -0,0 +1,231 @@
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_reference.h>
+
+void TestDeviceReferenceConstructorFromDeviceReference(void)
+{
+    typedef int T;
+
+    thrust::device_vector<T> v(1,0);
+    thrust::device_reference<T> ref = v[0];
+
+    // ref equals the object at v[0]
+    ASSERT_EQUAL(v[0], ref);
+
+    // the address of ref equals the address of v[0]
+    ASSERT_EQUAL(&v[0], &ref);
+
+    // modifying v[0] modifies ref
+    v[0] = 13;
+    ASSERT_EQUAL(13, ref);
+    ASSERT_EQUAL(v[0], ref);
+
+    // modifying ref modifies v[0]
+    ref = 7;
+    ASSERT_EQUAL(7, v[0]);
+    ASSERT_EQUAL(v[0], ref);
+}
+DECLARE_UNITTEST(TestDeviceReferenceConstructorFromDeviceReference);
+
+void TestDeviceReferenceConstructorFromDevicePointer(void)
+{
+    typedef int T;
+
+    thrust::device_vector<T> v(1,0);
+    thrust::device_ptr<T> ptr = &v[0];
+    thrust::device_reference<T> ref(ptr);
+
+    // ref equals the object pointed to by ptr
+    ASSERT_EQUAL(*ptr, ref);
+
+    // the address of ref equals ptr
+    ASSERT_EQUAL(ptr, &ref);
+
+    // modifying *ptr modifies ref
+    *ptr = 13;
+    ASSERT_EQUAL(13, ref);
+    ASSERT_EQUAL(v[0], ref);
+
+    // modifying ref modifies *ptr
+    ref = 7;
+    ASSERT_EQUAL(7, *ptr);
+    ASSERT_EQUAL(v[0], ref);
+}
+DECLARE_UNITTEST(TestDeviceReferenceConstructorFromDevicePointer);
+
+void TestDeviceReferenceAssignmentFromDeviceReference(void)
+{
+    // test same types
+    typedef int T0;
+    thrust::device_vector<T0> v0(2,0);
+    thrust::device_reference<T0> ref0 = v0[0];
+    thrust::device_reference<T0> ref1 = v0[1];
+
+    ref0 = 13;
+
+    ref1 = ref0;
+
+    // ref1 equals 13
+    ASSERT_EQUAL(13, ref1);
+    ASSERT_EQUAL(ref0, ref1);
+
+    // test different types
+    typedef float T1;
+    thrust::device_vector<T1> v1(1,0.0f);
+    thrust::device_reference<T1> ref2 = v1[0];
+
+    ref2 = ref1;
+
+    // ref2 equals 13.0f
+    ASSERT_EQUAL(13.0f, ref2);
+    ASSERT_EQUAL(ref0, ref2);
+    ASSERT_EQUAL(ref1, ref2);
+}
+DECLARE_UNITTEST(TestDeviceReferenceAssignmentFromDeviceReference);
+
+void TestDeviceReferenceManipulation(void)
+{
+    typedef int T1;
+
+    thrust::device_vector<T1> v(1,0);
+    thrust::device_ptr<T1> ptr = &v[0];
+    thrust::device_reference<T1> ref(ptr);
+
+    // reset
+    ref = 0;
+
+    // test prefix increment
+    ++ref;
+    ASSERT_EQUAL(1, ref);
+    ASSERT_EQUAL(1, *ptr);
+    ASSERT_EQUAL(1, v[0]);
+
+    // reset
+    ref = 0;
+
+    // test postfix increment
+    T1 x1 = ref++;
+    ASSERT_EQUAL(0, x1);
+    ASSERT_EQUAL(1, ref);
+    ASSERT_EQUAL(1, *ptr);
+    ASSERT_EQUAL(1, v[0]);
+
+    // reset
+    ref = 0;
+
+    // test addition-assignment
+    ref += 5;
+    ASSERT_EQUAL(5, ref);
+    ASSERT_EQUAL(5, *ptr);
+    ASSERT_EQUAL(5, v[0]);
+
+    // reset
+    ref = 0;
+
+    // test prefix decrement
+    --ref;
+    ASSERT_EQUAL(-1, ref);
+    ASSERT_EQUAL(-1, *ptr);
+    ASSERT_EQUAL(-1, v[0]);
+
+    // reset
+    ref = 0;
+
+    // test subtraction-assignment
+    ref -= 5;
+    ASSERT_EQUAL(-5, ref);
+    ASSERT_EQUAL(-5, *ptr);
+    ASSERT_EQUAL(-5, v[0]);
+
+    // reset
+    ref = 1;
+
+    // test multiply-assignment
+    ref *= 5;
+    ASSERT_EQUAL(5, ref);
+    ASSERT_EQUAL(5, *ptr);
+    ASSERT_EQUAL(5, v[0]);
+
+    // reset
+    ref = 5;
+
+    // test divide-assignment
+    ref /= 5;
+    ASSERT_EQUAL(1, ref);
+    ASSERT_EQUAL(1, *ptr);
+    ASSERT_EQUAL(1, v[0]);
+
+    // reset
+    ref = 5;
+
+    // test modulus-assignment
+    ref %= 5;
+    ASSERT_EQUAL(0, ref);
+    ASSERT_EQUAL(0, *ptr);
+    ASSERT_EQUAL(0, v[0]);
+
+    // reset
+    ref = 1;
+
+    // test left shift-assignment
+    ref <<= 1;
+    ASSERT_EQUAL(2, ref);
+    ASSERT_EQUAL(2, *ptr);
+    ASSERT_EQUAL(2, v[0]);
+
+    // reset
+    ref = 2;
+
+    // test right shift-assignment
+    ref >>= 1;
+    ASSERT_EQUAL(1, ref);
+    ASSERT_EQUAL(1, *ptr);
+    ASSERT_EQUAL(1, v[0]);
+
+    // reset
+    ref = 0;
+
+    // test OR-assignment
+    ref |= 1;
+    ASSERT_EQUAL(1, ref);
+    ASSERT_EQUAL(1, *ptr);
+    ASSERT_EQUAL(1, v[0]);
+
+    // reset
+    ref = 1;
+
+    // test XOR-assignment
+    ref ^= 1;
+    ASSERT_EQUAL(0, ref);
+    ASSERT_EQUAL(0, *ptr);
+    ASSERT_EQUAL(0, v[0]);
+
+    // test equality of const references
+    thrust::device_reference<const T1> ref1 = v[0];
+    ASSERT_EQUAL(true, ref1 == ref);
+}
+DECLARE_UNITTEST(TestDeviceReferenceManipulation);
+
+void TestDeviceReferenceSwap(void)
+{
+  typedef int T;
+
+  thrust::device_vector<T> v(2);
+  thrust::device_reference<T> ref1 = v.front();
+  thrust::device_reference<T> ref2 = v.back();
+
+  ref1 = 7;
+  ref2 = 13;
+
+  // test thrust::swap()
+  thrust::swap(ref1, ref2);
+  ASSERT_EQUAL(13, ref1);
+  ASSERT_EQUAL(7, ref2);
+
+  // test .swap()
+  ref1.swap(ref2);
+  ASSERT_EQUAL(7, ref1);
+  ASSERT_EQUAL(13, ref2);
+}
+DECLARE_UNITTEST(TestDeviceReferenceSwap);
+
diff --git a/thrust/testing/discard_iterator.cu b/thrust/testing/discard_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f5933559d87877ae481371abad6cdd5970f2285c
--- /dev/null
+++ b/thrust/testing/discard_iterator.cu
@@ -0,0 +1,102 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/discard_iterator.h>
+
+void TestDiscardIteratorIncrement(void)
+{
+  thrust::discard_iterator<> lhs(0);
+  thrust::discard_iterator<> rhs(0);
+
+  ASSERT_EQUAL(0, lhs - rhs);
+
+  lhs++;
+
+  ASSERT_EQUAL(1, lhs - rhs);
+  
+  lhs++;
+  lhs++;
+  
+  ASSERT_EQUAL(3, lhs - rhs);
+
+  lhs += 5;
+  
+  ASSERT_EQUAL(8, lhs - rhs);
+
+  lhs -= 10;
+  
+  ASSERT_EQUAL(-2, lhs - rhs);
+}
+DECLARE_UNITTEST(TestDiscardIteratorIncrement);
+
+void TestDiscardIteratorComparison(void)
+{
+  thrust::discard_iterator<> iter1(0);
+  thrust::discard_iterator<> iter2(0);
+
+  ASSERT_EQUAL(0, iter1 - iter2);
+  ASSERT_EQUAL(true, iter1 == iter2);
+
+  iter1++;
+  
+  ASSERT_EQUAL(1, iter1 - iter2);
+  ASSERT_EQUAL(false, iter1 == iter2);
+  
+  iter2++;
+
+  ASSERT_EQUAL(0, iter1 - iter2);
+  ASSERT_EQUAL(true, iter1 == iter2);
+  
+  iter1 += 100;
+  iter2 += 100;
+
+  ASSERT_EQUAL(0, iter1 - iter2);
+  ASSERT_EQUAL(true, iter1 == iter2);
+}
+DECLARE_UNITTEST(TestDiscardIteratorComparison);
+
+void TestMakeDiscardIterator(void)
+{
+  thrust::discard_iterator<> iter0 = thrust::make_discard_iterator(13);
+
+  *iter0 = 7;
+
+  thrust::discard_iterator<> iter1 = thrust::make_discard_iterator(7);
+
+  *iter1 = 13;
+
+  ASSERT_EQUAL(6, iter0 - iter1);
+}
+DECLARE_UNITTEST(TestMakeDiscardIterator);
+
+void TestZippedDiscardIterator(void)
+{
+  using namespace thrust;
+
+  typedef tuple<discard_iterator<> >  IteratorTuple1;
+  typedef zip_iterator<IteratorTuple1> ZipIterator1;
+
+  IteratorTuple1 t = thrust::make_tuple(thrust::make_discard_iterator());
+
+  ZipIterator1 z_iter1_first = thrust::make_zip_iterator(t);
+  ZipIterator1 z_iter1_last  = z_iter1_first + 10;
+  for(; z_iter1_first != z_iter1_last; ++z_iter1_first)
+  {
+    ;
+  }
+  
+  ASSERT_EQUAL(10, thrust::get<0>(z_iter1_first.get_iterator_tuple()) - thrust::make_discard_iterator());
+
+  typedef tuple<int *, discard_iterator<> > IteratorTuple2;
+  typedef zip_iterator<IteratorTuple2>      ZipIterator2;
+
+  ZipIterator2 z_iter_first = thrust::make_zip_iterator(thrust::make_tuple((int*)0, thrust::make_discard_iterator()));
+  ZipIterator2 z_iter_last  = z_iter_first + 10;
+
+  for(; z_iter_first != z_iter_last; ++z_iter_first)
+  {
+    ;
+  }
+
+  ASSERT_EQUAL(10, thrust::get<1>(z_iter_first.get_iterator_tuple()) - thrust::make_discard_iterator());
+}
+DECLARE_UNITTEST(TestZippedDiscardIterator);
+
diff --git a/thrust/testing/distance.cu b/thrust/testing/distance.cu
new file mode 100644
index 0000000000000000000000000000000000000000..93e8abbf0342983860c22421b42c6e6c0d6a6116
--- /dev/null
+++ b/thrust/testing/distance.cu
@@ -0,0 +1,28 @@
+#include <unittest/unittest.h>
+#include <thrust/distance.h>
+
+// TODO expand this with other iterator types (forward, bidirectional, etc.)
+
+template <typename Vector>
+void TestDistance(void)
+{
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(100);
+
+    Iterator i = v.begin();
+
+    ASSERT_EQUAL(thrust::distance(i, v.end()), 100);
+
+    i++;
+
+    ASSERT_EQUAL(thrust::distance(i, v.end()), 99);
+
+    i += 49;
+
+    ASSERT_EQUAL(thrust::distance(i, v.end()), 50);
+    
+    ASSERT_EQUAL(thrust::distance(i, i), 0);
+}
+DECLARE_VECTOR_UNITTEST(TestDistance);
+
diff --git a/thrust/testing/equal.cu b/thrust/testing/equal.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca9f7eb69e701a1c1f9acece1a6a12f3f584879f
--- /dev/null
+++ b/thrust/testing/equal.cu
@@ -0,0 +1,151 @@
+#include <unittest/unittest.h>
+#include <thrust/equal.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+template <class Vector>
+void TestEqualSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(5);
+    Vector v2(5);
+    v1[0] = 5; v1[1] = 2; v1[2] = 0; v1[3] = 0; v1[4] = 0;
+    v2[0] = 5; v2[1] = 2; v2[2] = 0; v2[3] = 6; v2[4] = 1;
+
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.end(), v1.begin()), true);
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.end(), v2.begin()), false);
+    ASSERT_EQUAL(thrust::equal(v2.begin(), v2.end(), v2.begin()), true);
+    
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.begin() + 0, v1.begin()), true);
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.begin() + 1, v1.begin()), true);
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.begin() + 3, v2.begin()), true);
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.begin() + 4, v2.begin()), false);
+    
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.end(), v2.begin(), thrust::less_equal<T>()), true);
+    ASSERT_EQUAL(thrust::equal(v1.begin(), v1.end(), v2.begin(), thrust::greater<T>()),    false);
+}
+DECLARE_VECTOR_UNITTEST(TestEqualSimple);
+
+template <typename T>
+void TestEqual(const size_t n)
+{
+    thrust::host_vector<T>   h_data1 = unittest::random_samples<T>(n);
+    thrust::host_vector<T>   h_data2 = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data1 = h_data1;
+    thrust::device_vector<T> d_data2 = h_data2;
+
+    //empty ranges
+    ASSERT_EQUAL(thrust::equal(h_data1.begin(), h_data1.begin(), h_data1.begin()), true);
+    ASSERT_EQUAL(thrust::equal(d_data1.begin(), d_data1.begin(), d_data1.begin()), true);
+    
+    //symmetric cases
+    ASSERT_EQUAL(thrust::equal(h_data1.begin(), h_data1.end(), h_data1.begin()), true);
+    ASSERT_EQUAL(thrust::equal(d_data1.begin(), d_data1.end(), d_data1.begin()), true);
+
+    if (n > 0)
+    {
+        h_data1[0] = 0; h_data2[0] = 1;
+        d_data1[0] = 0; d_data2[0] = 1;
+
+        //different vectors
+        ASSERT_EQUAL(thrust::equal(h_data1.begin(), h_data1.end(), h_data2.begin()), false);
+        ASSERT_EQUAL(thrust::equal(d_data1.begin(), d_data1.end(), d_data2.begin()), false);
+
+        //different predicates
+        ASSERT_EQUAL(thrust::equal(h_data1.begin(), h_data1.begin() + 1, h_data2.begin(), thrust::less<T>()), true);
+        ASSERT_EQUAL(thrust::equal(d_data1.begin(), d_data1.begin() + 1, d_data2.begin(), thrust::less<T>()), true);
+        ASSERT_EQUAL(thrust::equal(h_data1.begin(), h_data1.begin() + 1, h_data2.begin(), thrust::greater<T>()), false);
+        ASSERT_EQUAL(thrust::equal(d_data1.begin(), d_data1.begin() + 1, d_data2.begin(), thrust::greater<T>()), false);
+    }
+}
+DECLARE_VARIABLE_UNITTEST(TestEqual);
+
+template<typename InputIterator1, typename InputIterator2>
+bool equal(my_system &system, InputIterator1 /*first*/, InputIterator1, InputIterator2)
+{
+    system.validate_dispatch();
+    return false;
+}
+
+void TestEqualDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::equal(sys,
+                  vec.begin(),
+                  vec.end(),
+                  vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestEqualDispatchExplicit);
+
+
+template<typename InputIterator1, typename InputIterator2>
+bool equal(my_tag, InputIterator1 first, InputIterator1, InputIterator2)
+{
+    *first = 13;
+    return false;
+}
+
+void TestEqualDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::equal(thrust::retag<my_tag>(vec.begin()),
+                  thrust::retag<my_tag>(vec.end()),
+                  thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestEqualDispatchImplicit);
+
+struct only_set_when_both_expected
+{
+    long long expected;
+    bool * flag;
+
+    __device__
+    bool operator()(long long x, long long y)
+    {
+        if (x == expected && y == expected)
+        {
+            *flag = true;
+        }
+
+        return x == y;
+    }
+};
+
+void TestEqualWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_both_expected fn = { (1ll << magnitude) - 1,
+        thrust::raw_pointer_cast(has_executed) };
+
+    ASSERT_EQUAL(thrust::equal(thrust::device, begin, end, begin, fn), true);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestEqualWithBigIndexes()
+{
+    TestEqualWithBigIndexesHelper(30);
+    TestEqualWithBigIndexesHelper(31);
+    TestEqualWithBigIndexesHelper(32);
+    TestEqualWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestEqualWithBigIndexes);
diff --git a/thrust/testing/event.cu b/thrust/testing/event.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5833d4145291300616c282a23cd1ca03b368ac18
--- /dev/null
+++ b/thrust/testing/event.cu
@@ -0,0 +1,180 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/event.h>
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_default_constructed()
+{
+  THRUST_STATIC_ASSERT(
+    (std::is_same<
+      thrust::event<decltype(thrust::device)>
+    , thrust::unique_eager_event<decltype(thrust::device)>
+    >::value)
+  );
+
+  THRUST_STATIC_ASSERT(
+    (std::is_same<
+      thrust::event<decltype(thrust::device)>
+    , thrust::device_event
+    >::value)
+  );
+
+  THRUST_STATIC_ASSERT(
+    (std::is_same<
+      thrust::device_event
+    , thrust::device_unique_eager_event
+    >::value)
+  );
+
+  thrust::device_event e0;
+
+  ASSERT_EQUAL(false, e0.valid_stream());
+
+  ASSERT_THROWS_EQUAL(
+    e0.wait()
+  , thrust::event_error
+  , thrust::event_error(thrust::event_errc::no_state)
+  );
+
+  ASSERT_THROWS_EQUAL(
+    e0.stream()
+  , thrust::event_error
+  , thrust::event_error(thrust::event_errc::no_state)
+  );
+}
+DECLARE_UNITTEST(test_event_default_constructed);
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_new_stream()
+{
+  auto e0 = thrust::device_event(thrust::new_stream);
+
+  ASSERT_EQUAL(true, e0.valid_stream());
+
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e0.stream().native_handle());    
+
+  e0.wait();
+
+  ASSERT_EQUAL(true, e0.ready());
+}
+DECLARE_UNITTEST(test_event_new_stream);
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_linear_chaining()
+{
+  constexpr std::int64_t n = 1024;
+
+  // Create a new stream.
+  auto e0 = thrust::when_all();
+
+  auto const e0_stream = e0.stream().native_handle();
+
+  ASSERT_EQUAL(true, e0.valid_stream());
+
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e0_stream);
+
+  thrust::device_event e1;
+
+  for (std::int64_t i = 0; i < n; ++i)
+  {
+    ASSERT_EQUAL(true,  e0.valid_stream());
+
+    ASSERT_EQUAL(false, e1.valid_stream());
+    ASSERT_EQUAL(false, e1.ready());
+
+    ASSERT_EQUAL_QUIET(e0_stream, e0.stream().native_handle());
+
+    e1 = thrust::when_all(e0);
+
+    ASSERT_EQUAL(false, e0.valid_stream());
+    ASSERT_EQUAL(false, e0.ready());
+
+    ASSERT_EQUAL(true,  e1.valid_stream());
+
+    ASSERT_EQUAL(e0_stream, e1.stream().native_handle());
+
+    std::swap(e0, e1);
+  }
+}
+DECLARE_UNITTEST(test_event_linear_chaining);
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_when_all()
+{
+  // Create events with new streams.
+  auto e0 = thrust::when_all();
+  auto e1 = thrust::when_all();
+  auto e2 = thrust::when_all();
+  auto e3 = thrust::when_all();
+  auto e4 = thrust::when_all();
+  auto e5 = thrust::when_all();
+  auto e6 = thrust::when_all();
+  auto e7 = thrust::when_all();
+
+  auto const e0_stream = e0.stream().native_handle();
+
+  ASSERT_EQUAL(true, e0.valid_stream());
+  ASSERT_EQUAL(true, e1.valid_stream());
+  ASSERT_EQUAL(true, e2.valid_stream());
+  ASSERT_EQUAL(true, e3.valid_stream());
+  ASSERT_EQUAL(true, e4.valid_stream());
+  ASSERT_EQUAL(true, e5.valid_stream());
+  ASSERT_EQUAL(true, e6.valid_stream());
+  ASSERT_EQUAL(true, e7.valid_stream());
+
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e0_stream);
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e1.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e2.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e3.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e4.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e5.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e6.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e7.stream().native_handle());
+
+  auto e8 = thrust::when_all(e0, e1, e2, e3, e4, e5, e6, e7);
+
+  ASSERT_EQUAL(false, e0.valid_stream());
+  ASSERT_EQUAL(false, e1.valid_stream());
+  ASSERT_EQUAL(false, e2.valid_stream());
+  ASSERT_EQUAL(false, e3.valid_stream());
+  ASSERT_EQUAL(false, e4.valid_stream());
+  ASSERT_EQUAL(false, e5.valid_stream());
+  ASSERT_EQUAL(false, e6.valid_stream());
+  ASSERT_EQUAL(false, e7.valid_stream());
+
+  ASSERT_EQUAL(true, e8.valid_stream());
+
+  ASSERT_EQUAL(e0_stream, e8.stream().native_handle());
+
+  e8.wait();
+
+  ASSERT_EQUAL(false, e0.ready());
+  ASSERT_EQUAL(false, e1.ready());
+  ASSERT_EQUAL(false, e2.ready());
+  ASSERT_EQUAL(false, e3.ready());
+  ASSERT_EQUAL(false, e4.ready());
+  ASSERT_EQUAL(false, e5.ready());
+  ASSERT_EQUAL(false, e6.ready());
+  ASSERT_EQUAL(false, e7.ready());
+
+  ASSERT_EQUAL(true,  e8.ready());
+}
+DECLARE_UNITTEST(test_event_when_all);
+
+///////////////////////////////////////////////////////////////////////////////
+ 
+#endif
+
diff --git a/thrust/testing/fill.cu b/thrust/testing/fill.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7154b4118fe5869e0e7714d96ba96d43ca73085d
--- /dev/null
+++ b/thrust/testing/fill.cu
@@ -0,0 +1,465 @@
+#include <unittest/unittest.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <algorithm>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template <class Vector>
+void TestFillSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    thrust::fill(v.begin() + 1, v.begin() + 4, (T) 7);
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 7);
+    ASSERT_EQUAL(v[2], 7);
+    ASSERT_EQUAL(v[3], 7);
+    ASSERT_EQUAL(v[4], 4);
+
+    thrust::fill(v.begin() + 0, v.begin() + 3, (T) 8);
+
+    ASSERT_EQUAL(v[0], 8);
+    ASSERT_EQUAL(v[1], 8);
+    ASSERT_EQUAL(v[2], 8);
+    ASSERT_EQUAL(v[3], 7);
+    ASSERT_EQUAL(v[4], 4);
+
+    thrust::fill(v.begin() + 2, v.end(), (T) 9);
+
+    ASSERT_EQUAL(v[0], 8);
+    ASSERT_EQUAL(v[1], 8);
+    ASSERT_EQUAL(v[2], 9);
+    ASSERT_EQUAL(v[3], 9);
+    ASSERT_EQUAL(v[4], 9);
+
+    thrust::fill(v.begin(), v.end(), (T) 1);
+
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 1);
+    ASSERT_EQUAL(v[3], 1);
+    ASSERT_EQUAL(v[4], 1);
+}
+DECLARE_VECTOR_UNITTEST(TestFillSimple);
+
+
+void TestFillDiscardIterator(void)
+{
+    // there's no result to check because fill returns void
+    thrust::fill(thrust::discard_iterator<thrust::host_system_tag>(),
+                 thrust::discard_iterator<thrust::host_system_tag>(10),
+                 13);
+
+    thrust::fill(thrust::discard_iterator<thrust::device_system_tag>(),
+                 thrust::discard_iterator<thrust::device_system_tag>(10),
+                 13);
+}
+DECLARE_UNITTEST(TestFillDiscardIterator);
+
+
+template <class Vector>
+void TestFillMixedTypes(void)
+{
+    Vector v(4);
+
+    thrust::fill(v.begin(), v.end(), bool(true));
+
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 1);
+    ASSERT_EQUAL(v[3], 1);
+
+    thrust::fill(v.begin(), v.end(), char(20));
+
+    ASSERT_EQUAL(v[0], 20);
+    ASSERT_EQUAL(v[1], 20);
+    ASSERT_EQUAL(v[2], 20);
+    ASSERT_EQUAL(v[3], 20);
+}
+DECLARE_VECTOR_UNITTEST(TestFillMixedTypes);
+
+
+template <typename T>
+void TestFill(size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::fill(h_data.begin() + std::min((size_t)1, n), h_data.begin() + std::min((size_t)3, n), (T) 0);
+    thrust::fill(d_data.begin() + std::min((size_t)1, n), d_data.begin() + std::min((size_t)3, n), (T) 0);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::fill(h_data.begin() + std::min((size_t)117, n), h_data.begin() + std::min((size_t)367, n), (T) 1);
+    thrust::fill(d_data.begin() + std::min((size_t)117, n), d_data.begin() + std::min((size_t)367, n), (T) 1);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::fill(h_data.begin() + std::min((size_t)8, n), h_data.begin() + std::min((size_t)259, n), (T) 2);
+    thrust::fill(d_data.begin() + std::min((size_t)8, n), d_data.begin() + std::min((size_t)259, n), (T) 2);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::fill(h_data.begin() + std::min((size_t)3, n), h_data.end(), (T) 3);
+    thrust::fill(d_data.begin() + std::min((size_t)3, n), d_data.end(), (T) 3);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::fill(h_data.begin(), h_data.end(), (T) 4);
+    thrust::fill(d_data.begin(), d_data.end(), (T) 4);
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestFill);
+
+template <class Vector>
+void TestFillNSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    typename Vector::iterator iter = thrust::fill_n(v.begin() + 1, 3, (T) 7);
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 7);
+    ASSERT_EQUAL(v[2], 7);
+    ASSERT_EQUAL(v[3], 7);
+    ASSERT_EQUAL(v[4], 4);
+    ASSERT_EQUAL_QUIET(v.begin() + 4, iter);
+
+    iter = thrust::fill_n(v.begin() + 0, 3, (T) 8);
+
+    ASSERT_EQUAL(v[0], 8);
+    ASSERT_EQUAL(v[1], 8);
+    ASSERT_EQUAL(v[2], 8);
+    ASSERT_EQUAL(v[3], 7);
+    ASSERT_EQUAL(v[4], 4);
+    ASSERT_EQUAL_QUIET(v.begin() + 3, iter);
+
+    iter = thrust::fill_n(v.begin() + 2, 3, (T) 9);
+
+    ASSERT_EQUAL(v[0], 8);
+    ASSERT_EQUAL(v[1], 8);
+    ASSERT_EQUAL(v[2], 9);
+    ASSERT_EQUAL(v[3], 9);
+    ASSERT_EQUAL(v[4], 9);
+    ASSERT_EQUAL_QUIET(v.end(), iter);
+
+    iter = thrust::fill_n(v.begin(), v.size(), (T) 1);
+
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 1);
+    ASSERT_EQUAL(v[3], 1);
+    ASSERT_EQUAL(v[4], 1);
+    ASSERT_EQUAL_QUIET(v.end(), iter);
+}
+DECLARE_VECTOR_UNITTEST(TestFillNSimple);
+
+
+void TestFillNDiscardIterator(void)
+{
+  thrust::discard_iterator<thrust::host_system_tag> h_result =
+    thrust::fill_n(thrust::discard_iterator<thrust::host_system_tag>(),
+                   10,
+                   13);
+
+  thrust::discard_iterator<thrust::device_system_tag> d_result =
+    thrust::fill_n(thrust::discard_iterator<thrust::device_system_tag>(),
+                   10,
+                   13);
+
+  thrust::discard_iterator<> reference(10);
+
+  ASSERT_EQUAL_QUIET(reference, h_result);
+  ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_UNITTEST(TestFillNDiscardIterator);
+
+
+template <class Vector>
+void TestFillNMixedTypes(void)
+{
+    Vector v(4);
+
+    typename Vector::iterator iter = thrust::fill_n(v.begin(), v.size(), bool(true));
+
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 1);
+    ASSERT_EQUAL(v[3], 1);
+    ASSERT_EQUAL_QUIET(v.end(), iter);
+
+    iter = thrust::fill_n(v.begin(), v.size(), char(20));
+
+    ASSERT_EQUAL(v[0], 20);
+    ASSERT_EQUAL(v[1], 20);
+    ASSERT_EQUAL(v[2], 20);
+    ASSERT_EQUAL(v[3], 20);
+    ASSERT_EQUAL_QUIET(v.end(), iter);
+}
+DECLARE_VECTOR_UNITTEST(TestFillNMixedTypes);
+
+
+template <typename T>
+void TestFillN(size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t begin_offset = std::min<size_t>(1,n);
+    thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)3, n) - begin_offset, (T) 0);
+    thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)3, n) - begin_offset, (T) 0);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    begin_offset = std::min<size_t>(117, n);
+    thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
+    thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    begin_offset = std::min<size_t>(8, n);
+    thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
+    thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    begin_offset = std::min<size_t>(3, n);
+    thrust::fill_n(h_data.begin() + begin_offset, h_data.size() - begin_offset, (T) 3);
+    thrust::fill_n(d_data.begin() + begin_offset, d_data.size() - begin_offset, (T) 3);
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::fill_n(h_data.begin(), h_data.size(), (T) 4);
+    thrust::fill_n(d_data.begin(), d_data.size(), (T) 4);
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestFillN);
+
+
+template <typename Vector>
+void TestFillZipIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3,T(0));
+    Vector v2(3,T(0));
+    Vector v3(3,T(0));
+
+    thrust::fill(thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin(),v3.begin())),
+                 thrust::make_zip_iterator(thrust::make_tuple(v1.end(),v2.end(),v3.end())),
+                 thrust::tuple<T,T,T>(4,7,13));
+
+    ASSERT_EQUAL(4,  v1[0]);
+    ASSERT_EQUAL(4,  v1[1]);
+    ASSERT_EQUAL(4,  v1[2]);
+    ASSERT_EQUAL(7,  v2[0]);
+    ASSERT_EQUAL(7,  v2[1]);
+    ASSERT_EQUAL(7,  v2[2]);
+    ASSERT_EQUAL(13, v3[0]);
+    ASSERT_EQUAL(13, v3[1]);
+    ASSERT_EQUAL(13, v3[2]);
+};
+DECLARE_VECTOR_UNITTEST(TestFillZipIterator);
+
+
+void TestFillTuple(void)
+{
+    typedef int T;
+    typedef thrust::tuple<T,T> Tuple;
+
+    thrust::host_vector<Tuple>   h(3, Tuple(0,0));
+    thrust::device_vector<Tuple> d(3, Tuple(0,0));
+
+    thrust::fill(h.begin(), h.end(), Tuple(4,7));
+    thrust::fill(d.begin(), d.end(), Tuple(4,7));
+
+    ASSERT_EQUAL_QUIET(h, d);
+};
+DECLARE_UNITTEST(TestFillTuple);
+
+
+struct TypeWithTrivialAssigment
+{
+  int x, y, z;
+};
+
+void TestFillWithTrivialAssignment(void)
+{
+    typedef TypeWithTrivialAssigment T;
+
+    thrust::host_vector<T>   h(1);
+    thrust::device_vector<T> d(1);
+
+    ASSERT_EQUAL(h[0].x, 0);
+    ASSERT_EQUAL(h[0].y, 0);
+    ASSERT_EQUAL(h[0].z, 0);
+    ASSERT_EQUAL(static_cast<T>(d[0]).x, 0);
+    ASSERT_EQUAL(static_cast<T>(d[0]).y, 0);
+    ASSERT_EQUAL(static_cast<T>(d[0]).z, 0);
+
+    T val;
+    val.x = 10;
+    val.y = 20;
+    val.z = -1;
+
+    thrust::fill(h.begin(), h.end(), val);
+    thrust::fill(d.begin(), d.end(), val);
+
+    ASSERT_EQUAL(h[0].x, 10);
+    ASSERT_EQUAL(h[0].y, 20);
+    ASSERT_EQUAL(h[0].z, -1);
+    ASSERT_EQUAL(static_cast<T>(d[0]).x, 10);
+    ASSERT_EQUAL(static_cast<T>(d[0]).y, 20);
+    ASSERT_EQUAL(static_cast<T>(d[0]).z, -1);
+};
+DECLARE_UNITTEST(TestFillWithTrivialAssignment);
+
+
+struct TypeWithNonTrivialAssigment
+{
+  int x, y, z;
+
+  __host__ __device__
+  TypeWithNonTrivialAssigment() : x(0), y(0), z(0) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  TypeWithNonTrivialAssigment(const TypeWithNonTrivialAssigment &) = default;
+#endif
+
+  __host__ __device__
+  TypeWithNonTrivialAssigment& operator=(const TypeWithNonTrivialAssigment& t)
+  {
+    x = t.x;
+    y = t.y;
+    z = t.x + t.y;
+    return *this;
+  }
+
+  __host__ __device__
+  bool operator==(const TypeWithNonTrivialAssigment& t) const
+  {
+    return x == t.x && y == t.y && z == t.z;
+  }
+};
+
+void TestFillWithNonTrivialAssignment(void)
+{
+    typedef TypeWithNonTrivialAssigment T;
+
+    thrust::host_vector<T>   h(1);
+    thrust::device_vector<T> d(1);
+
+    ASSERT_EQUAL(h[0].x, 0);
+    ASSERT_EQUAL(h[0].y, 0);
+    ASSERT_EQUAL(h[0].z, 0);
+    ASSERT_EQUAL(static_cast<T>(d[0]).x, 0);
+    ASSERT_EQUAL(static_cast<T>(d[0]).y, 0);
+    ASSERT_EQUAL(static_cast<T>(d[0]).z, 0);
+
+    T val;
+    val.x = 10;
+    val.y = 20;
+    val.z = -1;
+
+    thrust::fill(h.begin(), h.end(), val);
+    thrust::fill(d.begin(), d.end(), val);
+
+    ASSERT_EQUAL(h[0].x, 10);
+    ASSERT_EQUAL(h[0].y, 20);
+    ASSERT_EQUAL(h[0].z, 30);
+    ASSERT_EQUAL(static_cast<T>(d[0]).x, 10);
+    ASSERT_EQUAL(static_cast<T>(d[0]).y, 20);
+    ASSERT_EQUAL(static_cast<T>(d[0]).z, 30);
+};
+DECLARE_UNITTEST(TestFillWithNonTrivialAssignment);
+
+
+template<typename ForwardIterator, typename T>
+void fill(my_system &system, ForwardIterator /*first*/, ForwardIterator, const T&)
+{
+    system.validate_dispatch();
+}
+
+void TestFillDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::fill(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestFillDispatchExplicit);
+
+
+template<typename ForwardIterator, typename T>
+void fill(my_tag, ForwardIterator first, ForwardIterator, const T&)
+{
+    *first = 13;
+}
+
+void TestFillDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::fill(thrust::retag<my_tag>(vec.begin()),
+                 thrust::retag<my_tag>(vec.end()),
+                 0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestFillDispatchImplicit);
+
+
+template<typename OutputIterator, typename Size, typename T>
+OutputIterator fill_n(my_system &system, OutputIterator first, Size, const T&)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestFillNDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::fill_n(sys, vec.begin(), vec.size(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestFillNDispatchExplicit);
+
+
+template<typename OutputIterator, typename Size, typename T>
+OutputIterator fill_n(my_tag, OutputIterator first, Size, const T&)
+{
+    *first = 13;
+    return first;
+}
+
+void TestFillNDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::fill_n(thrust::retag<my_tag>(vec.begin()),
+                   vec.size(),
+                   0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestFillNDispatchImplicit);
+
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/thrust/testing/find.cu b/thrust/testing/find.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9252171ddaf6816c5729668eaac977f6497010c7
--- /dev/null
+++ b/thrust/testing/find.cu
@@ -0,0 +1,373 @@
+#include <unittest/unittest.h>
+#include <thrust/sequence.h>
+#include <thrust/find.h>
+#include <thrust/iterator/retag.h>
+
+
+template <typename T>
+struct equal_to_value_pred
+{
+    T value;
+
+    equal_to_value_pred(T value) : value(value) {}
+
+    __host__ __device__
+    bool operator()(T v) const { return v == value; }
+};
+
+template <typename T>
+struct not_equal_to_value_pred
+{
+    T value;
+
+    not_equal_to_value_pred(T value) : value(value) {}
+
+    __host__ __device__
+    bool operator()(T v) const { return v != value; }
+};
+
+template<typename T>
+struct less_than_value_pred
+{
+    T value;
+
+    less_than_value_pred(T value) : value(value) {}
+
+    __host__ __device__
+    bool operator()(T v) const { return v < value; }
+};
+
+template <class Vector>
+void TestFindSimple(void)
+{
+    Vector vec(5);
+    vec[0] = 1;
+    vec[1] = 2;
+    vec[2] = 3;
+    vec[3] = 3;
+    vec[4] = 5;
+
+    ASSERT_EQUAL(thrust::find(vec.begin(), vec.end(), 0) - vec.begin(), 5);
+    ASSERT_EQUAL(thrust::find(vec.begin(), vec.end(), 1) - vec.begin(), 0);
+    ASSERT_EQUAL(thrust::find(vec.begin(), vec.end(), 2) - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::find(vec.begin(), vec.end(), 3) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::find(vec.begin(), vec.end(), 4) - vec.begin(), 5);
+    ASSERT_EQUAL(thrust::find(vec.begin(), vec.end(), 5) - vec.begin(), 4);
+}
+DECLARE_VECTOR_UNITTEST(TestFindSimple);
+
+template<typename InputIterator, typename T>
+InputIterator find(my_system &system, InputIterator first, InputIterator, const T&)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestFindDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::find(sys,
+                 vec.begin(),
+                 vec.end(),
+                 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestFindDispatchExplicit);
+
+
+template<typename InputIterator, typename T>
+InputIterator find(my_tag, InputIterator first, InputIterator, const T&)
+{
+    *first = 13;
+    return first;
+}
+
+void TestFindDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::find(thrust::retag<my_tag>(vec.begin()),
+                 thrust::retag<my_tag>(vec.end()),
+                 0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestFindDispatchImplicit);
+
+
+template <class Vector>
+void TestFindIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+    vec[0] = 1;
+    vec[1] = 2;
+    vec[2] = 3;
+    vec[3] = 3;
+    vec[4] = 5;
+
+    ASSERT_EQUAL(thrust::find_if(vec.begin(), vec.end(), equal_to_value_pred<T>(0)) - vec.begin(), 5);
+    ASSERT_EQUAL(thrust::find_if(vec.begin(), vec.end(), equal_to_value_pred<T>(1)) - vec.begin(), 0);
+    ASSERT_EQUAL(thrust::find_if(vec.begin(), vec.end(), equal_to_value_pred<T>(2)) - vec.begin(), 1);
+    ASSERT_EQUAL(thrust::find_if(vec.begin(), vec.end(), equal_to_value_pred<T>(3)) - vec.begin(), 2);
+    ASSERT_EQUAL(thrust::find_if(vec.begin(), vec.end(), equal_to_value_pred<T>(4)) - vec.begin(), 5);
+    ASSERT_EQUAL(thrust::find_if(vec.begin(), vec.end(), equal_to_value_pred<T>(5)) - vec.begin(), 4);
+}
+DECLARE_VECTOR_UNITTEST(TestFindIfSimple);
+
+template<typename InputIterator, typename Predicate>
+InputIterator find_if(my_system &system, InputIterator first, InputIterator, Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestFindIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::find_if(sys,
+                    vec.begin(),
+                    vec.end(),
+                    thrust::identity<int>());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestFindIfDispatchExplicit);
+
+
+template<typename InputIterator, typename Predicate>
+InputIterator find_if(my_tag, InputIterator first, InputIterator, Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestFindIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::find_if(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.end()),
+                    thrust::identity<int>());
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestFindIfDispatchImplicit);
+
+
+template <class Vector>
+void TestFindIfNotSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector vec(5);
+    vec[0] = 0;
+    vec[1] = 1;
+    vec[2] = 2;
+    vec[3] = 3;
+    vec[4] = 4;
+
+    ASSERT_EQUAL(0, thrust::find_if_not(vec.begin(), vec.end(), less_than_value_pred<T>(0)) - vec.begin());
+    ASSERT_EQUAL(1, thrust::find_if_not(vec.begin(), vec.end(), less_than_value_pred<T>(1)) - vec.begin());
+    ASSERT_EQUAL(2, thrust::find_if_not(vec.begin(), vec.end(), less_than_value_pred<T>(2)) - vec.begin());
+    ASSERT_EQUAL(3, thrust::find_if_not(vec.begin(), vec.end(), less_than_value_pred<T>(3)) - vec.begin());
+    ASSERT_EQUAL(4, thrust::find_if_not(vec.begin(), vec.end(), less_than_value_pred<T>(4)) - vec.begin());
+    ASSERT_EQUAL(5, thrust::find_if_not(vec.begin(), vec.end(), less_than_value_pred<T>(5)) - vec.begin());
+}
+DECLARE_VECTOR_UNITTEST(TestFindIfNotSimple);
+
+
+template<typename InputIterator, typename Predicate>
+InputIterator find_if_not(my_system &system, InputIterator first, InputIterator, Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestFindIfNotDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::find_if_not(sys,
+                        vec.begin(),
+                        vec.end(),
+                        thrust::identity<int>());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestFindIfNotDispatchExplicit);
+
+
+template<typename InputIterator, typename Predicate>
+InputIterator find_if_not(my_tag, InputIterator first, InputIterator, Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestFindIfNotDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::find_if_not(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()),
+                        thrust::identity<int>());
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestFindIfNotDispatchImplicit);
+
+
+template <typename T>
+struct TestFind
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        typename thrust::host_vector<T>::iterator   h_iter;
+        typename thrust::device_vector<T>::iterator d_iter;
+
+        h_iter = thrust::find(h_data.begin(), h_data.end(), T(0));
+        d_iter = thrust::find(d_data.begin(), d_data.end(), T(0));
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+
+        for (size_t i = 1; i < n; i *= 2)
+        {
+            T sample = h_data[i];
+            h_iter = thrust::find(h_data.begin(), h_data.end(), sample);
+            d_iter = thrust::find(d_data.begin(), d_data.end(), sample);
+            ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+        }
+    }
+};
+VariableUnitTest<TestFind, SignedIntegralTypes> TestFindInstance;
+
+
+template <typename T>
+struct TestFindIf
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        typename thrust::host_vector<T>::iterator   h_iter;
+        typename thrust::device_vector<T>::iterator d_iter;
+
+        h_iter = thrust::find_if(h_data.begin(), h_data.end(), equal_to_value_pred<T>(0));
+        d_iter = thrust::find_if(d_data.begin(), d_data.end(), equal_to_value_pred<T>(0));
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+
+        for (size_t i = 1; i < n; i *= 2)
+        {
+            T sample = h_data[i];
+            h_iter = thrust::find_if(h_data.begin(), h_data.end(), equal_to_value_pred<T>(sample));
+            d_iter = thrust::find_if(d_data.begin(), d_data.end(), equal_to_value_pred<T>(sample));
+            ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+        }
+    }
+};
+VariableUnitTest<TestFindIf, SignedIntegralTypes> TestFindIfInstance;
+
+
+template <typename T>
+struct TestFindIfNot
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        typename thrust::host_vector<T>::iterator   h_iter;
+        typename thrust::device_vector<T>::iterator d_iter;
+
+        h_iter = thrust::find_if_not(h_data.begin(), h_data.end(), not_equal_to_value_pred<T>(0));
+        d_iter = thrust::find_if_not(d_data.begin(), d_data.end(), not_equal_to_value_pred<T>(0));
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+
+        for (size_t i = 1; i < n; i *= 2)
+        {
+            T sample = h_data[i];
+            h_iter = thrust::find_if_not(h_data.begin(), h_data.end(), not_equal_to_value_pred<T>(sample));
+            d_iter = thrust::find_if_not(d_data.begin(), d_data.end(), not_equal_to_value_pred<T>(sample));
+            ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+        }
+    }
+};
+VariableUnitTest<TestFindIfNot, SignedIntegralTypes> TestFindIfNotInstance;
+
+void TestFindWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::detail::intmax_t distance_low_value = thrust::distance(
+        begin,
+        thrust::find(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    thrust::detail::intmax_t distance_high_value = thrust::distance(
+        begin,
+        thrust::find(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 16);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18);
+}
+
+void TestFindWithBigIndexes()
+{
+    TestFindWithBigIndexesHelper(30);
+    TestFindWithBigIndexesHelper(31);
+    TestFindWithBigIndexesHelper(32);
+    TestFindWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestFindWithBigIndexes);
+
+namespace
+{
+
+class Weird
+{
+  int value;
+
+public:
+  __host__ __device__ Weird(int val, int)
+      : value(val)
+  {}
+
+  friend __host__ __device__
+  bool operator==(int x, Weird y)
+  {
+    return x == y.value;
+  }
+};
+
+} // end anon namespace
+
+void TestFindAsymmetricEquality()
+{ // Regression test for thrust/thrust#1229
+  thrust::host_vector<int> v(1000);
+  thrust::sequence(v.begin(), v.end());
+  thrust::device_vector<int> dv(v);
+  auto result = thrust::find(dv.begin(), dv.end(), Weird(333, 0));
+  ASSERT_EQUAL(*result, 333);
+  ASSERT_EQUAL(result - dv.begin(), 333);
+}
+DECLARE_UNITTEST(TestFindAsymmetricEquality);
diff --git a/thrust/testing/for_each.cu b/thrust/testing/for_each.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8040e5f78553ee9280dc3f664dda23076a3b8f4a
--- /dev/null
+++ b/thrust/testing/for_each.cu
@@ -0,0 +1,399 @@
+#include <unittest/unittest.h>
+#include <thrust/for_each.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <algorithm>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template <typename T>
+class mark_present_for_each
+{
+    public:
+        T * ptr;
+        __host__ __device__ void operator()(T x){ ptr[(int) x] = 1; }
+};
+
+template <class Vector>
+void TestForEachSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input(5);
+    Vector output(7, (T) 0);
+
+    input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
+
+    mark_present_for_each<T> f;
+    f.ptr = thrust::raw_pointer_cast(output.data());
+
+    typename Vector::iterator result = thrust::for_each(input.begin(), input.end(), f);
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 0);
+    ASSERT_EQUAL(output[2], 1);
+    ASSERT_EQUAL(output[3], 1);
+    ASSERT_EQUAL(output[4], 1);
+    ASSERT_EQUAL(output[5], 0);
+    ASSERT_EQUAL(output[6], 1);
+    ASSERT_EQUAL_QUIET(result, input.end());
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestForEachSimple);
+
+
+template<typename InputIterator, typename Function>
+InputIterator for_each(my_system &system, InputIterator first, InputIterator, Function)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestForEachDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::for_each(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestForEachDispatchExplicit);
+
+
+template<typename InputIterator, typename Function>
+InputIterator for_each(my_tag, InputIterator first, InputIterator, Function)
+{
+    *first = 13;
+    return first;
+}
+
+void TestForEachDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::for_each(thrust::retag<my_tag>(vec.begin()),
+                     thrust::retag<my_tag>(vec.end()),
+                     0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestForEachDispatchImplicit);
+
+
+template <class Vector>
+void TestForEachNSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input(5);
+    Vector output(7, (T) 0);
+
+    input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
+
+    mark_present_for_each<T> f;
+    f.ptr = thrust::raw_pointer_cast(output.data());
+
+    typename Vector::iterator result = thrust::for_each_n(input.begin(), input.size(), f);
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 0);
+    ASSERT_EQUAL(output[2], 1);
+    ASSERT_EQUAL(output[3], 1);
+    ASSERT_EQUAL(output[4], 1);
+    ASSERT_EQUAL(output[5], 0);
+    ASSERT_EQUAL(output[6], 1);
+    ASSERT_EQUAL_QUIET(result, input.end());
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestForEachNSimple);
+
+
+template<typename InputIterator, typename Size, typename Function>
+InputIterator for_each_n(my_system &system, InputIterator first, Size, Function)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestForEachNDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::for_each_n(sys, vec.begin(), vec.size(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestForEachNDispatchExplicit);
+
+
+template<typename InputIterator, typename Size, typename Function>
+InputIterator for_each_n(my_tag, InputIterator first, Size, Function)
+{
+    *first = 13;
+    return first;
+}
+
+void TestForEachNDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::for_each_n(thrust::retag<my_tag>(vec.begin()),
+                       vec.size(),
+                       0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestForEachNDispatchImplicit);
+
+
+void TestForEachSimpleAnySystem(void)
+{
+    thrust::device_vector<int> output(7, 0);
+
+    mark_present_for_each<int> f;
+    f.ptr = thrust::raw_pointer_cast(output.data());
+
+    thrust::counting_iterator<int> result = thrust::for_each(thrust::make_counting_iterator(0), thrust::make_counting_iterator(5), f);
+
+    ASSERT_EQUAL(output[0], 1);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 1);
+    ASSERT_EQUAL(output[3], 1);
+    ASSERT_EQUAL(output[4], 1);
+    ASSERT_EQUAL(output[5], 0);
+    ASSERT_EQUAL(output[6], 0);
+    ASSERT_EQUAL_QUIET(result, thrust::make_counting_iterator(5));
+}
+DECLARE_UNITTEST(TestForEachSimpleAnySystem);
+
+
+void TestForEachNSimpleAnySystem(void)
+{
+    thrust::device_vector<int> output(7, 0);
+
+    mark_present_for_each<int> f;
+    f.ptr = thrust::raw_pointer_cast(output.data());
+
+    thrust::counting_iterator<int> result = thrust::for_each_n(thrust::make_counting_iterator(0), 5, f);
+
+    ASSERT_EQUAL(output[0], 1);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 1);
+    ASSERT_EQUAL(output[3], 1);
+    ASSERT_EQUAL(output[4], 1);
+    ASSERT_EQUAL(output[5], 0);
+    ASSERT_EQUAL(output[6], 0);
+    ASSERT_EQUAL_QUIET(result, thrust::make_counting_iterator(5));
+}
+DECLARE_UNITTEST(TestForEachNSimpleAnySystem);
+
+
+template <typename T>
+void TestForEach(const size_t n)
+{
+    const size_t output_size = std::min((size_t) 10, 2 * n);
+    
+    thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_input[i] =  ((size_t) h_input[i]) % output_size;
+    
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(output_size, (T) 0);
+    thrust::device_vector<T> d_output(output_size, (T) 0);
+
+    mark_present_for_each<T> h_f;
+    mark_present_for_each<T> d_f;
+    h_f.ptr = &h_output[0];
+    d_f.ptr = (&d_output[0]).get();
+    
+    typename thrust::host_vector<T>::iterator h_result =
+      thrust::for_each(h_input.begin(), h_input.end(), h_f);
+
+    typename thrust::device_vector<T>::iterator d_result =
+      thrust::for_each(d_input.begin(), d_input.end(), d_f);
+
+    ASSERT_EQUAL(h_output, d_output);
+    ASSERT_EQUAL_QUIET(h_result, h_input.end());
+    ASSERT_EQUAL_QUIET(d_result, d_input.end());
+}
+DECLARE_VARIABLE_UNITTEST(TestForEach);
+
+
+template <typename T>
+void TestForEachN(const size_t n)
+{
+    const size_t output_size = std::min((size_t) 10, 2 * n);
+    
+    thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_input[i] =  ((size_t) h_input[i]) % output_size;
+    
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(output_size, (T) 0);
+    thrust::device_vector<T> d_output(output_size, (T) 0);
+
+    mark_present_for_each<T> h_f;
+    mark_present_for_each<T> d_f;
+    h_f.ptr = &h_output[0];
+    d_f.ptr = (&d_output[0]).get();
+    
+    typename thrust::host_vector<T>::iterator h_result =
+      thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
+
+    typename thrust::device_vector<T>::iterator d_result =
+      thrust::for_each_n(d_input.begin(), d_input.size(), d_f);
+
+    ASSERT_EQUAL(h_output, d_output);
+    ASSERT_EQUAL_QUIET(h_result, h_input.end());
+    ASSERT_EQUAL_QUIET(d_result, d_input.end());
+}
+DECLARE_VARIABLE_UNITTEST(TestForEachN);
+
+
+template <typename T, unsigned int N>
+struct SetFixedVectorToConstant
+{
+    FixedVector<T,N> exemplar;
+
+    SetFixedVectorToConstant(T scalar) : exemplar(scalar) {} 
+
+    __host__ __device__
+    void operator()(FixedVector<T,N>& t)
+    {
+        t = exemplar;
+    }
+};
+
+
+template <typename T, unsigned int N>
+void _TestForEachWithLargeTypes(void)
+{
+    size_t n = (64 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_data(n);
+
+    for(size_t i = 0; i < h_data.size(); i++)
+        h_data[i] = FixedVector<T,N>(i);
+
+    thrust::device_vector< FixedVector<T,N> > d_data = h_data;
+   
+    SetFixedVectorToConstant<T,N> func(123);
+
+    thrust::for_each(h_data.begin(), h_data.end(), func);
+    thrust::for_each(d_data.begin(), d_data.end(), func);
+
+    ASSERT_EQUAL_QUIET(h_data, d_data);
+}
+
+
+void TestForEachWithLargeTypes(void)
+{
+    _TestForEachWithLargeTypes<int,    1>();
+    _TestForEachWithLargeTypes<int,    2>();
+    _TestForEachWithLargeTypes<int,    4>();
+    _TestForEachWithLargeTypes<int,    8>();
+    _TestForEachWithLargeTypes<int,   16>();
+
+    _TestForEachWithLargeTypes<int,   32>();  // fails on Linux 32 w/ gcc 4.1
+    _TestForEachWithLargeTypes<int,   64>();
+    _TestForEachWithLargeTypes<int,  128>();
+    _TestForEachWithLargeTypes<int,  256>();
+    _TestForEachWithLargeTypes<int,  512>();
+    
+    // XXX parallel_for doens't support large types 
+//    _TestForEachWithLargeTypes<int, 1024>();  // fails on Vista 64 w/ VS2008
+}
+DECLARE_UNITTEST(TestForEachWithLargeTypes);
+
+
+template <typename T, unsigned int N>
+void _TestForEachNWithLargeTypes(void)
+{
+    size_t n = (64 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_data(n);
+
+    for(size_t i = 0; i < h_data.size(); i++)
+        h_data[i] = FixedVector<T,N>(i);
+
+    thrust::device_vector< FixedVector<T,N> > d_data = h_data;
+   
+    SetFixedVectorToConstant<T,N> func(123);
+
+    thrust::for_each_n(h_data.begin(), h_data.size(), func);
+    thrust::for_each_n(d_data.begin(), d_data.size(), func);
+
+    ASSERT_EQUAL_QUIET(h_data, d_data);
+}
+
+
+void TestForEachNWithLargeTypes(void)
+{
+    _TestForEachNWithLargeTypes<int,    1>();
+    _TestForEachNWithLargeTypes<int,    2>();
+    _TestForEachNWithLargeTypes<int,    4>();
+    _TestForEachNWithLargeTypes<int,    8>();
+    _TestForEachNWithLargeTypes<int,   16>();
+
+    _TestForEachNWithLargeTypes<int,   32>();  // fails on Linux 32 w/ gcc 4.1
+    _TestForEachNWithLargeTypes<int,   64>();
+    _TestForEachNWithLargeTypes<int,  128>();
+    _TestForEachNWithLargeTypes<int,  256>();
+    _TestForEachNWithLargeTypes<int,  512>();
+
+    // XXX parallel_for doens't support large types 
+//    _TestForEachNWithLargeTypes<int, 1024>();  // fails on Vista 64 w/ VS2008
+}
+DECLARE_UNITTEST(TestForEachNWithLargeTypes);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+struct only_set_when_expected
+{
+    unsigned long long expected;
+    bool * flag;
+
+    __device__
+    void operator()(unsigned long long x)
+    {
+        if (x == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+void TestForEachWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<unsigned long long> begin(0);
+    thrust::counting_iterator<unsigned long long> end = begin + (1ull << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected fn = { (1ull << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::for_each(thrust::device, begin, end, fn);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestForEachWithBigIndexes()
+{
+    TestForEachWithBigIndexesHelper(30);
+    TestForEachWithBigIndexesHelper(31);
+    TestForEachWithBigIndexesHelper(32);
+    TestForEachWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestForEachWithBigIndexes);
diff --git a/thrust/testing/functional.cu b/thrust/testing/functional.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3b758c9b3ad0c9a151bcb23ca8715869b9b25dae
--- /dev/null
+++ b/thrust/testing/functional.cu
@@ -0,0 +1,324 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <functional>
+#include <algorithm>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+    
+const size_t NUM_SAMPLES = 10000;
+
+template <class InputVector, class OutputVector, class Operator, class ReferenceOperator>
+void TestUnaryFunctional(void)
+{
+    typedef typename InputVector::value_type  InputType;
+    typedef typename OutputVector::value_type OutputType;
+    
+    thrust::host_vector<InputType>  std_input = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<OutputType> std_output(NUM_SAMPLES);
+
+    InputVector  input = std_input;
+    OutputVector output(NUM_SAMPLES);
+
+    thrust::transform(    input.begin(),     input.end(),     output.begin(),          Operator());
+    thrust::transform(std_input.begin(), std_input.end(), std_output.begin(), ReferenceOperator());
+
+    ASSERT_EQUAL(output, std_output);
+}
+
+template <class InputVector, class OutputVector, class Operator, class ReferenceOperator>
+void TestBinaryFunctional(void)
+{
+    typedef typename InputVector::value_type  InputType;
+    typedef typename OutputVector::value_type OutputType;
+    
+    thrust::host_vector<InputType>  std_input1 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<InputType>  std_input2 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<OutputType> std_output(NUM_SAMPLES);
+
+    // Replace zeros to avoid divide by zero exceptions
+    std::replace(std_input2.begin(), std_input2.end(), (InputType) 0, (InputType) 1);
+
+    InputVector input1 = std_input1; 
+    InputVector input2 = std_input2; 
+    OutputVector output(NUM_SAMPLES);
+
+    thrust::transform(    input1.begin(),     input1.end(),      input2.begin(),     output.begin(),          Operator());
+    thrust::transform(std_input1.begin(), std_input1.end(),  std_input2.begin(), std_output.begin(), ReferenceOperator());
+
+    // Note: FP division is not bit-equal, even when nvcc is invoked with --prec-div
+    ASSERT_ALMOST_EQUAL(output, std_output);
+}
+
+
+
+// XXX add bool to list
+// Instantiate a macro for all integer-like data types
+#define INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)   \
+Macro(vector_type, operator_name, unittest::int8_t  )                  \
+Macro(vector_type, operator_name, unittest::uint8_t )                  \
+Macro(vector_type, operator_name, unittest::int16_t )                  \
+Macro(vector_type, operator_name, unittest::uint16_t)                  \
+Macro(vector_type, operator_name, unittest::int32_t )                  \
+Macro(vector_type, operator_name, unittest::uint32_t)                  \
+Macro(vector_type, operator_name, unittest::int64_t )                  \
+Macro(vector_type, operator_name, unittest::uint64_t)
+
+// Instantiate a macro for all integer and floating point data types
+#define INSTANTIATE_ALL_TYPES(Macro, vector_type, operator_name)       \
+INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)           \
+Macro(vector_type, operator_name, float)
+
+
+// op(T) -> T
+#define INSTANTIATE_UNARY_ARITHMETIC_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestUnaryFunctional< thrust::vector_type<data_type>,                                   \
+                         thrust::vector_type<data_type>,                                   \
+                         thrust::operator_name<data_type>,                                 \
+                         std::operator_name<data_type> >();
+// XXX revert OutputVector<T> back to bool
+// op(T) -> bool
+#define INSTANTIATE_UNARY_LOGICAL_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestUnaryFunctional< thrust::vector_type<data_type>,                                \
+                         thrust::vector_type<data_type>,                                \
+                         thrust::operator_name<data_type>,                              \
+                         std::operator_name<data_type> >();
+// op(T,T) -> T
+#define INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestBinaryFunctional< thrust::vector_type<data_type>,                                   \
+                          thrust::vector_type<data_type>,                                   \
+                          thrust::operator_name<data_type>,                                 \
+                          std::operator_name<data_type> >();
+// XXX revert OutputVector<T> back to bool
+// op(T,T) -> bool
+#define INSTANTIATE_BINARY_LOGICAL_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestBinaryFunctional< thrust::vector_type<data_type>,                                \
+                          thrust::vector_type<data_type>,                                \
+                          thrust::operator_name<data_type>,                              \
+                          std::operator_name<data_type> >();
+
+
+
+
+// op(T) -> T
+#define DECLARE_UNARY_ARITHMETIC_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                          \
+void Test##OperatorName##FunctionalHost(void)                                                              \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_UNARY_ARITHMETIC_FUNCTIONAL_TEST, host_vector,   operator_name);    \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                      \
+void Test##OperatorName##FunctionalDevice(void)                                                            \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_UNARY_ARITHMETIC_FUNCTIONAL_TEST, device_vector, operator_name);    \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// op(T) -> bool
+#define DECLARE_UNARY_LOGICAL_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                             \
+void Test##OperatorName##FunctionalHost(void)                                                              \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_UNARY_LOGICAL_FUNCTIONAL_TEST, host_vector,   operator_name);       \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                      \
+void Test##OperatorName##FunctionalDevice(void)                                                            \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_UNARY_LOGICAL_FUNCTIONAL_TEST, device_vector, operator_name);       \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// op(T,T) -> T
+#define DECLARE_BINARY_ARITHMETIC_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                         \
+void Test##OperatorName##FunctionalHost(void)                                                              \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                      \
+void Test##OperatorName##FunctionalDevice(void)                                                            \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// op(T,T) -> T (for integer T only)
+#define DECLARE_BINARY_INTEGER_ARITHMETIC_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                     \
+void Test##OperatorName##FunctionalHost(void)                                                                  \
+{                                                                                                              \
+    INSTANTIATE_INTEGER_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                              \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                          \
+void Test##OperatorName##FunctionalDevice(void)                                                                \
+{                                                                                                              \
+    INSTANTIATE_INTEGER_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                              \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// op(T,T) -> bool
+#define DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                         \
+void Test##OperatorName##FunctionalHost(void)                                                           \
+{                                                                                                       \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_LOGICAL_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                       \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                   \
+void Test##OperatorName##FunctionalDevice(void)                                                         \
+{                                                                                                       \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_LOGICAL_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                       \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+
+
+
+// Create the unit tests
+DECLARE_UNARY_ARITHMETIC_FUNCTIONAL_UNITTEST(negate, Negate);
+DECLARE_UNARY_LOGICAL_FUNCTIONAL_UNITTEST(logical_not, LogicalNot);
+
+// Ad-hoc testing for other functionals
+template <class Vector>
+void TestIdentityFunctional(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input(3);
+    input[0] = 0; input[1] = 1; input[2] = 2;
+
+    Vector output(3);
+
+    thrust::transform(input.begin(), input.end(), output.begin(), thrust::identity<T>());
+
+    ASSERT_EQUAL(input, output);
+}
+DECLARE_VECTOR_UNITTEST(TestIdentityFunctional);
+
+template <class Vector>
+void TestProject1stFunctional(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector lhs(3);
+    Vector rhs(3);
+    lhs[0] = 0;  rhs[0] = 3; 
+    lhs[1] = 1;  rhs[1] = 4;
+    lhs[2] = 2;  rhs[2] = 5;
+
+    Vector output(3);
+
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), output.begin(), thrust::project1st<T,T>());
+
+    ASSERT_EQUAL(output, lhs);
+}
+DECLARE_VECTOR_UNITTEST(TestProject1stFunctional);
+
+template <class Vector>
+void TestProject2ndFunctional(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector lhs(3);
+    Vector rhs(3);
+    lhs[0] = 0;  rhs[0] = 3; 
+    lhs[1] = 1;  rhs[1] = 4;
+    lhs[2] = 2;  rhs[2] = 5;
+
+    Vector output(3);
+
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), output.begin(), thrust::project2nd<T,T>());
+
+    ASSERT_EQUAL(output, rhs);
+}
+DECLARE_VECTOR_UNITTEST(TestProject2ndFunctional);
+
+template <class Vector>
+void TestMaximumFunctional(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input1(3);
+    Vector input2(3);
+    input1[0] = 8; input1[1] = 3; input1[2] = 7;
+    input2[0] = 5; input2[1] = 6; input2[2] = 9;
+
+    Vector output(3);
+
+    thrust::transform(input1.begin(), input1.end(), 
+                      input2.begin(), 
+                      output.begin(), 
+                      thrust::maximum<T>());
+
+    ASSERT_EQUAL(output[0], 8);
+    ASSERT_EQUAL(output[1], 6);
+    ASSERT_EQUAL(output[2], 9);
+}
+DECLARE_VECTOR_UNITTEST(TestMaximumFunctional);
+
+template <class Vector>
+void TestMinimumFunctional(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input1(3);
+    Vector input2(3);
+    input1[0] = 8; input1[1] = 3; input1[2] = 7;
+    input2[0] = 5; input2[1] = 6; input2[2] = 9;
+
+    Vector output(3);
+
+    thrust::transform(input1.begin(), input1.end(), 
+                      input2.begin(), 
+                      output.begin(), 
+                      thrust::minimum<T>());
+
+    ASSERT_EQUAL(output[0], 5);
+    ASSERT_EQUAL(output[1], 3);
+    ASSERT_EQUAL(output[2], 7);
+}
+DECLARE_VECTOR_UNITTEST(TestMinimumFunctional);
+
+template <class Vector>
+void TestNot1(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input(5);
+    input[0] = 1; input[1] = 0; input[2] = 1; input[3] = 1; input[4] = 0;
+
+    Vector output(5);
+
+    thrust::transform(input.begin(), input.end(), 
+                      output.begin(), 
+                      thrust::not1(thrust::identity<T>()));
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 0);
+    ASSERT_EQUAL(output[3], 0);
+    ASSERT_EQUAL(output[4], 1);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestNot1);
+
+template <class Vector>
+void TestNot2(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector input1(5);
+    Vector input2(5);
+    input1[0] = 1; input1[1] = 0; input1[2] = 1; input1[3] = 1; input1[4] = 0;
+    input2[0] = 1; input2[1] = 1; input2[2] = 0; input2[3] = 1; input2[4] = 1;
+
+    Vector output(5);
+
+    thrust::transform(input1.begin(), input1.end(), 
+                      input2.begin(),
+                      output.begin(), 
+                      thrust::not2(thrust::equal_to<T>()));
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 1);
+    ASSERT_EQUAL(output[3], 0);
+    ASSERT_EQUAL(output[4], 1);
+}
+DECLARE_VECTOR_UNITTEST(TestNot2);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/thrust/testing/functional_arithmetic.cu b/thrust/testing/functional_arithmetic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..70e29fe9a8ade44477fef60174ba05d659316b0f
--- /dev/null
+++ b/thrust/testing/functional_arithmetic.cu
@@ -0,0 +1,92 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <functional>
+#include <algorithm>
+    
+const size_t NUM_SAMPLES = 10000;
+
+template <class InputVector, class OutputVector, class Operator, class ReferenceOperator>
+void TestBinaryFunctional(void)
+{
+    typedef typename InputVector::value_type  InputType;
+    typedef typename OutputVector::value_type OutputType;
+    
+    thrust::host_vector<InputType>  std_input1 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<InputType>  std_input2 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<OutputType> std_output(NUM_SAMPLES);
+
+    // Replace zeros to avoid divide by zero exceptions
+    std::replace(std_input2.begin(), std_input2.end(), (InputType) 0, (InputType) 1);
+
+    InputVector input1 = std_input1; 
+    InputVector input2 = std_input2; 
+    OutputVector output(NUM_SAMPLES);
+
+    thrust::transform(    input1.begin(),     input1.end(),      input2.begin(),     output.begin(),          Operator());
+    thrust::transform(std_input1.begin(), std_input1.end(),  std_input2.begin(), std_output.begin(), ReferenceOperator());
+
+    // Note: FP division is not bit-equal, even when nvcc is invoked with --prec-div
+    ASSERT_ALMOST_EQUAL(output, std_output);
+}
+
+
+// XXX add bool to list
+// Instantiate a macro for all integer-like data types
+#define INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)   \
+Macro(vector_type, operator_name, unittest::int8_t  )                  \
+Macro(vector_type, operator_name, unittest::uint8_t )                  \
+Macro(vector_type, operator_name, unittest::int16_t )                  \
+Macro(vector_type, operator_name, unittest::uint16_t)                  \
+Macro(vector_type, operator_name, unittest::int32_t )                  \
+Macro(vector_type, operator_name, unittest::uint32_t)                  \
+Macro(vector_type, operator_name, unittest::int64_t )                  \
+Macro(vector_type, operator_name, unittest::uint64_t)
+
+// Instantiate a macro for all integer and floating point data types
+#define INSTANTIATE_ALL_TYPES(Macro, vector_type, operator_name)       \
+INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)           \
+Macro(vector_type, operator_name, float)
+
+
+// op(T,T) -> T
+#define INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestBinaryFunctional< thrust::vector_type<data_type>,                                   \
+                          thrust::vector_type<data_type>,                                   \
+                          thrust::operator_name<data_type>,                                 \
+                          std::operator_name<data_type> >();
+// op(T,T) -> T
+#define DECLARE_BINARY_ARITHMETIC_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                         \
+void Test##OperatorName##FunctionalHost(void)                                                              \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                      \
+void Test##OperatorName##FunctionalDevice(void)                                                            \
+{                                                                                                          \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                          \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// op(T,T) -> T (for integer T only)
+#define DECLARE_BINARY_INTEGER_ARITHMETIC_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                     \
+void Test##OperatorName##FunctionalHost(void)                                                                  \
+{                                                                                                              \
+    INSTANTIATE_INTEGER_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                              \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                          \
+void Test##OperatorName##FunctionalDevice(void)                                                                \
+{                                                                                                              \
+    INSTANTIATE_INTEGER_TYPES( INSTANTIATE_BINARY_ARITHMETIC_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                              \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// Create the unit tests
+DECLARE_BINARY_ARITHMETIC_FUNCTIONAL_UNITTEST(plus,       Plus      );
+DECLARE_BINARY_ARITHMETIC_FUNCTIONAL_UNITTEST(minus,      Minus     );
+DECLARE_BINARY_ARITHMETIC_FUNCTIONAL_UNITTEST(multiplies, Multiplies);
+DECLARE_BINARY_ARITHMETIC_FUNCTIONAL_UNITTEST(divides,    Divides   );
+
+DECLARE_BINARY_INTEGER_ARITHMETIC_FUNCTIONAL_UNITTEST(modulus, Modulus);
+
diff --git a/thrust/testing/functional_bitwise.cu b/thrust/testing/functional_bitwise.cu
new file mode 100644
index 0000000000000000000000000000000000000000..04faa0240977ed7fdbf7f3e3929517e2b85d4dbd
--- /dev/null
+++ b/thrust/testing/functional_bitwise.cu
@@ -0,0 +1,111 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <functional>
+#include <algorithm>
+    
+const size_t NUM_SAMPLES = 10000;
+
+// STL doesn't necessarily have these available
+namespace ref
+{
+
+template<typename T>
+  struct bit_and
+{
+  T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs & rhs;
+  }
+};
+
+template<typename T>
+  struct bit_or
+{
+  T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs | rhs;
+  }
+};
+
+template<typename T>
+  struct bit_xor
+{
+  T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs ^ rhs;
+  }
+};
+
+}
+
+template <class InputVector, class OutputVector, class Operator, class ReferenceOperator>
+void TestBinaryFunctional(void)
+{
+    typedef typename InputVector::value_type  InputType;
+    typedef typename OutputVector::value_type OutputType;
+    
+    thrust::host_vector<InputType>  std_input1 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<InputType>  std_input2 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<OutputType> std_output(NUM_SAMPLES);
+
+    // Replace zeros to avoid divide by zero exceptions
+    std::replace(std_input2.begin(), std_input2.end(), (InputType) 0, (InputType) 1);
+
+    InputVector input1 = std_input1; 
+    InputVector input2 = std_input2; 
+    OutputVector output(NUM_SAMPLES);
+
+    thrust::transform(    input1.begin(),     input1.end(),      input2.begin(),     output.begin(),          Operator());
+    thrust::transform(std_input1.begin(), std_input1.end(),  std_input2.begin(), std_output.begin(), ReferenceOperator());
+
+    // Note: FP division is not bit-equal, even when nvcc is invoked with --prec-div
+    ASSERT_ALMOST_EQUAL(output, std_output);
+}
+
+
+
+// XXX add bool to list
+// Instantiate a macro for all integer-like data types
+#define INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)   \
+Macro(vector_type, operator_name, unittest::int8_t  )                  \
+Macro(vector_type, operator_name, unittest::uint8_t )                  \
+Macro(vector_type, operator_name, unittest::int16_t )                  \
+Macro(vector_type, operator_name, unittest::uint16_t)                  \
+Macro(vector_type, operator_name, unittest::int32_t )                  \
+Macro(vector_type, operator_name, unittest::uint32_t)                  \
+Macro(vector_type, operator_name, unittest::int64_t )                  \
+Macro(vector_type, operator_name, unittest::uint64_t)
+
+// Instantiate a macro for all integer and floating point data types
+#define INSTANTIATE_ALL_TYPES(Macro, vector_type, operator_name)       \
+INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)           \
+Macro(vector_type, operator_name, float)
+
+// bitwise_op(T,T) -> T
+#define INSTANTIATE_BINARY_BITWISE_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestBinaryFunctional< thrust::vector_type<data_type>,                                \
+                          thrust::vector_type<data_type>,                                \
+                          thrust::operator_name<data_type>,                              \
+                          ref::operator_name<data_type> >();
+
+// op(T,T) -> T (for bitwise op and integer T only)
+#define DECLARE_BINARY_BITWISE_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                         \
+void Test##OperatorName##FunctionalHost(void)                                                           \
+{                                                                                                       \
+    INSTANTIATE_INTEGER_TYPES( INSTANTIATE_BINARY_BITWISE_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                       \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                   \
+void Test##OperatorName##FunctionalDevice(void)                                                         \
+{                                                                                                       \
+    INSTANTIATE_INTEGER_TYPES( INSTANTIATE_BINARY_BITWISE_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                       \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+
+// Create the unit tests
+DECLARE_BINARY_BITWISE_FUNCTIONAL_UNITTEST(bit_and,       BitAnd      );
+DECLARE_BINARY_BITWISE_FUNCTIONAL_UNITTEST(bit_or,        BitOr       );
+DECLARE_BINARY_BITWISE_FUNCTIONAL_UNITTEST(bit_xor,       BitXor      );
+
diff --git a/thrust/testing/functional_logical.cu b/thrust/testing/functional_logical.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5d4286982aa3be73fbfc91a719ff003386eb6c3
--- /dev/null
+++ b/thrust/testing/functional_logical.cu
@@ -0,0 +1,81 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <functional>
+#include <algorithm>
+    
+const size_t NUM_SAMPLES = 10000;
+
+template <class InputVector, class OutputVector, class Operator, class ReferenceOperator>
+void TestBinaryFunctional(void)
+{
+    typedef typename InputVector::value_type  InputType;
+    typedef typename OutputVector::value_type OutputType;
+    
+    thrust::host_vector<InputType>  std_input1 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<InputType>  std_input2 = unittest::random_samples<InputType>(NUM_SAMPLES);
+    thrust::host_vector<OutputType> std_output(NUM_SAMPLES);
+
+    InputVector input1 = std_input1; 
+    InputVector input2 = std_input2; 
+    OutputVector output(NUM_SAMPLES);
+
+    thrust::transform(    input1.begin(),     input1.end(),      input2.begin(),     output.begin(),          Operator());
+    thrust::transform(std_input1.begin(), std_input1.end(),  std_input2.begin(), std_output.begin(), ReferenceOperator());
+
+    // Note: FP division is not bit-equal, even when nvcc is invoked with --prec-div
+    ASSERT_ALMOST_EQUAL(output, std_output);
+}
+
+
+
+// XXX add bool to list
+// Instantiate a macro for all integer-like data types
+#define INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)   \
+Macro(vector_type, operator_name, unittest::int8_t  )                  \
+Macro(vector_type, operator_name, unittest::uint8_t )                  \
+Macro(vector_type, operator_name, unittest::int16_t )                  \
+Macro(vector_type, operator_name, unittest::uint16_t)                  \
+Macro(vector_type, operator_name, unittest::int32_t )                  \
+Macro(vector_type, operator_name, unittest::uint32_t)                  \
+Macro(vector_type, operator_name, unittest::int64_t )                  \
+Macro(vector_type, operator_name, unittest::uint64_t)
+
+// Instantiate a macro for all integer and floating point data types
+#define INSTANTIATE_ALL_TYPES(Macro, vector_type, operator_name)       \
+INSTANTIATE_INTEGER_TYPES(Macro, vector_type, operator_name)           \
+Macro(vector_type, operator_name, float)
+
+
+// XXX revert OutputVector<T> back to bool
+// op(T,T) -> bool
+#define INSTANTIATE_BINARY_LOGICAL_FUNCTIONAL_TEST(vector_type, operator_name, data_type) \
+    TestBinaryFunctional< thrust::vector_type<data_type>,                                \
+                          thrust::vector_type<data_type>,                                \
+                          thrust::operator_name<data_type>,                              \
+                          std::operator_name<data_type> >();
+
+// op(T,T) -> bool
+#define DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(operator_name, OperatorName)                         \
+void Test##OperatorName##FunctionalHost(void)                                                           \
+{                                                                                                       \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_LOGICAL_FUNCTIONAL_TEST, host_vector,   operator_name);   \
+}                                                                                                       \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalHost);                                                   \
+void Test##OperatorName##FunctionalDevice(void)                                                         \
+{                                                                                                       \
+    INSTANTIATE_ALL_TYPES( INSTANTIATE_BINARY_LOGICAL_FUNCTIONAL_TEST, device_vector, operator_name);   \
+}                                                                                                       \
+DECLARE_UNITTEST(Test##OperatorName##FunctionalDevice);
+
+// Create the unit tests
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(equal_to,      EqualTo     );
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(not_equal_to,  NotEqualTo  );
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(greater,       Greater     );
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(less,          Less        );
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(greater_equal, GreaterEqual);
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(less_equal,    LessEqual   );
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(logical_and,   LogicalAnd  );
+DECLARE_BINARY_LOGICAL_FUNCTIONAL_UNITTEST(logical_or,    LogicalOr   );
+
diff --git a/thrust/testing/functional_placeholders_arithmetic.cu b/thrust/testing/functional_placeholders_arithmetic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4376b46a943fb3d7b60cc900af122eefd863207c
--- /dev/null
+++ b/thrust/testing/functional_placeholders_arithmetic.cu
@@ -0,0 +1,75 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, op, reference_functor, type_list) \
+template<typename Vector> \
+  struct TestFunctionalPlaceholders##name \
+{ \
+  void operator()(const size_t) \
+  { \
+    static const size_t num_samples = 10000; \
+    const size_t zero = 0; \
+    typedef typename Vector::value_type T; \
+    Vector lhs = unittest::random_samples<T>(num_samples); \
+    Vector rhs = unittest::random_samples<T>(num_samples); \
+    thrust::replace(rhs.begin(), rhs.end(), T(0), T(1)); \
+\
+    Vector reference(lhs.size()); \
+    Vector result(lhs.size()); \
+    using namespace thrust::placeholders; \
+\
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), result.begin(), _1 op _2); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+\
+    thrust::transform(lhs.begin(), lhs.end(), thrust::make_constant_iterator<T>(1), reference.begin(), reference_functor<T>()); \
+    thrust::transform(lhs.begin(), lhs.end(), result.begin(), _1 op T(1)); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+\
+    thrust::transform(thrust::make_constant_iterator<T>(1,zero), thrust::make_constant_iterator<T>(1,num_samples), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(rhs.begin(), rhs.end(), result.begin(), T(1) op _1); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+  } \
+}; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::host_vector, std::allocator> TestFunctionalPlaceholders##name##HostInstance;
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Plus,       +, thrust::plus,       ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Minus,      -, thrust::minus,      ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Multiplies, *, thrust::multiplies, ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Divides,    /, thrust::divides,    ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Modulus,    %, thrust::modulus,    SmallIntegralTypes);
+
+#define UNARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
+template<typename Vector> \
+  void TestFunctionalPlaceholders##name(void) \
+{ \
+  static const size_t num_samples = 10000; \
+  typedef typename Vector::value_type T; \
+  Vector input = unittest::random_samples<T>(num_samples); \
+\
+  Vector reference(input.size()); \
+  thrust::transform(input.begin(), input.end(), reference.begin(), functor<T>()); \
+\
+  using namespace thrust::placeholders; \
+  Vector result(input.size()); \
+  thrust::transform(input.begin(), input.end(), result.begin(), reference_operator _1); \
+\
+  ASSERT_EQUAL(reference, result); \
+} \
+DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholders##name);
+
+template<typename T>
+  struct unary_plus_reference
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return +x;
+  }
+};
+
+UNARY_FUNCTIONAL_PLACEHOLDERS_TEST(UnaryPlus, +, unary_plus_reference);
+UNARY_FUNCTIONAL_PLACEHOLDERS_TEST(Negate,    -, thrust::negate);
+
diff --git a/thrust/testing/functional_placeholders_bitwise.cu b/thrust/testing/functional_placeholders_bitwise.cu
new file mode 100644
index 0000000000000000000000000000000000000000..10419535a971fa4db99998deb0d7f0a04662836d
--- /dev/null
+++ b/thrust/testing/functional_placeholders_bitwise.cu
@@ -0,0 +1,90 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <thrust/detail/allocator/allocator_traits.h>
+
+static const size_t num_samples = 10000;
+
+template<typename Vector, typename U> struct rebind_vector;
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::host_vector<T, Allocator>, U>
+{
+  typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+  typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+  typedef thrust::host_vector<U, new_alloc> type;
+};
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::device_vector<T, Allocator>, U>
+{
+  typedef thrust::device_vector<U,
+    typename Allocator::template rebind<U>::other> type;
+};
+
+#define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, op, reference_functor, type_list) \
+template<typename Vector> \
+  struct TestFunctionalPlaceholders##name \
+{ \
+  void operator()(const size_t) \
+  { \
+    static const size_t num_samples = 10000; \
+    const size_t zero = 0; \
+    typedef typename Vector::value_type T; \
+    Vector lhs = unittest::random_samples<T>(num_samples); \
+    Vector rhs = unittest::random_samples<T>(num_samples); \
+    thrust::replace(rhs.begin(), rhs.end(), T(0), T(1)); \
+\
+    Vector reference(lhs.size()); \
+    Vector result(lhs.size()); \
+    using namespace thrust::placeholders; \
+\
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), result.begin(), _1 op _2); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+\
+    thrust::transform(lhs.begin(), lhs.end(), thrust::make_constant_iterator<T>(1), reference.begin(), reference_functor<T>()); \
+    thrust::transform(lhs.begin(), lhs.end(), result.begin(), _1 op T(1)); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+\
+    thrust::transform(thrust::make_constant_iterator<T>(1,zero), thrust::make_constant_iterator<T>(1,num_samples), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(rhs.begin(), rhs.end(), result.begin(), T(1) op _1); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+  } \
+}; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::host_vector, std::allocator> TestFunctionalPlaceholders##name##HostInstance;
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitAnd, &, thrust::bit_and, SmallIntegralTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitOr,  |, thrust::bit_or,  SmallIntegralTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitXor, ^, thrust::bit_xor, SmallIntegralTypes);
+
+template<typename T>
+  struct bit_negate_reference
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return ~x;
+  }
+};
+
+template<typename Vector>
+  void TestFunctionalPlaceholdersBitNegate(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename rebind_vector<Vector,bool>::type bool_vector;
+  Vector input = unittest::random_samples<T>(num_samples);
+
+  bool_vector reference(input.size());
+  thrust::transform(input.begin(), input.end(), reference.begin(), bit_negate_reference<T>());
+
+  using namespace thrust::placeholders;
+  bool_vector result(input.size());
+  thrust::transform(input.begin(), input.end(), result.begin(), ~_1);
+
+  ASSERT_EQUAL(reference, result);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersBitNegate);
+
diff --git a/thrust/testing/functional_placeholders_compound_assignment.cu b/thrust/testing/functional_placeholders_compound_assignment.cu
new file mode 100644
index 0000000000000000000000000000000000000000..512fa73fadf038134711133436ebd80bdc4ad43b
--- /dev/null
+++ b/thrust/testing/functional_placeholders_compound_assignment.cu
@@ -0,0 +1,193 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, op, reference_functor, type_list) \
+template<typename Vector> \
+  struct TestFunctionalPlaceholders##name \
+{ \
+  void operator()(const size_t) \
+  { \
+    const size_t num_samples = 10000; \
+    typedef typename Vector::value_type T; \
+    Vector lhs = unittest::random_samples<T>(num_samples); \
+    Vector rhs = unittest::random_samples<T>(num_samples); \
+    thrust::replace(rhs.begin(), rhs.end(), T(0), T(1)); \
+\
+    Vector lhs_reference = lhs; \
+    Vector reference(lhs.size()); \
+    Vector result(lhs_reference.size()); \
+    using namespace thrust::placeholders; \
+\
+    thrust::transform(lhs_reference.begin(), lhs_reference.end(), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), result.begin(), _1 op _2); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+    ASSERT_ALMOST_EQUAL(lhs_reference, lhs); \
+\
+    thrust::transform(lhs_reference.begin(), lhs_reference.end(), thrust::make_constant_iterator<T>(1), reference.begin(), reference_functor<T>()); \
+    thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), result.begin(), _1 op T(1)); \
+    ASSERT_ALMOST_EQUAL(reference, result); \
+    ASSERT_ALMOST_EQUAL(lhs_reference, lhs); \
+  } \
+}; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::host_vector, std::allocator> TestFunctionalPlaceholders##name##HostInstance;
+
+template<typename T>
+  struct plus_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs += rhs; }
+};
+
+template<typename T>
+  struct minus_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs -= rhs; }
+};
+
+template<typename T>
+  struct multiplies_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs *= rhs; }
+};
+
+template<typename T>
+  struct divides_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs /= rhs; }
+};
+
+template<typename T>
+  struct modulus_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs %= rhs; }
+};
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(PlusEqual,       +=, plus_equal_reference,       ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(MinusEqual,      -=, minus_equal_reference,      ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(MultipliesEqual, *=, multiplies_equal_reference, ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(DividesEqual,    /=, divides_equal_reference,    ThirtyTwoBitTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(ModulusEqual,    %=, modulus_equal_reference,    SmallIntegralTypes);
+
+template<typename T>
+  struct bit_and_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs &= rhs; }
+};
+
+template<typename T>
+  struct bit_or_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs |= rhs; }
+};
+
+template<typename T>
+  struct bit_xor_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs ^= rhs; }
+};
+
+template<typename T>
+  struct bit_lshift_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs <<= rhs; }
+};
+
+template<typename T>
+  struct bit_rshift_equal_reference
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs >>= rhs; }
+};
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitAndEqual,    &=,  bit_and_equal_reference,    SmallIntegralTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitOrEqual,     |=,  bit_or_equal_reference,     SmallIntegralTypes);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitXorEqual,    ^=,  bit_xor_equal_reference,    SmallIntegralTypes);
+
+// XXX ptxas produces an error
+void TestFunctionalPlaceholdersBitLshiftEqualDevice(void)
+{
+  KNOWN_FAILURE;
+}
+// XXX KNOWN_FAILURE this until the above works
+void TestFunctionalPlaceholdersBitLshiftEqualHost(void)
+{
+  KNOWN_FAILURE;
+}
+//BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitLshiftEqual, <<=, bit_lshift_equal_reference, SmallIntegralTypes);
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitRshiftEqual, >>=, bit_rshift_equal_reference, SmallIntegralTypes);
+
+template<typename T>
+  struct prefix_increment_reference
+{
+  __host__ __device__ T& operator()(T &x) const { return ++x; }
+};
+
+template<typename T>
+  struct suffix_increment_reference
+{
+  __host__ __device__ T operator()(T &x) const { return x++; }
+};
+
+template<typename T>
+  struct prefix_decrement_reference
+{
+  __host__ __device__ T& operator()(T &x) const { return --x; }
+};
+
+template<typename T>
+  struct suffix_decrement_reference
+{
+  __host__ __device__ T operator()(T &x) const { return x--; }
+};
+
+#define PREFIX_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
+template<typename Vector> \
+  void TestFunctionalPlaceholdersPrefix##name(void) \
+{ \
+  const size_t num_samples = 10000; \
+  typedef typename Vector::value_type T; \
+  Vector input = unittest::random_samples<T>(num_samples); \
+\
+  Vector input_reference = input; \
+  Vector reference(input.size()); \
+  thrust::transform(input.begin(), input.end(), reference.begin(), functor<T>()); \
+\
+  using namespace thrust::placeholders; \
+  Vector result(input_reference.size()); \
+  thrust::transform(input_reference.begin(), input_reference.end(), result.begin(), reference_operator _1); \
+\
+  ASSERT_ALMOST_EQUAL(input_reference, input); \
+  ASSERT_ALMOST_EQUAL(reference, result); \
+} \
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersPrefix##name);
+
+PREFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Increment,  ++,  prefix_increment_reference);
+PREFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Decrement,  --,  prefix_decrement_reference);
+
+#define SUFFIX_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
+template<typename Vector> \
+  void TestFunctionalPlaceholdersSuffix##name(void) \
+{ \
+  const size_t num_samples = 10000; \
+  typedef typename Vector::value_type T; \
+  Vector input = unittest::random_samples<T>(num_samples); \
+\
+  Vector input_reference = input; \
+  Vector reference(input.size()); \
+  thrust::transform(input.begin(), input.end(), reference.begin(), functor<T>()); \
+\
+  using namespace thrust::placeholders; \
+  Vector result(input_reference.size()); \
+  thrust::transform(input_reference.begin(), input_reference.end(), result.begin(), _1 reference_operator); \
+\
+  ASSERT_ALMOST_EQUAL(input_reference, input); \
+  ASSERT_ALMOST_EQUAL(reference, result); \
+} \
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersSuffix##name);
+
+SUFFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Increment,  ++,  suffix_increment_reference);
+SUFFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Decrement,  --,  suffix_decrement_reference);
+
+
diff --git a/thrust/testing/functional_placeholders_logical.cu b/thrust/testing/functional_placeholders_logical.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b40084b5ec2867ecfcb945edaa61b25cf431587e
--- /dev/null
+++ b/thrust/testing/functional_placeholders_logical.cu
@@ -0,0 +1,72 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <thrust/detail/allocator/allocator_traits.h>
+
+static const size_t num_samples = 10000;
+
+template<typename Vector, typename U> struct rebind_vector;
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::host_vector<T, Allocator>, U>
+{
+  typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+  typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+  typedef thrust::host_vector<U, new_alloc> type;
+};
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::device_vector<T, Allocator>, U>
+{
+  typedef thrust::device_vector<U,
+    typename Allocator::template rebind<U>::other> type;
+};
+
+#define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
+template<typename Vector> \
+  void TestFunctionalPlaceholders##name(void) \
+{ \
+  typedef typename Vector::value_type T; \
+  typedef typename rebind_vector<Vector,bool>::type bool_vector; \
+  Vector lhs = unittest::random_samples<T>(num_samples); \
+  Vector rhs = unittest::random_samples<T>(num_samples); \
+\
+  bool_vector reference(lhs.size()); \
+  thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), reference.begin(), functor<T>()); \
+\
+  using namespace thrust::placeholders; \
+  bool_vector result(lhs.size()); \
+  thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), result.begin(), _1 reference_operator _2); \
+\
+  ASSERT_EQUAL(reference, result); \
+} \
+DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholders##name);
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(LogicalAnd, &&, thrust::logical_and);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(LogicalOr,  ||, thrust::logical_or);
+
+template<typename Vector>
+  void TestFunctionalPlaceholdersLogicalNot(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename rebind_vector<Vector,bool>::type bool_vector;
+  Vector input = unittest::random_samples<T>(num_samples);
+
+  if(input.size() > 0)
+  {
+    // produce at least one true in the output
+    input[0] = T(0);
+  } // end if
+
+  bool_vector reference(input.size());
+  thrust::transform(input.begin(), input.end(), reference.begin(), thrust::logical_not<T>());
+
+  using namespace thrust::placeholders;
+  bool_vector result(input.size());
+  thrust::transform(input.begin(), input.end(), result.begin(), !_1);
+
+  ASSERT_EQUAL(reference, result);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersLogicalNot);
+
diff --git a/thrust/testing/functional_placeholders_miscellaneous.cu b/thrust/testing/functional_placeholders_miscellaneous.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6774211b9c900c78dadf5f5f715d462ec04db56
--- /dev/null
+++ b/thrust/testing/functional_placeholders_miscellaneous.cu
@@ -0,0 +1,73 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+template<typename T>
+  struct saxpy_reference
+{
+  __host__ __device__ saxpy_reference(const T &aa)
+    : a(aa)
+  {}
+
+  __host__ __device__ T operator()(const T &x, const T &y) const
+  {
+    return a * x + y;
+  }
+
+  T a;
+};
+
+template<typename Vector>
+  struct TestFunctionalPlaceholdersValue
+{
+  void operator()(const size_t)
+  {
+    const size_t n = 10000;
+    typedef typename Vector::value_type T;
+
+    T a(13);
+
+    Vector x = unittest::random_integers<T>(n);
+    Vector y = unittest::random_integers<T>(n);
+    Vector result(n), reference(n);
+
+    thrust::transform(x.begin(), x.end(), y.begin(), reference.begin(), saxpy_reference<T>(a));
+
+    using namespace thrust::placeholders;
+    thrust::transform(x.begin(), x.end(), y.begin(), result.begin(), a * _1 + _2);
+
+    ASSERT_ALMOST_EQUAL(reference, result);
+  }
+};
+VectorUnitTest<TestFunctionalPlaceholdersValue, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholdersValueDevice;
+VectorUnitTest<TestFunctionalPlaceholdersValue, ThirtyTwoBitTypes, thrust::host_vector, std::allocator> TestFunctionalPlaceholdersValueHost;
+
+template<typename Vector>
+  struct TestFunctionalPlaceholdersTransformIterator
+{
+  void operator()(const size_t)
+  {
+    const size_t n = 10000;
+    typedef typename Vector::value_type T;
+
+    T a(13);
+
+    Vector x = unittest::random_integers<T>(n);
+    Vector y = unittest::random_integers<T>(n);
+    Vector result(n), reference(n);
+
+    thrust::transform(x.begin(), x.end(), y.begin(), reference.begin(), saxpy_reference<T>(a));
+
+    using namespace thrust::placeholders;
+    thrust::transform(thrust::make_transform_iterator(x.begin(), a * _1),
+                      thrust::make_transform_iterator(x.end(), a * _1),
+                      y.begin(),
+                      result.begin(),
+                      _1 + _2);
+
+    ASSERT_ALMOST_EQUAL(reference, result);
+  }
+};
+VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholdersTransformIteratorInstanceDevice;
+VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::host_vector, std::allocator> TestFunctionalPlaceholdersTransformIteratorInstanceHost;
+
diff --git a/thrust/testing/functional_placeholders_relational.cu b/thrust/testing/functional_placeholders_relational.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a610d3419f59c25a2e3be1c3fd896af171f874d6
--- /dev/null
+++ b/thrust/testing/functional_placeholders_relational.cu
@@ -0,0 +1,52 @@
+#include <unittest/unittest.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <thrust/detail/allocator/allocator_traits.h>
+
+static const size_t num_samples = 10000;
+
+template<typename Vector, typename U> struct rebind_vector;
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::host_vector<T, Allocator>, U>
+{
+    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+    typedef thrust::host_vector<U, new_alloc> type;
+};
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::device_vector<T, Allocator>, U>
+{
+  typedef thrust::device_vector<U,
+    typename Allocator::template rebind<U>::other> type;
+};
+
+#define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
+template<typename Vector> \
+  void TestFunctionalPlaceholdersBinary##name(void) \
+{ \
+  typedef typename Vector::value_type T; \
+  typedef typename rebind_vector<Vector,bool>::type bool_vector; \
+  Vector lhs = unittest::random_samples<T>(num_samples); \
+  Vector rhs = unittest::random_samples<T>(num_samples); \
+\
+  bool_vector reference(lhs.size()); \
+  thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), reference.begin(), functor<T>()); \
+\
+  using namespace thrust::placeholders; \
+  bool_vector result(lhs.size()); \
+  thrust::transform(lhs.begin(), lhs.end(), rhs.begin(), result.begin(), _1 reference_operator _2); \
+\
+  ASSERT_EQUAL(reference, result); \
+} \
+DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholdersBinary##name);
+
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(EqualTo,      ==, thrust::equal_to);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(NotEqualTo,   !=, thrust::not_equal_to);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Greater,       >, thrust::greater);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Less,          <, thrust::less);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(GreaterEqual, >=, thrust::greater_equal);
+BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(LessEqual,    <=, thrust::less_equal);
+
diff --git a/thrust/testing/future.cu b/thrust/testing/future.cu
new file mode 100644
index 0000000000000000000000000000000000000000..13755886078ee34898676b79f9f06ed3339e2be2
--- /dev/null
+++ b/thrust/testing/future.cu
@@ -0,0 +1,255 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/future.h>
+
+struct mock {};
+
+using future_value_types = unittest::type_list<
+  char
+, signed char
+, unsigned char
+, short
+, unsigned short
+, int
+, unsigned int
+, long
+, unsigned long
+, long long
+, unsigned long long
+, float
+, double
+, custom_numeric
+, float2
+, mock
+>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_default_constructed
+{
+  __host__
+  void operator()()
+  {
+    THRUST_STATIC_ASSERT(
+      (std::is_same<
+        thrust::future<decltype(thrust::device), T>
+      , thrust::unique_eager_future<decltype(thrust::device), T>
+      >::value)
+    );
+
+    THRUST_STATIC_ASSERT(
+      (std::is_same<
+        thrust::future<decltype(thrust::device), T>
+      , thrust::device_future<T>
+      >::value)
+    );
+
+    THRUST_STATIC_ASSERT(
+      (std::is_same<
+        thrust::device_future<T>
+      , thrust::device_unique_eager_future<T>
+      >::value)
+    );
+
+    thrust::device_future<T> f0;
+
+    ASSERT_EQUAL(false, f0.valid_stream());
+    ASSERT_EQUAL(false, f0.valid_content());
+
+    ASSERT_THROWS_EQUAL(
+      f0.wait()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      f0.stream()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      f0.get()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      THRUST_UNUSED_VAR(f0.extract())
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_default_constructed
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_new_stream
+{
+  __host__
+  void operator()()
+  {
+    auto f0 = thrust::device_future<T>(thrust::new_stream);
+
+    ASSERT_EQUAL(true,  f0.valid_stream());
+    ASSERT_EQUAL(false, f0.valid_content());
+
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f0.stream().native_handle());    
+
+    TEST_EVENT_WAIT(f0);
+
+    ASSERT_EQUAL(true, f0.ready());
+
+    ASSERT_THROWS_EQUAL(
+      f0.get()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      THRUST_UNUSED_VAR(f0.extract())
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_new_stream
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_convert_to_event
+{
+  __host__
+  void operator()()
+  {
+    auto f0 = thrust::device_future<T>(thrust::new_stream);
+
+    auto const f0_stream = f0.stream().native_handle();
+
+    ASSERT_EQUAL(true,  f0.valid_stream());
+    ASSERT_EQUAL(false, f0.valid_content());
+
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f0_stream);
+
+    auto f1 = thrust::device_event(std::move(f0));
+
+    ASSERT_EQUAL(false, f0.valid_stream());
+    ASSERT_EQUAL(true,  f1.valid_stream());
+
+    ASSERT_EQUAL(f0_stream, f1.stream().native_handle());
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_convert_to_event
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_when_all
+{
+  __host__
+  void operator()()
+  {
+    // Create futures with new streams.
+    auto f0 = thrust::device_future<T>(thrust::new_stream);
+    auto f1 = thrust::device_future<T>(thrust::new_stream);
+    auto f2 = thrust::device_future<T>(thrust::new_stream);
+    auto f3 = thrust::device_future<T>(thrust::new_stream);
+    auto f4 = thrust::device_future<T>(thrust::new_stream);
+    auto f5 = thrust::device_future<T>(thrust::new_stream);
+    auto f6 = thrust::device_future<T>(thrust::new_stream);
+    auto f7 = thrust::device_future<T>(thrust::new_stream);
+
+    auto const f0_stream = f0.stream().native_handle();
+
+    ASSERT_EQUAL(true, f0.valid_stream());
+    ASSERT_EQUAL(true, f1.valid_stream());
+    ASSERT_EQUAL(true, f2.valid_stream());
+    ASSERT_EQUAL(true, f3.valid_stream());
+    ASSERT_EQUAL(true, f4.valid_stream());
+    ASSERT_EQUAL(true, f5.valid_stream());
+    ASSERT_EQUAL(true, f6.valid_stream());
+    ASSERT_EQUAL(true, f7.valid_stream());
+
+    ASSERT_EQUAL(false, f0.valid_content());
+    ASSERT_EQUAL(false, f1.valid_content());
+    ASSERT_EQUAL(false, f2.valid_content());
+    ASSERT_EQUAL(false, f3.valid_content());
+    ASSERT_EQUAL(false, f4.valid_content());
+    ASSERT_EQUAL(false, f5.valid_content());
+    ASSERT_EQUAL(false, f6.valid_content());
+    ASSERT_EQUAL(false, f7.valid_content());
+
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f0_stream);
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f1.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f2.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f3.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f4.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f5.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f6.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f7.stream().native_handle());
+
+    auto e0 = thrust::when_all(f0, f1, f2, f3, f4, f5, f6, f7);
+
+    ASSERT_EQUAL(false, f0.valid_stream());
+    ASSERT_EQUAL(false, f1.valid_stream());
+    ASSERT_EQUAL(false, f2.valid_stream());
+    ASSERT_EQUAL(false, f3.valid_stream());
+    ASSERT_EQUAL(false, f4.valid_stream());
+    ASSERT_EQUAL(false, f5.valid_stream());
+    ASSERT_EQUAL(false, f6.valid_stream());
+    ASSERT_EQUAL(false, f7.valid_stream());
+
+    ASSERT_EQUAL(false, f0.valid_content());
+    ASSERT_EQUAL(false, f1.valid_content());
+    ASSERT_EQUAL(false, f2.valid_content());
+    ASSERT_EQUAL(false, f3.valid_content());
+    ASSERT_EQUAL(false, f4.valid_content());
+    ASSERT_EQUAL(false, f5.valid_content());
+    ASSERT_EQUAL(false, f6.valid_content());
+    ASSERT_EQUAL(false, f7.valid_content());
+
+    ASSERT_EQUAL(true,  e0.valid_stream());
+
+    ASSERT_EQUAL(f0_stream, e0.stream().native_handle());
+
+    TEST_EVENT_WAIT(e0);
+
+    ASSERT_EQUAL(false, f0.ready());
+    ASSERT_EQUAL(false, f1.ready());
+    ASSERT_EQUAL(false, f2.ready());
+    ASSERT_EQUAL(false, f3.ready());
+    ASSERT_EQUAL(false, f4.ready());
+    ASSERT_EQUAL(false, f5.ready());
+    ASSERT_EQUAL(false, f6.ready());
+    ASSERT_EQUAL(false, f7.ready());
+
+    ASSERT_EQUAL(true,  e0.ready());
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_when_all
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif
+
diff --git a/thrust/testing/gather.cu b/thrust/testing/gather.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c164e44b2bb80ea02e0ff7744a31eec8ce87e893
--- /dev/null
+++ b/thrust/testing/gather.cu
@@ -0,0 +1,351 @@
+#include <unittest/unittest.h>
+#include <thrust/gather.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/sequence.h>
+#include <algorithm>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+
+template <class Vector>
+void TestGatherSimple(void)
+{
+    Vector map(5);  // gather indices
+    Vector src(8);  // source vector
+    Vector dst(5);  // destination vector
+
+    map[0] = 6; map[1] = 2; map[2] = 1; map[3] = 7; map[4] = 2;
+    src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4; src[5] = 5; src[6] = 6; src[7] = 7;
+    dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0;
+
+    thrust::gather(map.begin(), map.end(), src.begin(), dst.begin());
+
+    ASSERT_EQUAL(dst[0], 6);
+    ASSERT_EQUAL(dst[1], 2);
+    ASSERT_EQUAL(dst[2], 1);
+    ASSERT_EQUAL(dst[3], 7);
+    ASSERT_EQUAL(dst[4], 2);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestGatherSimple);
+
+
+template<typename InputIterator, typename RandomAccessIterator, typename OutputIterator>
+OutputIterator gather(my_system &system, InputIterator, InputIterator, RandomAccessIterator, OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestGatherDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::gather(sys,
+                   vec.begin(),
+                   vec.end(),
+                   vec.begin(),
+                   vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestGatherDispatchExplicit);
+
+
+template<typename InputIterator, typename RandomAccessIterator, typename OutputIterator>
+OutputIterator gather(my_tag, InputIterator, InputIterator, RandomAccessIterator, OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestGatherDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::gather(thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.end()),
+                   thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestGatherDispatchImplicit);
+
+
+template <typename T>
+void TestGather(const size_t n)
+{
+    const size_t source_size = std::min((size_t) 10, 2 * n);
+
+    // source vectors to gather from
+    thrust::host_vector<T>   h_source = unittest::random_samples<T>(source_size);
+    thrust::device_vector<T> d_source = h_source;
+  
+    // gather indices
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] =  h_map[i] % source_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+
+    // gather destination
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    thrust::gather(h_map.begin(), h_map.end(), h_source.begin(), h_output.begin());
+    thrust::gather(d_map.begin(), d_map.end(), d_source.begin(), d_output.begin());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestGather);
+
+
+template <typename T>
+void TestGatherToDiscardIterator(const size_t n)
+{
+    const size_t source_size = std::min((size_t) 10, 2 * n);
+
+    // source vectors to gather from
+    thrust::host_vector<T>   h_source = unittest::random_samples<T>(source_size);
+    thrust::device_vector<T> d_source = h_source;
+  
+    // gather indices
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] =  h_map[i] % source_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+
+    thrust::discard_iterator<> h_result = 
+      thrust::gather(h_map.begin(), h_map.end(), h_source.begin(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> d_result =
+      thrust::gather(d_map.begin(), d_map.end(), d_source.begin(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherToDiscardIterator);
+
+
+template <class Vector>
+void TestGatherIfSimple(void)
+{
+    Vector flg(5);  // predicate array
+    Vector map(5);  // gather indices
+    Vector src(8);  // source vector
+    Vector dst(5);  // destination vector
+
+    flg[0] = 0; flg[1] = 1; flg[2] = 0; flg[3] = 1; flg[4] = 0;
+    map[0] = 6; map[1] = 2; map[2] = 1; map[3] = 7; map[4] = 2;
+    src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4; src[5] = 5; src[6] = 6; src[7] = 7;
+    dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0;
+
+    thrust::gather_if(map.begin(), map.end(), flg.begin(), src.begin(), dst.begin());
+
+    ASSERT_EQUAL(dst[0], 0);
+    ASSERT_EQUAL(dst[1], 2);
+    ASSERT_EQUAL(dst[2], 0);
+    ASSERT_EQUAL(dst[3], 7);
+    ASSERT_EQUAL(dst[4], 0);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestGatherIfSimple);
+
+template <typename T>
+struct is_even_gather_if
+{
+    __host__ __device__
+    bool operator()(const T i) const
+    { 
+        return (i % 2) == 0;
+    }
+};
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+OutputIterator gather_if(my_system &system,
+                         InputIterator1, //       map_first,
+                         InputIterator1, //       map_last,
+                         InputIterator2, //       stencil,
+                         RandomAccessIterator, // input_first,
+                         OutputIterator       result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestGatherIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::gather_if(sys,
+                      vec.begin(),
+                      vec.end(),
+                      vec.begin(),
+                      vec.begin(),
+                      vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestGatherIfDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+OutputIterator gather_if(my_tag,
+                         InputIterator1, //       map_first,
+                         InputIterator1, //       map_last,
+                         InputIterator2, //       stencil,
+                         RandomAccessIterator, // input_first,
+                         OutputIterator       result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestGatherIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::gather_if(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.end()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestGatherIfDispatchImplicit);
+
+
+template <typename T>
+void TestGatherIf(const size_t n)
+{
+    const size_t source_size = std::min((size_t) 10, 2 * n);
+
+    // source vectors to gather from
+    thrust::host_vector<T>   h_source = unittest::random_samples<T>(source_size);
+    thrust::device_vector<T> d_source = h_source;
+  
+    // gather indices
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] = h_map[i] % source_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+    
+    // gather stencil
+    thrust::host_vector<unsigned int> h_stencil = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_stencil[i] = h_stencil[i] % 2;
+    
+    thrust::device_vector<unsigned int> d_stencil = h_stencil;
+
+    // gather destination
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    thrust::gather_if(h_map.begin(), h_map.end(), h_stencil.begin(), h_source.begin(), h_output.begin(), is_even_gather_if<unsigned int>());
+    thrust::gather_if(d_map.begin(), d_map.end(), d_stencil.begin(), d_source.begin(), d_output.begin(), is_even_gather_if<unsigned int>());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherIf);
+
+
+
+template <typename T>
+void TestGatherIfToDiscardIterator(const size_t n)
+{
+    const size_t source_size = std::min((size_t) 10, 2 * n);
+
+    // source vectors to gather from
+    thrust::host_vector<T>   h_source = unittest::random_samples<T>(source_size);
+    thrust::device_vector<T> d_source = h_source;
+  
+    // gather indices
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] = h_map[i] % source_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+    
+    // gather stencil
+    thrust::host_vector<unsigned int> h_stencil = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_stencil[i] = h_stencil[i] % 2;
+    
+    thrust::device_vector<unsigned int> d_stencil = h_stencil;
+
+    thrust::discard_iterator<> h_result =
+      thrust::gather_if(h_map.begin(), h_map.end(), h_stencil.begin(), h_source.begin(), thrust::make_discard_iterator(), is_even_gather_if<unsigned int>());
+
+    thrust::discard_iterator<> d_result =
+      thrust::gather_if(d_map.begin(), d_map.end(), d_stencil.begin(), d_source.begin(), thrust::make_discard_iterator(), is_even_gather_if<unsigned int>());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestGatherIfToDiscardIterator);
+
+
+template <typename Vector>
+void TestGatherCountingIterator(void)
+{
+    Vector source(10);
+    thrust::sequence(source.begin(), source.end(), 0);
+
+    Vector map(10);
+    thrust::sequence(map.begin(), map.end(), 0);
+
+    Vector output(10);
+
+    // source has any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::gather(map.begin(),
+                   map.end(),
+                   thrust::make_counting_iterator(0),
+                   output.begin());
+
+    ASSERT_EQUAL(output, map);
+    
+    // map has any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::gather(thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator((int)source.size()),
+                   source.begin(),
+                   output.begin());
+
+    ASSERT_EQUAL(output, map);
+    
+    // source and map have any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::gather(thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator((int)output.size()),
+                   thrust::make_counting_iterator(0),
+                   output.begin());
+
+    ASSERT_EQUAL(output, map);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestGatherCountingIterator);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/thrust/testing/generate.cu b/thrust/testing/generate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fefd7d8e697c767f919739df69f3f64dc1e7afa2
--- /dev/null
+++ b/thrust/testing/generate.cu
@@ -0,0 +1,227 @@
+#include <unittest/unittest.h>
+#include <thrust/generate.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template<typename T>
+struct return_value
+{
+    T val;
+
+    return_value(void){}
+    return_value(T v):val(v){}
+
+    __host__ __device__
+    T operator()(void){ return val; }
+};
+
+template<class Vector>
+void TestGenerateSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector result(5);
+
+    T value = 13;
+
+    return_value<T> f(value);
+
+    thrust::generate(result.begin(), result.end(), f);
+
+    ASSERT_EQUAL(result[0], value);
+    ASSERT_EQUAL(result[1], value);
+    ASSERT_EQUAL(result[2], value);
+    ASSERT_EQUAL(result[3], value);
+    ASSERT_EQUAL(result[4], value);
+}
+DECLARE_VECTOR_UNITTEST(TestGenerateSimple);
+
+
+template<typename ForwardIterator, typename Generator>
+void generate(my_system &system, ForwardIterator /*first*/, ForwardIterator, Generator)
+{
+    system.validate_dispatch();
+}
+
+void TestGenerateDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::generate(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestGenerateDispatchExplicit);
+
+
+template<typename ForwardIterator, typename Generator>
+void generate(my_tag, ForwardIterator first, ForwardIterator, Generator)
+{
+    *first = 13;
+}
+
+void TestGenerateDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::generate(thrust::retag<my_tag>(vec.begin()),
+                     thrust::retag<my_tag>(vec.end()),
+                     0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestGenerateDispatchImplicit);
+
+template <typename T>
+void TestGenerate(const size_t n)
+{
+    thrust::host_vector<T> h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    T value = 13;
+    return_value<T> f(value);
+
+    thrust::generate(h_result.begin(), h_result.end(), f);
+    thrust::generate(d_result.begin(), d_result.end(), f);
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerate);
+
+template <typename T>
+void TestGenerateToDiscardIterator(const size_t)
+{
+    T value = 13;
+    return_value<T> f(value);
+
+    thrust::discard_iterator<thrust::host_system_tag> h_first;
+    thrust::generate(h_first, h_first + 10, f);
+
+    thrust::discard_iterator<thrust::device_system_tag> d_first;
+    thrust::generate(d_first, d_first + 10, f);
+
+    // there's nothing to actually check except that it compiles
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerateToDiscardIterator);
+
+template<class Vector>
+void TestGenerateNSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector result(5);
+
+    T value = 13;
+
+    return_value<T> f(value);
+
+    thrust::generate_n(result.begin(), result.size(), f);
+
+    ASSERT_EQUAL(result[0], value);
+    ASSERT_EQUAL(result[1], value);
+    ASSERT_EQUAL(result[2], value);
+    ASSERT_EQUAL(result[3], value);
+    ASSERT_EQUAL(result[4], value);
+}
+DECLARE_VECTOR_UNITTEST(TestGenerateNSimple);
+
+
+template<typename ForwardIterator, typename Size, typename Generator>
+ForwardIterator generate_n(my_system &system, ForwardIterator first, Size, Generator)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestGenerateNDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::generate_n(sys, vec.begin(), vec.size(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestGenerateNDispatchExplicit);
+
+
+template<typename ForwardIterator, typename Size, typename Generator>
+ForwardIterator generate_n(my_tag, ForwardIterator first, Size, Generator)
+{
+    *first = 13;
+    return first;
+}
+
+void TestGenerateNDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::generate_n(thrust::retag<my_tag>(vec.begin()),
+                       vec.size(),
+                       0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestGenerateNDispatchImplicit);
+
+template <typename T>
+void TestGenerateNToDiscardIterator(const size_t n)
+{
+    T value = 13;
+    return_value<T> f(value);
+
+    thrust::discard_iterator<thrust::host_system_tag> h_result = 
+      thrust::generate_n(thrust::discard_iterator<thrust::host_system_tag>(), n, f);
+
+    thrust::discard_iterator<thrust::device_system_tag> d_result = 
+      thrust::generate_n(thrust::discard_iterator<thrust::device_system_tag>(), n, f);
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestGenerateNToDiscardIterator);
+
+
+template <typename Vector>
+void TestGenerateZipIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3,T(0));
+    Vector v2(3,T(0));
+
+    thrust::generate(thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(v1.end(),v2.end())),
+                     return_value< thrust::tuple<T,T> > (thrust::tuple<T,T>(4,7)));
+
+    ASSERT_EQUAL(v1[0], 4);
+    ASSERT_EQUAL(v1[1], 4);
+    ASSERT_EQUAL(v1[2], 4);
+    ASSERT_EQUAL(v2[0], 7);
+    ASSERT_EQUAL(v2[1], 7);
+    ASSERT_EQUAL(v2[2], 7);
+};
+DECLARE_VECTOR_UNITTEST(TestGenerateZipIterator);
+
+
+void TestGenerateTuple(void)
+{
+    typedef int T;
+    typedef thrust::tuple<T,T> Tuple;
+
+    thrust::host_vector<Tuple>   h(3, Tuple(0,0));
+    thrust::device_vector<Tuple> d(3, Tuple(0,0));
+
+    thrust::generate(h.begin(), h.end(), return_value<Tuple>(Tuple(4,7)));
+    thrust::generate(d.begin(), d.end(), return_value<Tuple>(Tuple(4,7)));
+
+    ASSERT_EQUAL_QUIET(h, d);
+};
+DECLARE_UNITTEST(TestGenerateTuple);
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/thrust/testing/generate_const_iterators.cu b/thrust/testing/generate_const_iterators.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd12bfb3b201df52d35d2fba206d5fdf00630599
--- /dev/null
+++ b/thrust/testing/generate_const_iterators.cu
@@ -0,0 +1,29 @@
+#include <unittest/runtime_static_assert.h>
+#include <unittest/unittest.h>
+#include <thrust/generate.h>
+
+struct generator
+{
+    __host__ __device__
+    int operator()() const
+    {
+        return 1;
+    }
+};
+
+void TestGenerateConstIteratorCompilationError()
+{
+    thrust::host_vector<int> test1(10);
+
+    ASSERT_STATIC_ASSERT(thrust::generate(test1.cbegin(), test1.cend(), generator()));
+    ASSERT_STATIC_ASSERT(thrust::generate_n(test1.cbegin(), 10, generator()));
+}
+DECLARE_UNITTEST(TestGenerateConstIteratorCompilationError);
+
+void TestFillConstIteratorCompilationError()
+{
+    thrust::host_vector<int> test1(10);
+    ASSERT_STATIC_ASSERT(thrust::fill(test1.cbegin(), test1.cend(), 1));
+}
+DECLARE_UNITTEST(TestFillConstIteratorCompilationError);
+
diff --git a/thrust/testing/inner_product.cu b/thrust/testing/inner_product.cu
new file mode 100644
index 0000000000000000000000000000000000000000..07cce1dc1723fc65946e102434a3716c86d8480a
--- /dev/null
+++ b/thrust/testing/inner_product.cu
@@ -0,0 +1,173 @@
+#include <unittest/unittest.h>
+#include <thrust/inner_product.h>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <thrust/device_vector.h>
+
+template <class Vector>
+void TestInnerProductSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3);
+    Vector v2(3);
+    v1[0] =  1; v1[1] = -2; v1[2] =  3;
+    v2[0] = -4; v2[1] =  5; v2[2] =  6;
+
+    T init = 3;
+    T result = thrust::inner_product(v1.begin(), v1.end(), v2.begin(), init);
+    ASSERT_EQUAL(result, 7);
+}
+DECLARE_VECTOR_UNITTEST(TestInnerProductSimple);
+
+
+template <typename InputIterator1, typename InputIterator2, typename OutputType>
+int inner_product(my_system &system, InputIterator1, InputIterator1, InputIterator2, OutputType)
+{
+    system.validate_dispatch();
+    return 13;
+}
+
+void TestInnerProductDispatchExplicit()
+{
+    thrust::device_vector<int> vec;
+
+    my_system sys(0);
+    thrust::inner_product(sys,
+                          vec.begin(),
+                          vec.end(),
+                          vec.begin(),
+                          0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestInnerProductDispatchExplicit);
+
+
+template <typename InputIterator1, typename InputIterator2, typename OutputType>
+int inner_product(my_tag, InputIterator1, InputIterator1, InputIterator2, OutputType)
+{
+    return 13;
+}
+
+void TestInnerProductDispatchImplicit()
+{
+    thrust::device_vector<int> vec;
+
+    int result = thrust::inner_product(thrust::retag<my_tag>(vec.begin()),
+                                       thrust::retag<my_tag>(vec.end()),
+                                       thrust::retag<my_tag>(vec.begin()),
+                                       0);
+
+    ASSERT_EQUAL(13, result);
+}
+DECLARE_UNITTEST(TestInnerProductDispatchImplicit);
+
+template <class Vector>
+void TestInnerProductWithOperator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v1(3);
+    Vector v2(3);
+    v1[0] =  1; v1[1] = -2; v1[2] =  3;
+    v2[0] = -1; v2[1] =  3; v2[2] =  6;
+
+    // compute (v1 - v2) and perform a multiplies reduction
+    T init = 3;
+    T result = thrust::inner_product(v1.begin(), v1.end(), v2.begin(), init, 
+                                      thrust::multiplies<T>(), thrust::minus<T>());
+    ASSERT_EQUAL(result, 90);
+}
+DECLARE_VECTOR_UNITTEST(TestInnerProductWithOperator);
+
+template <typename T>
+struct TestInnerProduct
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T> h_v1 = unittest::random_integers<T>(n);
+        thrust::host_vector<T> h_v2 = unittest::random_integers<T>(n);
+
+        thrust::device_vector<T> d_v1 = h_v1;
+        thrust::device_vector<T> d_v2 = h_v2;
+
+        T init = 13;
+
+        T expected = thrust::inner_product(h_v1.begin(), h_v1.end(), h_v2.begin(), init);
+        T result   = thrust::inner_product(d_v1.begin(), d_v1.end(), d_v2.begin(), init);
+
+        ASSERT_EQUAL(expected, result);
+    }
+};
+VariableUnitTest<TestInnerProduct, IntegralTypes> TestInnerProductInstance;
+
+struct only_set_when_both_expected
+{
+    long long expected;
+    bool * flag;
+
+    __device__
+    long long operator()(long long x, long long y)
+    {
+        if (x == expected && y == expected)
+        {
+            *flag = true;
+        }
+
+        return x == y;
+    }
+};
+
+void TestInnerProductWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_both_expected fn = { (1ll << magnitude) - 1,
+        thrust::raw_pointer_cast(has_executed) };
+
+    ASSERT_EQUAL(thrust::inner_product(
+        thrust::device,
+        begin, end,
+        begin,
+        0ll,
+        thrust::plus<long long>(),
+        fn), (1ll << magnitude));
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestInnerProductWithBigIndexes()
+{
+    TestInnerProductWithBigIndexesHelper(30);
+    TestInnerProductWithBigIndexesHelper(31);
+    TestInnerProductWithBigIndexesHelper(32);
+    TestInnerProductWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestInnerProductWithBigIndexes);
+
+void TestInnerProductPlaceholders()
+{ // Regression test for thrust/thrust#1178
+  using namespace thrust::placeholders;
+
+  thrust::device_vector<float> v1(100, 1.f);
+  thrust::device_vector<float> v2(100, 1.f);
+
+  auto result = thrust::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0f,
+                                      thrust::plus<float>{},
+                                      _1 * _2 + 1.0f);
+
+  ASSERT_ALMOST_EQUAL(result, 200.f);
+}
+DECLARE_UNITTEST(TestInnerProductPlaceholders);
diff --git a/thrust/testing/is_contiguous_iterator.cu b/thrust/testing/is_contiguous_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..63a307b7bbab136fdc5a682d0536116b97a8a21e
--- /dev/null
+++ b/thrust/testing/is_contiguous_iterator.cu
@@ -0,0 +1,136 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/static_assert.h>
+#include <iterator>
+#include <vector>
+#if THRUST_CPP_DIALECT >= 2011
+  #include <array>
+  #include <unordered_map>
+  #include <unordered_set>
+#endif
+#include <string>
+#if THRUST_CPP_DIALECT >= 2017
+  #include <string_view>
+#endif
+#include <deque>
+#include <list>
+#include <map>
+#include <set>
+#include <thrust/device_ptr.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::string::iterator
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::wstring::iterator
+>::value));
+
+#if THRUST_CPP_DIALECT >= 2017
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::string_view::iterator
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::wstring_view::iterator
+>::value));
+#endif
+
+THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+  std::vector<bool>::iterator
+>::value));
+
+template <typename T>
+__host__
+void test_is_contiguous_iterator()
+{
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    T*
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    T const*
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    thrust::device_ptr<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    typename std::vector<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::vector<T>::reverse_iterator
+  >::value));
+
+  #if THRUST_CPP_DIALECT >= 2011
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    typename std::array<T, 1>::iterator
+  >::value));
+  #endif
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::list<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::deque<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::set<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::multiset<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::map<T, T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::multimap<T, T>::iterator
+  >::value));
+
+  #if THRUST_CPP_DIALECT >= 2011
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_set<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_multiset<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_map<T, T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_multimap<T, T>::iterator
+  >::value));
+  #endif
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    std::istream_iterator<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    std::ostream_iterator<T>
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_contiguous_iterator);
+
+template <typename Vector>
+__host__
+void test_is_contiguous_iterator_vectors()
+{
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    typename Vector::iterator
+  >::value));
+}
+DECLARE_VECTOR_UNITTEST(test_is_contiguous_iterator_vectors);
+
diff --git a/thrust/testing/is_operator_function_object.cu b/thrust/testing/is_operator_function_object.cu
new file mode 100644
index 0000000000000000000000000000000000000000..935ee1e55d4193ecb49eede8639460b40a5c7d83
--- /dev/null
+++ b/thrust/testing/is_operator_function_object.cu
@@ -0,0 +1,195 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/type_traits/is_operator_less_or_greater_function_object.h>
+#include <thrust/type_traits/is_operator_plus_function_object.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+THRUST_STATIC_ASSERT((thrust::is_operator_less_function_object<
+  std::less<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_greater_function_object<
+  std::greater<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+  std::less<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+  std::greater<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_plus_function_object<
+  std::plus<>
+>::value));
+#endif
+
+template <typename T>
+__host__
+void test_is_operator_less_function_object()
+{
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    thrust::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    thrust::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    std::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    std::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_less_function_object);
+
+template <typename T>
+__host__
+void test_is_operator_greater_function_object()
+{
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_greater_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    thrust::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    thrust::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_greater_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    std::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    std::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_greater_function_object);
+
+template <typename T>
+__host__
+void test_is_operator_less_or_greater_function_object()
+{
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    thrust::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    thrust::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    std::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    std::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_less_or_greater_function_object);
+
+template <typename T>
+__host__
+void test_is_operator_plus_function_object()
+{
+  THRUST_STATIC_ASSERT((thrust::is_operator_plus_function_object<
+    thrust::plus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    thrust::minus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_plus_function_object<
+    std::plus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    std::minus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_plus_function_object);
+
diff --git a/thrust/testing/is_partitioned.cu b/thrust/testing/is_partitioned.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e503f32a3d8938a5f1e8ded1dcde2c71da11fcee
--- /dev/null
+++ b/thrust/testing/is_partitioned.cu
@@ -0,0 +1,101 @@
+#include <unittest/unittest.h>
+#include <thrust/partition.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+template<typename T>
+struct is_even
+{
+  __host__ __device__
+  bool operator()(T x) const { return ((int) x % 2) == 0; }
+};
+
+template<typename Vector>
+void TestIsPartitionedSimple(void)
+{
+  typedef typename Vector::value_type T;
+
+  Vector v(4);
+  v[0] = 1; v[1] = 1; v[2] = 1; v[3] = 0;
+
+  // empty partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(v.begin(), v.begin(), thrust::identity<T>()));
+
+  // one element true partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(v.begin(), v.begin() + 1, thrust::identity<T>()));
+
+  // just true partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(v.begin(), v.begin() + 2, thrust::identity<T>()));
+
+  // both true & false partitions
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(v.begin(), v.end(), thrust::identity<T>()));
+
+  // one element false partition
+  ASSERT_EQUAL_QUIET(true, thrust::is_partitioned(v.begin() + 3, v.end(), thrust::identity<T>()));
+
+  v[0] = 1; v[1] = 0; v[2] = 1; v[3] = 1;
+
+  // not partitioned
+  ASSERT_EQUAL_QUIET(false, thrust::is_partitioned(v.begin(), v.end(), thrust::identity<T>()));
+}
+DECLARE_VECTOR_UNITTEST(TestIsPartitionedSimple);
+
+template <class Vector>
+void TestIsPartitioned(void)
+{
+  typedef typename Vector::value_type T;
+
+  const size_t n = (1 << 16) + 13;
+
+  Vector v = unittest::random_integers<T>(n);
+
+  v[0] = 1;
+  v[1] = 0;
+
+  ASSERT_EQUAL(false, thrust::is_partitioned(v.begin(), v.end(), is_even<T>()));
+
+  thrust::partition(v.begin(), v.end(), is_even<T>());
+
+  ASSERT_EQUAL(true, thrust::is_partitioned(v.begin(), v.end(), is_even<T>()));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestIsPartitioned);
+
+
+template<typename InputIterator, typename Predicate>
+bool is_partitioned(my_system &system, InputIterator /*first*/, InputIterator, Predicate)
+{
+  system.validate_dispatch();
+  return false;
+}
+
+void TestIsPartitionedDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::is_partitioned(sys, vec.begin(), vec.end(), 0);
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestIsPartitionedDispatchExplicit);
+
+
+template<typename InputIterator, typename Predicate>
+bool is_partitioned(my_tag, InputIterator first, InputIterator, Predicate)
+{
+  *first = 13;
+  return false;
+}
+
+void TestIsPartitionedDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::is_partitioned(thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.end()),
+                         0);
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestIsPartitionedDispatchImplicit);
+
diff --git a/thrust/testing/is_sorted.cu b/thrust/testing/is_sorted.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9edb7ed22e44d311c29fb878864f3183968663e8
--- /dev/null
+++ b/thrust/testing/is_sorted.cu
@@ -0,0 +1,116 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/retag.h>
+
+template <class Vector>
+void TestIsSortedSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(4);
+    v[0] = 0; v[1] = 5; v[2] = 8; v[3] = 0;
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 0), true);
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 1), true);
+
+    // the following line crashes gcc 4.3
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 3)
+    // do nothing
+#else
+    // compile this line on other compilers
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 2), true);
+#endif // GCC
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 3), true);
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 4), false);
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 3, thrust::less<T>()),    true);
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 1, thrust::greater<T>()), true);
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.begin() + 4, thrust::greater<T>()), false);
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.end()), false);
+}
+DECLARE_VECTOR_UNITTEST(TestIsSortedSimple);
+
+template <class Vector>
+void TestIsSortedRepeatedElements(void)
+{
+  Vector v(10);
+
+  v[0] = 0;
+  v[1] = 1;
+  v[2] = 1;
+  v[3] = 2;
+  v[4] = 3;
+  v[5] = 4;
+  v[6] = 5;
+  v[7] = 5;
+  v[8] = 5;
+  v[9] = 6;
+
+  ASSERT_EQUAL(true, thrust::is_sorted(v.begin(), v.end()));
+}
+DECLARE_VECTOR_UNITTEST(TestIsSortedRepeatedElements);
+
+
+template <class Vector>
+void TestIsSorted(void)
+{
+    typedef typename Vector::value_type T;
+
+    const size_t n = (1 << 16) + 13;
+
+    Vector v = unittest::random_integers<T>(n);
+
+    v[0] = 1;
+    v[1] = 0;
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.end()), false);
+
+    thrust::sort(v.begin(), v.end());
+
+    ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.end()), true);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestIsSorted);
+
+
+template<typename InputIterator>
+bool is_sorted(my_system &system, InputIterator /*first*/, InputIterator)
+{
+  system.validate_dispatch();
+  return false;
+}
+
+void TestIsSortedDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::is_sorted(sys,
+                    vec.begin(),
+                    vec.end());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestIsSortedDispatchExplicit);
+
+
+template<typename InputIterator>
+bool is_sorted(my_tag, InputIterator first, InputIterator)
+{
+  *first = 13;
+  return false;
+}
+
+void TestIsSortedDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::is_sorted(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.end()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestIsSortedDispatchImplicit);
+
diff --git a/thrust/testing/is_sorted_until.cu b/thrust/testing/is_sorted_until.cu
new file mode 100644
index 0000000000000000000000000000000000000000..128395581ef1198f0cc9df4e93f151cd759c8f9c
--- /dev/null
+++ b/thrust/testing/is_sorted_until.cu
@@ -0,0 +1,136 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/retag.h>
+
+template<typename Vector>
+void TestIsSortedUntilSimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(4);
+    v[0] = 0; v[1] = 5; v[2] = 8; v[3] = 0;
+
+    Iterator first = v.begin();
+
+    Iterator last  = v.begin() + 0;
+    Iterator ref = last;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last));
+
+    last = v.begin() + 1;
+    ref = last;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last));
+
+    last = v.begin() + 2;
+    ref = last;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last));
+
+    last = v.begin() + 3;
+    ref = v.begin() + 3;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last));
+
+    last = v.begin() + 4;
+    ref = v.begin() + 3;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last));
+
+    last = v.begin() + 3;
+    ref = v.begin() + 3;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last, thrust::less<T>()));
+
+    last = v.begin() + 4;
+    ref = v.begin() + 3;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last, thrust::less<T>()));
+
+    last = v.begin() + 1;
+    ref = v.begin() + 1;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last, thrust::greater<T>()));
+
+    last = v.begin() + 4;
+    ref = v.begin() + 1;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last, thrust::greater<T>()));
+
+    first = v.begin() + 2;
+    last = v.begin() + 4;
+    ref = v.begin() + 4;
+    ASSERT_EQUAL_QUIET(ref, thrust::is_sorted_until(first, last, thrust::greater<T>()));
+}
+DECLARE_VECTOR_UNITTEST(TestIsSortedUntilSimple);
+
+template<typename Vector>
+void TestIsSortedUntilRepeatedElements(void)
+{
+  Vector v(10);
+
+  v[0] = 0;
+  v[1] = 1;
+  v[2] = 1;
+  v[3] = 2;
+  v[4] = 3;
+  v[5] = 4;
+  v[6] = 5;
+  v[7] = 5;
+  v[8] = 5;
+  v[9] = 6;
+
+  ASSERT_EQUAL_QUIET(v.end(), thrust::is_sorted_until(v.begin(), v.end()));
+}
+DECLARE_VECTOR_UNITTEST(TestIsSortedUntilRepeatedElements);
+
+template <class Vector>
+void TestIsSortedUntil(void)
+{
+    typedef typename Vector::value_type T;
+
+    const size_t n = (1 << 16) + 13;
+
+    Vector v = unittest::random_integers<T>(n);
+
+    v[0] = 1;
+    v[1] = 0;
+
+    ASSERT_EQUAL_QUIET(v.begin() + 1, thrust::is_sorted_until(v.begin(), v.end()));
+
+    thrust::sort(v.begin(), v.end());
+
+    ASSERT_EQUAL_QUIET(v.end(), thrust::is_sorted_until(v.begin(), v.end()));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestIsSortedUntil);
+
+
+template<typename ForwardIterator>
+ForwardIterator is_sorted_until(my_system &system, ForwardIterator first, ForwardIterator)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestIsSortedUntilExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::is_sorted_until(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestIsSortedUntilExplicit);
+
+
+template<typename ForwardIterator>
+ForwardIterator is_sorted_until(my_tag, ForwardIterator first, ForwardIterator)
+{
+    *first = 13;
+    return first;
+}
+
+void TestIsSortedUntilImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::is_sorted_until(thrust::retag<my_tag>(vec.begin()),
+                            thrust::retag<my_tag>(vec.end()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestIsSortedUntilImplicit);
+
diff --git a/thrust/testing/logical.cu b/thrust/testing/logical.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0a2b6edc984385046c503adc975cc8d8fd42938d
--- /dev/null
+++ b/thrust/testing/logical.cu
@@ -0,0 +1,185 @@
+#include <unittest/unittest.h>
+#include <thrust/logical.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+
+template <class Vector>
+void TestAllOf(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(3, 1);
+
+    ASSERT_EQUAL(thrust::all_of(v.begin(), v.end(), thrust::identity<T>()), true);
+
+    v[1] = 0;
+    
+    ASSERT_EQUAL(thrust::all_of(v.begin(), v.end(), thrust::identity<T>()), false);
+
+    ASSERT_EQUAL(thrust::all_of(v.begin() + 0, v.begin() + 0, thrust::identity<T>()), true);
+    ASSERT_EQUAL(thrust::all_of(v.begin() + 0, v.begin() + 1, thrust::identity<T>()), true);
+    ASSERT_EQUAL(thrust::all_of(v.begin() + 0, v.begin() + 2, thrust::identity<T>()), false);
+    ASSERT_EQUAL(thrust::all_of(v.begin() + 1, v.begin() + 2, thrust::identity<T>()), false);
+}
+DECLARE_VECTOR_UNITTEST(TestAllOf);
+
+
+template <class InputIterator, class Predicate>
+bool all_of(my_system &system, InputIterator, InputIterator, Predicate)
+{
+    system.validate_dispatch();
+    return false;
+}
+
+void TestAllOfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::all_of(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestAllOfDispatchExplicit);
+
+
+template <class InputIterator, class Predicate>
+bool all_of(my_tag, InputIterator first, InputIterator, Predicate)
+{
+    *first = 13;
+    return false;
+}
+
+void TestAllOfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::all_of(thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.end()),
+                   0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestAllOfDispatchImplicit);
+
+
+template <class Vector>
+void TestAnyOf(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(3, 1);
+
+    ASSERT_EQUAL(thrust::any_of(v.begin(), v.end(), thrust::identity<T>()), true);
+
+    v[1] = 0;
+    
+    ASSERT_EQUAL(thrust::any_of(v.begin(), v.end(), thrust::identity<T>()), true);
+
+    ASSERT_EQUAL(thrust::any_of(v.begin() + 0, v.begin() + 0, thrust::identity<T>()), false);
+    ASSERT_EQUAL(thrust::any_of(v.begin() + 0, v.begin() + 1, thrust::identity<T>()), true);
+    ASSERT_EQUAL(thrust::any_of(v.begin() + 0, v.begin() + 2, thrust::identity<T>()), true);
+    ASSERT_EQUAL(thrust::any_of(v.begin() + 1, v.begin() + 2, thrust::identity<T>()), false);
+}
+DECLARE_VECTOR_UNITTEST(TestAnyOf);
+
+
+template <class InputIterator, class Predicate>
+bool any_of(my_system &system, InputIterator, InputIterator, Predicate)
+{
+    system.validate_dispatch();
+    return false;
+}
+
+void TestAnyOfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::any_of(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestAnyOfDispatchExplicit);
+
+
+template <class InputIterator, class Predicate>
+bool any_of(my_tag, InputIterator first, InputIterator, Predicate)
+{
+    *first = 13;
+    return false;
+}
+
+void TestAnyOfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::any_of(thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.end()),
+                   0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestAnyOfDispatchImplicit);
+
+
+template <class Vector>
+void TestNoneOf(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(3, 1);
+
+    ASSERT_EQUAL(thrust::none_of(v.begin(), v.end(), thrust::identity<T>()), false);
+
+    v[1] = 0;
+    
+    ASSERT_EQUAL(thrust::none_of(v.begin(), v.end(), thrust::identity<T>()), false);
+
+    ASSERT_EQUAL(thrust::none_of(v.begin() + 0, v.begin() + 0, thrust::identity<T>()), true);
+    ASSERT_EQUAL(thrust::none_of(v.begin() + 0, v.begin() + 1, thrust::identity<T>()), false);
+    ASSERT_EQUAL(thrust::none_of(v.begin() + 0, v.begin() + 2, thrust::identity<T>()), false);
+    ASSERT_EQUAL(thrust::none_of(v.begin() + 1, v.begin() + 2, thrust::identity<T>()), true);
+}
+DECLARE_VECTOR_UNITTEST(TestNoneOf);
+
+
+template <class InputIterator, class Predicate>
+bool none_of(my_system &system, InputIterator, InputIterator, Predicate)
+{
+    system.validate_dispatch();
+    return false;
+}
+
+void TestNoneOfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::none_of(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestNoneOfDispatchExplicit);
+
+
+template <class InputIterator, class Predicate>
+bool none_of(my_tag, InputIterator first, InputIterator, Predicate)
+{
+    *first = 13;
+    return false;
+}
+
+void TestNoneOfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::none_of(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.end()),
+                    0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestNoneOfDispatchImplicit);
+
diff --git a/thrust/testing/max_element.cu b/thrust/testing/max_element.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4562392641b7715da5ceb0ca339755920021258c
--- /dev/null
+++ b/thrust/testing/max_element.cu
@@ -0,0 +1,124 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/functional.h>
+
+template <class Vector>
+void TestMaxElementSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::max_element(data.begin(), data.end()), 5);
+    ASSERT_EQUAL( thrust::max_element(data.begin(), data.end()) - data.begin(), 1);
+    
+    ASSERT_EQUAL( *thrust::max_element(data.begin(), data.end(), thrust::greater<T>()), 1);
+    ASSERT_EQUAL( thrust::max_element(data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
+}
+DECLARE_VECTOR_UNITTEST(TestMaxElementSimple);
+
+template <class Vector>
+void TestMaxElementWithTransform(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::max_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())), -1);
+    ASSERT_EQUAL( *thrust::max_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>()),
+          thrust::greater<T>()), -5);
+    
+}
+DECLARE_VECTOR_UNITTEST(TestMaxElementWithTransform);
+
+template<typename T>
+void TestMaxElement(const size_t n)
+{
+    thrust::host_vector<T> h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    typename thrust::host_vector<T>::iterator   h_max = thrust::max_element(h_data.begin(), h_data.end());
+    typename thrust::device_vector<T>::iterator d_max = thrust::max_element(d_data.begin(), d_data.end());
+
+    ASSERT_EQUAL(h_max - h_data.begin(), d_max - d_data.begin());
+    
+    typename thrust::host_vector<T>::iterator   h_min = thrust::max_element(h_data.begin(), h_data.end(), thrust::greater<T>());
+    typename thrust::device_vector<T>::iterator d_min = thrust::max_element(d_data.begin(), d_data.end(), thrust::greater<T>());
+
+    ASSERT_EQUAL(h_min - h_data.begin(), d_min - d_data.begin());
+}
+DECLARE_VARIABLE_UNITTEST(TestMaxElement);
+
+
+template<typename ForwardIterator>
+ForwardIterator max_element(my_system &system, ForwardIterator first, ForwardIterator)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestMaxElementDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::max_element(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMaxElementDispatchExplicit);
+
+
+template<typename ForwardIterator>
+ForwardIterator max_element(my_tag, ForwardIterator first, ForwardIterator)
+{
+    *first = 13;
+    return first;
+}
+
+void TestMaxElementDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::max_element(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestMaxElementDispatchImplicit);
+
+void TestMaxElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(*thrust::max_element(thrust::device, begin, end), (1ll << magnitude));
+}
+
+void TestMaxElementWithBigIndexes()
+{
+    TestMaxElementWithBigIndexesHelper(30);
+    TestMaxElementWithBigIndexesHelper(31);
+    TestMaxElementWithBigIndexesHelper(32);
+    TestMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMaxElementWithBigIndexes);
diff --git a/thrust/testing/memory.cu b/thrust/testing/memory.cu
new file mode 100644
index 0000000000000000000000000000000000000000..622b06a0af047772be39794487b15bf2b2ae88de
--- /dev/null
+++ b/thrust/testing/memory.cu
@@ -0,0 +1,348 @@
+#include <iostream>
+#include <unittest/unittest.h>
+#include <thrust/memory.h>
+#include <thrust/sort.h>
+#include <thrust/memory.h>
+#include <thrust/pair.h>
+#include <thrust/fill.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/reverse.h>
+
+// Define a new system class, as the my_system one is already used with a thrust::sort template definition
+// that calls back into sort.cu
+class my_memory_system : public thrust::device_execution_policy<my_memory_system>
+{
+  public:
+    my_memory_system(int)
+      : correctly_dispatched(false),
+        num_copies(0)
+    {}
+
+    my_memory_system(const my_memory_system &other)
+      : correctly_dispatched(false),
+        num_copies(other.num_copies + 1)
+    {}
+
+    void validate_dispatch()
+    {
+      correctly_dispatched = (num_copies == 0);
+    }
+
+    bool is_valid()
+    {
+      return correctly_dispatched;
+    }
+
+  private:
+    bool correctly_dispatched;
+
+    // count the number of copies so that we can validate
+    // that dispatch does not introduce any
+    unsigned int num_copies;
+
+
+    // disallow default construction
+    my_memory_system();
+};
+
+namespace my_old_namespace
+{
+
+struct my_old_temporary_allocation_system
+  : public thrust::device_execution_policy<my_old_temporary_allocation_system>
+{
+};
+
+template <typename T>
+thrust::pair<thrust::pointer<T, my_old_temporary_allocation_system>, std::ptrdiff_t>
+get_temporary_buffer(my_old_temporary_allocation_system, std::ptrdiff_t)
+{
+  thrust::pointer<T, my_old_temporary_allocation_system> const
+    result(reinterpret_cast<T*>(4217));
+
+  return thrust::make_pair(result, 314);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_old_temporary_allocation_system, Pointer p)
+{
+  typedef typename thrust::detail::pointer_traits<Pointer>::raw_pointer RP;
+  ASSERT_EQUAL(p.get(), reinterpret_cast<RP>(4217));
+}
+
+} // my_old_namespace
+
+namespace my_new_namespace
+{
+
+struct my_new_temporary_allocation_system
+  : public thrust::device_execution_policy<my_new_temporary_allocation_system>
+{
+};
+
+template <typename T>
+thrust::pair<thrust::pointer<T, my_new_temporary_allocation_system>, std::ptrdiff_t>
+get_temporary_buffer(my_new_temporary_allocation_system, std::ptrdiff_t)
+{
+  thrust::pointer<T, my_new_temporary_allocation_system> const
+    result(reinterpret_cast<T*>(1742));
+
+  return thrust::make_pair(result, 413);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_new_temporary_allocation_system, Pointer p)
+{
+  // This should never be called (the three-argument with size overload below
+  // should be preferred) and shouldn't be ambiguous.
+  ASSERT_EQUAL(true, false);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_new_temporary_allocation_system, Pointer p, std::ptrdiff_t n)
+{
+  typedef typename thrust::detail::pointer_traits<Pointer>::raw_pointer RP;
+  ASSERT_EQUAL(p.get(), reinterpret_cast<RP>(1742));
+  ASSERT_EQUAL(n, 413);
+}
+
+} // my_new_namespace
+
+template<typename T1, typename T2>
+bool are_same(const T1 &, const T2 &)
+{
+  return false;
+}
+
+
+template<typename T>
+bool are_same(const T &, const T &)
+{
+  return true;
+}
+
+
+void TestSelectSystemDifferentTypes()
+{
+  using thrust::system::detail::generic::select_system;
+
+  my_memory_system my_sys(0);
+  thrust::device_system_tag device_sys;
+
+  // select_system(my_system, device_system_tag) should return device_system_tag (the minimum tag)
+  bool is_device_system_tag = are_same(device_sys, select_system(my_sys, device_sys));
+  ASSERT_EQUAL(true, is_device_system_tag);
+
+  // select_system(device_system_tag, my_tag) should return device_system_tag (the minimum tag)
+  is_device_system_tag = are_same(device_sys, select_system(device_sys, my_sys));
+  ASSERT_EQUAL(true, is_device_system_tag);
+}
+DECLARE_UNITTEST(TestSelectSystemDifferentTypes);
+
+
+void TestSelectSystemSameTypes()
+{
+  using thrust::system::detail::generic::select_system;
+
+  my_memory_system my_sys(0);
+  thrust::device_system_tag device_sys;
+  thrust::host_system_tag host_sys;
+
+  // select_system(host_system_tag, host_system_tag) should return host_system_tag
+  bool is_host_system_tag = are_same(host_sys, select_system(host_sys, host_sys));
+  ASSERT_EQUAL(true, is_host_system_tag);
+
+  // select_system(device_system_tag, device_system_tag) should return device_system_tag
+  bool is_device_system_tag = are_same(device_sys, select_system(device_sys, device_sys));
+  ASSERT_EQUAL(true, is_device_system_tag);
+
+  // select_system(my_system, my_system) should return my_system
+  bool is_my_system = are_same(my_sys, select_system(my_sys, my_sys));
+  ASSERT_EQUAL(true, is_my_system);
+}
+DECLARE_UNITTEST(TestSelectSystemSameTypes);
+
+
+void TestGetTemporaryBuffer()
+{
+  const std::ptrdiff_t n = 9001;
+
+  thrust::device_system_tag dev_tag;
+  typedef thrust::pointer<int, thrust::device_system_tag> pointer;
+  thrust::pair<pointer, std::ptrdiff_t> ptr_and_sz = thrust::get_temporary_buffer<int>(dev_tag, n);
+
+  ASSERT_EQUAL(ptr_and_sz.second, n);
+
+  const int ref_val = 13;
+  thrust::device_vector<int> ref(n, ref_val);
+
+  thrust::fill_n(ptr_and_sz.first, n, ref_val);
+
+  ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
+
+  thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first, ptr_and_sz.second);
+}
+DECLARE_UNITTEST(TestGetTemporaryBuffer);
+
+
+void TestMalloc()
+{
+  const std::ptrdiff_t n = 9001;
+
+  thrust::device_system_tag dev_tag;
+  typedef thrust::pointer<int, thrust::device_system_tag> pointer;
+  pointer ptr = pointer(static_cast<int*>(thrust::malloc(dev_tag, sizeof(int) * n).get()));
+
+  const int ref_val = 13;
+  thrust::device_vector<int> ref(n, ref_val);
+
+  thrust::fill_n(ptr, n, ref_val);
+
+  ASSERT_EQUAL(true, thrust::all_of(ptr, ptr + n, thrust::placeholders::_1 == ref_val));
+
+  thrust::free(dev_tag, ptr);
+}
+DECLARE_UNITTEST(TestMalloc);
+
+
+thrust::pointer<void,my_memory_system>
+  malloc(my_memory_system &system, std::size_t)
+{
+  system.validate_dispatch();
+
+  return thrust::pointer<void,my_memory_system>();
+}
+
+
+void TestMallocDispatchExplicit()
+{
+  const size_t n = 0;
+
+  my_memory_system sys(0);
+  thrust::malloc(sys, n);
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMallocDispatchExplicit);
+
+
+template<typename Pointer>
+void free(my_memory_system &system, Pointer)
+{
+  system.validate_dispatch();
+}
+
+
+void TestFreeDispatchExplicit()
+{
+  thrust::pointer<my_memory_system,void> ptr;
+
+  my_memory_system sys(0);
+  thrust::free(sys, ptr);
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestFreeDispatchExplicit);
+
+
+template<typename T>
+  thrust::pair<thrust::pointer<T,my_memory_system>, std::ptrdiff_t>
+    get_temporary_buffer(my_memory_system &system, std::ptrdiff_t n)
+{
+  system.validate_dispatch();
+
+  thrust::device_system_tag device_sys;
+  thrust::pair<thrust::pointer<T, thrust::device_system_tag>, std::ptrdiff_t> result = thrust::get_temporary_buffer<T>(device_sys, n);
+  return thrust::make_pair(thrust::pointer<T,my_memory_system>(result.first.get()), result.second);
+}
+
+
+void TestGetTemporaryBufferDispatchExplicit()
+{
+  const std::ptrdiff_t n = 9001;
+
+  my_memory_system sys(0);
+  typedef thrust::pointer<int, thrust::device_system_tag> pointer;
+  thrust::pair<pointer, std::ptrdiff_t> ptr_and_sz = thrust::get_temporary_buffer<int>(sys, n);
+
+  ASSERT_EQUAL(ptr_and_sz.second, n);
+  ASSERT_EQUAL(true, sys.is_valid());
+
+  const int ref_val = 13;
+  thrust::device_vector<int> ref(n, ref_val);
+
+  thrust::fill_n(ptr_and_sz.first, n, ref_val);
+
+  ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
+
+  thrust::return_temporary_buffer(sys, ptr_and_sz.first, ptr_and_sz.second);
+}
+DECLARE_UNITTEST(TestGetTemporaryBufferDispatchExplicit);
+
+
+void TestGetTemporaryBufferDispatchImplicit()
+{
+  if(are_same(thrust::device_system_tag(), thrust::system::cpp::tag()))
+  {
+    // XXX cpp uses the internal scalar backend, which currently elides user tags
+    KNOWN_FAILURE;
+  }
+  else
+  {
+    thrust::device_vector<int> vec(9001);
+
+    thrust::sequence(vec.begin(), vec.end());
+    thrust::reverse(vec.begin(), vec.end());
+
+    // call something we know will invoke get_temporary_buffer
+    my_memory_system sys(0);
+    thrust::sort(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, thrust::is_sorted(vec.begin(), vec.end()));
+    ASSERT_EQUAL(true, sys.is_valid());
+  }
+}
+DECLARE_UNITTEST(TestGetTemporaryBufferDispatchImplicit);
+
+
+void TestTemporaryBufferOldCustomization()
+{
+  typedef my_old_namespace::my_old_temporary_allocation_system system;
+  typedef thrust::pointer<int, system> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> pointer_and_size;
+
+  system sys;
+
+  {
+    pointer_and_size ps = thrust::get_temporary_buffer<int>(sys, 0);
+
+    // The magic values are defined in `my_old_namespace` above.
+    ASSERT_EQUAL(ps.first.get(), reinterpret_cast<int*>(4217));
+    ASSERT_EQUAL(ps.second, 314);
+
+    thrust::return_temporary_buffer(sys, ps.first, ps.second);
+  }
+}
+DECLARE_UNITTEST(TestTemporaryBufferOldCustomization);
+
+
+void TestTemporaryBufferNewCustomization()
+{
+  typedef my_new_namespace::my_new_temporary_allocation_system system;
+  typedef thrust::pointer<int, system> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> pointer_and_size;
+
+  system sys;
+
+  {
+    pointer_and_size ps = thrust::get_temporary_buffer<int>(sys, 0);
+
+    // The magic values are defined in `my_new_namespace` above.
+    ASSERT_EQUAL(ps.first.get(), reinterpret_cast<int*>(1742));
+    ASSERT_EQUAL(ps.second, 413);
+
+    thrust::return_temporary_buffer(sys, ps.first, ps.second);
+  }
+}
+DECLARE_UNITTEST(TestTemporaryBufferNewCustomization);
diff --git a/thrust/testing/merge.cu b/thrust/testing/merge.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d225edb074db56c2b4c1d248480f5793c02f0419
--- /dev/null
+++ b/thrust/testing/merge.cu
@@ -0,0 +1,205 @@
+#include <unittest/unittest.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+template<typename Vector>
+void TestMergeSimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(7);
+  ref[0] = 0;
+  ref[1] = 0;
+  ref[2] = 2;
+  ref[3] = 3;
+  ref[4] = 3;
+  ref[5] = 4;
+  ref[6] = 4;
+
+  Vector result(7);
+
+  Iterator end = thrust::merge(a.begin(), a.end(),
+                               b.begin(), b.end(),
+                               result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestMergeSimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator merge(my_system &system,
+                     InputIterator1,
+                     InputIterator1,
+                     InputIterator2,
+                     InputIterator2,
+                     OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestMergeDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::merge(sys,
+                vec.begin(),
+                vec.begin(),
+                vec.begin(),
+                vec.begin(),
+                vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMergeDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator merge(my_tag,
+                     InputIterator1,
+                     InputIterator1,
+                     InputIterator2,
+                     InputIterator2,
+                     OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestMergeDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::merge(thrust::retag<my_tag>(vec.begin()),
+                thrust::retag<my_tag>(vec.begin()),
+                thrust::retag<my_tag>(vec.begin()),
+                thrust::retag<my_tag>(vec.begin()),
+                thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestMergeDispatchImplicit);
+
+
+template<typename T>
+  void TestMerge(size_t n)
+{
+  size_t sizes[]   = {0, 1, n / 2, n, n + 1, 2 * n};
+  size_t num_sizes = sizeof(sizes) / sizeof(size_t);
+
+  thrust::host_vector<T> random = unittest::random_integers<unittest::int8_t>(n + *thrust::max_element(sizes, sizes + num_sizes));
+
+  thrust::host_vector<T> h_a(random.begin(), random.begin() + n);
+  thrust::host_vector<T> h_b(random.begin() + n, random.end());
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+  
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  for (size_t i = 0; i < num_sizes; i++)
+  {
+    size_t size = sizes[i];
+    
+    thrust::host_vector<T>   h_result(n + size);
+    thrust::device_vector<T> d_result(n + size);
+
+    typename thrust::host_vector<T>::iterator   h_end;
+    typename thrust::device_vector<T>::iterator d_end;
+    
+    h_end = thrust::merge(h_a.begin(), h_a.end(),
+                          h_b.begin(), h_b.begin() + size,
+                          h_result.begin());
+    h_result.resize(h_end - h_result.begin());
+
+    d_end = thrust::merge(d_a.begin(), d_a.end(),
+                          d_b.begin(), d_b.begin() + size,
+                          d_result.begin());
+    d_result.resize(d_end - d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestMerge);
+
+
+template<typename T>
+  void TestMergeToDiscardIterator(size_t n)
+{
+  thrust::host_vector<T> h_a = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b = unittest::random_integers<T>(n);
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::discard_iterator<> h_result = 
+    thrust::merge(h_a.begin(), h_a.end(),
+                  h_b.begin(), h_b.end(),
+                  thrust::make_discard_iterator());
+
+  thrust::discard_iterator<> d_result = 
+    thrust::merge(d_a.begin(), d_a.end(),
+                  d_b.begin(), d_b.end(),
+                  thrust::make_discard_iterator());
+
+  thrust::discard_iterator<> reference(2 * n);
+
+  ASSERT_EQUAL_QUIET(reference, h_result);
+  ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeToDiscardIterator);
+
+
+template<typename T>
+  void TestMergeDescending(size_t n)
+{
+  thrust::host_vector<T> h_a = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b = unittest::random_integers<T>(n);
+
+  thrust::stable_sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::stable_sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(d_a.size() + d_b.size());
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::merge(h_a.begin(), h_a.end(),
+                        h_b.begin(), h_b.end(),
+                        h_result.begin(),
+                        thrust::greater<T>());
+
+  d_end = thrust::merge(d_a.begin(), d_a.end(),
+                        d_b.begin(), d_b.end(),
+                        d_result.begin(),
+                        thrust::greater<T>());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeDescending);
+
diff --git a/thrust/testing/merge_by_key.cu b/thrust/testing/merge_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f504add81cd73668e9c46fcc2a3af80e2432c7c
--- /dev/null
+++ b/thrust/testing/merge_by_key.cu
@@ -0,0 +1,319 @@
+#include <unittest/unittest.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+template<typename Vector>
+void TestMergeByKeySimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), a_val(3), b_key(4), b_val(4);
+
+  a_key[0] = 0;  a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 13; a_val[1] = 7; a_val[2] = 42;
+
+  b_key[0] = 0 ; b_key[1] = 3;  b_key[2] = 3; b_key[3] = 4;
+  b_val[0] = 42; b_val[1] = 42; b_val[2] = 7; b_val[3] = 13;
+
+  Vector ref_key(7), ref_val(7);
+  ref_key[0] = 0; ref_val[0] = 13;
+  ref_key[1] = 0; ref_val[1] = 42;
+  ref_key[2] = 2; ref_val[2] = 7;
+  ref_key[3] = 3; ref_val[3] = 42;
+  ref_key[4] = 3; ref_val[4] = 7;
+  ref_key[5] = 4; ref_val[5] = 42;
+  ref_key[6] = 4; ref_val[6] = 13;
+
+  Vector result_key(7), result_val(7);
+
+  thrust::pair<Iterator,Iterator> ends =
+    thrust::merge_by_key(a_key.begin(), a_key.end(),
+                         b_key.begin(), b_key.end(),
+                         a_val.begin(), b_val.begin(),
+                         result_key.begin(),
+                         result_val.begin());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), ends.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), ends.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestMergeByKeySimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(my_system &system,
+                 InputIterator1,
+                 InputIterator1,
+                 InputIterator2,
+                 InputIterator2,
+                 InputIterator3,
+                 InputIterator4,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(keys_result, values_result);
+}
+
+void TestMergeByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::merge_by_key(sys,
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMergeByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(my_tag,
+                 InputIterator1,
+                 InputIterator1,
+                 InputIterator2,
+                 InputIterator2,
+                 InputIterator3,
+                 InputIterator4,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  *keys_result = 13;
+  return thrust::make_pair(keys_result, values_result);
+}
+
+void TestMergeByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::merge_by_key(thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestMergeByKeyDispatchImplicit);
+
+
+template<typename T>
+  void TestMergeByKey(size_t n)
+{
+  thrust::host_vector<T> random_keys = unittest::random_integers<unittest::int8_t>(n);
+  thrust::host_vector<T> random_vals = unittest::random_integers<unittest::int8_t>(n);
+
+  size_t denominators[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t num_denominators = sizeof(denominators) / sizeof(size_t);
+
+  for(size_t i = 0; i < num_denominators; ++i)
+  {
+    size_t size_a = n / denominators[i];
+
+    thrust::host_vector<T> h_a_keys(random_keys.begin(), random_keys.begin() + size_a);
+    thrust::host_vector<T> h_b_keys(random_keys.begin() + size_a, random_keys.end());
+
+    thrust::host_vector<T> h_a_vals(random_vals.begin(), random_vals.begin() + size_a);
+    thrust::host_vector<T> h_b_vals(random_vals.begin() + size_a, random_vals.end());
+
+    thrust::stable_sort(h_a_keys.begin(), h_a_keys.end());
+    thrust::stable_sort(h_b_keys.begin(), h_b_keys.end());
+
+    thrust::device_vector<T> d_a_keys = h_a_keys;
+    thrust::device_vector<T> d_b_keys = h_b_keys;
+
+    thrust::device_vector<T> d_a_vals = h_a_vals;
+    thrust::device_vector<T> d_b_vals = h_b_vals;
+
+    thrust::host_vector<T> h_result_keys(n);
+    thrust::host_vector<T> h_result_vals(n);
+
+    thrust::device_vector<T> d_result_keys(n);
+    thrust::device_vector<T> d_result_vals(n);
+
+
+    thrust::pair<
+      typename thrust::host_vector<T>::iterator,
+      typename thrust::host_vector<T>::iterator
+    > h_end;
+
+    thrust::pair<
+      typename thrust::device_vector<T>::iterator,
+      typename thrust::device_vector<T>::iterator
+    > d_end;
+
+
+    h_end = thrust::merge_by_key(h_a_keys.begin(), h_a_keys.end(),
+                                 h_b_keys.begin(), h_b_keys.end(),
+                                 h_a_vals.begin(),
+                                 h_b_vals.begin(),
+                                 h_result_keys.begin(),
+                                 h_result_vals.begin());
+    h_result_keys.erase(h_end.first, h_result_keys.end());
+    h_result_vals.erase(h_end.second, h_result_vals.end());
+
+    d_end = thrust::merge_by_key(d_a_keys.begin(), d_a_keys.end(),
+                                 d_b_keys.begin(), d_b_keys.end(),
+                                 d_a_vals.begin(),
+                                 d_b_vals.begin(),
+                                 d_result_keys.begin(),
+                                 d_result_vals.begin());
+    d_result_keys.erase(d_end.first, d_result_keys.end());
+    d_result_vals.erase(d_end.second, d_result_vals.end());
+
+    ASSERT_EQUAL(h_result_keys, d_result_keys);
+    ASSERT_EQUAL(h_result_vals, d_result_vals);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeByKey);
+
+
+template<typename T>
+  void TestMergeByKeyToDiscardIterator(size_t n)
+{
+  thrust::host_vector<T> h_a_keys = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_keys = unittest::random_integers<T>(n);
+
+  thrust::host_vector<T> h_a_vals = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_vals = unittest::random_integers<T>(n);
+
+  thrust::stable_sort(h_a_keys.begin(), h_a_keys.end());
+  thrust::stable_sort(h_b_keys.begin(), h_b_keys.end());
+
+  thrust::device_vector<T> d_a_keys = h_a_keys;
+  thrust::device_vector<T> d_b_keys = h_b_keys;
+
+  thrust::device_vector<T> d_a_vals = h_a_vals;
+  thrust::device_vector<T> d_b_vals = h_b_vals;
+
+  typedef thrust::pair<
+    thrust::discard_iterator<>,
+    thrust::discard_iterator<>
+  > discard_pair;
+
+  discard_pair h_result = 
+    thrust::merge_by_key(h_a_keys.begin(), h_a_keys.end(),
+                         h_b_keys.begin(), h_b_keys.end(),
+                         h_a_vals.begin(),
+                         h_b_vals.begin(),
+                         thrust::make_discard_iterator(),
+                         thrust::make_discard_iterator());
+
+  discard_pair d_result = 
+    thrust::merge_by_key(d_a_keys.begin(), d_a_keys.end(),
+                         d_b_keys.begin(), d_b_keys.end(),
+                         d_a_vals.begin(),
+                         d_b_vals.begin(),
+                         thrust::make_discard_iterator(),
+                         thrust::make_discard_iterator());
+
+  thrust::discard_iterator<> reference(2 * n);
+
+  ASSERT_EQUAL_QUIET(reference, h_result.first);
+  ASSERT_EQUAL_QUIET(reference, h_result.second);
+  ASSERT_EQUAL_QUIET(reference, d_result.first);
+  ASSERT_EQUAL_QUIET(reference, d_result.second);
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeByKeyToDiscardIterator);
+
+
+template<typename T>
+  void TestMergeByKeyDescending(size_t n)
+{
+  thrust::host_vector<T> random_keys = unittest::random_integers<unittest::int8_t>(n);
+  thrust::host_vector<T> random_vals = unittest::random_integers<unittest::int8_t>(n);
+
+  size_t denominators[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t num_denominators = sizeof(denominators) / sizeof(size_t);
+
+  for(size_t i = 0; i < num_denominators; ++i)
+  {
+    size_t size_a = n / denominators[i];
+
+    thrust::host_vector<T> h_a_keys(random_keys.begin(), random_keys.begin() + size_a);
+    thrust::host_vector<T> h_b_keys(random_keys.begin() + size_a, random_keys.end());
+
+    thrust::host_vector<T> h_a_vals(random_vals.begin(), random_vals.begin() + size_a);
+    thrust::host_vector<T> h_b_vals(random_vals.begin() + size_a, random_vals.end());
+
+    thrust::stable_sort(h_a_keys.begin(), h_a_keys.end(), thrust::greater<T>());
+    thrust::stable_sort(h_b_keys.begin(), h_b_keys.end(), thrust::greater<T>());
+
+    thrust::device_vector<T> d_a_keys = h_a_keys;
+    thrust::device_vector<T> d_b_keys = h_b_keys;
+
+    thrust::device_vector<T> d_a_vals = h_a_vals;
+    thrust::device_vector<T> d_b_vals = h_b_vals;
+
+    thrust::host_vector<T> h_result_keys(n);
+    thrust::host_vector<T> h_result_vals(n);
+
+    thrust::device_vector<T> d_result_keys(n);
+    thrust::device_vector<T> d_result_vals(n);
+
+
+    thrust::pair<
+      typename thrust::host_vector<T>::iterator,
+      typename thrust::host_vector<T>::iterator
+    > h_end;
+
+    thrust::pair<
+      typename thrust::device_vector<T>::iterator,
+      typename thrust::device_vector<T>::iterator
+    > d_end;
+
+
+    h_end = thrust::merge_by_key(h_a_keys.begin(), h_a_keys.end(),
+                                 h_b_keys.begin(), h_b_keys.end(),
+                                 h_a_vals.begin(),
+                                 h_b_vals.begin(),
+                                 h_result_keys.begin(),
+                                 h_result_vals.begin(),
+                                 thrust::greater<T>());
+    h_result_keys.erase(h_end.first, h_result_keys.end());
+    h_result_vals.erase(h_end.second, h_result_vals.end());
+
+    d_end = thrust::merge_by_key(d_a_keys.begin(), d_a_keys.end(),
+                                 d_b_keys.begin(), d_b_keys.end(),
+                                 d_a_vals.begin(),
+                                 d_b_vals.begin(),
+                                 d_result_keys.begin(),
+                                 d_result_vals.begin(),
+                                 thrust::greater<T>());
+    d_result_keys.erase(d_end.first, d_result_keys.end());
+    d_result_vals.erase(d_end.second, d_result_vals.end());
+
+    ASSERT_EQUAL(h_result_keys, d_result_keys);
+    ASSERT_EQUAL(h_result_vals, d_result_vals);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeByKeyDescending);
+
diff --git a/thrust/testing/merge_key_value.cu b/thrust/testing/merge_key_value.cu
new file mode 100644
index 0000000000000000000000000000000000000000..580f0f54a2c2d50237954a0300cfcb543a9ea70f
--- /dev/null
+++ b/thrust/testing/merge_key_value.cu
@@ -0,0 +1,93 @@
+#include <unittest/unittest.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+template<typename U>
+  void TestMergeKeyValue(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(d_a.size() + d_b.size());
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::merge(h_a.begin(), h_a.end(),
+                        h_b.begin(), h_b.end(),
+                        h_result.begin());
+
+  d_end = thrust::merge(d_a.begin(), d_a.end(),
+                        d_b.begin(), d_b.end(),
+                        d_result.begin());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeKeyValue);
+
+template<typename U>
+  void TestMergeKeyValueDescending(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::stable_sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(d_a.size() + d_b.size());
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::merge(h_a.begin(), h_a.end(),
+                        h_b.begin(), h_b.end(),
+                        h_result.begin(),
+                        thrust::greater<T>());
+
+  d_end = thrust::merge(d_a.begin(), d_a.end(),
+                        d_b.begin(), d_b.end(),
+                        d_result.begin(),
+                        thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestMergeKeyValueDescending);
+
+
diff --git a/thrust/testing/metaprogamming.cu b/thrust/testing/metaprogamming.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32f0a2e2091182623fb52462aacd8a24e5c6d974
--- /dev/null
+++ b/thrust/testing/metaprogamming.cu
@@ -0,0 +1,28 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/mpl/math.h>
+
+void TestLog2(void)
+{
+    unsigned int result;
+    
+    result = thrust::detail::mpl::math::log2<  1>::value;   ASSERT_EQUAL(result, 0lu);
+    result = thrust::detail::mpl::math::log2<  2>::value;   ASSERT_EQUAL(result, 1lu);
+    result = thrust::detail::mpl::math::log2<  3>::value;   ASSERT_EQUAL(result, 1lu);
+    result = thrust::detail::mpl::math::log2<  4>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  5>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  6>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  7>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  8>::value;   ASSERT_EQUAL(result, 3lu);
+    result = thrust::detail::mpl::math::log2<  9>::value;   ASSERT_EQUAL(result, 3lu);
+    result = thrust::detail::mpl::math::log2< 15>::value;   ASSERT_EQUAL(result, 3lu);
+    result = thrust::detail::mpl::math::log2< 16>::value;   ASSERT_EQUAL(result, 4lu);
+    result = thrust::detail::mpl::math::log2< 17>::value;   ASSERT_EQUAL(result, 4lu);
+    result = thrust::detail::mpl::math::log2<127>::value;   ASSERT_EQUAL(result, 6lu);
+    result = thrust::detail::mpl::math::log2<128>::value;   ASSERT_EQUAL(result, 7lu);
+    result = thrust::detail::mpl::math::log2<129>::value;   ASSERT_EQUAL(result, 7lu);
+    result = thrust::detail::mpl::math::log2<256>::value;   ASSERT_EQUAL(result, 8lu);
+    result = thrust::detail::mpl::math::log2<511>::value;   ASSERT_EQUAL(result, 8lu);
+    result = thrust::detail::mpl::math::log2<512>::value;   ASSERT_EQUAL(result, 9lu);
+}
+DECLARE_UNITTEST(TestLog2);
+
diff --git a/thrust/testing/min_and_max.cu b/thrust/testing/min_and_max.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f8d2717d440887b856dc3dc9b91743b5fb553aa4
--- /dev/null
+++ b/thrust/testing/min_and_max.cu
@@ -0,0 +1,69 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+
+template<typename T>
+struct TestMin
+{
+  void operator()(void)
+  {
+    // 2 < 3
+    T two(2), three(3);
+    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two,three));
+    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two,three,thrust::less<T>()));
+
+    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (three,two));
+    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (three,two,thrust::less<T>()));
+
+    ASSERT_EQUAL(three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two,three,thrust::greater<T>()));
+    ASSERT_EQUAL(three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (three,two,thrust::greater<T>()));
+
+    typedef key_value<T,T> KV;
+    KV two_and_two(two,two);
+    KV two_and_three(two,three);
+
+    // the first element breaks ties
+    ASSERT_EQUAL_QUIET(two_and_two,   thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_two, two_and_three));
+    ASSERT_EQUAL_QUIET(two_and_three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_three, two_and_two));
+
+    ASSERT_EQUAL_QUIET(two_and_two,   thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_two, two_and_three, thrust::less<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_three, two_and_two, thrust::less<KV>()));
+
+    ASSERT_EQUAL_QUIET(two_and_two,   thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_two, two_and_three, thrust::greater<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_three, two_and_two, thrust::greater<KV>()));
+  }
+};
+SimpleUnitTest<TestMin, NumericTypes> TestMinInstance;
+
+template<typename T>
+struct TestMax
+{
+  void operator()(void)
+  {
+    // 2 < 3
+    T two(2), three(3);
+    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two,three));
+    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two,three,thrust::less<T>()));
+
+    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (three,two));
+    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (three,two,thrust::less<T>()));
+
+    ASSERT_EQUAL(two, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two,three,thrust::greater<T>()));
+    ASSERT_EQUAL(two, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (three,two,thrust::greater<T>()));
+
+    typedef key_value<T,T> KV;
+    KV two_and_two(two,two);
+    KV two_and_three(two,three);
+
+    // the first element breaks ties
+    ASSERT_EQUAL_QUIET(two_and_two,   thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_two, two_and_three));
+    ASSERT_EQUAL_QUIET(two_and_three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_three, two_and_two));
+
+    ASSERT_EQUAL_QUIET(two_and_two,   thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_two, two_and_three, thrust::less<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_three, two_and_two, thrust::less<KV>()));
+
+    ASSERT_EQUAL_QUIET(two_and_two,   thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_two, two_and_three, thrust::greater<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (two_and_three, two_and_two, thrust::greater<KV>()));
+  }
+};
+SimpleUnitTest<TestMax, NumericTypes> TestMaxInstance;
+
diff --git a/thrust/testing/min_element.cu b/thrust/testing/min_element.cu
new file mode 100644
index 0000000000000000000000000000000000000000..81fedbdaba44f508ee15a9416549b4ddca5c61e0
--- /dev/null
+++ b/thrust/testing/min_element.cu
@@ -0,0 +1,124 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/retag.h>
+
+template <class Vector>
+void TestMinElementSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::min_element(data.begin(), data.end()), 1);
+    ASSERT_EQUAL( thrust::min_element(data.begin(), data.end()) - data.begin(), 2);
+    
+    ASSERT_EQUAL( *thrust::min_element(data.begin(), data.end(), thrust::greater<T>()), 5);
+    ASSERT_EQUAL( thrust::min_element(data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 1);
+}
+DECLARE_VECTOR_UNITTEST(TestMinElementSimple);
+
+template <class Vector>
+void TestMinElementWithTransform(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::min_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())), -5);
+    ASSERT_EQUAL( *thrust::min_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>()),
+          thrust::greater<T>()), -1);
+    
+}
+DECLARE_VECTOR_UNITTEST(TestMinElementWithTransform);
+
+template<typename T>
+void TestMinElement(const size_t n)
+{
+    thrust::host_vector<T> h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    typename thrust::host_vector<T>::iterator   h_min = thrust::min_element(h_data.begin(), h_data.end());
+    typename thrust::device_vector<T>::iterator d_min = thrust::min_element(d_data.begin(), d_data.end());
+
+    ASSERT_EQUAL(h_min - h_data.begin(), d_min - d_data.begin());
+    
+    typename thrust::host_vector<T>::iterator   h_max = thrust::min_element(h_data.begin(), h_data.end(), thrust::greater<T>());
+    typename thrust::device_vector<T>::iterator d_max = thrust::min_element(d_data.begin(), d_data.end(), thrust::greater<T>());
+
+    ASSERT_EQUAL(h_max - h_data.begin(), d_max - d_data.begin());
+}
+DECLARE_VARIABLE_UNITTEST(TestMinElement);
+
+
+template<typename ForwardIterator>
+ForwardIterator min_element(my_system &system, ForwardIterator first, ForwardIterator)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestMinElementDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::min_element(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMinElementDispatchExplicit);
+
+
+template<typename ForwardIterator>
+ForwardIterator min_element(my_tag, ForwardIterator first, ForwardIterator)
+{
+    *first = 13;
+    return first;
+}
+
+void TestMinElementDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::min_element(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.end()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestMinElementDispatchImplicit);
+
+void TestMinElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(
+        *thrust::min_element(thrust::device, begin, end, thrust::greater<long long>()),
+        (1ll << magnitude));
+}
+
+void TestMinElementWithBigIndexes()
+{
+    TestMinElementWithBigIndexesHelper(30);
+    TestMinElementWithBigIndexesHelper(31);
+    TestMinElementWithBigIndexesHelper(32);
+    TestMinElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinElementWithBigIndexes);
diff --git a/thrust/testing/minmax_element.cu b/thrust/testing/minmax_element.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a87f5bb439166ae3de7657bc009a8575bfb54a9
--- /dev/null
+++ b/thrust/testing/minmax_element.cu
@@ -0,0 +1,138 @@
+#include <unittest/unittest.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/retag.h>
+
+template <class Vector>
+void TestMinMaxElementSimple(void)
+{
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::minmax_element(data.begin(), data.end()).first,  1);
+    ASSERT_EQUAL( *thrust::minmax_element(data.begin(), data.end()).second, 5);
+    ASSERT_EQUAL(  thrust::minmax_element(data.begin(), data.end()).first  - data.begin(), 2);
+    ASSERT_EQUAL(  thrust::minmax_element(data.begin(), data.end()).second - data.begin(), 1);
+}
+DECLARE_VECTOR_UNITTEST(TestMinMaxElementSimple);
+  
+template <class Vector>
+void TestMinMaxElementWithTransform(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::minmax_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())).first, -5);
+    ASSERT_EQUAL( *thrust::minmax_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())).second, -1);
+}
+DECLARE_VECTOR_UNITTEST(TestMinMaxElementWithTransform);
+
+
+template<typename T>
+void TestMinMaxElement(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    typename thrust::host_vector<T>::iterator   h_min;
+    typename thrust::host_vector<T>::iterator   h_max;
+    typename thrust::device_vector<T>::iterator d_min;
+    typename thrust::device_vector<T>::iterator d_max;
+
+    h_min = thrust::minmax_element(h_data.begin(), h_data.end()).first;
+    d_min = thrust::minmax_element(d_data.begin(), d_data.end()).first;
+    h_max = thrust::minmax_element(h_data.begin(), h_data.end()).second;
+    d_max = thrust::minmax_element(d_data.begin(), d_data.end()).second;
+
+    ASSERT_EQUAL(h_min - h_data.begin(), d_min - d_data.begin());
+    ASSERT_EQUAL(h_max - h_data.begin(), d_max - d_data.begin());
+    
+    h_max = thrust::minmax_element(h_data.begin(), h_data.end(), thrust::greater<T>()).first;
+    d_max = thrust::minmax_element(d_data.begin(), d_data.end(), thrust::greater<T>()).first;
+    h_min = thrust::minmax_element(h_data.begin(), h_data.end(), thrust::greater<T>()).second;
+    d_min = thrust::minmax_element(d_data.begin(), d_data.end(), thrust::greater<T>()).second;
+
+    ASSERT_EQUAL(h_min - h_data.begin(), d_min - d_data.begin());
+    ASSERT_EQUAL(h_max - h_data.begin(), d_max - d_data.begin());
+}
+DECLARE_VARIABLE_UNITTEST(TestMinMaxElement);
+
+
+template<typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(my_system &system, ForwardIterator first, ForwardIterator)
+{
+    system.validate_dispatch();
+    return thrust::make_pair(first,first);
+}
+
+void TestMinMaxElementDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::minmax_element(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMinMaxElementDispatchExplicit);
+
+
+template<typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(my_tag, ForwardIterator first, ForwardIterator)
+{
+    *first = 13;
+    return thrust::make_pair(first,first);
+}
+
+void TestMinMaxElementDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::minmax_element(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.end()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestMinMaxElementDispatchImplicit);
+
+void TestMinMaxElementWithBigIndexesHelper(int magnitude)
+{
+    typedef thrust::counting_iterator<long long> Iter;
+    Iter begin(1);
+    Iter end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::pair<Iter, Iter> result = thrust::minmax_element(
+        thrust::device, begin, end);
+    ASSERT_EQUAL(*result.first, 1);
+    ASSERT_EQUAL(*result.second, (1ll << magnitude));
+
+    result = thrust::minmax_element(thrust::device, begin, end,
+        thrust::greater<long long>());
+    ASSERT_EQUAL(*result.second, 1);
+    ASSERT_EQUAL(*result.first, (1ll << magnitude));
+}
+
+void TestMinMaxElementWithBigIndexes()
+{
+    TestMinMaxElementWithBigIndexesHelper(30);
+    TestMinMaxElementWithBigIndexesHelper(31);
+    TestMinMaxElementWithBigIndexesHelper(32);
+    TestMinMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinMaxElementWithBigIndexes);
diff --git a/thrust/testing/mismatch.cu b/thrust/testing/mismatch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9c2ce351a00ac92fa7cce65f3eb72684ed6188c7
--- /dev/null
+++ b/thrust/testing/mismatch.cu
@@ -0,0 +1,76 @@
+#include <unittest/unittest.h>
+#include <thrust/mismatch.h>
+#include <thrust/iterator/retag.h>
+
+template <class Vector>
+void TestMismatchSimple(void)
+{
+    Vector a(4); Vector b(4);
+    a[0] = 1; b[0] = 1;
+    a[1] = 2; b[1] = 2;
+    a[2] = 3; b[2] = 4;
+    a[3] = 4; b[3] = 3;
+
+    ASSERT_EQUAL(thrust::mismatch(a.begin(), a.end(), b.begin()).first  - a.begin(), 2);
+    ASSERT_EQUAL(thrust::mismatch(a.begin(), a.end(), b.begin()).second - b.begin(), 2);
+
+    b[2] = 3;
+    
+    ASSERT_EQUAL(thrust::mismatch(a.begin(), a.end(), b.begin()).first  - a.begin(), 3);
+    ASSERT_EQUAL(thrust::mismatch(a.begin(), a.end(), b.begin()).second - b.begin(), 3);
+    
+    b[3] = 4;
+    
+    ASSERT_EQUAL(thrust::mismatch(a.begin(), a.end(), b.begin()).first  - a.begin(), 4);
+    ASSERT_EQUAL(thrust::mismatch(a.begin(), a.end(), b.begin()).second - b.begin(), 4);
+}
+DECLARE_VECTOR_UNITTEST(TestMismatchSimple);
+
+
+template <typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(my_system &system,
+                                                      InputIterator1 first,
+                                                      InputIterator1,
+                                                      InputIterator2)
+{
+    system.validate_dispatch();
+    return thrust::make_pair(first,first);
+}
+
+void TestMismatchDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::mismatch(sys,
+                     vec.begin(),
+                     vec.begin(),
+                     vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestMismatchDispatchExplicit);
+
+
+template <typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(my_tag,
+                                                      InputIterator1 first,
+                                                      InputIterator1,
+                                                      InputIterator2)
+{
+    *first = 13;
+    return thrust::make_pair(first,first);
+}
+
+void TestMismatchDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::mismatch(thrust::retag<my_tag>(vec.begin()),
+                     thrust::retag<my_tag>(vec.begin()),
+                     thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestMismatchDispatchImplicit);
+
diff --git a/thrust/testing/mr_disjoint_pool.cu b/thrust/testing/mr_disjoint_pool.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8499c6c53825b9ebf70960022d58385ac234542d
--- /dev/null
+++ b/thrust/testing/mr_disjoint_pool.cu
@@ -0,0 +1,297 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/disjoint_pool.h>
+#include <thrust/mr/new.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/mr/disjoint_sync_pool.h>
+#endif
+
+struct alloc_id
+{
+    std::size_t id;
+    std::size_t size;
+    std::size_t alignment;
+    std::size_t offset;
+
+    __host__ __device__
+    bool operator==(const alloc_id & other) const
+    {
+        return id == other.id && size == other.size && alignment == other.alignment;
+    }
+
+    alloc_id operator+(std::size_t size) const
+    {
+        alloc_id ret;
+        ret.id = id;
+        ret.size = size;
+        ret.alignment = alignment;
+        ret.offset = size;
+        return ret;
+    }
+};
+
+namespace thrust { namespace detail {
+template<>
+struct pointer_traits<alloc_id>
+{
+    template<typename>
+    struct rebind
+    {
+        typedef alloc_id other;
+    };
+
+    // implemented for the purposes of alignment test in disjoint pool's do_deallocate
+    static void * get(const alloc_id & id)
+    {
+        return reinterpret_cast<void *>(id.alignment);
+    }
+};
+}}
+
+class dummy_resource THRUST_FINAL : public thrust::mr::memory_resource<alloc_id>
+{
+public:
+    dummy_resource() : id_to_allocate(0), id_to_deallocate(0)
+    {
+    }
+
+    ~dummy_resource()
+    {
+        ASSERT_EQUAL(id_to_allocate, 0u);
+        ASSERT_EQUAL(id_to_deallocate, 0u);
+    }
+
+    virtual alloc_id do_allocate(std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    {
+        ASSERT_EQUAL(static_cast<bool>(id_to_allocate), true);
+
+        alloc_id ret;
+        ret.id = id_to_allocate;
+        ret.size = bytes;
+        ret.alignment = alignment;
+
+        id_to_allocate = 0;
+
+        return ret;
+    }
+
+    virtual void do_deallocate(alloc_id p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    {
+        ASSERT_EQUAL(p.size, bytes);
+        ASSERT_EQUAL(p.alignment, alignment);
+
+        if (id_to_deallocate != 0)
+        {
+            ASSERT_EQUAL(p.id, id_to_deallocate);
+            id_to_deallocate = 0;
+        }
+    }
+
+    std::size_t id_to_allocate;
+    std::size_t id_to_deallocate;
+};
+
+template<template<typename, typename> class PoolTemplate>
+void TestDisjointPool()
+{
+    dummy_resource upstream;
+    thrust::mr::new_delete_resource bookkeeper;
+
+    typedef PoolTemplate<
+        dummy_resource,
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = false;
+
+    // avoid having the destructor run when an assertion failure is raised
+    // (the destructor will try to release, which in turn calls do_deallocate,
+    // which may fail with an assertion failure exception...)
+    Pool * pool = new Pool(&upstream, &bookkeeper, opts);
+
+    upstream.id_to_allocate = 1;
+
+    // first allocation
+    alloc_id a1 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    // due to chunking, the above allocation should be enough for the next one too
+    alloc_id a2 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a2.id, 1u);
+
+    // deallocating and allocating back should give the same resource back
+    pool->do_deallocate(a1, 12, THRUST_MR_DEFAULT_ALIGNMENT);
+    alloc_id a3 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, a3.id);
+    ASSERT_EQUAL(a1.size, a3.size);
+    ASSERT_EQUAL(a1.alignment, a3.alignment);
+    ASSERT_EQUAL(a1.offset, a3.offset);
+
+    // allocating over-aligned memory should give non-cached results
+    upstream.id_to_allocate = 2;
+    alloc_id a4 = pool->do_allocate(32, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(a4.id, 2u);
+    ASSERT_EQUAL(a4.size, 32u);
+    ASSERT_EQUAL(a4.alignment, (std::size_t)THRUST_MR_DEFAULT_ALIGNMENT * 2);
+
+    // and deallocating it should return it back to upstream
+    upstream.id_to_deallocate = 2;
+    pool->do_deallocate(a4, 32u, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // release actually returns properly sized memory to upstream
+    upstream.id_to_deallocate = 1;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and does the same for oversized/overaligned memory
+    upstream.id_to_allocate = 3;
+    alloc_id a5 = pool->do_allocate(1024, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    upstream.id_to_deallocate = 3;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and after that, the formerly cached memory isn't used anymore,
+    // so new memory from upstream is returned back
+    upstream.id_to_allocate = 4;
+    alloc_id a6 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    // destruction also returns memory
+    upstream.id_to_deallocate = 4;
+
+    // actually destroy the pool; reasons why RAII is not used outlined at the beginning
+    // of this function
+    delete pool;
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+}
+
+void TestDisjointUnsynchronizedPool()
+{
+    TestDisjointPool<thrust::mr::disjoint_unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointUnsynchronizedPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestDisjointSynchronizedPool()
+{
+    TestDisjointPool<thrust::mr::disjoint_synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointSynchronizedPool);
+#endif
+
+template<template<typename, typename> class PoolTemplate>
+void TestDisjointPoolCachingOversized()
+{
+    dummy_resource upstream;
+    thrust::mr::new_delete_resource bookkeeper;
+
+    typedef PoolTemplate<
+        dummy_resource,
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = true;
+    opts.largest_block_size = 1024;
+
+    Pool pool(&upstream, &bookkeeper, opts);
+
+    upstream.id_to_allocate = 1;
+    alloc_id a1 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    upstream.id_to_allocate = 2;
+    alloc_id a2 = pool.do_allocate(64, 32);
+    ASSERT_EQUAL(a2.id, 2u);
+
+    pool.do_deallocate(a2, 64, 32);
+    pool.do_deallocate(a1, 2048, 32);
+
+    // make sure a good fit is used from the cache
+    alloc_id a3 = pool.do_allocate(32, 32);
+    ASSERT_EQUAL(a3.id, 2u);
+
+    alloc_id a4 = pool.do_allocate(1024, 32);
+    ASSERT_EQUAL(a4.id, 1u);
+
+    pool.do_deallocate(a4, 1024, 32);
+
+    // make sure that a new block is allocated when there's nothing cached with
+    // the required alignment
+    upstream.id_to_allocate = 3;
+    alloc_id a5 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    pool.release();
+
+    // make sure that release actually clears caches
+    upstream.id_to_allocate = 4;
+    alloc_id a6 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    upstream.id_to_allocate = 5;
+    alloc_id a7 = pool.do_allocate(2048, 1024);
+    ASSERT_EQUAL(a7.id, 5u);
+
+    pool.do_deallocate(a7, 2048, 1024);
+
+    // make sure that the 'ridiculousness' factor for size (options.cached_size_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 6;
+    alloc_id a8 = pool.do_allocate(24, 1024);
+    ASSERT_EQUAL(a8.id, 6u);
+
+    // make sure that the 'ridiculousness' factor for alignment (options.cached_alignment_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 7;
+    alloc_id a9 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a9.id, 7u);
+}
+
+void TestDisjointUnsynchronizedPoolCachingOversized()
+{
+    TestDisjointPoolCachingOversized<thrust::mr::disjoint_unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointUnsynchronizedPoolCachingOversized);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestDisjointSynchronizedPoolCachingOversized()
+{
+    TestDisjointPoolCachingOversized<thrust::mr::disjoint_synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointSynchronizedPoolCachingOversized);
+#endif
+
+template<template<typename, typename> class PoolTemplate>
+void TestDisjointGlobalPool()
+{
+    typedef PoolTemplate<
+        thrust::mr::new_delete_resource,
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    ASSERT_EQUAL(thrust::mr::get_global_resource<Pool>() != NULL, true);
+}
+
+void TestUnsynchronizedDisjointGlobalPool()
+{
+    TestDisjointGlobalPool<thrust::mr::disjoint_unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedDisjointGlobalPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedDisjointGlobalPool()
+{
+    TestDisjointGlobalPool<thrust::mr::disjoint_synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedDisjointGlobalPool);
+#endif
+
diff --git a/thrust/testing/mr_new.cu b/thrust/testing/mr_new.cu
new file mode 100644
index 0000000000000000000000000000000000000000..df0f3fde54da040895fa879c1d6e8f71f37cbebf
--- /dev/null
+++ b/thrust/testing/mr_new.cu
@@ -0,0 +1,36 @@
+#include <unittest/unittest.h>
+#include <thrust/mr/new.h>
+#include <thrust/fill.h>
+
+template<typename MemoryResource>
+void TestAlignment(MemoryResource memres, std::size_t size, std::size_t alignment)
+{
+    void * ptr = memres.do_allocate(size, alignment);
+    ASSERT_EQUAL(reinterpret_cast<std::size_t>(ptr) % alignment, 0u);
+
+    char * char_ptr = reinterpret_cast<char *>(ptr);
+    thrust::fill(char_ptr, char_ptr + size, 0);
+
+    memres.do_deallocate(ptr, size, alignment);
+}
+
+static const std::size_t MinTestedSize = 32;
+static const std::size_t MaxTestedSize = 8 * 1024;
+static const std::size_t TestedSizeStep = 1;
+
+static const std::size_t MinTestedAlignment = 16;
+static const std::size_t MaxTestedAlignment = 4 * 1024;
+static const std::size_t TestedAlignmentShift = 1;
+
+void TestNewDeleteResourceAlignedAllocation()
+{
+    for (std::size_t size = MinTestedSize; size <= MaxTestedSize; size += TestedSizeStep)
+    {
+        for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+            alignment <<= TestedAlignmentShift)
+        {
+            TestAlignment(thrust::mr::new_delete_resource(), size, alignment);
+        }
+    }
+}
+DECLARE_UNITTEST(TestNewDeleteResourceAlignedAllocation);
diff --git a/thrust/testing/mr_pool.cu b/thrust/testing/mr_pool.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75b18f038c565acd7c9f1f58353dfbcc9f759cb4
--- /dev/null
+++ b/thrust/testing/mr_pool.cu
@@ -0,0 +1,360 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/pool.h>
+#include <thrust/mr/new.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/mr/sync_pool.h>
+#endif
+
+template<typename T>
+struct reference
+{
+    typedef T & type;
+};
+
+template<>
+struct reference<void>
+{
+    typedef void type;
+};
+
+struct unit {};
+
+template<typename T>
+struct tracked_pointer : thrust::iterator_facade<
+                            tracked_pointer<T>,
+                            T,
+                            thrust::host_system_tag,
+                            thrust::random_access_traversal_tag,
+                            typename reference<T>::type,
+                            std::ptrdiff_t
+                         >
+{
+    typedef T * raw_pointer;
+
+    std::size_t id;
+    std::size_t size;
+    std::size_t alignment;
+    std::size_t offset;
+    void * ptr;
+
+    __host__ __device__
+    explicit tracked_pointer(T * ptr = NULL) : id(), size(), alignment(), offset(), ptr(ptr)
+    {
+    }
+
+    __host__ __device__
+    ~tracked_pointer()
+    {
+    }
+
+    template<typename U>
+    operator tracked_pointer<U>() const
+    {
+        tracked_pointer<U> ret;
+        ret.id = id;
+        ret.size = size;
+        ret.alignment = alignment;
+        ret.offset = offset;
+        ret.ptr = ptr;
+        return ret;
+    }
+
+    __host__ __device__
+    std::ptrdiff_t distance_to(const tracked_pointer & other) const
+    {
+        return static_cast<T *>(other.ptr) - static_cast<T *>(ptr);
+    }
+
+    __host__ __device__
+    T * get() const
+    {
+        return static_cast<T *>(ptr);
+    }
+
+    // globally qualified, because MSVC somehow prefers the name from the dependent base
+    // of this class over the `reference` template that's visible in the global namespace of this file...
+    __host__ __device__
+    typename ::reference<T>::type dereference() const
+    {
+        return *get();
+    }
+
+    __host__ __device__
+    void increment()
+    {
+        advance(1);
+    }
+
+    __host__ __device__
+    void decrement()
+    {
+        advance(-1);
+    }
+
+    __host__ __device__
+    void advance(std::ptrdiff_t diff)
+    {
+        ptr = get() + diff;
+        offset += diff * sizeof(T);
+    }
+
+    __host__ __device__
+    bool equal(const tracked_pointer & other) const
+    {
+        return id == other.id && size == other.size && alignment == other.alignment && offset == other.offset && ptr == other.ptr;
+    }
+};
+
+class tracked_resource THRUST_FINAL : public thrust::mr::memory_resource<tracked_pointer<void> >
+{
+public:
+    tracked_resource() : id_to_allocate(0), id_to_deallocate(0)
+    {
+    }
+
+    ~tracked_resource()
+    {
+        ASSERT_EQUAL(id_to_allocate, 0u);
+        ASSERT_EQUAL(id_to_deallocate, 0u);
+    }
+
+    virtual tracked_pointer<void> do_allocate(std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        ASSERT_EQUAL(static_cast<bool>(id_to_allocate), true);
+
+        void * raw = upstream.do_allocate(n, alignment);
+        tracked_pointer<void> ret(raw);
+        ret.id = id_to_allocate;
+        ret.size = n;
+        ret.alignment = alignment;
+
+        id_to_allocate = 0;
+
+        return ret;
+    }
+
+    virtual void do_deallocate(tracked_pointer<void> p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        ASSERT_EQUAL(p.size, n);
+        ASSERT_EQUAL(p.alignment, alignment);
+
+        if (id_to_deallocate != 0)
+        {
+            ASSERT_EQUAL(p.id, id_to_deallocate);
+            id_to_deallocate = 0;
+        }
+
+        upstream.do_deallocate(p.ptr, n, alignment);
+    }
+
+    std::size_t id_to_allocate;
+    std::size_t id_to_deallocate;
+
+private:
+    thrust::mr::new_delete_resource upstream;
+};
+
+template<template<typename> class PoolTemplate>
+void TestPool()
+{
+    tracked_resource upstream;
+
+    upstream.id_to_allocate = -1u;
+
+    typedef PoolTemplate<
+        tracked_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = false;
+
+    // avoid having the destructor run when an assertion failure is raised
+    // (the destructor will try to release, which in turn calls do_deallocate,
+    // which may fail with an assertion failure exception...)
+    Pool * pool = new Pool(&upstream, opts);
+
+    upstream.id_to_allocate = 1;
+
+    // first allocation
+    tracked_pointer<void> a1 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    // due to chunking, the above allocation should be enough for the next one too
+    tracked_pointer<void> a2 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a2.id, 1u);
+
+    // deallocating and allocating back should give the same resource back
+    pool->do_deallocate(a1, 12, THRUST_MR_DEFAULT_ALIGNMENT);
+    tracked_pointer<void> a3 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, a3.id);
+    ASSERT_EQUAL(a1.size, a3.size);
+    ASSERT_EQUAL(a1.alignment, a3.alignment);
+    ASSERT_EQUAL(a1.offset, a3.offset);
+
+    // allocating over-aligned memory should give non-cached results
+    // unlike with the disjoint version, nothing sensible can be said about the chunk size
+    upstream.id_to_allocate = 2;
+    tracked_pointer<void> a4 = pool->do_allocate(32, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(a4.id, 2u);
+    ASSERT_EQUAL(a4.alignment, (std::size_t)THRUST_MR_DEFAULT_ALIGNMENT * 2);
+
+    // and deallocating it should return it back to upstream
+    upstream.id_to_deallocate = 2;
+    pool->do_deallocate(a4, 32u, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // release actually returns properly sized memory to upstream
+    upstream.id_to_deallocate = 1;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and does the same for oversized/overaligned memory
+    upstream.id_to_allocate = 3;
+    tracked_pointer<void> a5 = pool->do_allocate(1024, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    upstream.id_to_deallocate = 3;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and after that, the formerly cached memory isn't used anymore,
+    // so new memory from upstream is returned back
+    upstream.id_to_allocate = 4;
+    tracked_pointer<void> a6 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    // destruction also returns memory
+    upstream.id_to_deallocate = 4;
+
+    // actually destroy the pool; reasons why RAII is not used outlined at the beginning
+    // of this function
+    delete pool;
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+}
+
+void TestUnsynchronizedPool()
+{
+    TestPool<thrust::mr::unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedPool()
+{
+    TestPool<thrust::mr::synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedPool);
+#endif
+
+template<template<typename> class PoolTemplate>
+void TestPoolCachingOversized()
+{
+    tracked_resource upstream;
+
+    upstream.id_to_allocate = -1u;
+
+    typedef PoolTemplate<
+        tracked_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = true;
+    opts.largest_block_size = 1024;
+
+    Pool pool(&upstream, opts);
+
+    upstream.id_to_allocate = 1;
+    tracked_pointer<void> a1 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    upstream.id_to_allocate = 2;
+    tracked_pointer<void> a2 = pool.do_allocate(64, 32);
+    ASSERT_EQUAL(a2.id, 2u);
+
+    pool.do_deallocate(a2, 64, 32);
+    pool.do_deallocate(a1, 2048, 32);
+
+    // make sure a good fit is used from the cache
+    tracked_pointer<void> a3 = pool.do_allocate(32, 32);
+    ASSERT_EQUAL(a3.id, 2u);
+
+    tracked_pointer<void> a4 = pool.do_allocate(1024, 32);
+    ASSERT_EQUAL(a4.id, 1u);
+
+    pool.do_deallocate(a4, 1024, 32);
+
+    // make sure that a new block is allocated when there's nothing cached with
+    // the required alignment
+    upstream.id_to_allocate = 3;
+    tracked_pointer<void> a5 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    pool.release();
+
+    // make sure that release actually clears caches
+    upstream.id_to_allocate = 4;
+    tracked_pointer<void> a6 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    upstream.id_to_allocate = 5;
+    tracked_pointer<void> a7 = pool.do_allocate(2048, 1024);
+    ASSERT_EQUAL(a7.id, 5u);
+
+    pool.do_deallocate(a7, 2048, 1024);
+
+    // make sure that the 'ridiculousness' factor for size (options.cached_size_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 6;
+    tracked_pointer<void> a8 = pool.do_allocate(24, 1024);
+    ASSERT_EQUAL(a8.id, 6u);
+
+    // make sure that the 'ridiculousness' factor for alignment (options.cached_alignment_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 7;
+    tracked_pointer<void> a9 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a9.id, 7u);
+}
+
+void TestUnsynchronizedPoolCachingOversized()
+{
+    TestPoolCachingOversized<thrust::mr::unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedPoolCachingOversized);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedPoolCachingOversized()
+{
+    TestPoolCachingOversized<thrust::mr::synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedPoolCachingOversized);
+#endif
+
+template<template<typename> class PoolTemplate>
+void TestGlobalPool()
+{
+    typedef PoolTemplate<
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    ASSERT_EQUAL(thrust::mr::get_global_resource<Pool>() != NULL, true);
+}
+
+void TestUnsynchronizedGlobalPool()
+{
+    TestGlobalPool<thrust::mr::unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedGlobalPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedGlobalPool()
+{
+    TestGlobalPool<thrust::mr::synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedGlobalPool);
+#endif
+
diff --git a/thrust/testing/mr_pool_options.cu b/thrust/testing/mr_pool_options.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b53e336df943783c540ec798bdbac08639fdb111
--- /dev/null
+++ b/thrust/testing/mr_pool_options.cu
@@ -0,0 +1,63 @@
+#include <unittest/unittest.h>
+#include <thrust/mr/pool_options.h>
+
+void TestPoolOptionsBasicValidity()
+{
+    thrust::mr::pool_options options = thrust::mr::pool_options();
+    ASSERT_EQUAL(options.validate(), false);
+
+    options.max_blocks_per_chunk = 1024;
+    options.max_bytes_per_chunk = 1024 * 1024;
+    options.smallest_block_size = 8;
+    options.largest_block_size = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // the minimum number of blocks per chunk is bigger than the max
+    options.min_blocks_per_chunk = 1025;
+    ASSERT_EQUAL(options.validate(), false);
+    options.min_blocks_per_chunk = 128;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // the minimum number of bytes per chunk is bigger than the max
+    options.min_bytes_per_chunk = 1025 * 1024;
+    ASSERT_EQUAL(options.validate(), false);
+    options.min_bytes_per_chunk = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // smallest block size is bigger than the largest block size
+    options.smallest_block_size = 2048;
+    ASSERT_EQUAL(options.validate(), false);
+    options.smallest_block_size = 8;
+    ASSERT_EQUAL(options.validate(), true);
+}
+DECLARE_UNITTEST(TestPoolOptionsBasicValidity);
+
+void TestPoolOptionsComplexValidity()
+{
+    thrust::mr::pool_options options = thrust::mr::pool_options();
+    ASSERT_EQUAL(options.validate(), false);
+
+    options.max_blocks_per_chunk = 1024;
+    options.max_bytes_per_chunk = 1024 * 1024;
+    options.smallest_block_size = 8;
+    options.largest_block_size = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    options.min_bytes_per_chunk = 2 * 1024;
+    options.max_bytes_per_chunk = 256 * 1024;
+
+    // the biggest allowed allocation (deduced from blocks in chunks)
+    // is smaller than the minimal allowed one (defined in bytes)
+    options.max_blocks_per_chunk = 1;
+    ASSERT_EQUAL(options.validate(), false);
+    options.max_blocks_per_chunk = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // the smallest allowed allocation (deduced from blocks in chunks)
+    // is bigger than the maximum allowed one (defined in bytes)
+    options.min_blocks_per_chunk = 1024 * 1024;
+    ASSERT_EQUAL(options.validate(), false);
+    options.min_blocks_per_chunk = 128;
+    ASSERT_EQUAL(options.validate(), true);
+}
+DECLARE_UNITTEST(TestPoolOptionsComplexValidity);
diff --git a/thrust/testing/omp/CMakeLists.txt b/thrust/testing/omp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89ea9bb0c5bb30a40317f60f48f318cd3c026114
--- /dev/null
+++ b/thrust/testing/omp/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "OMP")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "omp.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/thrust/testing/omp/nvcc_independence.cpp b/thrust/testing/omp/nvcc_independence.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c870928a8a36c94cf9a835dceed99fb401524fb9
--- /dev/null
+++ b/thrust/testing/omp/nvcc_independence.cpp
@@ -0,0 +1,75 @@
+#include <unittest/unittest.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/system_error.h>
+
+void TestNvccIndependenceTransform(void)
+{
+  typedef int T;
+  const int n = 10;
+
+  thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_input = h_input;
+
+  thrust::host_vector<T>   h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  thrust::transform(h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>());
+  thrust::transform(d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>());
+  
+  ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_UNITTEST(TestNvccIndependenceTransform);
+
+void TestNvccIndependenceReduce(void)
+{
+  typedef int T;
+  const int n = 10;
+
+  thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+
+  T init = 13;
+
+  T h_result = thrust::reduce(h_data.begin(), h_data.end(), init);
+  T d_result = thrust::reduce(d_data.begin(), d_data.end(), init);
+
+  ASSERT_ALMOST_EQUAL(h_result, d_result);
+}
+DECLARE_UNITTEST(TestNvccIndependenceReduce);
+
+void TestNvccIndependenceExclusiveScan(void)
+{
+  typedef int T;
+  const int n = 10;
+
+  thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_input = h_input;
+
+  thrust::host_vector<T>   h_output(n);
+  thrust::device_vector<T> d_output(n);
+  
+  thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+  thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_UNITTEST(TestNvccIndependenceExclusiveScan);
+
+void TestNvccIndependenceSort(void)
+{
+  typedef int T;
+  const int n = 10;
+
+  thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+  thrust::device_vector<T> d_data = h_data;
+
+  thrust::sort(h_data.begin(), h_data.end(), thrust::less<T>());
+  thrust::sort(d_data.begin(), d_data.end(), thrust::less<T>());
+
+  ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_UNITTEST(TestNvccIndependenceSort);
+
diff --git a/thrust/testing/omp/reduce_intervals.cu b/thrust/testing/omp/reduce_intervals.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e2ba4414069009c21c8986c0761c93f2820974d3
--- /dev/null
+++ b/thrust/testing/omp/reduce_intervals.cu
@@ -0,0 +1,105 @@
+#include <unittest/unittest.h>
+
+#include <thrust/functional.h>
+#include <thrust/system/detail/internal/decompose.h>
+#include <thrust/system/omp/detail/reduce_intervals.h>
+
+// CPP reference implementation 
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction,
+         typename Decomposition>
+void reduce_intervals(InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp)
+{
+  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+  typedef typename Decomposition::index_type index_type;
+
+  // wrap binary_op
+  thrust::detail::wrapped_function<
+    BinaryFunction,
+    OutputType
+  > wrapped_binary_op(binary_op);
+
+  for(index_type i = 0; i < decomp.size(); ++i, ++output)
+  {
+    InputIterator begin = input + decomp[i].begin();
+    InputIterator end   = input + decomp[i].end();
+
+    if (begin != end)
+    {
+      OutputType sum = *begin;
+
+      ++begin;
+
+      while (begin != end)
+      {
+        sum = wrapped_binary_op(sum, *begin);
+        ++begin;
+      }
+
+      *output = sum;
+    }
+  }
+}
+
+
+void TestOmpReduceIntervalsSimple(void)
+{
+  typedef int T;
+  typedef thrust::device_vector<T> Vector;
+
+  using thrust::system::omp::detail::reduce_intervals;
+  using thrust::system::detail::internal::uniform_decomposition;
+
+  Vector input(10, 1);
+
+  thrust::omp::tag omp_tag;
+    
+  {
+    uniform_decomposition<int> decomp(10, 10, 1);
+    Vector output(decomp.size());
+    reduce_intervals(omp_tag, input.begin(), output.begin(), thrust::plus<T>(), decomp);
+
+    ASSERT_EQUAL(output[0], 10);
+  }
+  
+  {
+    uniform_decomposition<int> decomp(10, 6, 2);
+    Vector output(decomp.size());
+    reduce_intervals(omp_tag, input.begin(), output.begin(), thrust::plus<T>(), decomp);
+
+    ASSERT_EQUAL(output[0], 6);
+    ASSERT_EQUAL(output[1], 4);
+  }
+}
+DECLARE_UNITTEST(TestOmpReduceIntervalsSimple);
+
+
+template<typename T>
+struct TestOmpReduceIntervals
+{
+  void operator()(const size_t n)
+  {
+    using thrust::system::omp::detail::reduce_intervals;
+    using thrust::system::detail::internal::uniform_decomposition;
+    
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    uniform_decomposition<size_t> decomp(n, 7, 100);
+
+    thrust::host_vector<T>   h_output(decomp.size());
+    thrust::device_vector<T> d_output(decomp.size());
+    
+    ::reduce_intervals(h_input.begin(), h_output.begin(), thrust::plus<T>(), decomp);
+    thrust::system::omp::tag omp_tag;
+    reduce_intervals(omp_tag, d_input.begin(), d_output.begin(), thrust::plus<T>(), decomp);
+
+    ASSERT_EQUAL(h_output, d_output);
+  }
+};
+VariableUnitTest<TestOmpReduceIntervals, IntegralTypes> TestOmpReduceIntervalsInstance;
+
diff --git a/thrust/testing/out_of_memory_recovery.cu b/thrust/testing/out_of_memory_recovery.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e4f0c327b2212a7a78c148266b600d56f8060e7
--- /dev/null
+++ b/thrust/testing/out_of_memory_recovery.cu
@@ -0,0 +1,33 @@
+// Regression test for NVBug 2720132.
+//
+// Summary of 2720132:
+//
+// 1. The large allocation fails due to running out of memory.
+// 2. A `thrust::system::system_error` exception is thrown.
+// 3. Local objects are destroyed as the stack is unwound, leading to the destruction of `x`.
+// 4. `x` runs a parallel algorithm in its destructor to call the destructors of all of its elements.
+// 5. Launching that parallel algorithm fails because of the prior CUDA out of memory error.
+// 6. A `thrust::system::system_error` exception is thrown.
+// 7. Because we've already got an active exception, `terminate` is called.
+
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/detail/cstdint.h>
+
+struct non_trivial
+{
+  __host__ __device__ non_trivial() {}
+  __host__ __device__ ~non_trivial() {}
+};
+
+void test_out_of_memory_recovery()
+{
+  try
+  {
+    thrust::device_vector<non_trivial> x(1);
+
+    thrust::device_vector<thrust::detail::uint32_t> y(0x00ffffffffffffff);
+  }
+  catch (...) { }
+}
+DECLARE_UNITTEST(test_out_of_memory_recovery);
diff --git a/thrust/testing/pair.cu b/thrust/testing/pair.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a213265f30d8fb0a8416297b50f7317f8751cf79
--- /dev/null
+++ b/thrust/testing/pair.cu
@@ -0,0 +1,271 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/tuple.h>
+#include <thrust/swap.h>
+
+template <typename T>
+struct TestPairManipulation
+{
+  void operator()(void)
+  {
+    typedef thrust::pair<T,T> P;
+
+    // test null constructor
+    P p1;
+    ASSERT_EQUAL(T(0), p1.first);
+    ASSERT_EQUAL(T(0), p1.second);
+
+    // test individual value manipulation
+    p1.first  = T(1);
+    p1.second = T(2);
+    ASSERT_EQUAL(T(1), p1.first);
+    ASSERT_EQUAL(T(2), p1.second);
+
+    // test copy constructor
+    P p2(p1);
+    ASSERT_EQUAL(p1.first,  p2.first);
+    ASSERT_EQUAL(p1.second, p2.second);
+
+    // test copy from std::pair constructor
+    std::pair<T,T> sp(p1.first, p1.second);
+    ASSERT_EQUAL(p1.first,  sp.first);
+    ASSERT_EQUAL(p1.second, sp.second);
+
+    // test initialization
+    P p3 = p2;
+    ASSERT_EQUAL(p2.first,  p3.first);
+    ASSERT_EQUAL(p2.second, p3.second);
+
+    // test initialization from std::pair
+    P p4 = sp;
+    ASSERT_EQUAL(sp.first,  p4.first);
+    ASSERT_EQUAL(sp.second, p4.second);
+
+    // test copy from pair
+    p4.first  = T(2);
+    p4.second = T(3);
+    
+    P p5;
+    p5 = p4;
+    ASSERT_EQUAL(p4.first,  p5.first);
+    ASSERT_EQUAL(p4.second, p5.second);
+
+    // test copy from std::pair
+    sp.first  = T(4);
+    sp.second = T(5);
+
+    P p6;
+    p6 = sp;
+    ASSERT_EQUAL(sp.first,  p6.first);
+    ASSERT_EQUAL(sp.second, p6.second);
+
+    // test initialization from make_pair
+    P p7 = thrust::make_pair(T(6),T(7));
+    ASSERT_EQUAL(T(6), p7.first);
+    ASSERT_EQUAL(T(7), p7.second);
+
+    // test copy from make_pair
+    p7 = thrust::make_pair(T(8),T(9));
+    ASSERT_EQUAL(T(8), p7.first);
+    ASSERT_EQUAL(T(9), p7.second);
+  }
+};
+SimpleUnitTest<TestPairManipulation, NumericTypes> TestPairManipulationInstance;
+
+
+template <typename T>
+struct TestPairComparison
+{
+  void operator()(void)
+  {
+    typedef thrust::pair<T,T> P;
+
+    P x, y;
+
+    // test operator ==
+    x.first = x.second = y.first = y.second = T(0);
+    ASSERT_EQUAL(true, x == y);
+    ASSERT_EQUAL(true, y == x);
+
+    x.first = y.first = y.second = T(0);
+    x.second = T(1);
+    ASSERT_EQUAL(false, x == y);
+    ASSERT_EQUAL(false, y == x);
+
+    // test operator<
+    x.first  = T(0); x.second = T(0);
+    y.first  = T(0); y.second = T(0);
+    ASSERT_EQUAL(false, x < y);
+    ASSERT_EQUAL(false, y < x);
+
+    x.first  = T(0); x.second = T(1);
+    y.first  = T(2); y.second = T(3);
+    ASSERT_EQUAL(true,  x < y);
+    ASSERT_EQUAL(false, y < x);
+
+    x.first  = T(0); x.second = T(0);
+    y.first  = T(0); y.second = T(1);
+    ASSERT_EQUAL(true,  x < y);
+    ASSERT_EQUAL(false, y < x);
+
+    x.first  = T(0); x.second = T(1);
+    y.first  = T(0); y.second = T(2);
+    ASSERT_EQUAL(true,  x < y);
+    ASSERT_EQUAL(false, y < x);
+
+    // test operator!=
+    x.first = y.first = y.second = T(0);
+    x.second = T(1);
+    ASSERT_EQUAL(true, x != y);
+    ASSERT_EQUAL(true, y != x);
+
+    x.first = x.second = y.first = y.second = T(0);
+    ASSERT_EQUAL(false, x != y);
+    ASSERT_EQUAL(false, y != x);
+
+    // test operator>
+    x.first  = T(0); x.second = T(0);
+    y.first  = T(0); y.second = T(0);
+    ASSERT_EQUAL(false, x > y);
+    ASSERT_EQUAL(false, y > x);
+
+    x.first  = T(2); x.second = T(3);
+    y.first  = T(0); y.second = T(1);
+    ASSERT_EQUAL(true,  x > y);
+    ASSERT_EQUAL(false, y > x);
+
+    x.first  = T(0); x.second = T(1);
+    y.first  = T(0); y.second = T(0);
+    ASSERT_EQUAL(true,  x > y);
+    ASSERT_EQUAL(false, y > x);
+
+    x.first  = T(0); x.second = T(2);
+    y.first  = T(0); y.second = T(1);
+    ASSERT_EQUAL(true,  x > y);
+    ASSERT_EQUAL(false, y > x);
+
+
+    // test operator <=
+    x.first = x.second = y.first = y.second = T(0);
+    ASSERT_EQUAL(true, x <= y);
+    ASSERT_EQUAL(true, y <= x);
+
+    x.first = y.first = y.second = T(0);
+    x.second = T(1);
+    ASSERT_EQUAL(false, x <= y);
+
+    x.first  = T(0); x.second = T(1);
+    y.first  = T(2); y.second = T(3);
+    ASSERT_EQUAL(true,  x <= y);
+    ASSERT_EQUAL(false, y <= x);
+
+    x.first  = T(0); x.second = T(0);
+    y.first  = T(0); y.second = T(1);
+    ASSERT_EQUAL(true,  x <= y);
+    ASSERT_EQUAL(false, y <= x);
+
+    x.first  = T(0); x.second = T(1);
+    y.first  = T(0); y.second = T(2);
+    ASSERT_EQUAL(true,  x <= y);
+    ASSERT_EQUAL(false, y <= x);
+
+
+    // test operator >=
+    x.first = x.second = y.first = y.second = T(0);
+    ASSERT_EQUAL(true, x >= y);
+    ASSERT_EQUAL(true, y >= x);
+
+    x.first = x.second = y.first = T(0);
+    y.second = T(1);
+    ASSERT_EQUAL(false, x >= y);
+
+    x.first  = T(2); x.second = T(3);
+    y.first  = T(0); y.second = T(1);
+    ASSERT_EQUAL(true,  x >= y);
+    ASSERT_EQUAL(false, y >= x);
+
+    x.first  = T(0); x.second = T(1);
+    y.first  = T(0); y.second = T(0);
+    ASSERT_EQUAL(true,  x >= y);
+    ASSERT_EQUAL(false, y >= x);
+
+    x.first  = T(0); x.second = T(2);
+    y.first  = T(0); y.second = T(1);
+    ASSERT_EQUAL(true,  x >= y);
+    ASSERT_EQUAL(false, y >= x);
+  }
+};
+SimpleUnitTest<TestPairComparison, NumericTypes> TestPairComparisonInstance;
+
+
+template<typename T>
+struct TestPairGet
+{
+  void operator()(void)
+  {
+    thrust::host_vector<T> data = unittest::random_integers<T>(2);
+
+    thrust::pair<T,T> p(data[0], data[1]);
+
+    ASSERT_EQUAL(data[0], thrust::get<0>(p));
+    ASSERT_EQUAL(data[1], thrust::get<1>(p));
+  }
+};
+SimpleUnitTest<TestPairGet, BuiltinNumericTypes> TestPairGetInstance;
+
+
+void TestPairTupleSize(void)
+{
+  int result = thrust::tuple_size< thrust::pair<int,int> >::value;
+  ASSERT_EQUAL(2, result);
+};
+DECLARE_UNITTEST(TestPairTupleSize);
+
+
+void TestPairTupleElement(void)
+{
+  typedef thrust::tuple_element<0, thrust::pair<int, float> >::type type0;
+  typedef thrust::tuple_element<1, thrust::pair<int, float> >::type type1;
+
+  ASSERT_EQUAL_QUIET(typeid(int),   typeid(type0));
+  ASSERT_EQUAL_QUIET(typeid(float), typeid(type1));
+};
+DECLARE_UNITTEST(TestPairTupleElement);
+
+
+void TestPairSwap(void)
+{
+  int x = 7;
+  int y = 13;
+
+  int z = 42;
+  int w = 0;
+
+  thrust::pair<int,int> a(x,y);
+  thrust::pair<int,int> b(z,w);
+
+  thrust::swap(a,b);
+
+  ASSERT_EQUAL(z, a.first);
+  ASSERT_EQUAL(w, a.second);
+  ASSERT_EQUAL(x, b.first);
+  ASSERT_EQUAL(y, b.second);
+
+
+  typedef thrust::pair<user_swappable,user_swappable> swappable_pair;
+
+  thrust::host_vector<swappable_pair>   h_v1(1), h_v2(1);
+  thrust::device_vector<swappable_pair> d_v1(1), d_v2(1);
+
+  thrust::swap_ranges(h_v1.begin(), h_v1.end(), h_v2.begin());
+  thrust::swap_ranges(d_v1.begin(), d_v1.end(), d_v2.begin());
+
+  swappable_pair ref(user_swappable(true), user_swappable(true));
+
+  ASSERT_EQUAL_QUIET(ref, h_v1[0]);
+  ASSERT_EQUAL_QUIET(ref, h_v1[0]);
+  ASSERT_EQUAL_QUIET(ref, (swappable_pair)d_v1[0]);
+  ASSERT_EQUAL_QUIET(ref, (swappable_pair)d_v1[0]);
+}
+DECLARE_UNITTEST(TestPairSwap);
+
diff --git a/thrust/testing/pair_reduce.cu b/thrust/testing/pair_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ebdab659798e3a6773d29be349d234bba5028f3c
--- /dev/null
+++ b/thrust/testing/pair_reduce.cu
@@ -0,0 +1,58 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/transform.h>
+#include <thrust/reduce.h>
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+struct add_pairs
+{
+  template <typename Pair1, typename Pair2>
+  __host__ __device__
+    Pair1 operator()(const Pair1 &x, const Pair2 &y)
+  {
+    return thrust::make_pair(x.first + y.first, x.second + y.second);
+  } // end operator()
+}; // end add_pairs
+
+
+template <typename T>
+  struct TestPairReduce
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::pair<T,T> P;
+
+    thrust::host_vector<T>   h_p1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_p2 = unittest::random_integers<T>(n);
+    thrust::host_vector<P>   h_pairs(n);
+
+    // zip up pairs on the host
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+    thrust::device_vector<T> d_p1 = h_p1;
+    thrust::device_vector<T> d_p2 = h_p2;
+    thrust::device_vector<P> d_pairs = h_pairs;
+
+    P init = thrust::make_pair(13,13);
+
+    // reduce on the host
+    P h_result = thrust::reduce(h_pairs.begin(), h_pairs.end(), init, add_pairs());
+
+    // reduce on the device
+    P d_result = thrust::reduce(d_pairs.begin(), d_pairs.end(), init, add_pairs());
+
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+  }
+}; // end TestPairReduce
+VariableUnitTest<TestPairReduce, SignedIntegralTypes> TestPairReduceInstance;
+
diff --git a/thrust/testing/pair_scan.cu b/thrust/testing/pair_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b1bfe064b9c9e6ef38ee56802d11b9fd4128227a
--- /dev/null
+++ b/thrust/testing/pair_scan.cu
@@ -0,0 +1,89 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/transform.h>
+#include <thrust/scan.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <unittest/cuda/testframework.h>
+#endif
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+struct add_pairs
+{
+  template <typename Pair1, typename Pair2>
+  __host__ __device__
+    Pair1 operator()(const Pair1 &x, const Pair2 &y)
+  {
+    return thrust::make_pair(x.first + y.first, x.second + y.second);
+  } // end operator()
+}; // end add_pairs
+
+
+template <typename T>
+  struct TestPairScan
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::pair<T,T> P;
+
+    thrust::host_vector<T>   h_p1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_p2 = unittest::random_integers<T>(n);
+    thrust::host_vector<P>   h_pairs(n);
+    thrust::host_vector<P>   h_output(n);
+
+    // zip up pairs on the host
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+    thrust::device_vector<T> d_p1 = h_p1;
+    thrust::device_vector<T> d_p2 = h_p2;
+    thrust::device_vector<P> d_pairs = h_pairs;
+    thrust::device_vector<P> d_output(n);
+
+    P init = thrust::make_pair(13,13);
+
+    // scan with plus
+    thrust::inclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), add_pairs());
+    thrust::inclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), add_pairs());
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+
+    // scan with maximum (thrust issue #69)
+    thrust::inclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), thrust::maximum<P>());
+    thrust::inclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), thrust::maximum<P>());
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+
+
+    // The tests below get miscompiled on Tesla hw for 8b types
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
+    {
+      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
+      {
+        KNOWN_FAILURE;
+      } // end if
+    } // end if
+#endif
+
+    // scan with plus
+    thrust::exclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), init, add_pairs());
+    thrust::exclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), init, add_pairs());
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+    
+    // scan with maximum (thrust issue #69)
+    thrust::exclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), init, thrust::maximum<P>());
+    thrust::exclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), init, thrust::maximum<P>());
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+  }
+};
+VariableUnitTest<TestPairScan, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestPairScanInstance;
+
diff --git a/thrust/testing/pair_scan_by_key.cu b/thrust/testing/pair_scan_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e63bc80642ce6bfc5eb2e6a75f9528dc7c0cb7f
--- /dev/null
+++ b/thrust/testing/pair_scan_by_key.cu
@@ -0,0 +1,61 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/transform.h>
+#include <thrust/scan.h>
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+struct add_pairs
+{
+  template <typename Pair1, typename Pair2>
+  __host__ __device__
+    Pair1 operator()(const Pair1 &x, const Pair2 &y)
+  {
+    return thrust::make_pair(x.first + y.first, x.second + y.second);
+  } // end operator()
+}; // end add_pairs
+
+
+template <typename T>
+  struct TestPairScanByKey
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::pair<T,T> P;
+
+    thrust::host_vector<T>   h_p1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_p2 = unittest::random_integers<T>(n);
+    thrust::host_vector<P>   h_pairs(n);
+
+    // zip up pairs on the host
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+    thrust::device_vector<T> d_p1 = h_p1;
+    thrust::device_vector<T> d_p2 = h_p2;
+    thrust::device_vector<P> d_pairs = h_pairs;
+
+    thrust::host_vector<T>   h_keys = unittest::random_integers<bool>(n);
+    thrust::device_vector<T> d_keys = h_keys;
+
+    P init = thrust::make_pair(13,13);
+
+    // scan on the host
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_pairs.begin(), h_pairs.begin(), init, thrust::equal_to<T>(), add_pairs());
+
+    // scan on the device
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_pairs.begin(), d_pairs.begin(), init, thrust::equal_to<T>(), add_pairs());
+
+    ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
+  }
+};
+VariableUnitTest<TestPairScanByKey, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestPairScanByKeyInstance;
+
diff --git a/thrust/testing/pair_sort.cu b/thrust/testing/pair_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..db8a83739968732b3da6c6514b1d032085effdd8
--- /dev/null
+++ b/thrust/testing/pair_sort.cu
@@ -0,0 +1,50 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+
+template <typename T>
+  struct TestPairStableSortByKey
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::pair<T,T> P;
+
+    // host arrays
+    thrust::host_vector<T>   h_p1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_p2 = unittest::random_integers<T>(n);
+    thrust::host_vector<P>   h_pairs(n);
+
+    thrust::host_vector<int> h_values(n);
+    thrust::sequence(h_values.begin(), h_values.end());
+
+    // zip up pairs on the host
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+    // device arrays
+    thrust::device_vector<P>   d_pairs = h_pairs;
+    thrust::device_vector<int> d_values = h_values;
+
+    // sort on the host
+    thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
+
+    // sort on the device
+    thrust::stable_sort_by_key(d_pairs.begin(), d_pairs.end(), d_values.begin());
+
+    ASSERT_EQUAL_QUIET(h_pairs,  d_pairs);
+    ASSERT_EQUAL(h_values, d_values);
+  }
+};
+VariableUnitTest<TestPairStableSortByKey, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestPairStableSortByKeyInstance;
+
diff --git a/thrust/testing/pair_sort_by_key.cu b/thrust/testing/pair_sort_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..811368aed9b76c66d9d7dfe0211f3af17892183e
--- /dev/null
+++ b/thrust/testing/pair_sort_by_key.cu
@@ -0,0 +1,41 @@
+#include <unittest/unittest.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+template <typename T>
+  struct TestPairStableSort
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::pair<T,T> P;
+
+    thrust::host_vector<T>   h_p1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_p2 = unittest::random_integers<T>(n);
+    thrust::host_vector<P>   h_pairs(n);
+
+    // zip up pairs on the host
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_pairs.begin(), make_pair_functor());
+
+    thrust::device_vector<P> d_pairs = h_pairs;
+
+    // sort on the host
+    thrust::stable_sort(h_pairs.begin(), h_pairs.end());
+
+    // sort on the device
+    thrust::stable_sort(d_pairs.begin(), d_pairs.end());
+
+    ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
+  }
+};
+VariableUnitTest<TestPairStableSort, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestPairStableSortInstance;
+
diff --git a/thrust/testing/pair_transform.cu b/thrust/testing/pair_transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..612a77af040c1a58eaafdbfc4dbb66efa83e593b
--- /dev/null
+++ b/thrust/testing/pair_transform.cu
@@ -0,0 +1,61 @@
+#include <unittest/unittest.h>
+#include <thrust/host_vector.h>
+#include <thrust/pair.h>
+#include <thrust/tuple.h>
+#include <thrust/transform.h>
+
+struct make_pair_functor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+    thrust::pair<T1,T2> operator()(const T1 &x, const T2 &y)
+  {
+    return thrust::make_pair(x,y);
+  } // end operator()()
+}; // end make_pair_functor
+
+struct add_pairs
+{
+  template <typename Pair1, typename Pair2>
+  __host__ __device__
+    Pair1 operator()(const Pair1 &x, const Pair2 &y)
+  {
+    return thrust::make_pair(x.first + y.first, x.second + y.second);
+  } // end operator()
+}; // end add_pairs
+
+
+template <typename T>
+  struct TestPairTransform
+{
+  void operator()(const size_t n)
+  {
+    typedef thrust::pair<T,T> P;
+
+    thrust::host_vector<T>   h_p1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_p2 = unittest::random_integers<T>(n);
+    thrust::host_vector<P>   h_result(n);
+
+    thrust::device_vector<T> d_p1 = h_p1;
+    thrust::device_vector<T> d_p2 = h_p2;
+    thrust::device_vector<P> d_result(n);
+
+    // zip up pairs on the host
+    thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), make_pair_functor());
+
+    // zip up pairs on the device
+    thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), make_pair_functor());
+
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+    
+    // add pairs on the host
+    thrust::transform(h_result.begin(), h_result.end(), h_result.begin(), h_result.begin(), add_pairs());
+                                                                          
+    // add pairs on the device                                            
+    thrust::transform(d_result.begin(), d_result.end(), d_result.begin(), d_result.begin(), add_pairs());
+
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+  }
+}; // end TestPairZip
+VariableUnitTest<TestPairTransform, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestPairTransformInstance;
+
diff --git a/thrust/testing/partition.cu b/thrust/testing/partition.cu
new file mode 100644
index 0000000000000000000000000000000000000000..742560f59b1bce5a085f341f57a32b108961752c
--- /dev/null
+++ b/thrust/testing/partition.cu
@@ -0,0 +1,1623 @@
+#include <unittest/unittest.h>
+#include <thrust/partition.h>
+#include <thrust/count.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/sort.h>
+
+template<typename T>
+struct is_even
+{
+    __host__ __device__
+    bool operator()(T x) const { return ((int) x % 2) == 0; }
+};
+
+typedef unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> PartitionTypes;
+
+template<typename Vector>
+void TestPartitionSimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector data(5);
+    data[0] = 1; 
+    data[1] = 2; 
+    data[2] = 1;
+    data[3] = 1; 
+    data[4] = 2; 
+
+    Iterator iter = thrust::partition(data.begin(), data.end(), is_even<T>());
+
+    Vector ref(5);
+    ref[0] = 2;
+    ref[1] = 2;
+    ref[2] = 1;
+    ref[3] = 1;
+    ref[4] = 1;
+
+    ASSERT_EQUAL(iter - data.begin(), 2);
+    ASSERT_EQUAL(data, ref);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionSimple);
+
+template<typename Vector>
+void TestPartitionStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector data(5);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 0;
+    data[3] = 0;
+    data[4] = 1;
+
+    Vector stencil(5);
+    stencil[0] = 1; 
+    stencil[1] = 2; 
+    stencil[2] = 1;
+    stencil[3] = 1; 
+    stencil[4] = 2; 
+
+    Iterator iter = thrust::partition(data.begin(), data.end(), stencil.begin(), is_even<T>());
+
+    Vector ref(5);
+    ref[0] = 1;
+    ref[1] = 1;
+    ref[2] = 0;
+    ref[3] = 0;
+    ref[4] = 0;
+
+    ASSERT_EQUAL(iter - data.begin(), 2);
+    ASSERT_EQUAL(data, ref);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionStencilSimple);
+
+
+template<typename Vector>
+void TestPartitionCopySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  2; 
+    data[2] =  1;
+    data[3] =  1; 
+    data[4] =  2; 
+
+    Vector true_results(2);
+    Vector false_results(3);
+
+    thrust::pair<typename Vector::iterator, typename Vector::iterator> ends =
+      thrust::partition_copy(data.begin(), data.end(), true_results.begin(), false_results.begin(), is_even<T>());
+
+    Vector true_ref(2);
+    true_ref[0] =  2;
+    true_ref[1] =  2;
+
+    Vector false_ref(3);
+    false_ref[0] =  1;
+    false_ref[1] =  1;
+    false_ref[2] =  1;
+
+    ASSERT_EQUAL(2, ends.first - true_results.begin());
+    ASSERT_EQUAL(3, ends.second - false_results.begin());
+    ASSERT_EQUAL(true_ref, true_results);
+    ASSERT_EQUAL(false_ref, false_results);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionCopySimple);
+
+
+template<typename Vector>
+void TestPartitionCopyStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  0; 
+    data[1] =  1; 
+    data[2] =  0;
+    data[3] =  0; 
+    data[4] =  1; 
+
+    Vector stencil(5);
+    stencil[0] =  1; 
+    stencil[1] =  2; 
+    stencil[2] =  1;
+    stencil[3] =  1; 
+    stencil[4] =  2; 
+
+    Vector true_results(2);
+    Vector false_results(3);
+
+    thrust::pair<typename Vector::iterator, typename Vector::iterator> ends =
+      thrust::partition_copy(data.begin(), data.end(), stencil.begin(), true_results.begin(), false_results.begin(), is_even<T>());
+
+    Vector true_ref(2);
+    true_ref[0] =  1;
+    true_ref[1] =  1;
+
+    Vector false_ref(3);
+    false_ref[0] =  0;
+    false_ref[1] =  0;
+    false_ref[2] =  0;
+
+    ASSERT_EQUAL(2, ends.first - true_results.begin());
+    ASSERT_EQUAL(3, ends.second - false_results.begin());
+    ASSERT_EQUAL(true_ref, true_results);
+    ASSERT_EQUAL(false_ref, false_results);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionCopyStencilSimple);
+
+
+template<typename Vector>
+void TestStablePartitionSimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  2; 
+    data[2] =  1;
+    data[3] =  3; 
+    data[4] =  2; 
+
+    Iterator iter = thrust::stable_partition(data.begin(), data.end(), is_even<T>());
+
+    Vector ref(5);
+    ref[0] =  2;
+    ref[1] =  2;
+    ref[2] =  1;
+    ref[3] =  1;
+    ref[4] =  3;
+
+    ASSERT_EQUAL(iter - data.begin(), 2);
+    ASSERT_EQUAL(data, ref);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStablePartitionSimple);
+
+
+template<typename Vector>
+void TestStablePartitionStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  2; 
+    data[2] =  1;
+    data[3] =  3; 
+    data[4] =  2; 
+
+    Vector stencil(5);
+    stencil[0] = 0;
+    stencil[1] = 1;
+    stencil[2] = 0;
+    stencil[3] = 0;
+    stencil[4] = 1;
+
+    Iterator iter = thrust::stable_partition(data.begin(), data.end(), stencil.begin(), thrust::identity<T>());
+
+    Vector ref(5);
+    ref[0] =  2;
+    ref[1] =  2;
+    ref[2] =  1;
+    ref[3] =  1;
+    ref[4] =  3;
+
+    ASSERT_EQUAL(iter - data.begin(), 2);
+    ASSERT_EQUAL(data, ref);
+}
+DECLARE_VECTOR_UNITTEST(TestStablePartitionStencilSimple);
+
+
+template<typename Vector>
+void TestStablePartitionCopySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  2; 
+    data[2] =  1;
+    data[3] =  1; 
+    data[4] =  2; 
+
+    Vector true_results(2);
+    Vector false_results(3);
+
+    thrust::pair<typename Vector::iterator, typename Vector::iterator> ends =
+      thrust::stable_partition_copy(data.begin(), data.end(), true_results.begin(), false_results.begin(), is_even<T>());
+
+    Vector true_ref(2);
+    true_ref[0] =  2;
+    true_ref[1] =  2;
+
+    Vector false_ref(3);
+    false_ref[0] =  1;
+    false_ref[1] =  1;
+    false_ref[2] =  1;
+
+    ASSERT_EQUAL(2, ends.first - true_results.begin());
+    ASSERT_EQUAL(3, ends.second - false_results.begin());
+    ASSERT_EQUAL(true_ref, true_results);
+    ASSERT_EQUAL(false_ref, false_results);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStablePartitionCopySimple);
+
+
+template<typename Vector>
+void TestStablePartitionCopyStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  2; 
+    data[2] =  1;
+    data[3] =  1; 
+    data[4] =  2; 
+
+    Vector stencil(5);
+    stencil[0] = false;
+    stencil[1] = true;
+    stencil[2] = false;
+    stencil[3] = false;
+    stencil[4] = true;
+
+    Vector true_results(2);
+    Vector false_results(3);
+
+    thrust::pair<typename Vector::iterator, typename Vector::iterator> ends =
+      thrust::stable_partition_copy(data.begin(), data.end(), stencil.begin(), true_results.begin(), false_results.begin(), thrust::identity<T>());
+
+    Vector true_ref(2);
+    true_ref[0] =  2;
+    true_ref[1] =  2;
+
+    Vector false_ref(3);
+    false_ref[0] =  1;
+    false_ref[1] =  1;
+    false_ref[2] =  1;
+
+    ASSERT_EQUAL(2, ends.first - true_results.begin());
+    ASSERT_EQUAL(3, ends.second - false_results.begin());
+    ASSERT_EQUAL(true_ref, true_results);
+    ASSERT_EQUAL(false_ref, false_results);
+}
+DECLARE_VECTOR_UNITTEST(TestStablePartitionCopyStencilSimple);
+
+
+template <typename T>
+struct TestPartition
+{
+    void operator()(const size_t n)
+    {
+        // setup ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        typename thrust::host_vector<T>::iterator   h_iter = thrust::partition(h_data.begin(), h_data.end(), is_even<T>());
+        typename thrust::device_vector<T>::iterator d_iter = thrust::partition(d_data.begin(), d_data.end(), is_even<T>());
+
+        thrust::sort(h_data.begin(), h_iter); thrust::sort(h_iter, h_data.end());
+        thrust::sort(d_data.begin(), d_iter); thrust::sort(d_iter, d_data.end());
+
+        ASSERT_EQUAL(h_data, d_data);
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+    }
+};
+VariableUnitTest<TestPartition, PartitionTypes> TestPartitionInstance;
+
+
+template <typename T>
+struct TestPartitionStencil
+{
+    void operator()(const size_t n)
+    {
+        // setup ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        thrust::device_vector<T> d_stencil = h_stencil;
+
+        typename thrust::host_vector<T>::iterator   h_iter = thrust::partition(h_data.begin(), h_data.end(), h_stencil.begin(), is_even<T>());
+        typename thrust::device_vector<T>::iterator d_iter = thrust::partition(d_data.begin(), d_data.end(), d_stencil.begin(), is_even<T>());
+
+        thrust::sort(h_data.begin(), h_iter); thrust::sort(h_iter, h_data.end());
+        thrust::sort(d_data.begin(), d_iter); thrust::sort(d_iter, d_data.end());
+
+        ASSERT_EQUAL(h_data, d_data);
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+    }
+};
+VariableUnitTest<TestPartitionStencil, PartitionTypes> TestPartitionStencilInstance;
+
+
+template <typename T>
+struct TestPartitionCopy
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // setup output ranges
+        thrust::host_vector<T>   h_true_results (n_true,  0);
+        thrust::host_vector<T>   h_false_results(n_false, 0);
+        thrust::device_vector<T> d_true_results (n_true,  0);
+        thrust::device_vector<T> d_false_results(n_false, 0);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, typename thrust::host_vector<T>::iterator> h_ends
+            = thrust::partition_copy(h_data.begin(), h_data.end(), h_true_results.begin(), h_false_results.begin(), is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, typename thrust::device_vector<T>::iterator> d_ends
+            = thrust::partition_copy(d_data.begin(), d_data.end(), d_true_results.begin(), d_false_results.begin(), is_even<T>());
+
+        // check true output
+        ASSERT_EQUAL(h_ends.first - h_true_results.begin(), n_true);
+        ASSERT_EQUAL(d_ends.first - d_true_results.begin(), n_true);
+        thrust::sort(h_true_results.begin(), h_true_results.end());
+        thrust::sort(d_true_results.begin(), d_true_results.end());
+        ASSERT_EQUAL(h_true_results, d_true_results);
+
+        // check false output
+        ASSERT_EQUAL(h_ends.second - h_false_results.begin(), n_false);
+        ASSERT_EQUAL(d_ends.second - d_false_results.begin(), n_false);
+        thrust::sort(h_false_results.begin(), h_false_results.end());
+        thrust::sort(d_false_results.begin(), d_false_results.end());
+        ASSERT_EQUAL(h_false_results, d_false_results);
+    }
+};
+VariableUnitTest<TestPartitionCopy, PartitionTypes> TestPartitionCopyInstance;
+
+
+template <typename T>
+struct TestPartitionCopyStencil
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        thrust::device_vector<T> d_stencil = h_stencil;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // setup output ranges
+        thrust::host_vector<T>   h_true_results (n_true,  0);
+        thrust::host_vector<T>   h_false_results(n_false, 0);
+        thrust::device_vector<T> d_true_results (n_true,  0);
+        thrust::device_vector<T> d_false_results(n_false, 0);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, typename thrust::host_vector<T>::iterator> h_ends
+            = thrust::partition_copy(h_data.begin(), h_data.end(), h_stencil.begin(), h_true_results.begin(), h_false_results.begin(), is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, typename thrust::device_vector<T>::iterator> d_ends
+            = thrust::partition_copy(d_data.begin(), d_data.end(), d_stencil.begin(), d_true_results.begin(), d_false_results.begin(), is_even<T>());
+
+        // check true output
+        ASSERT_EQUAL(h_ends.first - h_true_results.begin(), n_true);
+        ASSERT_EQUAL(d_ends.first - d_true_results.begin(), n_true);
+        thrust::sort(h_true_results.begin(), h_true_results.end());
+        thrust::sort(d_true_results.begin(), d_true_results.end());
+        ASSERT_EQUAL(h_true_results, d_true_results);
+
+        // check false output
+        ASSERT_EQUAL(h_ends.second - h_false_results.begin(), n_false);
+        ASSERT_EQUAL(d_ends.second - d_false_results.begin(), n_false);
+        thrust::sort(h_false_results.begin(), h_false_results.end());
+        thrust::sort(d_false_results.begin(), d_false_results.end());
+        ASSERT_EQUAL(h_false_results, d_false_results);
+    }
+};
+VariableUnitTest<TestPartitionCopyStencil, PartitionTypes> TestPartitionCopyStencilInstance;
+
+
+template <typename T>
+struct TestStablePartitionCopyStencil
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        thrust::device_vector<T> d_stencil = h_stencil;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // setup output ranges
+        thrust::host_vector<T>   h_true_results (n_true,  0);
+        thrust::host_vector<T>   h_false_results(n_false, 0);
+        thrust::device_vector<T> d_true_results (n_true,  0);
+        thrust::device_vector<T> d_false_results(n_false, 0);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, typename thrust::host_vector<T>::iterator> h_ends
+            = thrust::stable_partition_copy(h_data.begin(), h_data.end(), h_stencil.begin(), h_true_results.begin(), h_false_results.begin(), is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, typename thrust::device_vector<T>::iterator> d_ends
+            = thrust::stable_partition_copy(d_data.begin(), d_data.end(), d_stencil.begin(), d_true_results.begin(), d_false_results.begin(), is_even<T>());
+
+        // check true output
+        ASSERT_EQUAL(h_ends.first - h_true_results.begin(), n_true);
+        ASSERT_EQUAL(d_ends.first - d_true_results.begin(), n_true);
+        thrust::sort(h_true_results.begin(), h_true_results.end());
+        thrust::sort(d_true_results.begin(), d_true_results.end());
+        ASSERT_EQUAL(h_true_results, d_true_results);
+
+        // check false output
+        ASSERT_EQUAL(h_ends.second - h_false_results.begin(), n_false);
+        ASSERT_EQUAL(d_ends.second - d_false_results.begin(), n_false);
+        thrust::sort(h_false_results.begin(), h_false_results.end());
+        thrust::sort(d_false_results.begin(), d_false_results.end());
+        ASSERT_EQUAL(h_false_results, d_false_results);
+    }
+};
+VariableUnitTest<TestStablePartitionCopyStencil, PartitionTypes> TestStablePartitionCopyStencilInstance;
+
+
+template <typename T>
+struct TestPartitionCopyToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // mask both ranges
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
+            thrust::partition_copy(h_data.begin(),
+                                   h_data.end(),
+                                   thrust::make_discard_iterator(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > d_result1 =
+            thrust::partition_copy(d_data.begin(),
+                                   d_data.end(),
+                                   thrust::make_discard_iterator(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > reference1 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              thrust::make_discard_iterator(n_false));
+
+        ASSERT_EQUAL_QUIET(reference1, h_result1);
+        ASSERT_EQUAL_QUIET(reference1, d_result1);
+
+
+        // mask the false range
+        thrust::host_vector<T> h_trues(n_true);
+        thrust::device_vector<T> d_trues(n_true);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_result2 =
+            thrust::partition_copy(h_data.begin(),
+                                   h_data.end(),
+                                   h_trues.begin(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_result2 =
+            thrust::partition_copy(d_data.begin(),
+                                   d_data.end(),
+                                   d_trues.begin(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_reference2 =
+            thrust::make_pair(h_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_reference2 =
+            thrust::make_pair(d_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+
+        ASSERT_EQUAL(h_trues, d_trues);
+        ASSERT_EQUAL_QUIET(h_reference2, h_result2);
+        ASSERT_EQUAL_QUIET(d_reference2, d_result2);
+
+
+
+        // mask the true range
+        thrust::host_vector<T> h_falses(n_false);
+        thrust::device_vector<T> d_falses(n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_result3 =
+            thrust::partition_copy(h_data.begin(),
+                                   h_data.end(),
+                                   thrust::make_discard_iterator(),
+                                   h_falses.begin(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_result3 =
+            thrust::partition_copy(d_data.begin(),
+                                   d_data.end(),
+                                   thrust::make_discard_iterator(),
+                                   d_falses.begin(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              h_falses.begin() + n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              d_falses.begin() + n_false);
+
+
+        ASSERT_EQUAL(h_falses, d_falses);
+        ASSERT_EQUAL_QUIET(h_reference3, h_result3);
+        ASSERT_EQUAL_QUIET(d_reference3, d_result3);
+    }
+};
+VariableUnitTest<TestPartitionCopyToDiscardIterator, PartitionTypes> TestPartitionCopyToDiscardIteratorInstance;
+
+
+template <typename T>
+struct TestPartitionCopyStencilToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        thrust::device_vector<T> d_stencil = h_stencil;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // mask both ranges
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
+            thrust::partition_copy(h_data.begin(),
+                                   h_data.end(),
+                                   h_stencil.begin(),
+                                   thrust::make_discard_iterator(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > d_result1 =
+            thrust::partition_copy(d_data.begin(),
+                                   d_data.end(),
+                                   d_stencil.begin(),
+                                   thrust::make_discard_iterator(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > reference1 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              thrust::make_discard_iterator(n_false));
+
+        ASSERT_EQUAL_QUIET(reference1, h_result1);
+        ASSERT_EQUAL_QUIET(reference1, d_result1);
+
+
+        // mask the false range
+        thrust::host_vector<T> h_trues(n_true);
+        thrust::device_vector<T> d_trues(n_true);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_result2 =
+            thrust::partition_copy(h_data.begin(),
+                                   h_data.end(),
+                                   h_stencil.begin(),
+                                   h_trues.begin(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_result2 =
+            thrust::partition_copy(d_data.begin(),
+                                   d_data.end(),
+                                   d_stencil.begin(),
+                                   d_trues.begin(),
+                                   thrust::make_discard_iterator(),
+                                   is_even<T>());
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_reference2 =
+            thrust::make_pair(h_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_reference2 =
+            thrust::make_pair(d_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+
+        ASSERT_EQUAL(h_trues, d_trues);
+        ASSERT_EQUAL_QUIET(h_reference2, h_result2);
+        ASSERT_EQUAL_QUIET(d_reference2, d_result2);
+
+
+
+        // mask the true range
+        thrust::host_vector<T> h_falses(n_false);
+        thrust::device_vector<T> d_falses(n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_result3 =
+            thrust::partition_copy(h_data.begin(),
+                                   h_data.end(),
+                                   h_stencil.begin(),
+                                   thrust::make_discard_iterator(),
+                                   h_falses.begin(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_result3 =
+            thrust::partition_copy(d_data.begin(),
+                                   d_data.end(),
+                                   d_stencil.begin(),
+                                   thrust::make_discard_iterator(),
+                                   d_falses.begin(),
+                                   is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              h_falses.begin() + n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              d_falses.begin() + n_false);
+
+
+        ASSERT_EQUAL(h_falses, d_falses);
+        ASSERT_EQUAL_QUIET(h_reference3, h_result3);
+        ASSERT_EQUAL_QUIET(d_reference3, d_result3);
+    }
+};
+VariableUnitTest<TestPartitionCopyStencilToDiscardIterator, PartitionTypes> TestPartitionCopyStencilToDiscardIteratorInstance;
+
+
+template <typename T>
+struct TestStablePartition
+{
+    void operator()(const size_t n)
+    {
+        // setup ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        typename thrust::host_vector<T>::iterator   h_iter = thrust::stable_partition(h_data.begin(), h_data.end(), is_even<T>());
+        typename thrust::device_vector<T>::iterator d_iter = thrust::stable_partition(d_data.begin(), d_data.end(), is_even<T>());
+
+        ASSERT_EQUAL(h_data, d_data);
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+    }
+};
+VariableUnitTest<TestStablePartition, PartitionTypes> TestStablePartitionInstance;
+
+
+template <typename T>
+struct TestStablePartitionStencil
+{
+    void operator()(const size_t n)
+    {
+        // setup ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        thrust::device_vector<T> d_stencil = h_stencil;
+
+        typename thrust::host_vector<T>::iterator   h_iter = thrust::stable_partition(h_data.begin(), h_data.end(), h_stencil.begin(), is_even<T>());
+        typename thrust::device_vector<T>::iterator d_iter = thrust::stable_partition(d_data.begin(), d_data.end(), d_stencil.begin(), is_even<T>());
+
+        ASSERT_EQUAL(h_data, d_data);
+        ASSERT_EQUAL(h_iter - h_data.begin(), d_iter - d_data.begin());
+    }
+};
+VariableUnitTest<TestStablePartitionStencil, PartitionTypes> TestStablePartitionStencilInstance;
+
+
+template <typename T>
+struct TestStablePartitionCopy
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // setup output ranges
+        thrust::host_vector<T>   h_true_results (n_true,  0);
+        thrust::host_vector<T>   h_false_results(n_false, 0);
+        thrust::device_vector<T> d_true_results (n_true,  0);
+        thrust::device_vector<T> d_false_results(n_false, 0);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, typename thrust::host_vector<T>::iterator> h_ends
+            = thrust::stable_partition_copy(h_data.begin(), h_data.end(), h_true_results.begin(), h_false_results.begin(), is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, typename thrust::device_vector<T>::iterator> d_ends
+            = thrust::stable_partition_copy(d_data.begin(), d_data.end(), d_true_results.begin(), d_false_results.begin(), is_even<T>());
+
+        // check true output
+        ASSERT_EQUAL(h_ends.first - h_true_results.begin(), n_true);
+        ASSERT_EQUAL(d_ends.first - d_true_results.begin(), n_true);
+        ASSERT_EQUAL(h_true_results, d_true_results);
+
+        // check false output
+        ASSERT_EQUAL(h_ends.second - h_false_results.begin(), n_false);
+        ASSERT_EQUAL(d_ends.second - d_false_results.begin(), n_false);
+        ASSERT_EQUAL(h_false_results, d_false_results);
+    }
+};
+VariableUnitTest<TestStablePartitionCopy, PartitionTypes> TestStablePartitionCopyInstance;
+
+
+template <typename T>
+struct TestStablePartitionCopyToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // mask both ranges
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
+            thrust::stable_partition_copy(h_data.begin(),
+                                          h_data.end(),
+                                          thrust::make_discard_iterator(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > d_result1 =
+            thrust::stable_partition_copy(d_data.begin(),
+                                          d_data.end(),
+                                          thrust::make_discard_iterator(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > reference1 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              thrust::make_discard_iterator(n_false));
+
+        ASSERT_EQUAL_QUIET(reference1, h_result1);
+        ASSERT_EQUAL_QUIET(reference1, d_result1);
+
+
+        // mask the false range
+        thrust::host_vector<T> h_trues(n_true);
+        thrust::device_vector<T> d_trues(n_true);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_result2 =
+            thrust::stable_partition_copy(h_data.begin(),
+                                          h_data.end(),
+                                          h_trues.begin(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_result2 =
+            thrust::stable_partition_copy(d_data.begin(),
+                                          d_data.end(),
+                                          d_trues.begin(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_reference2 =
+            thrust::make_pair(h_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_reference2 =
+            thrust::make_pair(d_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+
+        ASSERT_EQUAL(h_trues, d_trues);
+        ASSERT_EQUAL_QUIET(h_reference2, h_result2);
+        ASSERT_EQUAL_QUIET(d_reference2, d_result2);
+
+
+
+        // mask the true range
+        thrust::host_vector<T> h_falses(n_false);
+        thrust::device_vector<T> d_falses(n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_result3 =
+            thrust::stable_partition_copy(h_data.begin(),
+                                          h_data.end(),
+                                          thrust::make_discard_iterator(),
+                                          h_falses.begin(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_result3 =
+            thrust::stable_partition_copy(d_data.begin(),
+                                          d_data.end(),
+                                          thrust::make_discard_iterator(),
+                                          d_falses.begin(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              h_falses.begin() + n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              d_falses.begin() + n_false);
+
+
+        ASSERT_EQUAL(h_falses, d_falses);
+        ASSERT_EQUAL_QUIET(h_reference3, h_result3);
+        ASSERT_EQUAL_QUIET(d_reference3, d_result3);
+    }
+};
+VariableUnitTest<TestStablePartitionCopyToDiscardIterator, PartitionTypes> TestStablePartitionCopyToDiscardIteratorInstance;
+
+
+template <typename T>
+struct TestStablePartitionCopyStencilToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        // setup input ranges
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+        thrust::device_vector<T> d_stencil = h_stencil;
+        
+        std::ptrdiff_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
+
+        // mask both ranges
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
+            thrust::stable_partition_copy(h_data.begin(),
+                                          h_data.end(),
+                                          h_stencil.begin(),
+                                          thrust::make_discard_iterator(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > d_result1 =
+            thrust::stable_partition_copy(d_data.begin(),
+                                          d_data.end(),
+                                          d_stencil.begin(),
+                                          thrust::make_discard_iterator(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > reference1 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              thrust::make_discard_iterator(n_false));
+
+        ASSERT_EQUAL_QUIET(reference1, h_result1);
+        ASSERT_EQUAL_QUIET(reference1, d_result1);
+
+
+        // mask the false range
+        thrust::host_vector<T> h_trues(n_true);
+        thrust::device_vector<T> d_trues(n_true);
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_result2 =
+            thrust::stable_partition_copy(h_data.begin(),
+                                          h_data.end(),
+                                          h_stencil.begin(),
+                                          h_trues.begin(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_result2 =
+            thrust::stable_partition_copy(d_data.begin(),
+                                          d_data.end(),
+                                          d_stencil.begin(),
+                                          d_trues.begin(),
+                                          thrust::make_discard_iterator(),
+                                          is_even<T>());
+
+        thrust::pair<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> > h_reference2 =
+            thrust::make_pair(h_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+        thrust::pair<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > d_reference2 =
+            thrust::make_pair(d_trues.begin() + n_true,
+                              thrust::make_discard_iterator(n_false));
+
+
+        ASSERT_EQUAL(h_trues, d_trues);
+        ASSERT_EQUAL_QUIET(h_reference2, h_result2);
+        ASSERT_EQUAL_QUIET(d_reference2, d_result2);
+
+
+
+        // mask the true range
+        thrust::host_vector<T> h_falses(n_false);
+        thrust::device_vector<T> d_falses(n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_result3 =
+            thrust::stable_partition_copy(h_data.begin(),
+                                          h_data.end(),
+                                          h_stencil.begin(),
+                                          thrust::make_discard_iterator(),
+                                          h_falses.begin(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_result3 =
+            thrust::stable_partition_copy(d_data.begin(),
+                                          d_data.end(),
+                                          d_stencil.begin(),
+                                          thrust::make_discard_iterator(),
+                                          d_falses.begin(),
+                                          is_even<T>());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<T>::iterator> h_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              h_falses.begin() + n_false);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<T>::iterator> d_reference3 =
+            thrust::make_pair(thrust::make_discard_iterator(n_true),
+                              d_falses.begin() + n_false);
+
+
+        ASSERT_EQUAL(h_falses, d_falses);
+        ASSERT_EQUAL_QUIET(h_reference3, h_result3);
+        ASSERT_EQUAL_QUIET(d_reference3, d_result3);
+    }
+};
+VariableUnitTest<TestStablePartitionCopyStencilToDiscardIterator, PartitionTypes> TestStablePartitionCopyStencilToDiscardIteratorInstance;
+
+
+struct is_ordered
+{
+    template <typename Tuple>
+    __host__ __device__
+    bool operator()(const Tuple& t) const
+    {
+        return thrust::get<0>(t) <= thrust::get<1>(t);
+    }
+};
+
+
+template<typename Vector>
+void TestPartitionZipIterator(void)
+{
+    Vector data1(5);
+    Vector data2(5);
+
+    data1[0] = 1;  data2[0] = 2; 
+    data1[1] = 2;  data2[1] = 1;
+    data1[2] = 1;  data2[2] = 2;
+    data1[3] = 1;  data2[3] = 2;
+    data1[4] = 2;  data2[4] = 1;
+
+    typedef typename Vector::iterator           Iterator;
+    typedef thrust::tuple<Iterator,Iterator>    IteratorTuple;
+    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+    ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(data1.begin(), data2.begin()));
+    ZipIterator end   = thrust::make_zip_iterator(thrust::make_tuple(data1.end(),   data2.end()));
+
+    ZipIterator iter = thrust::partition(begin, end, is_ordered());
+
+    Vector ref1(5);
+    Vector ref2(5);
+
+    ref1[0] = 1; ref2[0] = 2;
+    ref1[1] = 1; ref2[1] = 2;
+    ref1[2] = 1; ref2[2] = 2;
+    ref1[3] = 2; ref2[3] = 1;
+    ref1[4] = 2; ref2[4] = 1;
+
+    ASSERT_EQUAL(iter - begin, 3);
+    ASSERT_EQUAL(data1, ref1);
+    ASSERT_EQUAL(data2, ref2);
+}
+DECLARE_VECTOR_UNITTEST(TestPartitionZipIterator);
+
+
+template<typename Vector>
+void TestPartitionStencilZipIterator(void)
+{
+    Vector data(5);
+    data[0] = 1;
+    data[1] = 0;
+    data[2] = 1;
+    data[3] = 1;
+    data[4] = 0;
+
+    Vector stencil1(5);
+    Vector stencil2(5);
+
+    stencil1[0] = 1;  stencil2[0] = 2; 
+    stencil1[1] = 2;  stencil2[1] = 1;
+    stencil1[2] = 1;  stencil2[2] = 2;
+    stencil1[3] = 1;  stencil2[3] = 2;
+    stencil1[4] = 2;  stencil2[4] = 1;
+
+    typedef typename Vector::iterator           Iterator;
+    typedef thrust::tuple<Iterator,Iterator>    IteratorTuple;
+    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+    ZipIterator stencil_begin = thrust::make_zip_iterator(thrust::make_tuple(stencil1.begin(), stencil2.begin()));
+
+    Iterator iter = thrust::partition(data.begin(), data.end(), stencil_begin, is_ordered());
+
+    Vector ref(5);
+
+    ref[0] = 1;
+    ref[1] = 1;
+    ref[2] = 1;
+    ref[3] = 0;
+    ref[4] = 0;
+
+    ASSERT_EQUAL(iter - data.begin(), 3);
+    ASSERT_EQUAL(data, ref);
+}
+DECLARE_VECTOR_UNITTEST(TestPartitionStencilZipIterator);
+
+
+template<typename Vector>
+void TestStablePartitionZipIterator(void)
+{
+    Vector data1(5);
+    Vector data2(5);
+
+    data1[0] = 1;  data2[0] = 2; 
+    data1[1] = 2;  data2[1] = 0;
+    data1[2] = 1;  data2[2] = 3;
+    data1[3] = 1;  data2[3] = 2;
+    data1[4] = 2;  data2[4] = 1;
+
+    typedef typename Vector::iterator           Iterator;
+    typedef thrust::tuple<Iterator,Iterator>    IteratorTuple;
+    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+    ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(data1.begin(), data2.begin()));
+    ZipIterator end   = thrust::make_zip_iterator(thrust::make_tuple(data1.end(),   data2.end()));
+
+    ZipIterator iter = thrust::stable_partition(begin, end, is_ordered());
+
+    Vector ref1(5);
+    Vector ref2(5);
+
+    ref1[0] = 1; ref2[0] = 2;
+    ref1[1] = 1; ref2[1] = 3;
+    ref1[2] = 1; ref2[2] = 2;
+    ref1[3] = 2; ref2[3] = 0;
+    ref1[4] = 2; ref2[4] = 1;
+
+    ASSERT_EQUAL(data1, ref1);
+    ASSERT_EQUAL(data2, ref2);
+    ASSERT_EQUAL(iter - begin, 3);
+}
+DECLARE_VECTOR_UNITTEST(TestStablePartitionZipIterator);
+
+
+template<typename Vector>
+void TestStablePartitionStencilZipIterator(void)
+{
+    Vector data(5);
+    data[0] = 1;
+    data[1] = 0;
+    data[2] = 1;
+    data[3] = 1;
+    data[4] = 0;
+
+    Vector stencil1(5);
+    Vector stencil2(5);
+
+    stencil1[0] = 1;  stencil2[0] = 2; 
+    stencil1[1] = 2;  stencil2[1] = 0;
+    stencil1[2] = 1;  stencil2[2] = 3;
+    stencil1[3] = 1;  stencil2[3] = 2;
+    stencil1[4] = 2;  stencil2[4] = 1;
+
+    typedef typename Vector::iterator           Iterator;
+    typedef thrust::tuple<Iterator,Iterator>    IteratorTuple;
+    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+    ZipIterator stencil_begin = thrust::make_zip_iterator(thrust::make_tuple(stencil1.begin(), stencil2.begin()));
+
+    Iterator mid = thrust::stable_partition(data.begin(), data.end(), stencil_begin, is_ordered());
+
+    Vector ref(5);
+
+    ref[0] = 1;
+    ref[1] = 1;
+    ref[2] = 1;
+    ref[3] = 0;
+    ref[4] = 0;
+
+    ASSERT_EQUAL(ref, data);
+    ASSERT_EQUAL(mid - data.begin(), 3);
+}
+DECLARE_VECTOR_UNITTEST(TestStablePartitionStencilZipIterator);
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+ForwardIterator partition(my_system &system,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestPartitionDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::partition(sys,
+                      vec.begin(),
+                      vec.begin(),
+                      0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestPartitionDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+ForwardIterator partition(my_system &system,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          InputIterator,
+                          Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestPartitionStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::partition(sys,
+                      vec.begin(),
+                      vec.begin(),
+                      vec.begin(),
+                      0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestPartitionStencilDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+ForwardIterator partition(my_tag,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestPartitionDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::partition(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestPartitionDispatchImplicit);
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+ForwardIterator partition(my_tag,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          InputIterator,
+                          Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestPartitionStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::partition(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestPartitionStencilDispatchImplicit);
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(my_system &system,
+                   InputIterator,
+                   InputIterator,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestPartitionCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::partition_copy(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestPartitionCopyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(my_system &system,
+                   InputIterator1,
+                   InputIterator1,
+                   InputIterator2,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestPartitionCopyStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::partition_copy(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(my_tag,
+                   InputIterator first,
+                   InputIterator,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate)
+{
+  *first = 13;
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestPartitionCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::partition_copy(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestPartitionCopyDispatchImplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(my_tag,
+                   InputIterator1 first,
+                   InputIterator1,
+                   InputIterator2,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate)
+{
+  *first = 13;
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestPartitionCopyStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::partition_copy(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDispatchImplicit);
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+ForwardIterator stable_partition(my_system &system,
+                                 ForwardIterator first,
+                                 ForwardIterator,
+                                 Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestStablePartitionDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::stable_partition(sys,
+                             vec.begin(),
+                             vec.begin(),
+                             0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestStablePartitionDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+ForwardIterator stable_partition(my_system &system,
+                                 ForwardIterator first,
+                                 ForwardIterator,
+                                 InputIterator,
+                                 Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestStablePartitionStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::stable_partition(sys,
+                             vec.begin(),
+                             vec.begin(),
+                             vec.begin(),
+                             0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+ForwardIterator stable_partition(my_tag,
+                                 ForwardIterator first,
+                                 ForwardIterator,
+                                 Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestStablePartitionDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::stable_partition(thrust::retag<my_tag>(vec.begin()),
+                             thrust::retag<my_tag>(vec.begin()),
+                             0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestStablePartitionDispatchImplicit);
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+ForwardIterator stable_partition(my_tag,
+                                 ForwardIterator first,
+                                 ForwardIterator,
+                                 InputIterator,
+                                 Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestStablePartitionStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::stable_partition(thrust::retag<my_tag>(vec.begin()),
+                             thrust::retag<my_tag>(vec.begin()),
+                             thrust::retag<my_tag>(vec.begin()),
+                             0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDispatchImplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(my_system &system,
+                          InputIterator,
+                          InputIterator,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestStablePartitionCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::stable_partition_copy(sys,
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(my_system &system,
+                          InputIterator1,
+                          InputIterator1,
+                          InputIterator2,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestStablePartitionCopyStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::stable_partition_copy(sys,
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(my_tag,
+                          InputIterator first,
+                          InputIterator,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate)
+{
+  *first = 13;
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestStablePartitionCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::stable_partition_copy(thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDispatchImplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(my_tag,
+                          InputIterator1 first,
+                          InputIterator1,
+                          InputIterator2,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate)
+{
+  *first = 13;
+  return thrust::make_pair(out_true,out_false);
+}
+
+void TestStablePartitionCopyStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::stable_partition_copy(thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDispatchImplicit);
+
diff --git a/thrust/testing/partition_point.cu b/thrust/testing/partition_point.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd5a6a8c8cb79cd9d436eec632d0e1fff8a0be72
--- /dev/null
+++ b/thrust/testing/partition_point.cu
@@ -0,0 +1,133 @@
+#include <unittest/unittest.h>
+#include <thrust/partition.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+template<typename T>
+struct is_even
+{
+  __host__ __device__
+  bool operator()(T x) const { return ((int) x % 2) == 0; }
+};
+
+template<typename Vector>
+void TestPartitionPointSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector v(4);
+  v[0] = 1; v[1] = 1; v[2] = 1; v[3] = 0;
+
+  Iterator first = v.begin();
+
+  Iterator last = v.begin() + 4;
+  Iterator ref = first + 3;
+  ASSERT_EQUAL_QUIET(ref, thrust::partition_point(first, last, thrust::identity<T>()));
+
+  last = v.begin() + 3;
+  ref = last;
+  ASSERT_EQUAL_QUIET(ref, thrust::partition_point(first, last, thrust::identity<T>()));
+}
+DECLARE_VECTOR_UNITTEST(TestPartitionPointSimple);
+
+template <class Vector>
+void TestPartitionPoint(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  const size_t n = (1 << 16) + 13;
+
+  Vector v = unittest::random_integers<T>(n);
+
+  Iterator ref = thrust::stable_partition(v.begin(), v.end(), is_even<T>());
+
+  ASSERT_EQUAL(ref - v.begin(), thrust::partition_point(v.begin(), v.end(), is_even<T>()) - v.begin());
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionPoint);
+
+
+template<typename ForwardIterator, typename Predicate>
+ForwardIterator partition_point(my_system &system, 
+                                ForwardIterator first,
+                                ForwardIterator,
+                                Predicate)
+{
+  system.validate_dispatch();
+  return first;
+}
+
+void TestPartitionPointDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::partition_point(sys,
+                          vec.begin(),
+                          vec.begin(),
+                          0);
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestPartitionPointDispatchExplicit);
+
+
+template<typename ForwardIterator, typename Predicate>
+ForwardIterator partition_point(my_tag,
+                                ForwardIterator first,
+                                ForwardIterator,
+                                Predicate)
+{
+  *first = 13;
+  return first;
+}
+
+void TestPartitionPointDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::partition_point(thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.begin()),
+                          0);
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestPartitionPointDispatchImplicit);
+
+struct test_less_than
+{
+    long long expected;
+
+    __device__
+    bool operator()(long long y)
+    {
+        return y < expected;
+    }
+};
+
+void TestPartitionPointWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    test_less_than fn = { (1ll << magnitude) - 17 };
+
+    ASSERT_EQUAL(thrust::distance(
+        begin,
+        thrust::partition_point(
+            thrust::device,
+            begin, end,
+            fn)),
+        (1ll << magnitude) - 17);
+}
+
+void TestPartitionPointWithBigIndexes()
+{
+    TestPartitionPointWithBigIndexesHelper(30);
+    TestPartitionPointWithBigIndexesHelper(31);
+    TestPartitionPointWithBigIndexesHelper(32);
+    TestPartitionPointWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestPartitionPointWithBigIndexes);
diff --git a/thrust/testing/permutation_iterator.cu b/thrust/testing/permutation_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94f5857c43d458966abb9816712844594cb439ec
--- /dev/null
+++ b/thrust/testing/permutation_iterator.cu
@@ -0,0 +1,316 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/sequence.h>
+
+template <class Vector>
+void TestPermutationIteratorSimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector source(8);
+    Vector indices(4);
+    
+    // initialize input
+    thrust::sequence(source.begin(), source.end(), 1);
+
+    indices[0] = 3;
+    indices[1] = 0;
+    indices[2] = 5;
+    indices[3] = 7;
+   
+    thrust::permutation_iterator<Iterator, Iterator> begin(source.begin(), indices.begin());
+    thrust::permutation_iterator<Iterator, Iterator> end(source.begin(),   indices.end());
+
+    ASSERT_EQUAL(end - begin, 4);
+    ASSERT_EQUAL((begin + 4) == end, true);
+
+    ASSERT_EQUAL((T) *begin, 4);
+
+    begin++;
+    end--;
+
+    ASSERT_EQUAL((T) *begin, 1);
+    ASSERT_EQUAL((T) *end,   8);
+    ASSERT_EQUAL(end - begin, 2);
+
+    end--;
+
+    *begin = 10;
+    *end   = 20;
+
+    ASSERT_EQUAL(source[0], 10);
+    ASSERT_EQUAL(source[1],  2);
+    ASSERT_EQUAL(source[2],  3);
+    ASSERT_EQUAL(source[3],  4);
+    ASSERT_EQUAL(source[4],  5);
+    ASSERT_EQUAL(source[5], 20);
+    ASSERT_EQUAL(source[6],  7);
+    ASSERT_EQUAL(source[7],  8);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorSimple);
+
+template <class Vector>
+void TestPermutationIteratorGather(void)
+{
+    typedef typename Vector::iterator Iterator;
+
+    Vector source(8);
+    Vector indices(4);
+    Vector output(4, 10);
+    
+    // initialize input
+    thrust::sequence(source.begin(), source.end(), 1);
+
+    indices[0] = 3;
+    indices[1] = 0;
+    indices[2] = 5;
+    indices[3] = 7;
+   
+    thrust::permutation_iterator<Iterator, Iterator> p_source(source.begin(), indices.begin());
+
+    thrust::copy(p_source, p_source + 4, output.begin());
+
+    ASSERT_EQUAL(output[0], 4);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 6);
+    ASSERT_EQUAL(output[3], 8);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorGather);
+
+template <class Vector>
+void TestPermutationIteratorScatter(void)
+{
+    typedef typename Vector::iterator Iterator;
+
+    Vector source(4, 10);
+    Vector indices(4);
+    Vector output(8);
+    
+    // initialize output
+    thrust::sequence(output.begin(), output.end(), 1);
+
+    indices[0] = 3;
+    indices[1] = 0;
+    indices[2] = 5;
+    indices[3] = 7;
+   
+    // construct transform_iterator
+    thrust::permutation_iterator<Iterator, Iterator> p_output(output.begin(), indices.begin());
+
+    thrust::copy(source.begin(), source.end(), p_output);
+
+    ASSERT_EQUAL(output[0], 10);
+    ASSERT_EQUAL(output[1],  2);
+    ASSERT_EQUAL(output[2],  3);
+    ASSERT_EQUAL(output[3], 10);
+    ASSERT_EQUAL(output[4],  5);
+    ASSERT_EQUAL(output[5], 10);
+    ASSERT_EQUAL(output[6],  7);
+    ASSERT_EQUAL(output[7], 10);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorScatter);
+
+template <class Vector>
+void TestMakePermutationIterator(void)
+{
+    Vector source(8);
+    Vector indices(4);
+    Vector output(4, 10);
+    
+    // initialize input
+    thrust::sequence(source.begin(), source.end(), 1);
+
+    indices[0] = 3;
+    indices[1] = 0;
+    indices[2] = 5;
+    indices[3] = 7;
+   
+    thrust::copy(thrust::make_permutation_iterator(source.begin(), indices.begin()),
+                 thrust::make_permutation_iterator(source.begin(), indices.begin()) + 4,
+                 output.begin());
+
+    ASSERT_EQUAL(output[0], 4);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 6);
+    ASSERT_EQUAL(output[3], 8);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestMakePermutationIterator);
+
+template <typename Vector>
+void TestPermutationIteratorReduce(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector source(8);
+    Vector indices(4);
+    Vector output(4, 10);
+    
+    // initialize input
+    thrust::sequence(source.begin(), source.end(), 1);
+
+    indices[0] = 3;
+    indices[1] = 0;
+    indices[2] = 5;
+    indices[3] = 7;
+   
+    // construct transform_iterator
+    thrust::permutation_iterator<Iterator, Iterator> iter(source.begin(), indices.begin());
+
+    T result1 = thrust::reduce(thrust::make_permutation_iterator(source.begin(), indices.begin()),
+                               thrust::make_permutation_iterator(source.begin(), indices.begin()) + 4);
+
+    ASSERT_EQUAL(result1, 19);
+    
+    T result2 = thrust::transform_reduce(thrust::make_permutation_iterator(source.begin(), indices.begin()),
+                                         thrust::make_permutation_iterator(source.begin(), indices.begin()) + 4,
+                                         thrust::negate<T>(),
+                                         T(0),
+                                         thrust::plus<T>());
+    ASSERT_EQUAL(result2, -19);
+};
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorReduce);
+
+void TestPermutationIteratorHostDeviceGather(void)
+{
+    typedef int T;
+    typedef thrust::host_vector<T> HostVector;
+    typedef thrust::host_vector<T> DeviceVector;
+    typedef HostVector::iterator   HostIterator;
+    typedef DeviceVector::iterator DeviceIterator;
+
+    HostVector h_source(8);
+    HostVector h_indices(4);
+    HostVector h_output(4, 10);
+    
+    DeviceVector d_source(8);
+    DeviceVector d_indices(4);
+    DeviceVector d_output(4, 10);
+
+    // initialize source
+    thrust::sequence(h_source.begin(), h_source.end(), 1);
+    thrust::sequence(d_source.begin(), d_source.end(), 1);
+
+    h_indices[0] = d_indices[0] = 3;
+    h_indices[1] = d_indices[1] = 0;
+    h_indices[2] = d_indices[2] = 5;
+    h_indices[3] = d_indices[3] = 7;
+   
+    thrust::permutation_iterator<HostIterator,   HostIterator>   p_h_source(h_source.begin(), h_indices.begin());
+    thrust::permutation_iterator<DeviceIterator, DeviceIterator> p_d_source(d_source.begin(), d_indices.begin());
+
+    // gather host->device
+    thrust::copy(p_h_source, p_h_source + 4, d_output.begin());
+
+    ASSERT_EQUAL(d_output[0], 4);
+    ASSERT_EQUAL(d_output[1], 1);
+    ASSERT_EQUAL(d_output[2], 6);
+    ASSERT_EQUAL(d_output[3], 8);
+    
+    // gather device->host
+    thrust::copy(p_d_source, p_d_source + 4, h_output.begin());
+
+    ASSERT_EQUAL(h_output[0], 4);
+    ASSERT_EQUAL(h_output[1], 1);
+    ASSERT_EQUAL(h_output[2], 6);
+    ASSERT_EQUAL(h_output[3], 8);
+}
+DECLARE_UNITTEST(TestPermutationIteratorHostDeviceGather);
+
+void TestPermutationIteratorHostDeviceScatter(void)
+{
+    typedef int T;
+    typedef thrust::host_vector<T> HostVector;
+    typedef thrust::host_vector<T> DeviceVector;
+    typedef HostVector::iterator   HostIterator;
+    typedef DeviceVector::iterator DeviceIterator;
+
+    HostVector h_source(4,10);
+    HostVector h_indices(4);
+    HostVector h_output(8);
+    
+    DeviceVector d_source(4,10);
+    DeviceVector d_indices(4);
+    DeviceVector d_output(8);
+
+    // initialize source
+    thrust::sequence(h_output.begin(), h_output.end(), 1);
+    thrust::sequence(d_output.begin(), d_output.end(), 1);
+
+    h_indices[0] = d_indices[0] = 3;
+    h_indices[1] = d_indices[1] = 0;
+    h_indices[2] = d_indices[2] = 5;
+    h_indices[3] = d_indices[3] = 7;
+   
+    thrust::permutation_iterator<HostIterator,   HostIterator>   p_h_output(h_output.begin(), h_indices.begin());
+    thrust::permutation_iterator<DeviceIterator, DeviceIterator> p_d_output(d_output.begin(), d_indices.begin());
+
+    // scatter host->device
+    thrust::copy(h_source.begin(), h_source.end(), p_d_output);
+
+    ASSERT_EQUAL(d_output[0], 10);
+    ASSERT_EQUAL(d_output[1],  2);
+    ASSERT_EQUAL(d_output[2],  3);
+    ASSERT_EQUAL(d_output[3], 10);
+    ASSERT_EQUAL(d_output[4],  5);
+    ASSERT_EQUAL(d_output[5], 10);
+    ASSERT_EQUAL(d_output[6],  7);
+    ASSERT_EQUAL(d_output[7], 10);
+    
+    // scatter device->host
+    thrust::copy(d_source.begin(), d_source.end(), p_h_output);
+
+    ASSERT_EQUAL(h_output[0], 10);
+    ASSERT_EQUAL(h_output[1],  2);
+    ASSERT_EQUAL(h_output[2],  3);
+    ASSERT_EQUAL(h_output[3], 10);
+    ASSERT_EQUAL(h_output[4],  5);
+    ASSERT_EQUAL(h_output[5], 10);
+    ASSERT_EQUAL(h_output[6],  7);
+    ASSERT_EQUAL(h_output[7], 10);
+}
+DECLARE_UNITTEST(TestPermutationIteratorHostDeviceScatter);
+
+template <typename Vector>
+void TestPermutationIteratorWithCountingIterator(void)
+{
+  typedef typename Vector::value_type T;
+  
+  typename thrust::counting_iterator<T> input(0), index(0);
+
+  // test copy()
+  {
+    Vector output(4,0);
+
+    thrust::copy(thrust::make_permutation_iterator(input, index),
+                 thrust::make_permutation_iterator(input, index + output.size()),
+                 output.begin());
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 2);
+    ASSERT_EQUAL(output[3], 3);
+  }
+
+  // test copy()
+  {
+    Vector output(4,0);
+
+    thrust::transform(thrust::make_permutation_iterator(input, index),
+                      thrust::make_permutation_iterator(input, index + 4),
+                      output.begin(),
+                      thrust::identity<T>());
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 1);
+    ASSERT_EQUAL(output[2], 2);
+    ASSERT_EQUAL(output[3], 3);
+  }
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorWithCountingIterator);
+
diff --git a/thrust/testing/preprocessor.cu b/thrust/testing/preprocessor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..643c9ad99c11bc3a40fa9ee28ac74c3ed51c5662
--- /dev/null
+++ b/thrust/testing/preprocessor.cu
@@ -0,0 +1,717 @@
+#include <unittest/unittest.h>
+#include <string>
+#include <thrust/detail/preprocessor.h>
+
+void test_pp_stringize()
+{
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(int))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello  world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE( hello  world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello  world ))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE( hello  world ))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello
+                                    world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE("hello world"))
+  , "\"hello world\""
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE('hello world'))
+  , "'hello world'"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE($%!&<->))
+  , "$%!&<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE($%!&""<->))
+  , "$%!&\"\"<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_STRINGIZE))
+  , "THRUST_PP_STRINGIZE"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_STRINGIZE(int)))
+  , "\"int\""
+  );
+}
+DECLARE_UNITTEST(test_pp_stringize);
+
+void test_pp_cat2()
+{
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(i, nt)))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello, world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello , world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2( hello, world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello,  world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello, world )))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello,
+                                                   world )))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello world, from thrust!)))
+  , "hello worldfrom thrust!"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(-, >)))
+  , "->"
+  );
+}
+DECLARE_UNITTEST(test_pp_cat2);
+
+#define THRUST_TEST_PP_EXPAND_TARGET() success
+
+#define THRUST_TEST_PP_EXPAND_ARGS() ()
+
+void test_pp_expand()
+{
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(int)))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello  world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND( hello  world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello  world )))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND( hello  world )))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello
+                                    world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND("hello world")))
+  , "\"hello world\""
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND('hello world')))
+  , "'hello world'"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND($%!&<->)))
+  , "$%!&<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND($%!&""<->)))
+  , "$%!&\"\"<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(THRUST_PP_EXPAND)))
+  , "THRUST_PP_EXPAND"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(THRUST_PP_EXPAND(int))))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(
+      THRUST_PP_CAT2(THRUST_TEST_, PP_EXPAND_TARGET)()
+    )))
+  , "success"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(
+      THRUST_TEST_PP_EXPAND_TARGET THRUST_TEST_PP_EXPAND_ARGS()
+    )))
+  , "success"
+  );
+}
+DECLARE_UNITTEST(test_pp_expand);
+
+#undef THRUST_TEST_PP_EXPAND_TARGET
+
+#undef THRUST_TEST_PP_EXPAND_ARGS
+
+void test_pp_arity()
+{
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY()
+  , 0
+  );
+
+  /* This bash script was used to generate these tests:
+
+    for arity in {0..62}
+    do
+      echo "  ASSERT_EQUAL("
+      echo "    THRUST_PP_ARITY("
+      echo "      `bash -c \"echo {0..${arity}} | tr ' ' ,\"`"
+      echo "    )"
+      echo "  , $((${arity} + 1))"
+      echo "  );"
+      echo
+    done
+  */
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0
+    )
+  , 1
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1
+    )
+  , 2
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2
+    )
+  , 3
+  );
+ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3
+    )
+  , 4
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4
+    )
+  , 5
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5
+    )
+  , 6
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6
+    )
+  , 7
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7
+    )
+  , 8
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8
+    )
+  , 9
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9
+    )
+  , 10
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10
+    )
+  , 11
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11
+    )
+  , 12
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12
+    )
+  , 13
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13
+    )
+  , 14
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+    )
+  , 15
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+    )
+  , 16
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
+    )
+  , 17
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
+    )
+  , 18
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
+    )
+  , 19
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+    )
+  , 20
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
+    )
+  , 21
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
+    )
+  , 22
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
+    )
+  , 23
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
+    )
+  , 24
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
+    )
+  , 25
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
+    )
+  , 26
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
+    )
+  , 27
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
+    )
+  , 28
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
+    )
+  , 29
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
+    )
+  , 30
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
+    )
+  , 31
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+    )
+  , 32
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
+    )
+  , 33
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
+    )
+  , 34
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
+    )
+  , 35
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35
+    )
+  , 36
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
+    )
+  , 37
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37
+    )
+  , 38
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
+    )
+  , 39
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+    )
+  , 40
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
+    )
+  , 41
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
+    )
+  , 42
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
+    )
+  , 43
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43
+    )
+  , 44
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
+    )
+  , 45
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
+    )
+  , 46
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46
+    )
+  , 47
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47
+    )
+  , 48
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
+    )
+  , 49
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
+    )
+  , 50
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50
+    )
+  , 51
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51
+    )
+  , 52
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
+    )
+  , 53
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
+    )
+  , 54
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
+    )
+  , 55
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55
+    )
+  , 56
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
+    )
+  , 57
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57
+    )
+  , 58
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
+    )
+  , 59
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+    )
+  , 60
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
+    )
+  , 61
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61
+    )
+  , 62
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
+    )
+  , 63
+  );
+}
+DECLARE_UNITTEST(test_pp_arity);
+
+#define THRUST_TEST_PP_DISPATCH_PLUS(...)                                     \
+  THRUST_PP_DISPATCH(THRUST_TEST_PP_DISPATCH_PLUS, __VA_ARGS__)               \
+  /**/
+#define THRUST_TEST_PP_DISPATCH_PLUS0()        0
+#define THRUST_TEST_PP_DISPATCH_PLUS1(x)       x
+#define THRUST_TEST_PP_DISPATCH_PLUS2(x, y)    x + y
+#define THRUST_TEST_PP_DISPATCH_PLUS3(x, y, z) x + y + z
+
+void test_pp_dispatch()
+{
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS()
+  , 0
+  );
+
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS(0)
+  , 0
+  );
+
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS(1, 2)
+  , 3
+  );
+
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS(1, 2, 3)
+  , 6
+  );
+}
+DECLARE_UNITTEST(test_pp_dispatch);
+
+#undef THRUST_TEST_PP_DISPATCH_PLUS
+#undef THRUST_TEST_PP_DISPATCH_PLUS0
+#undef THRUST_TEST_PP_DISPATCH_PLUS1
+#undef THRUST_TEST_PP_DISPATCH_PLUS2
+#undef THRUST_TEST_PP_DISPATCH_PLUS3
+
diff --git a/thrust/testing/random.cu b/thrust/testing/random.cu
new file mode 100644
index 0000000000000000000000000000000000000000..53a165055bcf3226847905187cfa7c6f36156b47
--- /dev/null
+++ b/thrust/testing/random.cu
@@ -0,0 +1,932 @@
+#include <unittest/unittest.h>
+#include <thrust/random.h>
+#include <thrust/generate.h>
+#include <sstream>
+
+template<typename Engine>
+  struct ValidateEngine
+{
+  __host__ __device__
+  ValidateEngine(const typename Engine::result_type value_10000)
+    : m_value_10000(value_10000)
+  {}
+
+  __host__ __device__
+  bool operator()(void) const
+  {
+    Engine e;
+    e.discard(9999);
+
+    // get the 10Kth result
+    return e() == m_value_10000;
+  }
+
+  const typename Engine::result_type m_value_10000;
+}; // end ValidateEngine
+
+
+template<typename Engine,
+         bool trivial_min = (Engine::min == 0)>
+  struct ValidateEngineMin
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    Engine e;
+
+    bool result = true;
+
+    for(int i = 0; i < 10000; ++i)
+    {
+      result &= (e() >= Engine::min);
+    }
+
+    return result;
+  }
+}; // end ValidateEngineMin
+
+template<typename Engine>
+  struct ValidateEngineMin<Engine,true>
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    return true;
+  }
+};
+
+
+template<typename Engine>
+  struct ValidateEngineMax
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    Engine e;
+
+    bool result = true;
+
+    for(int i = 0; i < 10000; ++i)
+    {
+      result &= (e() <= Engine::max);
+    }
+
+    return result;
+  }
+}; // end ValidateEngineMax
+
+
+template<typename Engine>
+  struct ValidateEngineEqual
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    bool result = true;
+
+    // test from default constructor
+    Engine e0, e1;
+    result &= (e0 == e1);
+
+    // advance engines
+    e0.discard(10000);
+    e1.discard(10000);
+    result &= (e0 == e1);
+
+    // test from identical seeds
+    Engine e2(13), e3(13);
+    result &= (e2 == e3);
+
+    // test different seeds aren't equal
+    Engine e4(7), e5(13);
+    result &= !(e4 == e5);
+
+    // test reseeding engine to the same seed causes equality
+    e4.seed(13);
+    result &= (e4 == e5);
+
+    return result;
+  }
+};
+
+
+template<typename Engine>
+  struct ValidateEngineUnequal
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    bool result = true;
+
+    // test from default constructor
+    Engine e0, e1;
+    result &= !(e0 != e1);
+
+    // advance engines
+    e0.discard(1000);
+    e1.discard(1000);
+    result &= !(e0 != e1);
+
+    // test from identical seeds
+    Engine e2(13), e3(13);
+    result &= !(e2 != e3);
+
+    // test different seeds aren't equal
+    Engine e4(7), e5(13);
+    result &= (e4 != e5);
+
+    // test reseeding engine to the same seed causes equality
+    e4.seed(13);
+    result &= !(e4 != e5);
+
+    // test different discards causes inequality
+    Engine e6(13), e7(13);
+    e6.discard(500);
+    e7.discard(1000);
+    result &= (e6 != e7);
+
+    return result;
+  }
+};
+
+
+template<typename Distribution, typename Engine>
+  struct ValidateDistributionMin
+{
+  typedef Engine random_engine;
+
+  __host__ __device__
+  ValidateDistributionMin(const Distribution &dd)
+    : d(dd)
+  {}
+
+  __host__ __device__
+  bool operator()(void)
+  {
+    Engine e;
+
+    bool result = true;
+
+    for(int i = 0; i < 10000; ++i)
+    {
+      result &= (d(e) >= d.min());
+    }
+
+    return result;
+  }
+
+  Distribution d;
+};
+
+
+template<typename Distribution, typename Engine>
+  struct ValidateDistributionMax
+{
+  typedef Engine random_engine;
+
+  __host__ __device__
+  ValidateDistributionMax(const Distribution &dd)
+    : d(dd)
+  {}
+
+  __host__ __device__
+  bool operator()(void)
+  {
+    Engine e;
+
+    bool result = true;
+
+    for(int i = 0; i < 10000; ++i)
+    {
+      result &= (d(e) <= d.max());
+    }
+
+    return result;
+  }
+
+  Distribution d;
+};
+
+
+template<typename Distribution>
+  struct ValidateDistributionEqual
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    return d0 == d1;
+  }
+
+  Distribution d0, d1;
+};
+
+
+template<typename Distribution>
+  struct ValidateDistributionUnqual
+{
+  __host__ __device__
+  bool operator()(void) const
+  {
+    return d0 != d1;
+  }
+
+  Distribution d0, d1;
+};
+
+
+template<typename Engine, thrust::detail::uint64_t value_10000>
+void TestEngineValidation(void)
+{
+  // test host
+  thrust::host_vector<bool> h(1);
+  thrust::generate(h.begin(), h.end(), ValidateEngine<Engine>(value_10000));
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::device_vector<bool> d(1);
+  thrust::generate(d.begin(), d.end(), ValidateEngine<Engine>(value_10000));
+
+  ASSERT_EQUAL(true, d[0]);
+}
+
+
+template<typename Engine>
+void TestEngineMax(void)
+{
+  // test host
+  thrust::host_vector<bool> h(1);
+  thrust::generate(h.begin(), h.end(), ValidateEngineMax<Engine>());
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::device_vector<bool> d(1);
+  thrust::generate(d.begin(), d.end(), ValidateEngineMax<Engine>());
+
+  ASSERT_EQUAL(true, d[0]);
+}
+
+
+template<typename Engine>
+void TestEngineMin(void)
+{
+  // test host
+  thrust::host_vector<bool> h(1);
+  thrust::generate(h.begin(), h.end(), ValidateEngineMin<Engine>());
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::device_vector<bool> d(1);
+  thrust::generate(d.begin(), d.end(), ValidateEngineMin<Engine>());
+
+  ASSERT_EQUAL(true, d[0]);
+}
+
+
+template<typename Engine>
+void TestEngineSaveRestore(void)
+{
+  // create a default engine
+  Engine e0;
+
+  // run it for a while
+  e0.discard(10000);
+
+  // save it
+  std::stringstream ss;
+  ss << e0;
+
+  // run it a while longer
+  e0.discard(10000);
+
+  // restore old state
+  Engine e1;
+  ss >> e1;
+
+  // run e1 a while longer
+  e1.discard(10000);
+
+  // both should return the same result
+
+  ASSERT_EQUAL(e0(), e1());
+}
+
+
+template<typename Engine>
+void TestEngineEqual(void)
+{
+  ValidateEngineEqual<Engine> f;
+
+  // test host
+  thrust::host_vector<bool> h(1);
+  thrust::generate(h.begin(), h.end(), f);
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::device_vector<bool> d(1);
+  thrust::generate(d.begin(), d.end(), f);
+
+  ASSERT_EQUAL(true, d[0]);
+}
+
+
+template<typename Engine>
+void TestEngineUnequal(void)
+{
+  ValidateEngineUnequal<Engine> f;
+
+  // test host
+  thrust::host_vector<bool> h(1);
+  thrust::generate(h.begin(), h.end(), f);
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::device_vector<bool> d(1);
+  thrust::generate(d.begin(), d.end(), f);
+
+  ASSERT_EQUAL(true, d[0]);
+}
+
+void TestRanlux24BaseValidation(void)
+{
+  typedef thrust::random::ranlux24_base Engine;
+
+  TestEngineValidation<Engine,7937952u>();
+}
+DECLARE_UNITTEST(TestRanlux24BaseValidation);
+
+
+void TestRanlux24BaseMin(void)
+{
+  typedef thrust::random::ranlux24_base Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24BaseMin);
+
+
+void TestRanlux24BaseMax(void)
+{
+  typedef thrust::random::ranlux24_base Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24BaseMax);
+
+
+void TestRanlux24BaseSaveRestore(void)
+{
+  typedef thrust::random::ranlux24_base Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24BaseSaveRestore);
+
+
+void TestRanlux24BaseEqual(void)
+{
+  typedef thrust::random::ranlux24_base Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24BaseEqual);
+
+
+void TestRanlux24BaseUnequal(void)
+{
+  typedef thrust::random::ranlux24_base Engine;
+
+  TestEngineUnequal<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24BaseUnequal);
+
+
+void TestRanlux48BaseValidation(void)
+{
+  typedef thrust::random::ranlux48_base Engine;
+
+  TestEngineValidation<Engine,192113843633948ull>();
+}
+DECLARE_UNITTEST(TestRanlux48BaseValidation);
+
+
+void TestRanlux48BaseMin(void)
+{
+  typedef thrust::random::ranlux48_base Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48BaseMin);
+
+
+void TestRanlux48BaseMax(void)
+{
+  typedef thrust::random::ranlux48_base Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48BaseMax);
+
+
+void TestRanlux48BaseSaveRestore(void)
+{
+  typedef thrust::random::ranlux48_base Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48BaseSaveRestore);
+
+
+void TestRanlux48BaseEqual(void)
+{
+  typedef thrust::random::ranlux48_base Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48BaseEqual);
+
+
+#if defined(__INTEL_COMPILER) && 1800 >= __INTEL_COMPILER
+void TestRanlux48BaseUnequal(void)
+{
+    // ICPC has a known failure with this test.
+    // See nvbug 200414000.
+    KNOWN_FAILURE;
+}
+#else
+void TestRanlux48BaseUnequal(void)
+{
+  typedef thrust::random::ranlux48_base Engine;
+
+  TestEngineUnequal<Engine>();
+}
+#endif
+DECLARE_UNITTEST(TestRanlux48BaseUnequal);
+
+
+void TestMinstdRandValidation(void)
+{
+  typedef thrust::random::minstd_rand Engine;
+
+  TestEngineValidation<Engine,399268537u>();
+}
+DECLARE_UNITTEST(TestMinstdRandValidation);
+
+
+void TestMinstdRandMin(void)
+{
+  typedef thrust::random::minstd_rand Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRandMin);
+
+
+void TestMinstdRandMax(void)
+{
+  typedef thrust::random::minstd_rand Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRandMax);
+
+
+void TestMinstdRandSaveRestore(void)
+{
+  typedef thrust::random::minstd_rand Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRandSaveRestore);
+
+
+void TestMinstdRandEqual(void)
+{
+  typedef thrust::random::minstd_rand Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRandEqual);
+
+
+void TestMinstdRandUnequal(void)
+{
+  typedef thrust::random::minstd_rand Engine;
+
+  TestEngineUnequal<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRandUnequal);
+
+
+void TestMinstdRand0Validation(void)
+{
+  typedef thrust::random::minstd_rand0 Engine;
+
+  TestEngineValidation<Engine,1043618065u>();
+}
+DECLARE_UNITTEST(TestMinstdRand0Validation);
+
+
+void TestMinstdRand0Min(void)
+{
+  typedef thrust::random::minstd_rand0 Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRand0Min);
+
+
+void TestMinstdRand0Max(void)
+{
+  typedef thrust::random::minstd_rand0 Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRand0Max);
+
+
+void TestMinstdRand0SaveRestore(void)
+{
+  typedef thrust::random::minstd_rand0 Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRand0SaveRestore);
+
+
+void TestMinstdRand0Equal(void)
+{
+  typedef thrust::random::minstd_rand0 Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRand0Equal);
+
+
+void TestMinstdRand0Unequal(void)
+{
+  typedef thrust::random::minstd_rand0 Engine;
+
+  TestEngineUnequal<Engine>();
+}
+DECLARE_UNITTEST(TestMinstdRand0Unequal);
+
+
+void TestTaus88Validation(void)
+{
+  typedef thrust::random::taus88 Engine;
+
+  TestEngineValidation<Engine,3535848941ull>();
+}
+DECLARE_UNITTEST(TestTaus88Validation);
+
+
+void TestTaus88Min(void)
+{
+  typedef thrust::random::taus88 Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestTaus88Min);
+
+
+void TestTaus88Max(void)
+{
+  typedef thrust::random::taus88 Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestTaus88Max);
+
+
+void TestTaus88SaveRestore(void)
+{
+  typedef thrust::random::taus88 Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestTaus88SaveRestore);
+
+
+void TestTaus88Equal(void)
+{
+  typedef thrust::random::taus88 Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestTaus88Equal);
+
+
+void TestTaus88Unequal(void)
+{
+  typedef thrust::random::taus88 Engine;
+
+  TestEngineUnequal<Engine>();
+}
+DECLARE_UNITTEST(TestTaus88Unequal);
+
+
+void TestRanlux24Validation(void)
+{
+  typedef thrust::random::ranlux24 Engine;
+
+  TestEngineValidation<Engine,9901578>();
+}
+DECLARE_UNITTEST(TestRanlux24Validation);
+
+
+void TestRanlux24Min(void)
+{
+  typedef thrust::random::ranlux24 Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24Min);
+
+
+void TestRanlux24Max(void)
+{
+  typedef thrust::random::ranlux24 Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24Max);
+
+
+void TestRanlux24SaveRestore(void)
+{
+  typedef thrust::random::ranlux24 Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24SaveRestore);
+
+
+void TestRanlux24Equal(void)
+{
+  typedef thrust::random::ranlux24 Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24Equal);
+
+
+void TestRanlux24Unequal(void)
+{
+  typedef thrust::random::ranlux24 Engine;
+
+  TestEngineUnequal<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux24Unequal);
+
+
+
+void TestRanlux48Validation(void)
+{
+  typedef thrust::random::ranlux48 Engine;
+
+  TestEngineValidation<Engine,88229545517833ull>();
+}
+DECLARE_UNITTEST(TestRanlux48Validation);
+
+
+void TestRanlux48Min(void)
+{
+  typedef thrust::random::ranlux48 Engine;
+
+  TestEngineMin<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48Min);
+
+
+void TestRanlux48Max(void)
+{
+  typedef thrust::random::ranlux48 Engine;
+
+  TestEngineMax<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48Max);
+
+
+void TestRanlux48SaveRestore(void)
+{
+  typedef thrust::random::ranlux48 Engine;
+
+  TestEngineSaveRestore<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48SaveRestore);
+
+
+void TestRanlux48Equal(void)
+{
+  typedef thrust::random::ranlux48 Engine;
+
+  TestEngineEqual<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48Equal);
+
+
+void TestRanlux48Unequal(void)
+{
+  typedef thrust::random::ranlux48 Engine;
+
+  TestEngineUnequal<Engine>();
+}
+DECLARE_UNITTEST(TestRanlux48Unequal);
+
+
+template<typename Distribution, typename Validator>
+  void ValidateDistributionCharacteristic(void)
+{
+  typedef typename Validator::random_engine Engine;
+
+  // test default-constructed Distribution
+
+  // test host
+  thrust::host_vector<bool> h(1);
+  thrust::generate(h.begin(), h.end(), Validator(Distribution()));
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::device_vector<bool> d(1);
+  thrust::generate(d.begin(), d.end(), Validator(Distribution()));
+
+  ASSERT_EQUAL(true, d[0]);
+
+
+  // test distribution & engine with comparable ranges
+  // only do this if they have the same result_type
+  if(thrust::detail::is_same<typename Distribution::result_type, typename Engine::result_type>::value)
+  {
+    // test Distribution with same range as engine
+
+    // test host
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4305)
+    thrust::generate(h.begin(), h.end(), Validator(
+        Distribution(Engine::min, Engine::max)
+    ));
+    THRUST_DISABLE_MSVC_WARNING_END(4305)
+
+    ASSERT_EQUAL(true, h[0]);
+
+    // test device
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4305)
+    thrust::generate(d.begin(), d.end(), Validator(
+        Distribution(Engine::min, Engine::max)
+    ));
+    THRUST_DISABLE_MSVC_WARNING_END(4305)
+
+    ASSERT_EQUAL(true, d[0]);
+
+    // test Distribution with smaller range than engine
+
+    // test host
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4305) // Truncation warning.
+    typename Distribution::result_type engine_range = Engine::max - Engine::min;
+    THRUST_DISABLE_MSVC_WARNING_END(4305)
+    thrust::generate(h.begin(), h.end(), Validator(Distribution(engine_range/3, (2 * engine_range)/3)));
+
+    ASSERT_EQUAL(true, h[0]);
+
+    // test device
+    thrust::generate(d.begin(), d.end(), Validator(Distribution(engine_range/3, (2 * engine_range)/3)));
+
+    ASSERT_EQUAL(true, d[0]);
+  }
+
+
+  // test Distribution with a very small range
+
+  // test host
+  thrust::generate(h.begin(), h.end(), Validator(Distribution(1,6)));
+
+  ASSERT_EQUAL(true, h[0]);
+
+  // test device
+  thrust::generate(d.begin(), d.end(), Validator(Distribution(1,6)));
+
+  ASSERT_EQUAL(true, d[0]);
+}
+
+
+template<typename Distribution>
+  void TestDistributionSaveRestore(void)
+{
+  // create a default distribution
+  Distribution d0(7, 13);
+
+  // save it
+  std::stringstream ss;
+  ss << d0;
+
+  // restore old state
+  Distribution d1;
+  ss >> d1;
+
+  ASSERT_EQUAL(d0, d1);
+}
+
+
+void TestUniformIntDistributionMin(void)
+{
+  typedef thrust::random::uniform_int_distribution<int>          int_dist;
+  typedef thrust::random::uniform_int_distribution<unsigned int> uint_dist;
+  
+  ValidateDistributionCharacteristic<int_dist,  ValidateDistributionMin<int_dist,  thrust::minstd_rand> >();
+  ValidateDistributionCharacteristic<uint_dist, ValidateDistributionMin<uint_dist, thrust::minstd_rand> >();
+}
+DECLARE_UNITTEST(TestUniformIntDistributionMin);
+
+
+void TestUniformIntDistributionMax(void)
+{
+  typedef thrust::random::uniform_int_distribution<int>          int_dist;
+  typedef thrust::random::uniform_int_distribution<unsigned int> uint_dist;
+  
+  ValidateDistributionCharacteristic<int_dist,  ValidateDistributionMax<int_dist,  thrust::minstd_rand> >();
+  ValidateDistributionCharacteristic<uint_dist, ValidateDistributionMax<uint_dist, thrust::minstd_rand> >();
+}
+DECLARE_UNITTEST(TestUniformIntDistributionMax);
+
+
+void TestUniformIntDistributionSaveRestore(void)
+{
+  typedef thrust::random::uniform_int_distribution<int>          int_dist;
+  typedef thrust::random::uniform_int_distribution<unsigned int> uint_dist;
+
+  TestDistributionSaveRestore<int_dist>();
+  TestDistributionSaveRestore<uint_dist>();
+}
+DECLARE_UNITTEST(TestUniformIntDistributionSaveRestore);
+
+
+void TestUniformRealDistributionMin(void)
+{
+  typedef thrust::random::uniform_real_distribution<float>  float_dist;
+  typedef thrust::random::uniform_real_distribution<double> double_dist;
+  
+  ValidateDistributionCharacteristic<float_dist,  ValidateDistributionMin<float_dist,  thrust::minstd_rand> >();
+  ValidateDistributionCharacteristic<double_dist, ValidateDistributionMin<double_dist, thrust::minstd_rand> >();
+}
+DECLARE_UNITTEST(TestUniformRealDistributionMin);
+
+
+void TestUniformRealDistributionMax(void)
+{
+  typedef thrust::random::uniform_real_distribution<float>  float_dist;
+  typedef thrust::random::uniform_real_distribution<double> double_dist;
+  
+  ValidateDistributionCharacteristic<float_dist,  ValidateDistributionMax<float_dist,  thrust::minstd_rand> >();
+  ValidateDistributionCharacteristic<double_dist, ValidateDistributionMax<double_dist, thrust::minstd_rand> >();
+}
+DECLARE_UNITTEST(TestUniformRealDistributionMax);
+
+
+void TestUniformRealDistributionSaveRestore(void)
+{
+  typedef thrust::random::uniform_real_distribution<float>  float_dist;
+  typedef thrust::random::uniform_real_distribution<double> double_dist;
+
+  TestDistributionSaveRestore<float_dist>();
+  TestDistributionSaveRestore<double_dist>();
+}
+DECLARE_UNITTEST(TestUniformRealDistributionSaveRestore);
+
+
+void TestNormalDistributionMin(void)
+{
+  typedef thrust::random::normal_distribution<float>  float_dist;
+  typedef thrust::random::normal_distribution<double> double_dist;
+  
+  ValidateDistributionCharacteristic<float_dist,  ValidateDistributionMin<float_dist,  thrust::minstd_rand> >();
+  ValidateDistributionCharacteristic<double_dist, ValidateDistributionMin<double_dist, thrust::minstd_rand> >();
+}
+DECLARE_UNITTEST(TestNormalDistributionMin);
+
+
+void TestNormalDistributionMax(void)
+{
+  typedef thrust::random::normal_distribution<float>  float_dist;
+  typedef thrust::random::normal_distribution<double> double_dist;
+  
+  ValidateDistributionCharacteristic<float_dist,  ValidateDistributionMax<float_dist,  thrust::minstd_rand> >();
+  ValidateDistributionCharacteristic<double_dist, ValidateDistributionMax<double_dist, thrust::minstd_rand> >();
+}
+DECLARE_UNITTEST(TestNormalDistributionMax);
+
+
+void TestNormalDistributionSaveRestore(void)
+{
+  typedef thrust::random::normal_distribution<float>  float_dist;
+  typedef thrust::random::normal_distribution<double> double_dist;
+
+  TestDistributionSaveRestore<float_dist>();
+  TestDistributionSaveRestore<double_dist>();
+}
+DECLARE_UNITTEST(TestNormalDistributionSaveRestore);
+
diff --git a/thrust/testing/reduce.cu b/thrust/testing/reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb08bc8897be7595972e0f2751396d5e2476c2a7
--- /dev/null
+++ b/thrust/testing/reduce.cu
@@ -0,0 +1,232 @@
+#include <unittest/unittest.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <limits>
+
+template<typename T>
+  struct plus_mod_10
+{
+  __host__ __device__
+  T operator()(T lhs, T rhs) const
+  {
+    return ((lhs % 10) + (rhs % 10)) % 10;
+  }
+};
+
+template<typename T>
+struct is_equal_div_10_reduce
+{
+    __host__ __device__
+    bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+template <class Vector>
+void TestReduceSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(3);
+    v[0] = 1; v[1] = -2; v[2] = 3;
+
+    // no initializer
+    ASSERT_EQUAL(thrust::reduce(v.begin(), v.end()), 2);
+
+    // with initializer
+    ASSERT_EQUAL(thrust::reduce(v.begin(), v.end(), (T) 10), 12);
+}
+DECLARE_VECTOR_UNITTEST(TestReduceSimple);
+
+
+template<typename InputIterator>
+int reduce(my_system &system, InputIterator, InputIterator)
+{
+    system.validate_dispatch();
+    return 13;
+}
+
+void TestReduceDispatchExplicit()
+{
+    thrust::device_vector<int> vec;
+
+    my_system sys(0);
+    thrust::reduce(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReduceDispatchExplicit);
+
+
+template<typename InputIterator>
+int reduce(my_tag, InputIterator, InputIterator)
+{
+    return 13;
+}
+
+void TestReduceDispatchImplicit()
+{
+    thrust::device_vector<int> vec;
+
+    int result = thrust::reduce(thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.end()));
+
+    ASSERT_EQUAL(13, result);
+}
+DECLARE_UNITTEST(TestReduceDispatchImplicit);
+
+
+template <typename T>
+struct TestReduce
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        T init = 13;
+
+        T h_result = thrust::reduce(h_data.begin(), h_data.end(), init);
+        T d_result = thrust::reduce(d_data.begin(), d_data.end(), init);
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestReduce, IntegralTypes> TestReduceInstance;
+
+
+template <class IntVector, class FloatVector>
+void TestReduceMixedTypes(void)
+{
+    // make sure we get types for default args and operators correct
+    IntVector int_input(4);
+    int_input[0] = 1;
+    int_input[1] = 2;
+    int_input[2] = 3;
+    int_input[3] = 4;
+
+    FloatVector float_input(4);
+    float_input[0] = 1.5;
+    float_input[1] = 2.5;
+    float_input[2] = 3.5;
+    float_input[3] = 4.5;
+
+    // float -> int should use using plus<int> operator by default
+    ASSERT_EQUAL(thrust::reduce(float_input.begin(), float_input.end(), (int) 0), 10);
+
+    // int -> float should use using plus<float> operator by default
+    ASSERT_EQUAL(thrust::reduce(int_input.begin(), int_input.end(), (float) 0.5), 10.5);
+}
+void TestReduceMixedTypesHost(void)
+{
+    TestReduceMixedTypes< thrust::host_vector<int>, thrust::host_vector<float> >();
+}
+DECLARE_UNITTEST(TestReduceMixedTypesHost);
+void TestReduceMixedTypesDevice(void)
+{
+    TestReduceMixedTypes< thrust::device_vector<int>, thrust::device_vector<float> >();
+}
+DECLARE_UNITTEST(TestReduceMixedTypesDevice);
+
+
+template <typename T>
+struct TestReduceWithOperator
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        T init = 3;
+
+        T cpu_result = thrust::reduce(h_data.begin(), h_data.end(), init, plus_mod_10<T>());
+        T gpu_result = thrust::reduce(d_data.begin(), d_data.end(), init, plus_mod_10<T>());
+
+        ASSERT_EQUAL(cpu_result, gpu_result);
+    }
+};
+VariableUnitTest<TestReduceWithOperator, UnsignedIntegralTypes> TestReduceWithOperatorInstance;
+
+
+template <typename T>
+struct plus_mod3
+{
+    T * table;
+
+    plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    T operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+template <typename Vector>
+void TestReduceWithIndirection(void)
+{
+    // add numbers modulo 3 with external lookup table
+    typedef typename Vector::value_type T;
+
+    Vector data(7);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 2;
+    data[3] = 1;
+    data[4] = 2;
+    data[5] = 0;
+    data[6] = 1;
+
+    Vector table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    T result = thrust::reduce(data.begin(), data.end(), T(0), plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
+
+    ASSERT_EQUAL(result, T(1));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestReduceWithIndirection);
+
+template<typename T>
+  void TestReduceCountingIterator()
+{
+  size_t const n = 15 * sizeof(T);
+
+  ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+  thrust::counting_iterator<T, thrust::host_system_tag>   h_first = thrust::make_counting_iterator<T>(0);
+  thrust::counting_iterator<T, thrust::device_system_tag> d_first = thrust::make_counting_iterator<T>(0);
+
+  T init = unittest::random_integer<T>();
+
+  T h_result = thrust::reduce(h_first, h_first + n, init);
+  T d_result = thrust::reduce(d_first, d_first + n, init);
+
+  // we use ASSERT_ALMOST_EQUAL because we're testing floating point types
+  ASSERT_ALMOST_EQUAL(h_result, d_result);
+}
+DECLARE_GENERIC_UNITTEST(TestReduceCountingIterator);
+
+void TestReduceWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    long long result = thrust::reduce(thrust::device, begin, end);
+
+    ASSERT_EQUAL(result, 1ll << magnitude);
+}
+
+void TestReduceWithBigIndexes()
+{
+    TestReduceWithBigIndexesHelper(30);
+    TestReduceWithBigIndexesHelper(31);
+    TestReduceWithBigIndexesHelper(32);
+    TestReduceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestReduceWithBigIndexes);
diff --git a/thrust/testing/reduce_by_key.cu b/thrust/testing/reduce_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f8539c0664194dc21678b8a3afccefdc8398fb0c
--- /dev/null
+++ b/thrust/testing/reduce_by_key.cu
@@ -0,0 +1,264 @@
+#include <unittest/unittest.h>
+#include <thrust/reduce.h>
+#include <thrust/unique.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+template<typename T>
+struct is_equal_div_10_reduce
+{
+    __host__ __device__
+    bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+
+template <typename Vector>
+void initialize_keys(Vector& keys)
+{
+    keys.resize(9);
+    keys[0] = 11;
+    keys[1] = 11;
+    keys[2] = 21;
+    keys[3] = 20;
+    keys[4] = 21;
+    keys[5] = 21;
+    keys[6] = 21;
+    keys[7] = 37;
+    keys[8] = 37;
+}
+
+template <typename Vector>
+void initialize_values(Vector& values)
+{
+    values.resize(9);
+    values[0] = 0; 
+    values[1] = 1;
+    values[2] = 2;
+    values[3] = 3;
+    values[4] = 4;
+    values[5] = 5;
+    values[6] = 6;
+    values[7] = 7;
+    values[8] = 8;
+}
+
+
+template<typename Vector>
+void TestReduceByKeySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector keys;
+    Vector values;
+
+    typename thrust::pair<typename Vector::iterator, typename Vector::iterator> new_last;
+
+    // basic test
+    initialize_keys(keys);  initialize_values(values);
+
+    Vector output_keys(keys.size());
+    Vector output_values(values.size());
+
+    new_last = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+
+    ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+    ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+    ASSERT_EQUAL(output_keys[0], 11);
+    ASSERT_EQUAL(output_keys[1], 21);
+    ASSERT_EQUAL(output_keys[2], 20);
+    ASSERT_EQUAL(output_keys[3], 21);
+    ASSERT_EQUAL(output_keys[4], 37);
+    
+    ASSERT_EQUAL(output_values[0],  1);
+    ASSERT_EQUAL(output_values[1],  2);
+    ASSERT_EQUAL(output_values[2],  3);
+    ASSERT_EQUAL(output_values[3], 15);
+    ASSERT_EQUAL(output_values[4], 15);
+
+    // test BinaryPredicate
+    initialize_keys(keys);  initialize_values(values);
+    
+    new_last = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
+
+    ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
+    ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
+    ASSERT_EQUAL(output_keys[0], 11);
+    ASSERT_EQUAL(output_keys[1], 21);
+    ASSERT_EQUAL(output_keys[2], 37);
+    
+    ASSERT_EQUAL(output_values[0],  1);
+    ASSERT_EQUAL(output_values[1], 20);
+    ASSERT_EQUAL(output_values[2], 15);
+
+    // test BinaryFunction
+    initialize_keys(keys);  initialize_values(values);
+
+    new_last = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
+
+    ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+    ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+    ASSERT_EQUAL(output_keys[0], 11);
+    ASSERT_EQUAL(output_keys[1], 21);
+    ASSERT_EQUAL(output_keys[2], 20);
+    ASSERT_EQUAL(output_keys[3], 21);
+    ASSERT_EQUAL(output_keys[4], 37);
+    
+    ASSERT_EQUAL(output_values[0],  1);
+    ASSERT_EQUAL(output_values[1],  2);
+    ASSERT_EQUAL(output_values[2],  3);
+    ASSERT_EQUAL(output_values[3], 15);
+    ASSERT_EQUAL(output_values[4], 15);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestReduceByKeySimple);
+
+template<typename K>
+struct TestReduceByKey
+{
+    void operator()(const size_t n)
+    {
+        typedef unsigned int V; // ValueType
+
+        thrust::host_vector<K>   h_keys = unittest::random_integers<bool>(n);
+        thrust::host_vector<V>   h_vals = unittest::random_integers<V>(n);
+        thrust::device_vector<K> d_keys = h_keys;
+        thrust::device_vector<V> d_vals = h_vals;
+
+        thrust::host_vector<K>   h_keys_output(n);
+        thrust::host_vector<V>   h_vals_output(n);
+        thrust::device_vector<K> d_keys_output(n);
+        thrust::device_vector<V> d_vals_output(n);
+
+        typedef typename thrust::host_vector<K>::iterator   HostKeyIterator;
+        typedef typename thrust::host_vector<V>::iterator   HostValIterator;
+        typedef typename thrust::device_vector<K>::iterator DeviceKeyIterator;
+        typedef typename thrust::device_vector<V>::iterator DeviceValIterator;
+
+        typedef typename thrust::pair<HostKeyIterator,  HostValIterator>   HostIteratorPair;
+        typedef typename thrust::pair<DeviceKeyIterator,DeviceValIterator> DeviceIteratorPair;
+
+        HostIteratorPair   h_last = thrust::reduce_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_keys_output.begin(), h_vals_output.begin());
+        DeviceIteratorPair d_last = thrust::reduce_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys_output.begin(), d_vals_output.begin());
+
+        ASSERT_EQUAL(h_last.first  - h_keys_output.begin(), d_last.first  - d_keys_output.begin());
+        ASSERT_EQUAL(h_last.second - h_vals_output.begin(), d_last.second - d_vals_output.begin());
+       
+        size_t N = h_last.first - h_keys_output.begin();
+
+        h_keys_output.resize(N);
+        h_vals_output.resize(N);
+        d_keys_output.resize(N);
+        d_vals_output.resize(N);
+
+        ASSERT_EQUAL(h_keys_output, d_keys_output);
+        ASSERT_EQUAL(h_vals_output, d_vals_output);
+    }
+};
+VariableUnitTest<TestReduceByKey, IntegralTypes> TestReduceByKeyInstance;
+
+template<typename K>
+struct TestReduceByKeyToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        typedef unsigned int V; // ValueType
+
+        thrust::host_vector<K>   h_keys = unittest::random_integers<bool>(n);
+        thrust::host_vector<V>   h_vals = unittest::random_integers<V>(n);
+        thrust::device_vector<K> d_keys = h_keys;
+        thrust::device_vector<V> d_vals = h_vals;
+
+        thrust::host_vector<K>   h_keys_output(n);
+        thrust::host_vector<V>   h_vals_output(n);
+        thrust::device_vector<K> d_keys_output(n);
+        thrust::device_vector<V> d_vals_output(n);
+
+        thrust::host_vector<K> unique_keys = h_keys;
+        unique_keys.erase(thrust::unique(unique_keys.begin(), unique_keys.end()), unique_keys.end());
+
+        // discard key output
+        size_t h_size =
+          thrust::reduce_by_key(h_keys.begin(), h_keys.end(),
+                                h_vals.begin(),
+                                thrust::make_discard_iterator(),
+                                h_vals_output.begin()).second - h_vals_output.begin();
+
+        size_t d_size =
+          thrust::reduce_by_key(d_keys.begin(), d_keys.end(),
+                                d_vals.begin(),
+                                thrust::make_discard_iterator(),
+                                d_vals_output.begin()).second - d_vals_output.begin();
+
+        h_vals_output.resize(h_size);
+        d_vals_output.resize(d_size);
+
+        ASSERT_EQUAL(h_vals_output.size(), unique_keys.size());
+        ASSERT_EQUAL(d_vals_output.size(), unique_keys.size());
+        ASSERT_EQUAL(d_vals_output.size(), h_vals_output.size());
+    }
+};
+VariableUnitTest<TestReduceByKeyToDiscardIterator, IntegralTypes> TestReduceByKeyToDiscardIteratorInstance;
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+reduce_by_key(my_system &system,
+              InputIterator1, 
+              InputIterator1,
+              InputIterator2,
+              OutputIterator1 keys_output,
+              OutputIterator2 values_output)
+{
+    system.validate_dispatch();
+    return thrust::make_pair(keys_output, values_output);
+}
+
+void TestReduceByKeyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::reduce_by_key(sys,
+                          vec.begin(),
+                          vec.begin(),
+                          vec.begin(),
+                          vec.begin(),
+                          vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReduceByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+reduce_by_key(my_tag,
+              InputIterator1, 
+              InputIterator1,
+              InputIterator2,
+              OutputIterator1 keys_output,
+              OutputIterator2 values_output)
+{
+    *keys_output = 13;
+    return thrust::make_pair(keys_output, values_output);
+}
+
+void TestReduceByKeyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::reduce_by_key(thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReduceByKeyDispatchImplicit);
+
diff --git a/thrust/testing/reduce_large.cu b/thrust/testing/reduce_large.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cfe2d0973da8111a5b0d643bd493f905455b34ad
--- /dev/null
+++ b/thrust/testing/reduce_large.cu
@@ -0,0 +1,37 @@
+#include <unittest/unittest.h>
+#include <thrust/reduce.h>
+
+
+template <typename T, unsigned int N>
+void _TestReduceWithLargeTypes(void)
+{
+    size_t n = (64 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_data(n);
+
+    for(size_t i = 0; i < h_data.size(); i++)
+        h_data[i] = FixedVector<T,N>(i);
+
+    thrust::device_vector< FixedVector<T,N> > d_data = h_data;
+    
+    FixedVector<T,N> h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector<T,N>(0));
+    FixedVector<T,N> d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector<T,N>(0));
+
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+
+void TestReduceWithLargeTypes(void)
+{
+  _TestReduceWithLargeTypes<int,    4>();
+  _TestReduceWithLargeTypes<int,    8>();
+  _TestReduceWithLargeTypes<int,   16>();
+
+  // XXX these take too long to compile
+  //  _TestReduceWithLargeTypes<int,   32>();
+  //  _TestReduceWithLargeTypes<int,   64>();
+  //  _TestReduceWithLargeTypes<int,  128>(); 
+  //  _TestReduceWithLargeTypes<int,  256>();
+  //  _TestReduceWithLargeTypes<int,  512>();
+}
+DECLARE_UNITTEST(TestReduceWithLargeTypes);
+
diff --git a/thrust/testing/regression/CMakeLists.txt b/thrust/testing/regression/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eea8b3a4571fac8b1a26ff7d855038e06597712f
--- /dev/null
+++ b/thrust/testing/regression/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Disabled as these test names are too long for CMAKE_OBJECT_PATH_MAX.
+# We should integrate these with the other unit tests.
+# See issue #1205.
+#
+return()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "regression.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/thrust/testing/regression/gh_911__merge_by_key_wrong_element_type_default_comparator.cu b/thrust/testing/regression/gh_911__merge_by_key_wrong_element_type_default_comparator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..01308aa270cb8390618c141087aeecf6648492a1
--- /dev/null
+++ b/thrust/testing/regression/gh_911__merge_by_key_wrong_element_type_default_comparator.cu
@@ -0,0 +1,26 @@
+#include <thrust/sort.h>
+#include <thrust/device_ptr.h>
+int main() {
+  const int N = 100;
+  thrust::device_ptr<int> input_key_A1;
+  thrust::device_ptr<float> input_val_A1;
+  thrust::device_ptr<int> input_key_B1;
+  thrust::device_ptr<float> input_val_B1;
+  thrust::device_ptr<int> output_key;
+  thrust::device_ptr<float> output_val;
+
+  // use key tuples (with one element to keep it simple)
+  auto input_key_tuple_A = thrust::make_tuple(input_key_A1);
+  auto input_key_tuple_B = thrust::make_tuple(input_key_B1);
+  auto output_key_tuple = thrust::make_tuple(output_key);
+  // use zip iterator to zip together elements of a tuple (each is an iterator)
+  auto zip_it_A = thrust::make_zip_iterator(input_key_tuple_A);
+  auto zip_it_B = thrust::make_zip_iterator(input_key_tuple_B);
+  auto zip_it_out = thrust::make_zip_iterator(output_key_tuple);
+
+  // does NOT compile in CUDA 9.1 (compiles fine in CUDA 8)
+  thrust::merge_by_key(zip_it_A, zip_it_A + N, zip_it_B, zip_it_B + N, input_val_A1, input_val_B1, zip_it_out, output_val);
+
+  return 0;
+}
+
diff --git a/thrust/testing/regression/gh_919_nvbug_2318871__zip_iterator_with_complex.cu b/thrust/testing/regression/gh_919_nvbug_2318871__zip_iterator_with_complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3904933f3f91a1b6e61431b8b1ba5e7b66d05ab7
--- /dev/null
+++ b/thrust/testing/regression/gh_919_nvbug_2318871__zip_iterator_with_complex.cu
@@ -0,0 +1,40 @@
+#include <thrust/device_vector.h>
+#include <thrust/complex.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+   
+struct greater_than_5 
+{
+  template <typename T>
+  __host__ __device__
+  bool operator()(T val)
+  {
+    return abs(val) > 5;
+  }
+};
+ 
+int main()
+{
+  typedef thrust::complex<float> T;
+
+  thrust::device_vector<T> d(10);
+  thrust::sequence(d.begin(), d.end());
+  thrust::device_vector<T> r(10);
+
+  thrust::counting_iterator<int> c_begin(0); 
+  thrust::counting_iterator<int> c_end(c_begin + 10); 
+
+  thrust::device_vector<int> idxs(10);
+
+  thrust::copy_if(
+    thrust::make_zip_iterator(thrust::make_tuple(c_begin, d.begin()))
+  , thrust::make_zip_iterator(thrust::make_tuple(c_end, d.end()))
+  , d.begin()
+  , thrust::make_zip_iterator(thrust::make_tuple(idxs.begin(), r.begin()))
+  , greater_than_5{}
+  );
+}
diff --git a/thrust/testing/regression/gh_928_nvbug_2341455__reduce_with_complex.cu b/thrust/testing/regression/gh_928_nvbug_2341455__reduce_with_complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba422be605252f0394736cafecf703eaca1da884
--- /dev/null
+++ b/thrust/testing/regression/gh_928_nvbug_2341455__reduce_with_complex.cu
@@ -0,0 +1,10 @@
+#include <thrust/device_vector.h>
+#include <thrust/complex.h>
+#include <thrust/reduce.h>
+
+int main()
+{
+  thrust::device_vector<thrust::complex<double> > d(5);
+  thrust::reduce(d.begin(), d.end());
+}
+
diff --git a/thrust/testing/regression/nvbug_1632709__reduce_large_input_sizes.cu b/thrust/testing/regression/nvbug_1632709__reduce_large_input_sizes.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e59633bb166df3fbf64e08c75c9318ac60ffda4
--- /dev/null
+++ b/thrust/testing/regression/nvbug_1632709__reduce_large_input_sizes.cu
@@ -0,0 +1,20 @@
+#include <thrust/reduce.h> 
+#include <thrust/iterator/constant_iterator.h> 
+
+#include <assert.h>
+#include <iostream>
+ 
+int main()
+{ 
+  long long n = 10000000000; 
+
+  long long r = thrust::reduce(
+    thrust::constant_iterator<long long>(0)
+  , thrust::constant_iterator<long long>(n)
+  ); 
+
+  std::cout << r << std::endl;
+
+  assert(r == n);
+}
+ 
diff --git a/thrust/testing/regression/nvbug_1940974__merge_with_constant_iterator.cu b/thrust/testing/regression/nvbug_1940974__merge_with_constant_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..646fdc558d7dc18d20897ed5391b498cc732a425
--- /dev/null
+++ b/thrust/testing/regression/nvbug_1940974__merge_with_constant_iterator.cu
@@ -0,0 +1,35 @@
+#include <thrust/device_vector.h>
+#include <thrust/merge.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+struct comp
+{
+  template<typename Tuple1, typename Tuple2>
+  __host__ __device__
+  bool operator()(const Tuple1& t1, const Tuple2& t2) 
+  {
+    return thrust::get<0>(t1) == thrust::get<1>(t2);
+  }
+};
+
+int main()
+{
+    typedef thrust::device_vector<int> Vector;
+
+    Vector second(10), third(5), fourth(5), indices(15);
+
+    thrust::merge_by_key(thrust::make_zip_iterator(thrust::make_tuple(thrust::constant_iterator<int>(12), second.begin())),
+                         thrust::make_zip_iterator(thrust::make_tuple(thrust::constant_iterator<int>(12), second.begin())) + 10, 
+                         thrust::make_zip_iterator(thrust::make_tuple(third.begin(), fourth.begin())),
+                         thrust::make_zip_iterator(thrust::make_tuple(third.begin(), fourth.begin())) + 5,
+                         thrust::counting_iterator<int>(0),
+                         thrust::counting_iterator<int>(10),
+                         thrust::make_discard_iterator(),
+                         indices.begin(),
+                         comp());
+
+    return 0;
+}
+ 
diff --git a/thrust/testing/regression/nvbug_1965743__unnecessary_static_on_get_occ_device_properties.cu b/thrust/testing/regression/nvbug_1965743__unnecessary_static_on_get_occ_device_properties.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c01c0ad4eda9084f4dcc516301d829a08d8ee7b6
--- /dev/null
+++ b/thrust/testing/regression/nvbug_1965743__unnecessary_static_on_get_occ_device_properties.cu
@@ -0,0 +1,5 @@
+// nvcc -Xcompiler -Wall -Xcompiler -Werror -ccbin=clang
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+int main() {}
diff --git a/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.cu b/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f06945328c1324f6ee62d89a2f444ee9c07fee14
--- /dev/null
+++ b/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.cu
@@ -0,0 +1,20 @@
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{  
+  return make_uint2(a.x + b.x, a.y + b.y); 
+} 
+  
+int main() {  
+  int num_elements = 32;  
+  uint2 *input = NULL, *output = NULL;
+  const uint2 zero = make_uint2(0,0);  
+  
+  thrust::exclusive_scan(thrust::device_ptr<uint2>((uint2*)input), 
+                         thrust::device_ptr<uint2>((uint2*)input + num_elements), 
+                         thrust::device_ptr<uint2>(output), zero);  
+  
+  return 0;  
+}
+ 
diff --git a/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed0.cu b/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed0.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f987c2f3f27c2bda632dcb73d71ce272174764d3
--- /dev/null
+++ b/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed0.cu
@@ -0,0 +1,22 @@
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+struct uint2_adder 
+{ 
+  __host__ __device__ uint2 operator()(uint2 a, uint2 b) {  
+    return make_uint2(a.x + b.x, a.y + b.y); 
+  } 
+}; 
+  
+int main() {  
+  int num_elements = 32;  
+  uint2 *input = NULL, *output = NULL;
+  const uint2 zero = make_uint2(0,0);  
+  
+  thrust::exclusive_scan(thrust::device_ptr<uint2>((uint2*)input), 
+                         thrust::device_ptr<uint2>((uint2*)input + num_elements), 
+                         thrust::device_ptr<uint2>(output), zero, uint2_adder());  
+  
+  return 0;  
+}
+ 
diff --git a/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed1.cu b/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed1.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ccf67d39c72275820c7f42e4b24cafc93876b74
--- /dev/null
+++ b/thrust/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed1.cu
@@ -0,0 +1,20 @@
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{  
+  return make_uint2(a.x + b.x, a.y + b.y); 
+} 
+  
+int main() {  
+  int num_elements = 32;  
+  uint2 *input = NULL, *output = NULL;
+  const uint2 zero = make_uint2(0,0);  
+  
+  thrust::exclusive_scan(thrust::device_ptr<uint2>((uint2*)input), 
+                         thrust::device_ptr<uint2>((uint2*)input + num_elements), 
+                         thrust::device_ptr<uint2>(output), zero, operator+);  
+  
+  return 0;  
+}
+ 
diff --git a/thrust/testing/remove.cu b/thrust/testing/remove.cu
new file mode 100644
index 0000000000000000000000000000000000000000..95b679dc774b8af077d9db872e0eb563c86365a8
--- /dev/null
+++ b/thrust/testing/remove.cu
@@ -0,0 +1,761 @@
+#include <unittest/unittest.h>
+#include <thrust/remove.h>
+#include <thrust/count.h>
+#include <thrust/functional.h>
+#include <stdexcept>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename T>
+struct is_even
+  : thrust::unary_function<T,bool>
+{
+    __host__ __device__
+    bool operator()(T x) { return (static_cast<unsigned int>(x) & 1) == 0; }
+};
+
+template<typename T>
+struct is_true
+  : thrust::unary_function<T,bool>
+{
+    __host__ __device__
+    bool operator()(T x) { return x ? true : false; }
+};
+
+template<typename Vector>
+void TestRemoveSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1;
+    data[1] =  2;
+    data[2] =  1;
+    data[3] =  3;
+    data[4] =  2;
+
+    typename Vector::iterator end = thrust::remove(data.begin(),
+                                                    data.end(),
+                                                    (T) 2);
+
+    ASSERT_EQUAL(end - data.begin(), 3);
+
+    ASSERT_EQUAL(data[0], 1);
+    ASSERT_EQUAL(data[1], 1);
+    ASSERT_EQUAL(data[2], 3);
+}
+DECLARE_VECTOR_UNITTEST(TestRemoveSimple);
+
+
+template<typename ForwardIterator,
+         typename T>
+ForwardIterator remove(my_system &system,
+                       ForwardIterator first,
+                       ForwardIterator,
+                       const T &)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestRemoveDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::remove(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestRemoveDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename T>
+ForwardIterator remove(my_tag,
+                       ForwardIterator first,
+                       ForwardIterator,
+                       const T &)
+{
+    *first = 13;
+    return first;
+}
+
+void TestRemoveDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::remove(thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.begin()),
+                   0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestRemoveDispatchImplicit);
+
+
+template<typename Vector>
+void TestRemoveCopySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1;
+    data[1] =  2;
+    data[2] =  1;
+    data[3] =  3;
+    data[4] =  2;
+
+    Vector result(5);
+
+    typename Vector::iterator end = thrust::remove_copy(data.begin(),
+                                                        data.end(),
+                                                        result.begin(),
+                                                        (T) 2);
+
+    ASSERT_EQUAL(end - result.begin(), 3);
+
+    ASSERT_EQUAL(result[0], 1);
+    ASSERT_EQUAL(result[1], 1);
+    ASSERT_EQUAL(result[2], 3);
+}
+DECLARE_VECTOR_UNITTEST(TestRemoveCopySimple);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+OutputIterator remove_copy(my_system &system,
+                           InputIterator,
+                           InputIterator,
+                           OutputIterator result,
+                           const T &)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestRemoveCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::remove_copy(sys,
+                        vec.begin(),
+                        vec.begin(),
+                        vec.begin(),
+                        0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestRemoveCopyDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+OutputIterator remove_copy(my_tag,
+                           InputIterator,
+                           InputIterator,
+                           OutputIterator result,
+                           const T &)
+{
+    *result = 13;
+    return result;
+}
+
+void TestRemoveCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::remove_copy(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestRemoveCopyDispatchImplicit);
+
+
+template<typename Vector>
+void TestRemoveIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1;
+    data[1] =  2;
+    data[2] =  1;
+    data[3] =  3;
+    data[4] =  2;
+
+    typename Vector::iterator end = thrust::remove_if(data.begin(),
+                                                      data.end(),
+                                                      is_even<T>());
+
+    ASSERT_EQUAL(end - data.begin(), 3);
+
+    ASSERT_EQUAL(data[0], 1);
+    ASSERT_EQUAL(data[1], 1);
+    ASSERT_EQUAL(data[2], 3);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestRemoveIfSimple);
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+ForwardIterator remove_if(my_system &system,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestRemoveIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::remove_if(sys, vec.begin(), vec.end(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestRemoveIfDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+ForwardIterator remove_if(my_tag,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestRemoveIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::remove_if(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestRemoveIfDispatchImplicit);
+
+
+template<typename Vector>
+void TestRemoveIfStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1;
+    data[1] =  2;
+    data[2] =  1;
+    data[3] =  3;
+    data[4] =  2;
+
+    Vector stencil(5);
+    stencil[0] = 0;
+    stencil[1] = 1;
+    stencil[2] = 0;
+    stencil[3] = 0;
+    stencil[4] = 1;
+
+    typename Vector::iterator end = thrust::remove_if(data.begin(),
+                                                      data.end(),
+                                                      stencil.begin(),
+                                                      thrust::identity<T>());
+
+    ASSERT_EQUAL(end - data.begin(), 3);
+
+    ASSERT_EQUAL(data[0], 1);
+    ASSERT_EQUAL(data[1], 1);
+    ASSERT_EQUAL(data[2], 3);
+}
+DECLARE_VECTOR_UNITTEST(TestRemoveIfStencilSimple);
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+ForwardIterator remove_if(my_system &system,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          InputIterator,
+                          Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestRemoveIfStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::remove_if(sys,
+                      vec.begin(),
+                      vec.begin(),
+                      vec.begin(),
+                      0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestRemoveIfStencilDispatchExplicit);
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+ForwardIterator remove_if(my_tag,
+                          ForwardIterator first,
+                          ForwardIterator,
+                          InputIterator,
+                          Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestRemoveIfStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::remove_if(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestRemoveIfStencilDispatchImplicit);
+
+
+template<typename Vector>
+void TestRemoveCopyIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1;
+    data[1] =  2;
+    data[2] =  1;
+    data[3] =  3;
+    data[4] =  2;
+
+    Vector result(5);
+
+    typename Vector::iterator end = thrust::remove_copy_if(data.begin(),
+                                                           data.end(),
+                                                           result.begin(),
+                                                           is_even<T>());
+
+    ASSERT_EQUAL(end - result.begin(), 3);
+
+    ASSERT_EQUAL(result[0], 1);
+    ASSERT_EQUAL(result[1], 1);
+    ASSERT_EQUAL(result[2], 3);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestRemoveCopyIfSimple);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+InputIterator remove_copy_if(my_system &system,
+                             InputIterator first,
+                             InputIterator,
+                             OutputIterator,
+                             Predicate)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestRemoveCopyIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::remove_copy_if(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestRemoveCopyIfDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+InputIterator remove_copy_if(my_tag,
+                             InputIterator first,
+                             InputIterator,
+                             OutputIterator,
+                             Predicate)
+{
+    *first = 13;
+    return first;
+}
+
+void TestRemoveCopyIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::remove_copy_if(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestRemoveCopyIfDispatchImplicit);
+
+
+template<typename Vector>
+void TestRemoveCopyIfStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1;
+    data[1] =  2;
+    data[2] =  1;
+    data[3] =  3;
+    data[4] =  2;
+
+    Vector stencil(5);
+    stencil[0] = 0;
+    stencil[1] = 1;
+    stencil[2] = 0;
+    stencil[3] = 0;
+    stencil[4] = 1;
+
+    Vector result(5);
+
+    typename Vector::iterator end = thrust::remove_copy_if(data.begin(),
+                                                           data.end(),
+                                                           stencil.begin(),
+                                                           result.begin(),
+                                                           thrust::identity<T>());
+
+    ASSERT_EQUAL(end - result.begin(), 3);
+
+    ASSERT_EQUAL(result[0], 1);
+    ASSERT_EQUAL(result[1], 1);
+    ASSERT_EQUAL(result[2], 3);
+}
+DECLARE_VECTOR_UNITTEST(TestRemoveCopyIfStencilSimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+OutputIterator remove_copy_if(my_system &system,
+                              InputIterator1,
+                              InputIterator1,
+                              InputIterator2,
+                              OutputIterator result,
+                              Predicate)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestRemoveCopyIfStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::remove_copy_if(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestRemoveCopyIfStencilDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+OutputIterator remove_copy_if(my_tag,
+                              InputIterator1,
+                              InputIterator1,
+                              InputIterator2,
+                              OutputIterator result,
+                              Predicate)
+{
+    *result = 13;
+    return result;
+}
+
+void TestRemoveCopyIfStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::remove_copy_if(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestRemoveCopyIfStencilDispatchImplicit);
+
+
+template<typename T>
+void TestRemove(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t h_size = thrust::remove(h_data.begin(), h_data.end(), T(0)) - h_data.begin();
+    size_t d_size = thrust::remove(d_data.begin(), d_data.end(), T(0)) - d_data.begin();
+
+    ASSERT_EQUAL(h_size, d_size);
+
+    h_data.resize(h_size);
+    d_data.resize(d_size);
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemove);
+
+
+template<typename T>
+void TestRemoveIf(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), is_true<T>()) - h_data.begin();
+    size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), is_true<T>()) - d_data.begin();
+
+    ASSERT_EQUAL(h_size, d_size);
+
+    h_data.resize(h_size);
+    d_data.resize(d_size);
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveIf);
+
+
+template<typename T>
+void TestRemoveIfStencil(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
+    thrust::device_vector<bool> d_stencil = h_stencil;
+
+    size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), h_stencil.begin(), is_true<T>()) - h_data.begin();
+    size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), d_stencil.begin(), is_true<T>()) - d_data.begin();
+
+    ASSERT_EQUAL(h_size, d_size);
+
+    h_data.resize(h_size);
+    d_data.resize(d_size);
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveIfStencil);
+
+
+template<typename T>
+void TestRemoveCopy(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    size_t h_size = thrust::remove_copy(h_data.begin(), h_data.end(), h_result.begin(), T(0)) - h_result.begin();
+    size_t d_size = thrust::remove_copy(d_data.begin(), d_data.end(), d_result.begin(), T(0)) - d_result.begin();
+
+    ASSERT_EQUAL(h_size, d_size);
+
+    h_result.resize(h_size);
+    d_result.resize(d_size);
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopy);
+
+
+template<typename T>
+void TestRemoveCopyToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t num_zeros = thrust::count(h_data.begin(), h_data.end(), T(0));
+    size_t num_nonzeros = h_data.size() - num_zeros;
+
+    thrust::discard_iterator<> h_result =
+      thrust::remove_copy(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), T(0));
+
+    thrust::discard_iterator<> d_result =
+      thrust::remove_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), T(0));
+
+    thrust::discard_iterator<> reference(num_nonzeros);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopyToDiscardIterator);
+
+
+template<typename T>
+void TestRemoveCopyToDiscardIteratorZipped(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T> h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    size_t num_zeros = thrust::count(h_data.begin(), h_data.end(), T(0));
+    size_t num_nonzeros = h_data.size() - num_zeros;
+
+    typedef thrust::tuple<typename thrust::host_vector<T>::iterator, thrust::discard_iterator<> >   Tuple1;
+    typedef thrust::tuple<typename thrust::device_vector<T>::iterator, thrust::discard_iterator<> > Tuple2;
+
+    typedef thrust::zip_iterator<Tuple1> ZipIterator1;
+    typedef thrust::zip_iterator<Tuple2> ZipIterator2;
+
+    ZipIterator1 h_result =
+      thrust::remove_copy(thrust::make_zip_iterator(thrust::make_tuple(h_data.begin(), h_data.begin())),
+                          thrust::make_zip_iterator(thrust::make_tuple(h_data.end(), h_data.end())),
+                          thrust::make_zip_iterator(thrust::make_tuple(h_output.begin(),thrust::make_discard_iterator())),
+                          thrust::make_tuple(T(0),T(0)));
+
+    ZipIterator2 d_result =
+      thrust::remove_copy(thrust::make_zip_iterator(thrust::make_tuple(d_data.begin(), d_data.begin())),
+                          thrust::make_zip_iterator(thrust::make_tuple(d_data.end(), d_data.end())),
+                          thrust::make_zip_iterator(thrust::make_tuple(d_output.begin(),thrust::make_discard_iterator())),
+                          thrust::make_tuple(T(0),T(0)));
+
+    thrust::discard_iterator<> reference(num_nonzeros);
+
+    ASSERT_EQUAL(h_output, d_output);
+    ASSERT_EQUAL_QUIET(reference, thrust::get<1>(h_result.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(reference, thrust::get<1>(d_result.get_iterator_tuple()));
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopyToDiscardIteratorZipped);
+
+
+template<typename T>
+void TestRemoveCopyIf(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<T>()) - h_result.begin();
+    size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_true<T>()) - d_result.begin();
+
+    ASSERT_EQUAL(h_size, d_size);
+
+    h_result.resize(h_size);
+    d_result.resize(d_size);
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIf);
+
+
+template<typename T>
+void TestRemoveCopyIfToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    size_t num_false = thrust::count_if(h_data.begin(), h_data.end(), thrust::not1(is_true<T>()));
+
+    thrust::discard_iterator<> h_result =
+      thrust::remove_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), is_true<T>());
+
+    thrust::discard_iterator<> d_result =
+      thrust::remove_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), is_true<T>());
+
+    thrust::discard_iterator<> reference(num_false);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIfToDiscardIterator);
+
+
+template<typename T>
+void TestRemoveCopyIfStencil(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
+    thrust::device_vector<bool> d_stencil = h_stencil;
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_true<T>()) - h_result.begin();
+    size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_true<T>()) - d_result.begin();
+
+    ASSERT_EQUAL(h_size, d_size);
+
+    h_result.resize(h_size);
+    d_result.resize(d_size);
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIfStencil);
+
+
+template<typename T>
+void TestRemoveCopyIfStencilToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
+    thrust::device_vector<bool> d_stencil = h_stencil;
+
+    size_t num_false = thrust::count_if(h_stencil.begin(), h_stencil.end(), thrust::not1(is_true<T>()));
+
+    thrust::discard_iterator<> h_result =
+      thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), is_true<T>());
+
+    thrust::discard_iterator<> d_result =
+      thrust::remove_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), is_true<T>());
+
+    thrust::discard_iterator<> reference(num_false);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIfStencilToDiscardIterator);
diff --git a/thrust/testing/replace.cu b/thrust/testing/replace.cu
new file mode 100644
index 0000000000000000000000000000000000000000..31e9890bb42226b3980a169ffe8996bc41b7e830
--- /dev/null
+++ b/thrust/testing/replace.cu
@@ -0,0 +1,675 @@
+#include <unittest/unittest.h>
+#include <thrust/replace.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template <class Vector>
+void TestReplaceSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  2; 
+    data[2] =  1;
+    data[3] =  3; 
+    data[4] =  2; 
+
+    thrust::replace(data.begin(), data.end(), (T) 1, (T) 4);
+    thrust::replace(data.begin(), data.end(), (T) 2, (T) 5);
+
+    Vector result(5);
+    result[0] =  4; 
+    result[1] =  5; 
+    result[2] =  4;
+    result[3] =  3; 
+    result[4] =  5; 
+
+    ASSERT_EQUAL(data, result);
+}
+DECLARE_VECTOR_UNITTEST(TestReplaceSimple);
+
+
+template<typename ForwardIterator, typename T>
+void replace(my_system &system,
+             ForwardIterator, ForwardIterator, const T &,
+             const T &)
+{
+    system.validate_dispatch();
+}
+
+void TestReplaceDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::replace(sys,
+                    vec.begin(),
+                    vec.begin(),
+                    0,
+                    0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReplaceDispatchExplicit);
+
+
+template<typename ForwardIterator, typename T>
+void replace(my_tag,
+             ForwardIterator first, ForwardIterator, const T &,
+             const T &)
+{
+    *first = 13;
+}
+
+void TestReplaceDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::replace(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    0,
+                    0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReplaceDispatchImplicit);
+
+
+template <typename T>
+void TestReplace(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    T old_value = 0;
+    T new_value = 1;
+
+    thrust::replace(h_data.begin(), h_data.end(), old_value, new_value);
+    thrust::replace(d_data.begin(), d_data.end(), old_value, new_value);
+
+    ASSERT_ALMOST_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplace);
+
+
+template <class Vector>
+void TestReplaceCopySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] = 1; 
+    data[1] = 2; 
+    data[2] = 1;
+    data[3] = 3; 
+    data[4] = 2; 
+
+    Vector dest(5);
+
+    thrust::replace_copy(data.begin(), data.end(), dest.begin(), (T) 1, (T) 4);
+    thrust::replace_copy(dest.begin(), dest.end(), dest.begin(), (T) 2, (T) 5);
+
+    Vector result(5);
+    result[0] = 4; 
+    result[1] = 5; 
+    result[2] = 4;
+    result[3] = 3; 
+    result[4] = 5; 
+
+    ASSERT_EQUAL(dest, result);
+}
+DECLARE_VECTOR_UNITTEST(TestReplaceCopySimple);
+
+
+template<typename InputIterator, typename OutputIterator, typename T>
+OutputIterator replace_copy(my_system &system,
+                            InputIterator, InputIterator,
+                            OutputIterator result,
+                            const T &,
+                            const T &)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestReplaceCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::replace_copy(sys,
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         0,
+                         0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReplaceCopyDispatchExplicit);
+
+
+template<typename InputIterator, typename OutputIterator, typename T>
+OutputIterator replace_copy(my_tag,
+                            InputIterator, InputIterator,
+                            OutputIterator result,
+                            const T &,
+                            const T &)
+{
+    *result = 13;
+    return result;
+}
+
+void TestReplaceCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::replace_copy(thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         0,
+                         0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReplaceCopyDispatchImplicit);
+
+
+template <typename T>
+void TestReplaceCopy(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+    
+    T old_value = 0;
+    T new_value = 1;
+    
+    thrust::host_vector<T>   h_dest(n);
+    thrust::device_vector<T> d_dest(n);
+
+    thrust::replace_copy(h_data.begin(), h_data.end(), h_dest.begin(), old_value, new_value);
+    thrust::replace_copy(d_data.begin(), d_data.end(), d_dest.begin(), old_value, new_value);
+
+    ASSERT_ALMOST_EQUAL(h_data, d_data);
+    ASSERT_ALMOST_EQUAL(h_dest, d_dest);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceCopy);
+
+
+template <typename T>
+void TestReplaceCopyToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+    
+    T old_value = 0;
+    T new_value = 1;
+
+    thrust::discard_iterator<> h_result =
+      thrust::replace_copy(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), old_value, new_value);
+
+    thrust::discard_iterator<> d_result =
+      thrust::replace_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), old_value, new_value);
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceCopyToDiscardIterator);
+
+
+
+template <typename T>
+struct less_than_five
+{
+  __host__ __device__ bool operator()(const T &val) const {return val < 5;}
+};
+
+template <class Vector>
+void TestReplaceIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  3; 
+    data[2] =  4;
+    data[3] =  6; 
+    data[4] =  5; 
+
+    thrust::replace_if(data.begin(), data.end(), less_than_five<T>(), (T) 0);
+
+    Vector result(5);
+    result[0] =  0; 
+    result[1] =  0; 
+    result[2] =  0;
+    result[3] =  6; 
+    result[4] =  5; 
+
+    ASSERT_EQUAL(data, result);
+}
+DECLARE_VECTOR_UNITTEST(TestReplaceIfSimple);
+
+
+template<typename ForwardIterator, typename Predicate, typename T>
+void replace_if(my_system &system,
+                ForwardIterator, ForwardIterator,
+                Predicate,
+                const T &)
+{
+    system.validate_dispatch();
+}
+
+void TestReplaceIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::replace_if(sys,
+                       vec.begin(),
+                       vec.begin(),
+                       0,
+                       0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReplaceIfDispatchExplicit);
+
+
+template<typename ForwardIterator, typename Predicate, typename T>
+void replace_if(my_tag,
+                ForwardIterator first, ForwardIterator,
+                Predicate,
+                const T &)
+{
+    *first = 13;
+}
+
+void TestReplaceIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::replace_if(thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       0,
+                       0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReplaceIfDispatchImplicit);
+
+
+template <class Vector>
+void TestReplaceIfStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  3; 
+    data[2] =  4;
+    data[3] =  6; 
+    data[4] =  5; 
+
+    Vector stencil(5);
+    stencil[0] = 5;
+    stencil[1] = 4;
+    stencil[2] = 6;
+    stencil[3] = 3;
+    stencil[4] = 7;
+
+    thrust::replace_if(data.begin(), data.end(), stencil.begin(), less_than_five<T>(), (T) 0);
+
+    Vector result(5);
+    result[0] =  1; 
+    result[1] =  0; 
+    result[2] =  4;
+    result[3] =  0; 
+    result[4] =  5; 
+
+    ASSERT_EQUAL(data, result);
+}
+DECLARE_VECTOR_UNITTEST(TestReplaceIfStencilSimple);
+
+
+template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+void replace_if(my_system &system,
+                ForwardIterator, ForwardIterator,
+                InputIterator,
+                Predicate,
+                const T &)
+{
+    system.validate_dispatch();
+}
+
+void TestReplaceIfStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::replace_if(sys,
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       0,
+                       0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReplaceIfStencilDispatchExplicit);
+
+
+template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+void replace_if(my_tag,
+                ForwardIterator first, ForwardIterator,
+                InputIterator,
+                Predicate,
+                const T &)
+{
+    *first = 13;
+}
+
+void TestReplaceIfStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::replace_if(thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       0,
+                       0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReplaceIfStencilDispatchImplicit);
+
+
+template <typename T>
+void TestReplaceIf(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::replace_if(h_data.begin(), h_data.end(), less_than_five<T>(), (T) 0);
+    thrust::replace_if(d_data.begin(), d_data.end(), less_than_five<T>(), (T) 0);
+
+    ASSERT_ALMOST_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceIf);
+
+
+template <typename T>
+void TestReplaceIfStencil(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_stencil = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_stencil = h_stencil;
+
+    thrust::replace_if(h_data.begin(), h_data.end(), h_stencil.begin(), less_than_five<T>(), (T) 0);
+    thrust::replace_if(d_data.begin(), d_data.end(), d_stencil.begin(), less_than_five<T>(), (T) 0);
+
+    ASSERT_ALMOST_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceIfStencil);
+
+
+template <class Vector>
+void TestReplaceCopyIfSimple(void)
+{
+    typedef typename Vector::value_type T;
+    
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  3; 
+    data[2] =  4;
+    data[3] =  6; 
+    data[4] =  5; 
+
+    Vector dest(5);
+
+    thrust::replace_copy_if(data.begin(), data.end(), dest.begin(), less_than_five<T>(), (T) 0);
+
+    Vector result(5);
+    result[0] =  0; 
+    result[1] =  0; 
+    result[2] =  0;
+    result[3] =  6; 
+    result[4] =  5; 
+
+    ASSERT_EQUAL(dest, result);
+}
+DECLARE_VECTOR_UNITTEST(TestReplaceCopyIfSimple);
+
+
+template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+OutputIterator replace_copy_if(my_system &system,
+                               InputIterator, InputIterator,
+                               OutputIterator result,
+                               Predicate,
+                               const T &)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestReplaceCopyIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::replace_copy_if(sys,
+                            vec.begin(),
+                            vec.begin(),
+                            vec.begin(),
+                            0,
+                            0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReplaceCopyIfDispatchExplicit);
+
+
+template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+OutputIterator replace_copy_if(my_tag,
+                               InputIterator, InputIterator,
+                               OutputIterator result,
+                               Predicate,
+                               const T &)
+{
+    *result = 13;
+    return result;
+}
+
+void TestReplaceCopyIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::replace_copy_if(thrust::retag<my_tag>(vec.begin()),
+                            thrust::retag<my_tag>(vec.begin()),
+                            thrust::retag<my_tag>(vec.begin()),
+                            0,
+                            0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReplaceCopyIfDispatchImplicit);
+
+
+template <class Vector>
+void TestReplaceCopyIfStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+    
+    Vector data(5);
+    data[0] =  1; 
+    data[1] =  3; 
+    data[2] =  4;
+    data[3] =  6; 
+    data[4] =  5; 
+
+    Vector stencil(5);
+    stencil[0] = 1;
+    stencil[1] = 5;
+    stencil[2] = 4;
+    stencil[3] = 7;
+    stencil[4] = 8;
+
+    Vector dest(5);
+
+    thrust::replace_copy_if(data.begin(), data.end(), stencil.begin(), dest.begin(), less_than_five<T>(), (T) 0);
+
+    Vector result(5);
+    result[0] =  0; 
+    result[1] =  3; 
+    result[2] =  0;
+    result[3] =  6; 
+    result[4] =  5; 
+
+    ASSERT_EQUAL(dest, result);
+}
+DECLARE_VECTOR_UNITTEST(TestReplaceCopyIfStencilSimple);
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+OutputIterator replace_copy_if(my_system &system,
+                               InputIterator1, InputIterator1,
+                               InputIterator2,
+                               OutputIterator result,
+                               Predicate,
+                               const T &)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+
+void TestReplaceCopyIfStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::replace_copy_if(sys,
+                            vec.begin(),
+                            vec.begin(),
+                            vec.begin(),
+                            vec.begin(),
+                            0,
+                            0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReplaceCopyIfStencilDispatchExplicit);
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+OutputIterator replace_copy_if(my_tag,
+                               InputIterator1, InputIterator1,
+                               InputIterator2,
+                               OutputIterator result,
+                               Predicate,
+                               const T &)
+{
+    *result = 13;
+    return result;
+}
+
+void TestReplaceCopyIfStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::replace_copy_if(thrust::retag<my_tag>(vec.begin()),
+                            thrust::retag<my_tag>(vec.begin()),
+                            thrust::retag<my_tag>(vec.begin()),
+                            thrust::retag<my_tag>(vec.begin()),
+                            0,
+                            0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReplaceCopyIfStencilDispatchImplicit);
+
+
+template <typename T>
+void TestReplaceCopyIf(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_dest(n);
+    thrust::device_vector<T> d_dest(n);
+
+    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<T>(), 0);
+    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<T>(), 0);
+
+    ASSERT_ALMOST_EQUAL(h_data, d_data);
+    ASSERT_ALMOST_EQUAL(h_dest, d_dest);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceCopyIf);
+
+
+template <typename T>
+void TestReplaceCopyIfToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::discard_iterator<> h_result =
+      thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+
+    thrust::discard_iterator<> d_result =
+      thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceCopyIfToDiscardIterator);
+
+template <typename T>
+void TestReplaceCopyIfStencil(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_stencil = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_stencil = h_stencil;
+
+    thrust::host_vector<T>   h_dest(n);
+    thrust::device_vector<T> d_dest(n);
+
+    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<T>(), 0);
+    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<T>(), 0);
+
+    ASSERT_ALMOST_EQUAL(h_data, d_data);
+    ASSERT_ALMOST_EQUAL(h_dest, d_dest);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceCopyIfStencil);
+
+template <typename T>
+void TestReplaceCopyIfStencilToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_stencil = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_stencil = h_stencil;
+
+    thrust::discard_iterator<> h_result =
+      thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+
+    thrust::discard_iterator<> d_result =
+      thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestReplaceCopyIfStencilToDiscardIterator);
+
diff --git a/thrust/testing/reverse.cu b/thrust/testing/reverse.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b04e446dccbf7eb35a28c6bc6b846783d0fc16f1
--- /dev/null
+++ b/thrust/testing/reverse.cu
@@ -0,0 +1,203 @@
+#include <unittest/unittest.h>
+#include <thrust/reverse.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+typedef unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> ReverseTypes;
+
+template<typename Vector>
+void TestReverseSimple(void)
+{
+  Vector data(5);
+  data[0] = 1;
+  data[1] = 2;
+  data[2] = 3;
+  data[3] = 4;
+  data[4] = 5;
+
+  thrust::reverse(data.begin(), data.end());
+
+  Vector ref(5);
+  ref[0] = 5;
+  ref[1] = 4;
+  ref[2] = 3;
+  ref[3] = 2;
+  ref[4] = 1;
+
+  ASSERT_EQUAL(ref, data);
+}
+DECLARE_VECTOR_UNITTEST(TestReverseSimple);
+
+
+template<typename BidirectionalIterator>
+void reverse(my_system &system,
+             BidirectionalIterator,
+             BidirectionalIterator)
+{
+  system.validate_dispatch();
+}
+
+void TestReverseDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::reverse(sys, vec.begin(), vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReverseDispatchExplicit);
+
+
+template<typename BidirectionalIterator>
+void reverse(my_tag,
+             BidirectionalIterator first,
+             BidirectionalIterator)
+{
+  *first = 13;
+}
+
+void TestReverseDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::reverse(thrust::retag<my_tag>(vec.begin()),
+                  thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReverseDispatchImplicit);
+
+
+template<typename Vector>
+void TestReverseCopySimple(void)
+{
+  typedef typename Vector::iterator   Iterator;
+
+  Vector input(5);
+  input[0] = 1;
+  input[1] = 2;
+  input[2] = 3;
+  input[3] = 4;
+  input[4] = 5;
+
+  Vector output(5);
+
+  Iterator iter = thrust::reverse_copy(input.begin(), input.end(), output.begin());
+
+  Vector ref(5);
+  ref[0] = 5;
+  ref[1] = 4;
+  ref[2] = 3;
+  ref[3] = 2;
+  ref[4] = 1;
+
+  ASSERT_EQUAL(5, iter - output.begin());
+  ASSERT_EQUAL(ref, output);
+}
+DECLARE_VECTOR_UNITTEST(TestReverseCopySimple);
+
+
+template<typename BidirectionalIterator, typename OutputIterator>
+OutputIterator reverse_copy(my_system &system,
+                            BidirectionalIterator,
+                            BidirectionalIterator,
+                            OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestReverseCopyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::reverse_copy(sys, vec.begin(), vec.end(), vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestReverseCopyDispatchExplicit);
+
+
+template<typename BidirectionalIterator, typename OutputIterator>
+OutputIterator reverse_copy(my_tag,
+                            BidirectionalIterator,
+                            BidirectionalIterator,
+                            OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestReverseCopyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::reverse_copy(thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.end()),
+                       thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestReverseCopyDispatchImplicit);
+
+
+template<typename T>
+struct TestReverse
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T> h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::reverse(h_data.begin(), h_data.end());
+    thrust::reverse(d_data.begin(), d_data.end());
+
+    ASSERT_EQUAL(h_data, d_data);
+  }
+};
+VariableUnitTest<TestReverse, ReverseTypes> TestReverseInstance;
+
+template<typename T>
+struct TestReverseCopy
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T> h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T> h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    thrust::reverse_copy(h_data.begin(), h_data.end(), h_result.begin());
+    thrust::reverse_copy(d_data.begin(), d_data.end(), d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestReverseCopy, ReverseTypes> TestReverseCopyInstance;
+
+template<typename T>
+struct TestReverseCopyToDiscardIterator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T> h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::discard_iterator<> h_result =
+      thrust::reverse_copy(h_data.begin(), h_data.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> d_result =
+      thrust::reverse_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+  }
+};
+VariableUnitTest<TestReverseCopyToDiscardIterator, ReverseTypes> TestReverseCopyToDiscardIteratorInstance;
+
diff --git a/thrust/testing/reverse_iterator.cu b/thrust/testing/reverse_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1571456f1c0d9be4235437939963cdd59a327e1e
--- /dev/null
+++ b/thrust/testing/reverse_iterator.cu
@@ -0,0 +1,136 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/scan.h>
+
+void TestReverseIteratorCopyConstructor(void)
+{
+  thrust::host_vector<int> h_v(1,13);
+
+  thrust::reverse_iterator<thrust::host_vector<int>::iterator> h_iter0(h_v.end());
+  thrust::reverse_iterator<thrust::host_vector<int>::iterator> h_iter1(h_iter0);
+
+  ASSERT_EQUAL_QUIET(h_iter0, h_iter1);
+  ASSERT_EQUAL(*h_iter0, *h_iter1);
+
+
+  thrust::device_vector<int> d_v(1,13);
+
+  thrust::reverse_iterator<thrust::device_vector<int>::iterator> d_iter2(d_v.end());
+  thrust::reverse_iterator<thrust::device_vector<int>::iterator> d_iter3(d_iter2);
+
+  ASSERT_EQUAL_QUIET(d_iter2, d_iter3);
+  ASSERT_EQUAL(*d_iter2, *d_iter3);
+}
+DECLARE_UNITTEST(TestReverseIteratorCopyConstructor);
+
+void TestReverseIteratorIncrement(void)
+{
+  thrust::host_vector<int> h_v(4);
+  thrust::sequence(h_v.begin(), h_v.end());
+
+  thrust::reverse_iterator<thrust::host_vector<int>::iterator> h_iter(h_v.end());
+
+  ASSERT_EQUAL(*h_iter, 3);
+
+  h_iter++;
+  ASSERT_EQUAL(*h_iter, 2);
+
+  h_iter++;
+  ASSERT_EQUAL(*h_iter, 1);
+
+  h_iter++;
+  ASSERT_EQUAL(*h_iter, 0);
+
+
+  thrust::device_vector<int> d_v(4);
+  thrust::sequence(d_v.begin(), d_v.end());
+
+  thrust::reverse_iterator<thrust::device_vector<int>::iterator> d_iter(d_v.end());
+
+  ASSERT_EQUAL(*d_iter, 3);
+
+  d_iter++;
+  ASSERT_EQUAL(*d_iter, 2);
+
+  d_iter++;
+  ASSERT_EQUAL(*d_iter, 1);
+
+  d_iter++;
+  ASSERT_EQUAL(*d_iter, 0);
+}
+DECLARE_UNITTEST(TestReverseIteratorIncrement);
+
+template <typename Vector>
+void TestReverseIteratorCopy(void)
+{
+  Vector source(4);
+  source[0] = 10;
+  source[1] = 20;
+  source[2] = 30;
+  source[3] = 40;
+
+  Vector destination(4,0);
+  
+  thrust::copy(thrust::make_reverse_iterator(source.end()),
+               thrust::make_reverse_iterator(source.begin()),
+               destination.begin());
+
+  ASSERT_EQUAL(destination[0], 40);
+  ASSERT_EQUAL(destination[1], 30);
+  ASSERT_EQUAL(destination[2], 20);
+  ASSERT_EQUAL(destination[3], 10);
+}
+DECLARE_VECTOR_UNITTEST(TestReverseIteratorCopy);
+
+void TestReverseIteratorExclusiveScanSimple(void)
+{
+  typedef int T;
+  const size_t n = 10;
+
+  thrust::host_vector<T> h_data(n);
+  thrust::sequence(h_data.begin(), h_data.end());
+
+  thrust::device_vector<T> d_data = h_data;
+
+  thrust::host_vector<T>   h_result(h_data.size());
+  thrust::device_vector<T> d_result(d_data.size());
+
+  thrust::exclusive_scan(thrust::make_reverse_iterator(h_data.end()),
+                         thrust::make_reverse_iterator(h_data.begin()),
+                         h_result.begin());
+
+  thrust::exclusive_scan(thrust::make_reverse_iterator(d_data.end()),
+                         thrust::make_reverse_iterator(d_data.begin()),
+                         d_result.begin());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_UNITTEST(TestReverseIteratorExclusiveScanSimple);
+
+
+template <typename T>
+struct TestReverseIteratorExclusiveScan
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T> h_data = unittest::random_samples<T>(n);
+
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    thrust::exclusive_scan(thrust::make_reverse_iterator(h_data.end()),
+                           thrust::make_reverse_iterator(h_data.begin()),
+                           h_result.begin());
+
+    thrust::exclusive_scan(thrust::make_reverse_iterator(d_data.end()),
+                           thrust::make_reverse_iterator(d_data.begin()),
+                           d_result.begin());
+
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+  }
+};
+VariableUnitTest<TestReverseIteratorExclusiveScan, IntegralTypes> TestReverseIteratorExclusiveScanInstance;
+
diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..925c7bc8faa86820bdeee264276552892dd164e9
--- /dev/null
+++ b/thrust/testing/scan.cu
@@ -0,0 +1,675 @@
+#include <unittest/unittest.h>
+#include <thrust/scan.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+
+template<typename T>
+  struct max_functor
+{
+  __host__ __device__
+  T operator()(T rhs, T lhs) const
+  {
+    return thrust::max(rhs,lhs);
+  }
+};
+
+
+template <class Vector>
+void TestScanSimple(void)
+{
+    typedef typename Vector::value_type T;
+    
+    typename Vector::iterator iter;
+
+    Vector input(5);
+    Vector result(5);
+    Vector output(5);
+
+    input[0] = 1; input[1] = 3; input[2] = -2; input[3] = 4; input[4] = -5;
+
+    Vector input_copy(input);
+
+    // inclusive scan
+    iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin());
+    result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+    
+    // exclusive scan
+    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(0));
+    result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+    
+    // exclusive scan with init
+    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3));
+    result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+    
+    // inclusive scan with op
+    iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), thrust::plus<T>());
+    result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+
+    // exclusive scan with init and op
+    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus<T>());
+    result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+
+    // inplace inclusive scan
+    input = input_copy;
+    iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin());
+    result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+    ASSERT_EQUAL(input, result);
+
+    // inplace exclusive scan with init
+    input = input_copy;
+    iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3));
+    result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+    ASSERT_EQUAL(input, result);
+
+    // inplace exclusive scan with implicit init=0
+    input = input_copy;
+    iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin());
+    result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+    ASSERT_EQUAL(input, result);
+}
+DECLARE_VECTOR_UNITTEST(TestScanSimple);
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+OutputIterator inclusive_scan(my_system &system,
+                              InputIterator,
+                              InputIterator,
+                              OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestInclusiveScanDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::inclusive_scan(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestInclusiveScanDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+OutputIterator inclusive_scan(my_tag,
+                              InputIterator,
+                              InputIterator,
+                              OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestInclusiveScanDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::inclusive_scan(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestInclusiveScanDispatchImplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+OutputIterator exclusive_scan(my_system &system,
+                              InputIterator,
+                              InputIterator,
+                              OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestExclusiveScanDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::exclusive_scan(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestExclusiveScanDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+OutputIterator exclusive_scan(my_tag,
+                              InputIterator,
+                              InputIterator,
+                              OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestExclusiveScanDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::exclusive_scan(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestExclusiveScanDispatchImplicit);
+
+
+void TestInclusiveScan32(void)
+{
+    typedef int T;
+    size_t n = 32;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+
+    ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_UNITTEST(TestInclusiveScan32);
+
+
+void TestExclusiveScan32(void)
+{
+    typedef int T;
+    size_t n = 32;
+    T init = 13;
+
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), init);
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), init);
+
+    ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_UNITTEST(TestExclusiveScan32);
+
+
+template <class IntVector, class FloatVector>
+void TestScanMixedTypes(void)
+{
+    // make sure we get types for default args and operators correct
+    IntVector int_input(4);
+    int_input[0] = 1;
+    int_input[1] = 2;
+    int_input[2] = 3;
+    int_input[3] = 4;
+
+    FloatVector float_input(4);
+    float_input[0] = 1.5;
+    float_input[1] = 2.5;
+    float_input[2] = 3.5;
+    float_input[3] = 4.5;
+
+    IntVector   int_output(4);
+    FloatVector float_output(4);
+
+    // float -> int should use plus<void> operator and float accumulator by default
+    thrust::inclusive_scan(float_input.begin(), float_input.end(), int_output.begin());
+    ASSERT_EQUAL(int_output[0],  1); // in: 1.5 accum: 1.5f out: 1
+    ASSERT_EQUAL(int_output[1],  4); // in: 2.5 accum: 4.0f out: 4
+    ASSERT_EQUAL(int_output[2],  7); // in: 3.5 accum: 7.5f out: 7
+    ASSERT_EQUAL(int_output[3], 12); // in: 4.5 accum: 12.f out: 12
+
+    // float -> float with plus<int> operator (float accumulator)
+    thrust::inclusive_scan(float_input.begin(), float_input.end(), float_output.begin(), thrust::plus<int>());
+    ASSERT_EQUAL(float_output[0],  1.5f); // in: 1.5 accum: 1.5f out: 1.5f
+    ASSERT_EQUAL(float_output[1],  3.0f); // in: 2.5 accum: 3.0f out: 3.0f
+    ASSERT_EQUAL(float_output[2],  6.0f); // in: 3.5 accum: 6.0f out: 6.0f
+    ASSERT_EQUAL(float_output[3], 10.0f); // in: 4.5 accum: 10.f out: 10.f
+
+    // float -> int should use plus<void> operator and float accumulator by default
+    thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin());
+    ASSERT_EQUAL(int_output[0], 0); // out: 0.0f  in: 1.5 accum: 1.5f
+    ASSERT_EQUAL(int_output[1], 1); // out: 1.5f  in: 2.5 accum: 4.0f
+    ASSERT_EQUAL(int_output[2], 4); // out: 4.0f  in: 3.5 accum: 7.5f
+    ASSERT_EQUAL(int_output[3], 7); // out: 7.5f  in: 4.5 accum: 12.f
+
+    // float -> int should use plus<> operator and float accumulator by default
+    thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin(), (float) 5.5);
+    ASSERT_EQUAL(int_output[0],  5); // out: 5.5f  in: 1.5 accum: 7.0f
+    ASSERT_EQUAL(int_output[1],  7); // out: 7.0f  in: 2.5 accum: 9.5f
+    ASSERT_EQUAL(int_output[2],  9); // out: 9.5f  in: 3.5 accum: 13.0f
+    ASSERT_EQUAL(int_output[3], 13); // out: 13.f  in: 4.5 accum: 17.4f
+
+    // int -> float should use using plus<> operator and int accumulator by default
+    thrust::inclusive_scan(int_input.begin(), int_input.end(), float_output.begin());
+    ASSERT_EQUAL(float_output[0],  1.f); // in: 1 accum: 1  out: 1
+    ASSERT_EQUAL(float_output[1],  3.f); // in: 2 accum: 3  out: 3
+    ASSERT_EQUAL(float_output[2],  6.f); // in: 3 accum: 6  out: 6
+    ASSERT_EQUAL(float_output[3], 10.f); // in: 4 accum: 10 out: 10
+
+    // int -> float + float init_value should use using plus<> operator and
+    // float accumulator by default
+    thrust::exclusive_scan(int_input.begin(), int_input.end(), float_output.begin(), (float) 5.5);
+    ASSERT_EQUAL(float_output[0],  5.5f); // out: 5.5f  in: 1 accum: 6.5f
+    ASSERT_EQUAL(float_output[1],  6.5f); // out: 6.0f  in: 2 accum: 8.5f
+    ASSERT_EQUAL(float_output[2],  8.5f); // out: 8.0f  in: 3 accum: 11.5f
+    ASSERT_EQUAL(float_output[3], 11.5f); // out: 11.f  in: 4 accum: 15.5f
+}
+void TestScanMixedTypesHost(void)
+{
+    TestScanMixedTypes< thrust::host_vector<int>, thrust::host_vector<float> >();
+}
+DECLARE_UNITTEST(TestScanMixedTypesHost);
+void TestScanMixedTypesDevice(void)
+{
+    TestScanMixedTypes< thrust::device_vector<int>, thrust::device_vector<float> >();
+}
+DECLARE_UNITTEST(TestScanMixedTypesDevice);
+
+
+template <typename T>
+struct TestScanWithOperator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+    
+    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), max_functor<T>());
+    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), max_functor<T>());
+    ASSERT_EQUAL(d_output, h_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), T(13), max_functor<T>());
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), T(13), max_functor<T>());
+    ASSERT_EQUAL(d_output, h_output);
+  }
+};
+VariableUnitTest<TestScanWithOperator, SignedIntegralTypes> TestScanWithOperatorInstance;
+
+
+template <typename T>
+struct TestScanWithOperatorToDiscardIterator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::discard_iterator<> reference(n);
+    
+    thrust::discard_iterator<> h_result =
+      thrust::inclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), max_functor<T>());
+
+    thrust::discard_iterator<> d_result =
+      thrust::inclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), max_functor<T>());
+    
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+    
+    h_result =
+      thrust::exclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), T(13), max_functor<T>());
+
+    d_result =
+      thrust::exclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), T(13), max_functor<T>());
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+  }
+};
+VariableUnitTest<TestScanWithOperatorToDiscardIterator, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestScanWithOperatorToDiscardIteratorInstance;
+
+
+template <typename T>
+struct TestScan
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+    
+    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), (T) 11);
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), (T) 11);
+    ASSERT_EQUAL(d_output, h_output);
+    
+    // in-place scans
+    h_output = h_input;
+    d_output = d_input;
+    thrust::inclusive_scan(h_output.begin(), h_output.end(), h_output.begin());
+    thrust::inclusive_scan(d_output.begin(), d_output.end(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+    
+    h_output = h_input;
+    d_output = d_input;
+    thrust::exclusive_scan(h_output.begin(), h_output.end(), h_output.begin());
+    thrust::exclusive_scan(d_output.begin(), d_output.end(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+  }
+};
+VariableUnitTest<TestScan, IntegralTypes> TestScanInstance;
+
+
+template <typename T>
+struct TestScanToDiscardIterator
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+    
+    thrust::discard_iterator<> h_result =
+      thrust::inclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> d_result =
+      thrust::inclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+    
+    h_result =
+      thrust::exclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), (T) 11);
+
+    d_result =
+      thrust::exclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), (T) 11);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+  }
+};
+VariableUnitTest<TestScanToDiscardIterator, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestScanToDiscardIteratorInstance;
+
+
+void TestScanMixedTypes(void)
+{
+    const unsigned int n = 113;
+
+    thrust::host_vector<unsigned int> h_input = unittest::random_integers<unsigned int>(n);
+    for(size_t i = 0; i < n; i++)
+        h_input[i] %= 10;
+    thrust::device_vector<unsigned int> d_input = h_input;
+
+    thrust::host_vector<float>   h_float_output(n);
+    thrust::device_vector<float> d_float_output(n);
+    thrust::host_vector<int>   h_int_output(n);
+    thrust::device_vector<int> d_int_output(n);
+
+    //mixed input/output types
+    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_float_output.begin());
+    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_float_output.begin());
+    ASSERT_EQUAL(d_float_output, h_float_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_float_output.begin(), (float) 3.5);
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_float_output.begin(), (float) 3.5);
+    ASSERT_EQUAL(d_float_output, h_float_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_float_output.begin(), (int) 3);
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_float_output.begin(), (int) 3);
+    ASSERT_EQUAL(d_float_output, h_float_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_int_output.begin(), (int) 3);
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_int_output.begin(), (int) 3);
+    ASSERT_EQUAL(d_int_output, h_int_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_int_output.begin(), (float) 3.5);
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_int_output.begin(), (float) 3.5);
+    ASSERT_EQUAL(d_int_output, h_int_output);
+}
+DECLARE_UNITTEST(TestScanMixedTypes);
+
+
+template <typename T, unsigned int N>
+void _TestScanWithLargeTypes(void)
+{
+    size_t n = (1024 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_input(n);
+    thrust::host_vector< FixedVector<T,N> > h_output(n);
+
+    for(size_t i = 0; i < h_input.size(); i++)
+        h_input[i] = FixedVector<T,N>(i);
+
+    thrust::device_vector< FixedVector<T,N> > d_input = h_input;
+    thrust::device_vector< FixedVector<T,N> > d_output(n);
+    
+    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+    
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), FixedVector<T,N>(0));
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), FixedVector<T,N>(0));
+    
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+}
+
+void TestScanWithLargeTypes(void)
+{
+  _TestScanWithLargeTypes<int,  1>();
+
+#if !defined(__QNX__)
+  _TestScanWithLargeTypes<int,  8>();
+  _TestScanWithLargeTypes<int, 64>();
+#else
+  KNOWN_FAILURE;
+#endif
+}
+DECLARE_UNITTEST(TestScanWithLargeTypes);
+
+
+template <typename T>
+struct plus_mod3
+{
+    T * table;
+
+    plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    T operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+template <typename Vector>
+void TestInclusiveScanWithIndirection(void)
+{
+    // add numbers modulo 3 with external lookup table
+    typedef typename Vector::value_type T;
+
+    Vector data(7);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 2;
+    data[3] = 1;
+    data[4] = 2;
+    data[5] = 0;
+    data[6] = 1;
+
+    Vector table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::inclusive_scan(data.begin(), data.end(), data.begin(), plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
+    
+    ASSERT_EQUAL(data[0], T(0));
+    ASSERT_EQUAL(data[1], T(1));
+    ASSERT_EQUAL(data[2], T(0));
+    ASSERT_EQUAL(data[3], T(1));
+    ASSERT_EQUAL(data[4], T(0));
+    ASSERT_EQUAL(data[5], T(0));
+    ASSERT_EQUAL(data[6], T(1));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithIndirection);
+
+struct only_set_when_expected_it
+{
+    long long expected;
+    bool * flag;
+
+    __host__ __device__ only_set_when_expected_it operator++() const { return *this; }
+    __host__ __device__ only_set_when_expected_it operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long value) const
+    {
+        if (value == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+namespace thrust
+{
+template<>
+struct iterator_traits<only_set_when_expected_it>
+{
+    typedef long long value_type;
+    typedef only_set_when_expected_it reference;
+};
+}
+
+void TestInclusiveScanWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude), thrust::raw_pointer_cast(has_executed) };
+
+    thrust::inclusive_scan(thrust::device, begin, end, out);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestInclusiveScanWithBigIndexes()
+{
+  TestInclusiveScanWithBigIndexesHelper(30);
+  TestInclusiveScanWithBigIndexesHelper(31);
+  TestInclusiveScanWithBigIndexesHelper(32);
+  TestInclusiveScanWithBigIndexesHelper(33);
+}
+
+DECLARE_UNITTEST(TestInclusiveScanWithBigIndexes);
+
+void TestExclusiveScanWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::exclusive_scan(thrust::device, begin, end, out,0ll);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestExclusiveScanWithBigIndexes()
+{
+  TestExclusiveScanWithBigIndexesHelper(30);
+  TestExclusiveScanWithBigIndexesHelper(31);
+  TestExclusiveScanWithBigIndexesHelper(32);
+  TestExclusiveScanWithBigIndexesHelper(33);
+}
+
+DECLARE_UNITTEST(TestExclusiveScanWithBigIndexes);
+
+#if THRUST_CPP_DIALECT >= 2011
+
+struct Int {
+    int i{};
+    __host__ __device__ explicit Int(int num) : i(num) {}
+    __host__ __device__ Int() : i{} {}
+    __host__ __device__ Int operator+(Int const& o) const { return Int{this->i + o.i}; }
+};
+
+void TestInclusiveScanWithUserDefinedType()
+{
+    thrust::device_vector<Int> vec(5, Int{1});
+
+    thrust::inclusive_scan(
+        thrust::device,
+        vec.cbegin(),
+        vec.cend(),
+        vec.begin());
+
+    ASSERT_EQUAL(static_cast<Int>(vec.back()).i, 5);
+}
+DECLARE_UNITTEST(TestInclusiveScanWithUserDefinedType);
+
+#endif // c++11
diff --git a/thrust/testing/scan_by_key.cu b/thrust/testing/scan_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..efc48bdb41fed0331a28e53a857b7a036d747c06
--- /dev/null
+++ b/thrust/testing/scan_by_key.cu
@@ -0,0 +1,629 @@
+#include <unittest/unittest.h>
+#include <thrust/scan.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/random.h>
+
+
+template <typename Vector>
+void TestInclusiveScanByKeySimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector keys(7);
+    Vector vals(7);
+
+    Vector output(7, 0);
+
+    keys[0] = 0; vals[0] = 1;
+    keys[1] = 1; vals[1] = 2;
+    keys[2] = 1; vals[2] = 3;
+    keys[3] = 1; vals[3] = 4;
+    keys[4] = 2; vals[4] = 5;
+    keys[5] = 3; vals[5] = 6;
+    keys[6] = 3; vals[6] = 7;
+
+    Iterator iter = thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
+
+    ASSERT_EQUAL_QUIET(iter, output.end());
+
+    ASSERT_EQUAL(output[0],  1);
+    ASSERT_EQUAL(output[1],  2);
+    ASSERT_EQUAL(output[2],  5);
+    ASSERT_EQUAL(output[3],  9);
+    ASSERT_EQUAL(output[4],  5);
+    ASSERT_EQUAL(output[5],  6);
+    ASSERT_EQUAL(output[6], 13);
+    
+    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>(), thrust::multiplies<T>());
+
+    ASSERT_EQUAL(output[0],  1);
+    ASSERT_EQUAL(output[1],  2);
+    ASSERT_EQUAL(output[2],  6);
+    ASSERT_EQUAL(output[3], 24);
+    ASSERT_EQUAL(output[4],  5);
+    ASSERT_EQUAL(output[5],  6);
+    ASSERT_EQUAL(output[6], 42);
+    
+    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>());
+
+    ASSERT_EQUAL(output[0],  1);
+    ASSERT_EQUAL(output[1],  2);
+    ASSERT_EQUAL(output[2],  5);
+    ASSERT_EQUAL(output[3],  9);
+    ASSERT_EQUAL(output[4],  5);
+    ASSERT_EQUAL(output[5],  6);
+    ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator inclusive_scan_by_key(my_system &system,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestInclusiveScanByKeyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::inclusive_scan_by_key(sys,
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator inclusive_scan_by_key(my_tag,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestInclusiveScanByKeyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::inclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit);
+
+
+template <typename Vector>
+void TestExclusiveScanByKeySimple(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator   Iterator;
+
+    Vector keys(7);
+    Vector vals(7);
+
+    Vector output(7, 0);
+
+    keys[0] = 0; vals[0] = 1;
+    keys[1] = 1; vals[1] = 2;
+    keys[2] = 1; vals[2] = 3;
+    keys[3] = 1; vals[3] = 4;
+    keys[4] = 2; vals[4] = 5;
+    keys[5] = 3; vals[5] = 6;
+    keys[6] = 3; vals[6] = 7;
+    
+    Iterator iter = thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
+
+    ASSERT_EQUAL_QUIET(iter, output.end());
+
+    ASSERT_EQUAL(output[0], 0);
+    ASSERT_EQUAL(output[1], 0);
+    ASSERT_EQUAL(output[2], 2);
+    ASSERT_EQUAL(output[3], 5);
+    ASSERT_EQUAL(output[4], 0);
+    ASSERT_EQUAL(output[5], 0);
+    ASSERT_EQUAL(output[6], 6);
+
+    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10));
+
+    ASSERT_EQUAL(output[0], 10);
+    ASSERT_EQUAL(output[1], 10);
+    ASSERT_EQUAL(output[2], 12);
+    ASSERT_EQUAL(output[3], 15);
+    ASSERT_EQUAL(output[4], 10);
+    ASSERT_EQUAL(output[5], 10);
+    ASSERT_EQUAL(output[6], 16);
+    
+    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>(), thrust::multiplies<T>());
+
+    ASSERT_EQUAL(output[0], 10);
+    ASSERT_EQUAL(output[1], 10);
+    ASSERT_EQUAL(output[2], 20);
+    ASSERT_EQUAL(output[3], 60);
+    ASSERT_EQUAL(output[4], 10);
+    ASSERT_EQUAL(output[5], 10);
+    ASSERT_EQUAL(output[6], 60);
+    
+    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>());
+
+    ASSERT_EQUAL(output[0], 10);
+    ASSERT_EQUAL(output[1], 10);
+    ASSERT_EQUAL(output[2], 12);
+    ASSERT_EQUAL(output[3], 15);
+    ASSERT_EQUAL(output[4], 10);
+    ASSERT_EQUAL(output[5], 10);
+    ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator exclusive_scan_by_key(my_system &system,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestExclusiveScanByKeyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::exclusive_scan_by_key(sys,
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator exclusive_scan_by_key(my_tag,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestExclusiveScanByKeyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::exclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit);
+
+
+struct head_flag_predicate
+{
+    template <typename T>
+    __host__ __device__
+    bool operator()(const T&, const T& b)
+    {
+        return b ? false : true;
+    }
+};
+
+template <typename Vector>
+void TestScanByKeyHeadFlags(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector keys(7);
+    Vector vals(7);
+
+    Vector output(7, 0);
+
+    keys[0] = 0; vals[0] = 1;
+    keys[1] = 1; vals[1] = 2;
+    keys[2] = 0; vals[2] = 3;
+    keys[3] = 0; vals[3] = 4;
+    keys[4] = 1; vals[4] = 5;
+    keys[5] = 1; vals[5] = 6;
+    keys[6] = 0; vals[6] = 7;
+    
+    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), head_flag_predicate(), thrust::plus<T>());
+
+    ASSERT_EQUAL(output[0],  1);
+    ASSERT_EQUAL(output[1],  2);
+    ASSERT_EQUAL(output[2],  5);
+    ASSERT_EQUAL(output[3],  9);
+    ASSERT_EQUAL(output[4],  5);
+    ASSERT_EQUAL(output[5],  6);
+    ASSERT_EQUAL(output[6], 13);
+
+    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), head_flag_predicate(), thrust::plus<T>());
+    
+    ASSERT_EQUAL(output[0], 10);
+    ASSERT_EQUAL(output[1], 10);
+    ASSERT_EQUAL(output[2], 12);
+    ASSERT_EQUAL(output[3], 15);
+    ASSERT_EQUAL(output[4], 10);
+    ASSERT_EQUAL(output[5], 10);
+    ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
+
+template <typename Vector>
+void TestInclusiveScanByKeyTransformIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector keys(7);
+    Vector vals(7);
+
+    Vector output(7, 0);
+
+    keys[0] = 0; vals[0] = 1;
+    keys[1] = 1; vals[1] = 2;
+    keys[2] = 1; vals[2] = 3;
+    keys[3] = 1; vals[3] = 4;
+    keys[4] = 2; vals[4] = 5;
+    keys[5] = 3; vals[5] = 6;
+    keys[6] = 3; vals[6] = 7;
+
+    thrust::inclusive_scan_by_key
+        (keys.begin(), keys.end(),
+         thrust::make_transform_iterator(vals.begin(), thrust::negate<T>()), 
+         output.begin());
+    
+    ASSERT_EQUAL(output[0],  -1);
+    ASSERT_EQUAL(output[1],  -2);
+    ASSERT_EQUAL(output[2],  -5);
+    ASSERT_EQUAL(output[3],  -9);
+    ASSERT_EQUAL(output[4],  -5);
+    ASSERT_EQUAL(output[5],  -6);
+    ASSERT_EQUAL(output[6], -13);
+}
+DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator);
+
+
+template <typename Vector>
+void TestScanByKeyReusedKeys(void)
+{
+    Vector keys(7);
+    Vector vals(7);
+
+    Vector output(7, 0);
+
+    keys[0] = 0; vals[0] = 1;
+    keys[1] = 1; vals[1] = 2;
+    keys[2] = 1; vals[2] = 3;
+    keys[3] = 1; vals[3] = 4;
+    keys[4] = 0; vals[4] = 5;
+    keys[5] = 1; vals[5] = 6;
+    keys[6] = 1; vals[6] = 7;
+    
+    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
+
+    ASSERT_EQUAL(output[0],  1);
+    ASSERT_EQUAL(output[1],  2);
+    ASSERT_EQUAL(output[2],  5);
+    ASSERT_EQUAL(output[3],  9);
+    ASSERT_EQUAL(output[4],  5);
+    ASSERT_EQUAL(output[5],  6);
+    ASSERT_EQUAL(output[6], 13);
+
+    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), typename Vector::value_type(10));
+    
+    ASSERT_EQUAL(output[0], 10);
+    ASSERT_EQUAL(output[1], 10);
+    ASSERT_EQUAL(output[2], 12);
+    ASSERT_EQUAL(output[3], 15);
+    ASSERT_EQUAL(output[4], 10);
+    ASSERT_EQUAL(output[5], 10);
+    ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
+
+
+template <typename T>
+void TestInclusiveScanByKey(const size_t n)
+{
+    // XXX WAR nvbug 1541533
+#if 0 //THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+    if(typeid(T) == typeid(char) ||
+       typeid(T) == typeid(unsigned char))
+    {
+      KNOWN_FAILURE;
+    }
+#endif
+
+    thrust::host_vector<int> h_keys(n);
+    thrust::default_random_engine rng;
+    for(size_t i = 0, k = 0; i < n; i++){
+        h_keys[i] = k;
+        if (rng() % 10 == 0)
+            k++;
+    }
+    thrust::device_vector<int> d_keys = h_keys;
+
+    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
+    for(size_t i = 0; i < n; i++)
+        h_vals[i] = i % 10;
+    thrust::device_vector<T> d_vals = h_vals;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+   
+    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey);
+
+
+template <typename T>
+void TestExclusiveScanByKey(const size_t n)
+{
+    thrust::host_vector<int> h_keys(n);
+    thrust::default_random_engine rng;
+    for(size_t i = 0, k = 0; i < n; i++){
+        h_keys[i] = k;
+        if (rng() % 10 == 0)
+            k++;
+    }
+    thrust::device_vector<int> d_keys = h_keys;
+
+    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
+    for(size_t i = 0; i < n; i++)
+        h_vals[i] = i % 10;
+    thrust::device_vector<T> d_vals = h_vals;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+   
+    // without init
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+    
+    // with init
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), (T) 11);
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), (T) 11);
+    ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey);
+
+
+template <typename T>
+void TestInclusiveScanByKeyInPlace(const size_t n)
+{
+    // XXX WAR nvbug 1541533
+#if 0 //THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+    if(typeid(T) == typeid(char) ||
+       typeid(T) == typeid(unsigned char))
+    {
+      KNOWN_FAILURE;
+    }
+#endif
+
+    thrust::host_vector<int> h_keys(n);
+    thrust::default_random_engine rng;
+    for(size_t i = 0, k = 0; i < n; i++){
+        h_keys[i] = k;
+        if (rng() % 10 == 0)
+            k++;
+    }
+    thrust::device_vector<int> d_keys = h_keys;
+
+    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
+    for(size_t i = 0; i < n; i++)
+        h_vals[i] = i % 10;
+    thrust::device_vector<T> d_vals = h_vals;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+   
+    // in-place scans
+    h_output = h_vals;
+    d_output = d_vals;
+    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin());
+    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace);
+
+
+template <typename T>
+void TestExclusiveScanByKeyInPlace(const size_t n)
+{
+    thrust::host_vector<int> h_keys(n);
+    thrust::default_random_engine rng;
+    for(size_t i = 0, k = 0; i < n; i++){
+        h_keys[i] = k;
+        if (rng() % 10 == 0)
+            k++;
+    }
+    thrust::device_vector<int> d_keys = h_keys;
+
+    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
+    for(size_t i = 0; i < n; i++)
+        h_vals[i] = i % 10;
+    thrust::device_vector<T> d_vals = h_vals;
+
+    thrust::host_vector<T>   h_output = h_vals;
+    thrust::device_vector<T> d_output = d_vals;
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin(), (T) 11);
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin(), (T) 11);
+    ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace);
+
+
+void TestScanByKeyMixedTypes(void)
+{
+    const unsigned int n = 113;
+    
+    thrust::host_vector<int> h_keys(n);
+    thrust::default_random_engine rng;
+    for(size_t i = 0, k = 0; i < n; i++){
+        h_keys[i] = k;
+        if (rng() % 10 == 0)
+            k++;
+    }
+    thrust::device_vector<int> d_keys = h_keys;
+
+    thrust::host_vector<unsigned int> h_vals = unittest::random_integers<unsigned int>(n);
+    for(size_t i = 0; i < n; i++)
+        h_vals[i] %= 10;
+    thrust::device_vector<unsigned int> d_vals = h_vals;
+
+    thrust::host_vector<float>   h_float_output(n);
+    thrust::device_vector<float> d_float_output(n);
+    thrust::host_vector<int>   h_int_output(n);
+    thrust::device_vector<int> d_int_output(n);
+
+    //mixed vals/output types
+    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin());
+    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin());
+    ASSERT_EQUAL(d_float_output, h_float_output);
+    
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (float) 3.5);
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (float) 3.5);
+    ASSERT_EQUAL(d_float_output, h_float_output);
+    
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (int) 3);
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (int) 3);
+    ASSERT_EQUAL(d_float_output, h_float_output);
+    
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (int) 3);
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (int) 3);
+    ASSERT_EQUAL(d_int_output, h_int_output);
+    
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (float) 3.5);
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (float) 3.5);
+    ASSERT_EQUAL(d_int_output, h_int_output);
+}
+DECLARE_UNITTEST(TestScanByKeyMixedTypes);
+
+
+void TestScanByKeyLargeInput()
+{
+    const unsigned int N = 1 << 20;
+
+    thrust::host_vector<unsigned int> vals_sizes = unittest::random_integers<unsigned int>(10);
+        
+    thrust::host_vector<unsigned int>   h_vals = unittest::random_integers<unsigned int>(N);
+    thrust::device_vector<unsigned int> d_vals = h_vals;
+
+    thrust::host_vector<unsigned int>   h_output(N, 0);
+    thrust::device_vector<unsigned int> d_output(N, 0);
+
+    for (unsigned int i = 0; i < vals_sizes.size(); i++)
+    {
+        const unsigned int n = vals_sizes[i] % N;
+
+        // define segments
+        thrust::host_vector<unsigned int> h_keys(n);
+        thrust::default_random_engine rng;
+        for(size_t i = 0, k = 0; i < n; i++){
+            h_keys[i] = k;
+            if (rng() % 100 == 0)
+                k++;
+        }
+        thrust::device_vector<unsigned int> d_keys = h_keys;
+    
+        thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin());
+        thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin());
+        ASSERT_EQUAL(d_output, h_output);
+
+        thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin());
+        thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin());
+        ASSERT_EQUAL(d_output, h_output);
+   }
+}
+DECLARE_UNITTEST(TestScanByKeyLargeInput);
+
+
+template <typename T, unsigned int N>
+void _TestScanByKeyWithLargeTypes(void)
+{
+    size_t n = (64 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector<   unsigned int   > h_keys(n);
+    thrust::host_vector< FixedVector<T,N> > h_vals(n);
+    thrust::host_vector< FixedVector<T,N> > h_output(n);
+
+    thrust::default_random_engine rng;
+    for(size_t i = 0, k = 0; i < h_vals.size(); i++)
+    {
+        h_vals[i] = FixedVector<T,N>(i);
+        h_keys[i]  = k;
+        if (rng() % 5 == 0)
+            k++;
+    }
+
+    thrust::device_vector<   unsigned int   > d_keys = h_keys;
+    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
+    thrust::device_vector< FixedVector<T,N> > d_output(n);
+    
+    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+    
+    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), FixedVector<T,N>(0));
+    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), FixedVector<T,N>(0));
+    
+    ASSERT_EQUAL_QUIET(h_output, d_output);
+}
+
+void TestScanByKeyWithLargeTypes(void)
+{
+    _TestScanByKeyWithLargeTypes<int,    1>();
+    _TestScanByKeyWithLargeTypes<int,    2>();
+    _TestScanByKeyWithLargeTypes<int,    4>();
+    _TestScanByKeyWithLargeTypes<int,    8>();
+    //_TestScanByKeyWithLargeTypes<int,   16>();  // too many resources requested for launch
+    //_TestScanByKeyWithLargeTypes<int,   32>();  
+    //_TestScanByKeyWithLargeTypes<int,   64>();  // too large to pass as argument
+    //_TestScanByKeyWithLargeTypes<int,  128>();
+    //_TestScanByKeyWithLargeTypes<int,  256>();
+    //_TestScanByKeyWithLargeTypes<int,  512>();
+    //_TestScanByKeyWithLargeTypes<int, 1024>();
+}
+DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
+
diff --git a/thrust/testing/scatter.cu b/thrust/testing/scatter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ffd56f27c1b2236ac989d025c28f1180b05bf7f9
--- /dev/null
+++ b/thrust/testing/scatter.cu
@@ -0,0 +1,359 @@
+#include <unittest/unittest.h>
+#include <thrust/scatter.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/sequence.h>
+#include <thrust/fill.h>
+#include <algorithm>
+
+template <class Vector>
+void TestScatterSimple(void)
+{
+    Vector map(5);  // scatter indices
+    Vector src(5);  // source vector
+    Vector dst(8);  // destination vector
+
+    map[0] = 6; map[1] = 3; map[2] = 1; map[3] = 7; map[4] = 2;
+    src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4;
+    dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0; dst[5] = 0; dst[6] = 0; dst[7] = 0;
+
+    thrust::scatter(src.begin(), src.end(), map.begin(), dst.begin());
+
+    ASSERT_EQUAL(dst[0], 0);
+    ASSERT_EQUAL(dst[1], 2);
+    ASSERT_EQUAL(dst[2], 4);
+    ASSERT_EQUAL(dst[3], 1);
+    ASSERT_EQUAL(dst[4], 0);
+    ASSERT_EQUAL(dst[5], 0);
+    ASSERT_EQUAL(dst[6], 0);
+    ASSERT_EQUAL(dst[7], 3);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterSimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+void scatter(my_system &system,
+             InputIterator1,
+             InputIterator1,
+             InputIterator2,
+             RandomAccessIterator)
+{
+    system.validate_dispatch();
+}
+
+
+void TestScatterDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::scatter(sys,
+                    vec.begin(),
+                    vec.begin(),
+                    vec.begin(),
+                    vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestScatterDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+void scatter(my_tag,
+             InputIterator1,
+             InputIterator1,
+             InputIterator2,
+             RandomAccessIterator output)
+{
+    *output = 13;
+}
+
+void TestScatterDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::scatter(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestScatterDispatchImplicit);
+
+
+template <typename T>
+void TestScatter(const size_t n)
+{
+    const size_t output_size = std::min((size_t) 10, 2 * n);
+    
+    thrust::host_vector<T> h_input(n, (T) 1);
+    thrust::device_vector<T> d_input(n, (T) 1);
+   
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] =  h_map[i] % output_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+
+    thrust::host_vector<T>   h_output(output_size, (T) 0);
+    thrust::device_vector<T> d_output(output_size, (T) 0);
+
+    thrust::scatter(h_input.begin(), h_input.end(), h_map.begin(), h_output.begin());
+    thrust::scatter(d_input.begin(), d_input.end(), d_map.begin(), d_output.begin());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestScatter);
+
+
+template <typename T>
+void TestScatterToDiscardIterator(const size_t n)
+{
+    const size_t output_size = std::min((size_t) 10, 2 * n);
+    
+    thrust::host_vector<T> h_input(n, (T) 1);
+    thrust::device_vector<T> d_input(n, (T) 1);
+   
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] =  h_map[i] % output_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+
+    thrust::scatter(h_input.begin(), h_input.end(), h_map.begin(), thrust::make_discard_iterator());
+    thrust::scatter(d_input.begin(), d_input.end(), d_map.begin(), thrust::make_discard_iterator());
+
+    // there's nothing to check -- just make sure it compiles
+}
+DECLARE_VARIABLE_UNITTEST(TestScatterToDiscardIterator);
+
+
+template <class Vector>
+void TestScatterIfSimple(void)
+{
+    Vector flg(5);  // predicate array
+    Vector map(5);  // scatter indices
+    Vector src(5);  // source vector
+    Vector dst(8);  // destination vector
+
+    flg[0] = 0; flg[1] = 1; flg[2] = 0; flg[3] = 1; flg[4] = 0;
+    map[0] = 6; map[1] = 3; map[2] = 1; map[3] = 7; map[4] = 2;
+    src[0] = 0; src[1] = 1; src[2] = 2; src[3] = 3; src[4] = 4;
+    dst[0] = 0; dst[1] = 0; dst[2] = 0; dst[3] = 0; dst[4] = 0; dst[5] = 0; dst[6] = 0; dst[7] = 0;
+
+    thrust::scatter_if(src.begin(), src.end(), map.begin(), flg.begin(), dst.begin());
+
+    ASSERT_EQUAL(dst[0], 0);
+    ASSERT_EQUAL(dst[1], 0);
+    ASSERT_EQUAL(dst[2], 0);
+    ASSERT_EQUAL(dst[3], 1);
+    ASSERT_EQUAL(dst[4], 0);
+    ASSERT_EQUAL(dst[5], 0);
+    ASSERT_EQUAL(dst[6], 0);
+    ASSERT_EQUAL(dst[7], 3);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterIfSimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+void scatter_if(my_system &system,
+                InputIterator1,
+                InputIterator1,
+                InputIterator2,
+                InputIterator3,
+                RandomAccessIterator)
+{
+    system.validate_dispatch();
+}
+
+void TestScatterIfDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::scatter_if(sys,
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin(),
+                       vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestScatterIfDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+void scatter_if(my_tag,
+                InputIterator1,
+                InputIterator1,
+                InputIterator2,
+                InputIterator3,
+                RandomAccessIterator output)
+{
+    *output = 13;
+}
+
+void TestScatterIfDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::scatter_if(thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()),
+                       thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestScatterIfDispatchImplicit);
+
+
+template <typename T>
+class is_even_scatter_if
+{
+    public:
+    __host__ __device__ bool operator()(const T i) const { return (i % 2) == 0; }
+};
+
+template <typename T>
+void TestScatterIf(const size_t n)
+{
+    const size_t output_size = std::min((size_t) 10, 2 * n);
+    
+    thrust::host_vector<T> h_input(n, (T) 1);
+    thrust::device_vector<T> d_input(n, (T) 1);
+   
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] =  h_map[i] % output_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+
+    thrust::host_vector<T>   h_output(output_size, (T) 0);
+    thrust::device_vector<T> d_output(output_size, (T) 0);
+
+    thrust::scatter_if(h_input.begin(), h_input.end(), h_map.begin(), h_map.begin(), h_output.begin(), is_even_scatter_if<unsigned int>());
+    thrust::scatter_if(d_input.begin(), d_input.end(), d_map.begin(), d_map.begin(), d_output.begin(), is_even_scatter_if<unsigned int>());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestScatterIf);
+
+
+template <typename T>
+void TestScatterIfToDiscardIterator(const size_t n)
+{
+    const size_t output_size = std::min((size_t) 10, 2 * n);
+    
+    thrust::host_vector<T> h_input(n, (T) 1);
+    thrust::device_vector<T> d_input(n, (T) 1);
+   
+    thrust::host_vector<unsigned int> h_map = unittest::random_integers<unsigned int>(n);
+
+    for(size_t i = 0; i < n; i++)
+        h_map[i] =  h_map[i] % output_size;
+    
+    thrust::device_vector<unsigned int> d_map = h_map;
+
+    thrust::scatter_if(h_input.begin(), h_input.end(), h_map.begin(), h_map.begin(), thrust::make_discard_iterator(), is_even_scatter_if<unsigned int>());
+    thrust::scatter_if(d_input.begin(), d_input.end(), d_map.begin(), d_map.begin(), thrust::make_discard_iterator(), is_even_scatter_if<unsigned int>());
+}
+DECLARE_VARIABLE_UNITTEST(TestScatterIfToDiscardIterator);
+
+
+template <typename Vector>
+void TestScatterCountingIterator(void)
+{
+    Vector source(10);
+    thrust::sequence(source.begin(), source.end(), 0);
+
+    Vector map(10);
+    thrust::sequence(map.begin(), map.end(), 0);
+
+    Vector output(10);
+
+    // source has any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::scatter(thrust::make_counting_iterator(0), thrust::make_counting_iterator(10),
+                    map.begin(),
+                    output.begin());
+
+    ASSERT_EQUAL(output, map);
+    
+    // map has any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::scatter(source.begin(), source.end(),
+                    thrust::make_counting_iterator(0),
+                    output.begin());
+
+    ASSERT_EQUAL(output, map);
+    
+    // source and map have any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::scatter(thrust::make_counting_iterator(0), thrust::make_counting_iterator(10),
+                    thrust::make_counting_iterator(0),
+                    output.begin());
+
+    ASSERT_EQUAL(output, map);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterCountingIterator);
+
+
+template <typename Vector>
+void TestScatterIfCountingIterator(void)
+{
+    Vector source(10);
+    thrust::sequence(source.begin(), source.end(), 0);
+
+    Vector map(10);
+    thrust::sequence(map.begin(), map.end(), 0);
+    
+    Vector stencil(10, 1);
+
+    Vector output(10);
+
+    // source has any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::scatter_if(thrust::make_counting_iterator(0), thrust::make_counting_iterator(10),
+                       map.begin(),
+                       stencil.begin(),
+                       output.begin());
+
+    ASSERT_EQUAL(output, map);
+    
+    // map has any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::scatter_if(source.begin(), source.end(),
+                       thrust::make_counting_iterator(0),
+                       stencil.begin(),
+                       output.begin());
+
+    ASSERT_EQUAL(output, map);
+    
+    // source and map have any_system_tag
+    thrust::fill(output.begin(), output.end(), 0);
+    thrust::scatter_if(thrust::make_counting_iterator(0), thrust::make_counting_iterator(10),
+                       thrust::make_counting_iterator(0),
+                       stencil.begin(),
+                       output.begin());
+
+    ASSERT_EQUAL(output, map);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterIfCountingIterator);
+
diff --git a/thrust/testing/sequence.cu b/thrust/testing/sequence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cd3e17744fa87621b1b21be028977379c485ae32
--- /dev/null
+++ b/thrust/testing/sequence.cu
@@ -0,0 +1,125 @@
+#include <unittest/unittest.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename ForwardIterator>
+void sequence(my_system &system, ForwardIterator, ForwardIterator)
+{
+    system.validate_dispatch();
+}
+
+void TestSequenceDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::sequence(sys, vec.begin(), vec.end());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSequenceDispatchExplicit);
+
+
+template<typename ForwardIterator>
+void sequence(my_tag, ForwardIterator first, ForwardIterator)
+{
+    *first = 13;
+}
+
+void TestSequenceDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::sequence(thrust::retag<my_tag>(vec.begin()),
+                     thrust::retag<my_tag>(vec.end()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSequenceDispatchImplicit);
+
+
+template <class Vector>
+void TestSequenceSimple(void)
+{
+    Vector v(5);
+
+    thrust::sequence(v.begin(), v.end());
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+    ASSERT_EQUAL(v[3], 3);
+    ASSERT_EQUAL(v[4], 4);
+
+    thrust::sequence(v.begin(), v.end(), 10);
+
+    ASSERT_EQUAL(v[0], 10);
+    ASSERT_EQUAL(v[1], 11);
+    ASSERT_EQUAL(v[2], 12);
+    ASSERT_EQUAL(v[3], 13);
+    ASSERT_EQUAL(v[4], 14);
+    
+    thrust::sequence(v.begin(), v.end(), 10, 2);
+
+    ASSERT_EQUAL(v[0], 10);
+    ASSERT_EQUAL(v[1], 12);
+    ASSERT_EQUAL(v[2], 14);
+    ASSERT_EQUAL(v[3], 16);
+    ASSERT_EQUAL(v[4], 18);
+}
+DECLARE_VECTOR_UNITTEST(TestSequenceSimple);
+
+
+template <typename T>
+void TestSequence(size_t n)
+{
+    thrust::host_vector<T>   h_data(n);
+    thrust::device_vector<T> d_data(n);
+
+    thrust::sequence(h_data.begin(), h_data.end());
+    thrust::sequence(d_data.begin(), d_data.end());
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::sequence(h_data.begin(), h_data.end(), T(10));
+    thrust::sequence(d_data.begin(), d_data.end(), T(10));
+
+    ASSERT_EQUAL(h_data, d_data);
+
+    thrust::sequence(h_data.begin(), h_data.end(), T(10), T(2));
+    thrust::sequence(d_data.begin(), d_data.end(), T(10), T(2));
+
+    ASSERT_EQUAL(h_data, d_data);
+    
+    thrust::sequence(h_data.begin(), h_data.end(), size_t(10), size_t(2));
+    thrust::sequence(d_data.begin(), d_data.end(), size_t(10), size_t(2));
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestSequence);
+
+
+template <typename T>
+void TestSequenceToDiscardIterator(size_t n)
+{
+    thrust::host_vector<T>   h_data(n);
+    thrust::device_vector<T> d_data(n);
+
+    thrust::sequence(thrust::discard_iterator<thrust::device_system_tag>(),
+                     thrust::discard_iterator<thrust::device_system_tag>(13),
+                     T(10),
+                     T(2));
+
+    // nothing to check -- just make sure it compiles
+}
+DECLARE_VARIABLE_UNITTEST(TestSequenceToDiscardIterator);
+
+
+void TestSequenceComplex()
+{
+  thrust::device_vector<thrust::complex<double> > m(64);
+  thrust::sequence(m.begin(), m.end());
+}
+DECLARE_UNITTEST(TestSequenceComplex);
diff --git a/thrust/testing/set_difference.cu b/thrust/testing/set_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ae553fd8254d7e7ec10ca3aa98c4a3d0b86e70d
--- /dev/null
+++ b/thrust/testing/set_difference.cu
@@ -0,0 +1,242 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_difference(my_system &system,
+                              InputIterator1,
+                              InputIterator1,
+                              InputIterator2,
+                              InputIterator2,
+                              OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestSetDifferenceDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_difference(sys,
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetDifferenceDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_difference(my_tag,
+                              InputIterator1,
+                              InputIterator1,
+                              InputIterator2,
+                              InputIterator2,
+                              OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestSetDifferenceDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_difference(thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetDifferenceDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetDifferenceSimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 4; a[3] = 5;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4; b[4] = 6;
+
+  Vector ref(2);
+  ref[0] = 2; ref[1] = 5;
+
+  Vector result(2);
+
+  Iterator end = thrust::set_difference(a.begin(), a.end(),
+                                        b.begin(), b.end(),
+                                        result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetDifferenceSimple);
+
+
+template<typename T>
+void TestSetDifference(const size_t n)
+{
+  size_t sizes[]   = {0, 1, n / 2, n, n + 1, 2 * n};
+  size_t num_sizes = sizeof(sizes) / sizeof(size_t);
+
+  thrust::host_vector<T> random = unittest::random_integers<unittest::int8_t>(n + *thrust::max_element(sizes, sizes + num_sizes));
+
+  thrust::host_vector<T> h_a(random.begin(), random.begin() + n);
+  thrust::host_vector<T> h_b(random.begin() + n, random.end());
+  
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+  
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  for (size_t i = 0; i < num_sizes; i++)
+  {
+    size_t size = sizes[i];
+    
+    thrust::host_vector<T>   h_result(n + size);
+    thrust::device_vector<T> d_result(n + size);
+
+    typename thrust::host_vector<T>::iterator   h_end;
+    typename thrust::device_vector<T>::iterator d_end;
+    
+    h_end = thrust::set_difference(h_a.begin(), h_a.end(),
+                                   h_b.begin(), h_b.begin() + size,
+                                   h_result.begin());
+    h_result.resize(h_end - h_result.begin());
+
+    d_end = thrust::set_difference(d_a.begin(), d_a.end(),
+                                   d_b.begin(), d_b.begin() + size,
+                                   d_result.begin());
+    d_result.resize(d_end - d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifference);
+
+
+template<typename T>
+void TestSetDifferenceEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_a = temp; thrust::sort(h_a.begin(), h_a.end());
+  thrust::host_vector<T> h_b = h_a;
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_difference(h_a.begin(), h_a.end(),
+                                 h_b.begin(), h_b.end(),
+                                 h_result.begin());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_difference(d_a.begin(), d_a.end(),
+                                 d_b.begin(), d_b.end(),
+                                 d_result.begin());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceEquivalentRanges);
+
+
+template<typename T>
+void TestSetDifferenceMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end());
+  thrust::sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_difference(h_a.begin(), h_a.end(),
+                                 h_b.begin(), h_b.end(),
+                                 h_result.begin());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_difference(d_a.begin(), d_a.end(),
+                                 d_b.begin(), d_b.end(),
+                                 d_result.begin());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceMultiset);
+
+// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
+// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+void TestSetDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    thrust::counting_iterator<long long> end_longer = end + 1;
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_vector<long long> result;
+    result.resize(1);
+    thrust::set_difference(thrust::device, begin, end_longer, begin, end, result.begin());
+
+    thrust::host_vector<long long> expected;
+    expected.push_back(*end);
+
+    ASSERT_EQUAL(result, expected);
+}
+
+void TestSetDifferenceWithBigIndexes()
+{
+    TestSetDifferenceWithBigIndexesHelper(30);
+    TestSetDifferenceWithBigIndexesHelper(31);
+    TestSetDifferenceWithBigIndexesHelper(32);
+    TestSetDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes);
+#endif
diff --git a/thrust/testing/set_difference_by_key.cu b/thrust/testing/set_difference_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be68685fc1049642aa4098e5442222187df36d2f
--- /dev/null
+++ b/thrust/testing/set_difference_by_key.cu
@@ -0,0 +1,315 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(my_system &system,
+                        InputIterator1,
+                        InputIterator1,
+                        InputIterator2,
+                        InputIterator2,
+                        InputIterator3,
+                        InputIterator4,
+                        OutputIterator1 keys_result,
+                        OutputIterator2 values_result)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(keys_result, values_result);
+}
+
+void TestSetDifferenceByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_difference_by_key(sys,
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetDifferenceByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(my_tag,
+                        InputIterator1,
+                        InputIterator1,
+                        InputIterator2,
+                        InputIterator2,
+                        InputIterator3,
+                        InputIterator4,
+                        OutputIterator1 keys_result,
+                        OutputIterator2 values_result)
+{
+  *keys_result = 13;
+  return thrust::make_pair(keys_result,values_result);
+}
+
+void TestSetDifferenceByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_difference_by_key(thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetDifferenceByKeyDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetDifferenceByKeySimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4; a_key[3] = 5;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4; b_key[4] = 6;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 2; ref_key[1] = 5;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_difference_by_key(a_key.begin(), a_key.end(),
+                                  b_key.begin(), b_key.end(),
+                                  a_val.begin(),
+                                  b_val.begin(),
+                                  result_key.begin(),
+                                  result_val.begin());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetDifferenceByKeySimple);
+
+
+template<typename T>
+void TestSetDifferenceByKey(const size_t n)
+{
+  thrust::host_vector<T> random_keys = unittest::random_integers<unittest::int8_t>(n);
+  thrust::host_vector<T> random_vals = unittest::random_integers<unittest::int8_t>(n);
+
+  size_t denominators[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t num_denominators = sizeof(denominators) / sizeof(size_t);
+
+  for(size_t i = 0; i < num_denominators; ++i)
+  {
+    size_t size_a = n / denominators[i];
+
+    thrust::host_vector<T> h_a_keys(random_keys.begin(), random_keys.begin() + size_a);
+    thrust::host_vector<T> h_b_keys(random_keys.begin() + size_a, random_keys.end());
+
+    thrust::host_vector<T> h_a_vals(random_vals.begin(), random_vals.begin() + size_a);
+    thrust::host_vector<T> h_b_vals(random_vals.begin() + size_a, random_vals.end());
+
+    thrust::stable_sort(h_a_keys.begin(), h_a_keys.end());
+    thrust::stable_sort(h_b_keys.begin(), h_b_keys.end());
+
+    thrust::device_vector<T> d_a_keys = h_a_keys;
+    thrust::device_vector<T> d_b_keys = h_b_keys;
+
+    thrust::device_vector<T> d_a_vals = h_a_vals;
+    thrust::device_vector<T> d_b_vals = h_b_vals;
+
+    thrust::host_vector<T> h_result_keys(n);
+    thrust::host_vector<T> h_result_vals(n);
+
+    thrust::device_vector<T> d_result_keys(n);
+    thrust::device_vector<T> d_result_vals(n);
+
+
+    thrust::pair<
+      typename thrust::host_vector<T>::iterator,
+      typename thrust::host_vector<T>::iterator
+    > h_end;
+
+    thrust::pair<
+      typename thrust::device_vector<T>::iterator,
+      typename thrust::device_vector<T>::iterator
+    > d_end;
+
+
+    h_end = thrust::set_difference_by_key(h_a_keys.begin(), h_a_keys.end(),
+                                          h_b_keys.begin(), h_b_keys.end(),
+                                          h_a_vals.begin(),
+                                          h_b_vals.begin(),
+                                          h_result_keys.begin(),
+                                          h_result_vals.begin());
+    h_result_keys.erase(h_end.first, h_result_keys.end());
+    h_result_vals.erase(h_end.second, h_result_vals.end());
+
+    d_end = thrust::set_difference_by_key(d_a_keys.begin(), d_a_keys.end(),
+                                          d_b_keys.begin(), d_b_keys.end(),
+                                          d_a_vals.begin(),
+                                          d_b_vals.begin(),
+                                          d_result_keys.begin(),
+                                          d_result_vals.begin());
+    d_result_keys.erase(d_end.first, d_result_keys.end());
+    d_result_vals.erase(d_end.second, d_result_vals.end());
+
+    ASSERT_EQUAL(h_result_keys, d_result_keys);
+    ASSERT_EQUAL(h_result_vals, d_result_vals);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKey);
+
+
+template<typename T>
+void TestSetDifferenceByKeyEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+
+  thrust::host_vector<T> h_a_key = temp;
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::host_vector<T> h_b_key = h_a_key;
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  thrust::host_vector<T>   h_result_key(n), h_result_val(n);
+  thrust::device_vector<T> d_result_key(n), d_result_val(n);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+  
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_difference_by_key(h_a_key.begin(), h_a_key.end(),
+                                        h_b_key.begin(), h_b_key.end(),
+                                        h_a_val.begin(),
+                                        h_b_val.begin(),
+                                        h_result_key.begin(),
+                                        h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_difference_by_key(d_a_key.begin(), d_a_key.end(),
+                                        d_b_key.begin(), d_b_key.end(),
+                                        d_a_val.begin(),
+                                        d_b_val.begin(),
+                                        d_result_key.begin(),
+                                        d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKeyEquivalentRanges);
+
+
+template<typename T>
+void TestSetDifferenceByKeyMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::sort(h_b_key.begin(), h_b_key.end());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  thrust::host_vector<T>   h_result_key(n), h_result_val(n);
+  thrust::device_vector<T> d_result_key(n), d_result_val(n);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_difference_by_key(h_a_key.begin(), h_a_key.end(),
+                                        h_b_key.begin(), h_b_key.end(),
+                                        h_a_val.begin(),
+                                        h_b_val.begin(),
+                                        h_result_key.begin(),
+                                        h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_difference_by_key(d_a_key.begin(), d_a_key.end(),
+                                        d_b_key.begin(), d_b_key.end(),
+                                        d_a_val.begin(),
+                                        d_b_val.begin(),
+                                        d_result_key.begin(),
+                                        d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKeyMultiset);
+
diff --git a/thrust/testing/set_difference_by_key_descending.cu b/thrust/testing/set_difference_by_key_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0b58fb67b82f4c3592db091a3d38d3201963c28
--- /dev/null
+++ b/thrust/testing/set_difference_by_key_descending.cu
@@ -0,0 +1,100 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetDifferenceByKeyDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(4), a_val(4);
+  Vector b_key(5), b_val(5);
+
+  a_key[0] = 5; a_key[1] = 4; a_key[2] = 2; a_key[3] = 0;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 6; b_key[1] = 4; b_key[2] = 3; b_key[3] = 3; b_key[4] = 0;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 5; ref_key[1] = 2;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_difference_by_key(a_key.begin(), a_key.end(),
+                                  b_key.begin(), b_key.end(),
+                                  a_val.begin(),
+                                  b_val.begin(),
+                                  result_key.begin(),
+                                  result_val.begin(),
+                                  thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetDifferenceByKeyDescendingSimple);
+
+
+template<typename T>
+void TestSetDifferenceByKeyDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end(), thrust::greater<T>());
+  thrust::sort(h_b_key.begin(), h_b_key.end(), thrust::greater<T>());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(h_a_key.size());
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(h_b_key.size());
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  thrust::host_vector<T>   h_result_key(n), h_result_val(n);
+  thrust::device_vector<T> d_result_key(n), d_result_val(n);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_difference_by_key(h_a_key.begin(), h_a_key.end(),
+                                        h_b_key.begin(), h_b_key.end(),
+                                        h_a_val.begin(),
+                                        h_b_val.begin(),
+                                        h_result_key.begin(),
+                                        h_result_val.begin(),
+                                        thrust::greater<T>());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_difference_by_key(d_a_key.begin(), d_a_key.end(),
+                                        d_b_key.begin(), d_b_key.end(),
+                                        d_a_val.begin(),
+                                        d_b_val.begin(),
+                                        d_result_key.begin(),
+                                        d_result_val.begin(),
+                                        thrust::greater<T>());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKeyDescending);
+
diff --git a/thrust/testing/set_difference_descending.cu b/thrust/testing/set_difference_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5dd71cfb263b45819b108524f587db08710bd2d3
--- /dev/null
+++ b/thrust/testing/set_difference_descending.cu
@@ -0,0 +1,68 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetDifferenceDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 5; a[1] = 4; a[2] = 2; a[3] = 0;
+  b[0] = 6; b[1] = 4; b[2] = 3; b[3] = 3; b[4] = 0;
+
+  Vector ref(2);
+  ref[0] = 5; ref[1] = 2;
+
+  Vector result(2);
+
+  Iterator end = thrust::set_difference(a.begin(), a.end(),
+                                        b.begin(), b.end(),
+                                        result.begin(),
+                                        thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetDifferenceDescendingSimple);
+
+
+template<typename T>
+void TestSetDifferenceDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_difference(h_a.begin(), h_a.end(),
+                                 h_b.begin(), h_b.end(),
+                                 h_result.begin(),
+                                 thrust::greater<T>());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_difference(d_a.begin(), d_a.end(),
+                                 d_b.begin(), d_b.end(),
+                                 d_result.begin(),
+                                 thrust::greater<T>());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceDescending);
+
diff --git a/thrust/testing/set_difference_key_value.cu b/thrust/testing/set_difference_key_value.cu
new file mode 100644
index 0000000000000000000000000000000000000000..381718a4478302ef208f10bb66979068aec03d3e
--- /dev/null
+++ b/thrust/testing/set_difference_key_value.cu
@@ -0,0 +1,51 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename U>
+  void TestSetDifferenceKeyValue(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_difference(h_a.begin(), h_a.end(),
+                                 h_b.begin(), h_b.end(),
+                                 h_result.begin());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_difference(d_a.begin(), d_a.end(),
+                                 d_b.begin(), d_b.end(),
+                                 d_result.begin());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetDifferenceKeyValue);
+
+
diff --git a/thrust/testing/set_intersection.cu b/thrust/testing/set_intersection.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a8fae65374cfb8bf26046a77af6dce4bc8f21dd5
--- /dev/null
+++ b/thrust/testing/set_intersection.cu
@@ -0,0 +1,283 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_intersection(my_system &system,
+                                InputIterator1,
+                                InputIterator1,
+                                InputIterator2,
+                                InputIterator2,
+                                OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestSetIntersectionDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_intersection(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetIntersectionDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_intersection(my_tag,
+                                InputIterator1,
+                                InputIterator1,
+                                InputIterator2,
+                                InputIterator2,
+                                OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestSetIntersectionDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_intersection(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetIntersectionDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetIntersectionSimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(2);
+  ref[0] = 0; ref[1] = 4;
+
+  Vector result(2);
+
+  Iterator end = thrust::set_intersection(a.begin(), a.end(),
+                                          b.begin(), b.end(),
+                                          result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetIntersectionSimple);
+
+
+template<typename T>
+void TestSetIntersection(const size_t n)
+{
+  size_t sizes[]   = {0, 1, n / 2, n, n + 1, 2 * n};
+  size_t num_sizes = sizeof(sizes) / sizeof(size_t);
+
+  thrust::host_vector<T> random = unittest::random_integers<unittest::int8_t>(n + *thrust::max_element(sizes, sizes + num_sizes));
+
+  thrust::host_vector<T> h_a(random.begin(), random.begin() + n);
+  thrust::host_vector<T> h_b(random.begin() + n, random.end());
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+  
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  for (size_t i = 0; i < num_sizes; i++)
+  {
+    size_t size = sizes[i];
+    
+    thrust::host_vector<T>   h_result(n + size);
+    thrust::device_vector<T> d_result(n + size);
+
+    typename thrust::host_vector<T>::iterator   h_end;
+    typename thrust::device_vector<T>::iterator d_end;
+    
+    h_end = thrust::set_intersection(h_a.begin(), h_a.end(),
+                                     h_b.begin(), h_b.begin() + size,
+                                     h_result.begin());
+    h_result.resize(h_end - h_result.begin());
+
+    d_end = thrust::set_intersection(d_a.begin(), d_a.end(),
+                                     d_b.begin(), d_b.begin() + size,
+                                     d_result.begin());
+    d_result.resize(d_end - d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersection);
+
+
+template<typename T>
+void TestSetIntersectionToDiscardIterator(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end());
+  thrust::sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::discard_iterator<> h_result;
+  thrust::discard_iterator<> d_result;
+
+  thrust::host_vector<T> h_reference(n);
+  typename thrust::host_vector<T>::iterator h_end = 
+    thrust::set_intersection(h_a.begin(), h_a.end(),
+                             h_b.begin(), h_b.end(),
+                             h_reference.begin());
+  h_reference.erase(h_end, h_reference.end());
+  
+  h_result = thrust::set_intersection(h_a.begin(), h_a.end(),
+                                      h_b.begin(), h_b.end(),
+                                      thrust::make_discard_iterator());
+
+  d_result = thrust::set_intersection(d_a.begin(), d_a.end(),
+                                      d_b.begin(), d_b.end(),
+                                      thrust::make_discard_iterator());
+
+  thrust::discard_iterator<> reference(h_reference.size());
+
+  ASSERT_EQUAL_QUIET(reference, h_result);
+  ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionToDiscardIterator);
+
+
+template<typename T>
+void TestSetIntersectionEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_a = temp; thrust::sort(h_a.begin(), h_a.end());
+  thrust::host_vector<T> h_b = h_a;
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_intersection(h_a.begin(), h_a.end(),
+                                   h_b.begin(), h_b.end(),
+                                   h_result.begin());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_intersection(d_a.begin(), d_a.end(),
+                                   d_b.begin(), d_b.end(),
+                                   d_result.begin());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionEquivalentRanges);
+
+
+template<typename T>
+void TestSetIntersectionMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end());
+  thrust::sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_intersection(h_a.begin(), h_a.end(),
+                                   h_b.begin(), h_b.end(),
+                                   h_result.begin());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_intersection(d_a.begin(), d_a.end(),
+                                   d_b.begin(), d_b.end(),
+                                   d_result.begin());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);
+
+// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
+// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+void TestSetDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin1(0);
+    thrust::counting_iterator<long long> begin2 = begin1 + (1ll << magnitude);
+    thrust::counting_iterator<long long> end1 = begin2 + 1;
+    thrust::counting_iterator<long long> end2 = begin2 + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin2, end1), 1);
+
+    thrust::device_vector<long long> result;
+    result.resize(1);
+    thrust::set_intersection(thrust::device, begin1, end1, begin2, end2, result.begin());
+
+    thrust::host_vector<long long> expected;
+    expected.push_back(*begin2);
+
+    ASSERT_EQUAL(result, expected);
+}
+
+void TestSetDifferenceWithBigIndexes()
+{
+    TestSetDifferenceWithBigIndexesHelper(30);
+    TestSetDifferenceWithBigIndexesHelper(31);
+    TestSetDifferenceWithBigIndexesHelper(32);
+    TestSetDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes);
+#endif
diff --git a/thrust/testing/set_intersection_by_key.cu b/thrust/testing/set_intersection_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b7d51fc8cfdcf3882e0f6da736bbcde5bc44670
--- /dev/null
+++ b/thrust/testing/set_intersection_by_key.cu
@@ -0,0 +1,295 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(my_system &system,
+                          InputIterator1,
+                          InputIterator1,
+                          InputIterator2,
+                          InputIterator2,
+                          InputIterator3,
+                          OutputIterator1 keys_result,
+                          OutputIterator2 values_result)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(keys_result, values_result);
+}
+
+void TestSetIntersectionByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_intersection_by_key(sys,
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin(),
+                                  vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(my_tag,
+                          InputIterator1,
+                          InputIterator1,
+                          InputIterator2,
+                          InputIterator2,
+                          InputIterator3,
+                          OutputIterator1 keys_result,
+                          OutputIterator2 values_result)
+{
+  *keys_result = 13;
+  return thrust::make_pair(keys_result,values_result);
+}
+
+void TestSetIntersectionByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_intersection_by_key(thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()),
+                                  thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetIntersectionByKeySimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 0; ref_key[1] = 4;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_intersection_by_key(a_key.begin(), a_key.end(),
+                                    b_key.begin(), b_key.end(),
+                                    a_val.begin(),
+                                    result_key.begin(),
+                                    result_val.begin());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetIntersectionByKeySimple);
+
+
+template<typename T>
+void TestSetIntersectionByKey(const size_t n)
+{
+  thrust::host_vector<T> random_keys = unittest::random_integers<unittest::int8_t>(n);
+  thrust::host_vector<T> random_vals = unittest::random_integers<unittest::int8_t>(n);
+
+  size_t denominators[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t num_denominators = sizeof(denominators) / sizeof(size_t);
+
+  for(size_t i = 0; i < num_denominators; ++i)
+  {
+    size_t size_a = n / denominators[i];
+
+    thrust::host_vector<T> h_a_keys(random_keys.begin(), random_keys.begin() + size_a);
+    thrust::host_vector<T> h_b_keys(random_keys.begin() + size_a, random_keys.end());
+
+    thrust::host_vector<T> h_a_vals(random_vals.begin(), random_vals.begin() + size_a);
+
+    thrust::stable_sort(h_a_keys.begin(), h_a_keys.end());
+    thrust::stable_sort(h_b_keys.begin(), h_b_keys.end());
+
+    thrust::device_vector<T> d_a_keys = h_a_keys;
+    thrust::device_vector<T> d_b_keys = h_b_keys;
+
+    thrust::device_vector<T> d_a_vals = h_a_vals;
+
+    thrust::host_vector<T> h_result_keys(n);
+    thrust::host_vector<T> h_result_vals(n);
+
+    thrust::device_vector<T> d_result_keys(n);
+    thrust::device_vector<T> d_result_vals(n);
+
+
+    thrust::pair<
+      typename thrust::host_vector<T>::iterator,
+      typename thrust::host_vector<T>::iterator
+    > h_end;
+
+    thrust::pair<
+      typename thrust::device_vector<T>::iterator,
+      typename thrust::device_vector<T>::iterator
+    > d_end;
+
+
+    h_end = thrust::set_intersection_by_key(h_a_keys.begin(), h_a_keys.end(),
+                                            h_b_keys.begin(), h_b_keys.end(),
+                                            h_a_vals.begin(),
+                                            h_result_keys.begin(),
+                                            h_result_vals.begin());
+    h_result_keys.erase(h_end.first, h_result_keys.end());
+    h_result_vals.erase(h_end.second, h_result_vals.end());
+
+    d_end = thrust::set_intersection_by_key(d_a_keys.begin(), d_a_keys.end(),
+                                            d_b_keys.begin(), d_b_keys.end(),
+                                            d_a_vals.begin(),
+                                            d_result_keys.begin(),
+                                            d_result_vals.begin());
+    d_result_keys.erase(d_end.first, d_result_keys.end());
+    d_result_vals.erase(d_end.second, d_result_vals.end());
+
+    ASSERT_EQUAL(h_result_keys, d_result_keys);
+    ASSERT_EQUAL(h_result_vals, d_result_vals);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKey);
+
+
+template<typename T>
+void TestSetIntersectionByKeyEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+
+  thrust::host_vector<T> h_a_key = temp;
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::host_vector<T> h_b_key = h_a_key;
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+
+  thrust::host_vector<T>   h_result_key(n), h_result_val(n);
+  thrust::device_vector<T> d_result_key(n), d_result_val(n);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+  
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_intersection_by_key(h_a_key.begin(), h_a_key.end(),
+                                          h_b_key.begin(), h_b_key.end(),
+                                          h_a_val.begin(),
+                                          h_result_key.begin(),
+                                          h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_intersection_by_key(d_a_key.begin(), d_a_key.end(),
+                                          d_b_key.begin(), d_b_key.end(),
+                                          d_a_val.begin(),
+                                          d_result_key.begin(),
+                                          d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKeyEquivalentRanges);
+
+
+template<typename T>
+void TestSetIntersectionByKeyMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::sort(h_b_key.begin(), h_b_key.end());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+
+  thrust::host_vector<T>   h_result_key(n), h_result_val(n);
+  thrust::device_vector<T> d_result_key(n), d_result_val(n);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_intersection_by_key(h_a_key.begin(), h_a_key.end(),
+                                          h_b_key.begin(), h_b_key.end(),
+                                          h_a_val.begin(),
+                                          h_result_key.begin(),
+                                          h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_intersection_by_key(d_a_key.begin(), d_a_key.end(),
+                                          d_b_key.begin(), d_b_key.end(),
+                                          d_a_val.begin(),
+                                          d_result_key.begin(),
+                                          d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKeyMultiset);
+
diff --git a/thrust/testing/set_intersection_by_key_descending.cu b/thrust/testing/set_intersection_by_key_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9931b7a199f53b245df0de7052b20f39abba52ee
--- /dev/null
+++ b/thrust/testing/set_intersection_by_key_descending.cu
@@ -0,0 +1,94 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetIntersectionByKeyDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3), b_val(4);
+
+  a_key[0] = 4; a_key[1] = 2; a_key[2] = 0;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 4; b_key[1] = 3; b_key[2] = 3; b_key[3] = 0;
+
+  Vector ref_key(2), ref_val(2);
+  ref_key[0] = 4; ref_key[1] = 0;
+  ref_val[0] = 0; ref_val[1] = 0;
+
+  Vector result_key(2), result_val(2);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_intersection_by_key(a_key.begin(), a_key.end(),
+                                    b_key.begin(), b_key.end(),
+                                    a_val.begin(),
+                                    result_key.begin(),
+                                    result_val.begin(),
+                                    thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetIntersectionByKeyDescendingSimple);
+
+
+template<typename T>
+void TestSetIntersectionByKeyDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end(), thrust::greater<T>());
+  thrust::sort(h_b_key.begin(), h_b_key.end(), thrust::greater<T>());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(h_a_key.size());
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+
+  thrust::host_vector<T>   h_result_key(n), h_result_val(n);
+  thrust::device_vector<T> d_result_key(n), d_result_val(n);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_intersection_by_key(h_a_key.begin(), h_a_key.end(),
+                                          h_b_key.begin(), h_b_key.end(),
+                                          h_a_val.begin(),
+                                          h_result_key.begin(),
+                                          h_result_val.begin(),
+                                          thrust::greater<T>());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_intersection_by_key(d_a_key.begin(), d_a_key.end(),
+                                          d_b_key.begin(), d_b_key.end(),
+                                          d_a_val.begin(),
+                                          d_result_key.begin(),
+                                          d_result_val.begin(),
+                                          thrust::greater<T>());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKeyDescending);
+
diff --git a/thrust/testing/set_intersection_descending.cu b/thrust/testing/set_intersection_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd4e7cb120a63f3c9c99018895edd51637dd072d
--- /dev/null
+++ b/thrust/testing/set_intersection_descending.cu
@@ -0,0 +1,68 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetIntersectionDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 4; a[1] = 2; a[2] = 0;
+  b[0] = 4; b[1] = 3; b[2] = 3; b[3] = 0;
+
+  Vector ref(2);
+  ref[0] = 4; ref[1] = 0;
+
+  Vector result(2);
+
+  Iterator end = thrust::set_intersection(a.begin(), a.end(),
+                                          b.begin(), b.end(),
+                                          result.begin(),
+                                          thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetIntersectionDescendingSimple);
+
+
+template<typename T>
+void TestSetIntersectionDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_intersection(h_a.begin(), h_a.end(),
+                                   h_b.begin(), h_b.end(),
+                                   h_result.begin(),
+                                   thrust::greater<T>());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_intersection(d_a.begin(), d_a.end(),
+                                   d_b.begin(), d_b.end(),
+                                   d_result.begin(),
+                                   thrust::greater<T>());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionDescending);
+
diff --git a/thrust/testing/set_intersection_key_value.cu b/thrust/testing/set_intersection_key_value.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdca4816df995acecd5c8ba2a45f46941840db4e
--- /dev/null
+++ b/thrust/testing/set_intersection_key_value.cu
@@ -0,0 +1,50 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename U>
+  void TestSetIntersectionKeyValue(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(n);
+  thrust::device_vector<T> d_result(n);
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_intersection(h_a.begin(), h_a.end(),
+                                   h_b.begin(), h_b.end(),
+                                   h_result.begin());
+  h_result.resize(h_end - h_result.begin());
+
+  d_end = thrust::set_intersection(d_a.begin(), d_a.end(),
+                                   d_b.begin(), d_b.end(),
+                                   d_result.begin());
+
+  d_result.resize(d_end - d_result.begin());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionKeyValue);
+
diff --git a/thrust/testing/set_symmetric_difference.cu b/thrust/testing/set_symmetric_difference.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b3e3c14932c64c49224fb7212e2f74d5e3d34d5e
--- /dev/null
+++ b/thrust/testing/set_symmetric_difference.cu
@@ -0,0 +1,257 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_symmetric_difference(my_system &system,
+                                        InputIterator1,
+                                        InputIterator1,
+                                        InputIterator2,
+                                        InputIterator2,
+                                        OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestSetSymmetricDifferenceDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_symmetric_difference(sys,
+                                   vec.begin(),
+                                   vec.begin(),
+                                   vec.begin(),
+                                   vec.begin(),
+                                   vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_symmetric_difference(my_tag,
+                                        InputIterator1,
+                                        InputIterator1,
+                                        InputIterator2,
+                                        InputIterator2,
+                                        OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestSetSymmetricDifferenceDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_symmetric_difference(thrust::retag<my_tag>(vec.begin()),
+                                   thrust::retag<my_tag>(vec.begin()),
+                                   thrust::retag<my_tag>(vec.begin()),
+                                   thrust::retag<my_tag>(vec.begin()),
+                                   thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetSymmetricDifferenceSimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 4; a[3] = 6;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4; b[4] = 7;
+
+  Vector ref(5);
+  ref[0] = 2; ref[1] = 3; ref[2] = 3; ref[3] = 6; ref[4] = 7;
+
+  Vector result(5);
+
+  Iterator end = thrust::set_symmetric_difference(a.begin(), a.end(),
+                                                  b.begin(), b.end(),
+                                                  result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetSymmetricDifferenceSimple);
+
+
+template<typename T>
+void TestSetSymmetricDifference(const size_t n)
+{
+  size_t sizes[]   = {0, 1, n / 2, n, n + 1, 2 * n};
+  size_t num_sizes = sizeof(sizes) / sizeof(size_t);
+
+  thrust::host_vector<T> random = unittest::random_integers<unittest::int8_t>(n + *thrust::max_element(sizes, sizes + num_sizes));
+
+  thrust::host_vector<T> h_a(random.begin(), random.begin() + n);
+  thrust::host_vector<T> h_b(random.begin() + n, random.end());
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+  
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  for (size_t i = 0; i < num_sizes; i++)
+  {
+      size_t size = sizes[i];
+      
+      thrust::host_vector<T>   h_result(n + size);
+      thrust::device_vector<T> d_result(n + size);
+
+      typename thrust::host_vector<T>::iterator   h_end;
+      typename thrust::device_vector<T>::iterator d_end;
+      
+      h_end = thrust::set_symmetric_difference(h_a.begin(), h_a.end(),
+                                               h_b.begin(), h_b.begin() + size,
+                                               h_result.begin());
+      h_result.resize(h_end - h_result.begin());
+
+      d_end = thrust::set_symmetric_difference(d_a.begin(), d_a.end(),
+                                               d_b.begin(), d_b.begin() + size,
+                                               d_result.begin());
+      d_result.resize(d_end - d_result.begin());
+
+      ASSERT_EQUAL(h_result, d_result);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifference);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_a = temp; thrust::sort(h_a.begin(), h_a.end());
+  thrust::host_vector<T> h_b = h_a;
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(h_result.size());
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_symmetric_difference(h_a.begin(), h_a.end(),
+                                           h_b.begin(), h_b.end(),
+                                           h_result.begin());
+  h_result.erase(h_end, h_result.end());
+
+  d_end = thrust::set_symmetric_difference(d_a.begin(), d_a.end(),
+                                           d_b.begin(), d_b.end(),
+                                           d_result.begin());
+  d_result.erase(d_end, d_result.end());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceEquivalentRanges);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end());
+  thrust::sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(h_result.size());
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_difference(h_a.begin(), h_a.end(),
+                                 h_b.begin(), h_b.end(),
+                                 h_result.begin());
+  h_result.erase(h_end, h_result.end());
+
+  d_end = thrust::set_difference(d_a.begin(), d_a.end(),
+                                 d_b.begin(), d_b.end(),
+                                 d_result.begin());
+  d_result.erase(d_end, d_result.end());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceMultiset);
+
+
+template<typename U>
+  void TestSetSymmetricDifferenceKeyValue(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(h_result.size());
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_symmetric_difference(h_a.begin(), h_a.end(),
+                                           h_b.begin(), h_b.end(),
+                                           h_result.begin());
+  h_result.erase(h_end, h_result.begin());
+
+  d_end = thrust::set_symmetric_difference(d_a.begin(), d_a.end(),
+                                           d_b.begin(), d_b.end(),
+                                           d_result.begin());
+
+  d_result.erase(d_end, d_result.begin());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceKeyValue);
+
diff --git a/thrust/testing/set_symmetric_difference_by_key.cu b/thrust/testing/set_symmetric_difference_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c2688fdb88ae76fd185a3986d0a2dbbfe2fb2c83
--- /dev/null
+++ b/thrust/testing/set_symmetric_difference_by_key.cu
@@ -0,0 +1,320 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(my_system &system,
+                                  InputIterator1,
+                                  InputIterator1,
+                                  InputIterator2,
+                                  InputIterator2,
+                                  InputIterator3,
+                                  InputIterator4,
+                                  OutputIterator1 keys_result,
+                                  OutputIterator2 values_result)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(keys_result, values_result);
+}
+
+void TestSetSymmetricDifferenceByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_symmetric_difference_by_key(sys,
+                                          vec.begin(),
+                                          vec.begin(),
+                                          vec.begin(),
+                                          vec.begin(),
+                                          vec.begin(),
+                                          vec.begin(),
+                                          vec.begin(),
+                                          vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(my_tag,
+                                  InputIterator1,
+                                  InputIterator1,
+                                  InputIterator2,
+                                  InputIterator2,
+                                  InputIterator3,
+                                  InputIterator4,
+                                  OutputIterator1 keys_result,
+                                  OutputIterator2 values_result)
+{
+  *keys_result = 13;
+  return thrust::make_pair(keys_result,values_result);
+}
+
+void TestSetSymmetricDifferenceByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_symmetric_difference_by_key(thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()),
+                                          thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetSymmetricDifferenceByKeySimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4; a_key[3] = 6;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4; b_key[4] = 7;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 2; ref_key[1] = 3; ref_key[2] = 3; ref_key[3] = 6; ref_key[4] = 7;
+  ref_val[0] = 0; ref_val[1] = 1; ref_val[2] = 1; ref_val[3] = 0; ref_val[4] = 1;
+
+  Vector result_key(5), result_val(5);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_symmetric_difference_by_key(a_key.begin(), a_key.end(),
+                                            b_key.begin(), b_key.end(),
+                                            a_val.begin(),
+                                            b_val.begin(),
+                                            result_key.begin(),
+                                            result_val.begin());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetSymmetricDifferenceByKeySimple);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceByKey(const size_t n)
+{
+  thrust::host_vector<T> random_keys = unittest::random_integers<unittest::int8_t>(n);
+  thrust::host_vector<T> random_vals = unittest::random_integers<unittest::int8_t>(n);
+
+  size_t denominators[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t num_denominators = sizeof(denominators) / sizeof(size_t);
+
+  for(size_t i = 0; i < num_denominators; ++i)
+  {
+    size_t size_a = n / denominators[i];
+
+    thrust::host_vector<T> h_a_keys(random_keys.begin(), random_keys.begin() + size_a);
+    thrust::host_vector<T> h_b_keys(random_keys.begin() + size_a, random_keys.end());
+
+    thrust::host_vector<T> h_a_vals(random_vals.begin(), random_vals.begin() + size_a);
+    thrust::host_vector<T> h_b_vals(random_vals.begin() + size_a, random_vals.end());
+
+    thrust::stable_sort(h_a_keys.begin(), h_a_keys.end());
+    thrust::stable_sort(h_b_keys.begin(), h_b_keys.end());
+
+    thrust::device_vector<T> d_a_keys = h_a_keys;
+    thrust::device_vector<T> d_b_keys = h_b_keys;
+
+    thrust::device_vector<T> d_a_vals = h_a_vals;
+    thrust::device_vector<T> d_b_vals = h_b_vals;
+
+    size_t max_size = h_a_keys.size() + h_b_keys.size();
+
+    thrust::host_vector<T> h_result_keys(max_size);
+    thrust::host_vector<T> h_result_vals(max_size);
+
+    thrust::device_vector<T> d_result_keys(max_size);
+    thrust::device_vector<T> d_result_vals(max_size);
+
+
+    thrust::pair<
+      typename thrust::host_vector<T>::iterator,
+      typename thrust::host_vector<T>::iterator
+    > h_end;
+
+    thrust::pair<
+      typename thrust::device_vector<T>::iterator,
+      typename thrust::device_vector<T>::iterator
+    > d_end;
+
+
+    h_end = thrust::set_symmetric_difference_by_key(h_a_keys.begin(), h_a_keys.end(),
+                                                    h_b_keys.begin(), h_b_keys.end(),
+                                                    h_a_vals.begin(),
+                                                    h_b_vals.begin(),
+                                                    h_result_keys.begin(),
+                                                    h_result_vals.begin());
+    h_result_keys.erase(h_end.first, h_result_keys.end());
+    h_result_vals.erase(h_end.second, h_result_vals.end());
+
+    d_end = thrust::set_symmetric_difference_by_key(d_a_keys.begin(), d_a_keys.end(),
+                                                    d_b_keys.begin(), d_b_keys.end(),
+                                                    d_a_vals.begin(),
+                                                    d_b_vals.begin(),
+                                                    d_result_keys.begin(),
+                                                    d_result_vals.begin());
+    d_result_keys.erase(d_end.first, d_result_keys.end());
+    d_result_vals.erase(d_end.second, d_result_vals.end());
+
+    ASSERT_EQUAL(h_result_keys, d_result_keys);
+    ASSERT_EQUAL(h_result_vals, d_result_vals);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKey);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceByKeyEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+
+  thrust::host_vector<T> h_a_key = temp;
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::host_vector<T> h_b_key = h_a_key;
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  size_t max_size = h_a_key.size() + h_b_key.size();
+
+  thrust::host_vector<T>   h_result_key(max_size), h_result_val(max_size);
+  thrust::device_vector<T> d_result_key(max_size), d_result_val(max_size);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+  
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_symmetric_difference_by_key(h_a_key.begin(), h_a_key.end(),
+                                                  h_b_key.begin(), h_b_key.end(),
+                                                  h_a_val.begin(),
+                                                  h_b_val.begin(),
+                                                  h_result_key.begin(),
+                                                  h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_symmetric_difference_by_key(d_a_key.begin(), d_a_key.end(),
+                                                  d_b_key.begin(), d_b_key.end(),
+                                                  d_a_val.begin(),
+                                                  d_b_val.begin(),
+                                                  d_result_key.begin(),
+                                                  d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKeyEquivalentRanges);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceByKeyMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::sort(h_b_key.begin(), h_b_key.end());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  size_t max_size = h_a_key.size() + h_b_key.size();
+  thrust::host_vector<T>   h_result_key(max_size), h_result_val(max_size);
+  thrust::device_vector<T> d_result_key(max_size), d_result_val(max_size);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_symmetric_difference_by_key(h_a_key.begin(), h_a_key.end(),
+                                                  h_b_key.begin(), h_b_key.end(),
+                                                  h_a_val.begin(),
+                                                  h_b_val.begin(),
+                                                  h_result_key.begin(),
+                                                  h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_symmetric_difference_by_key(d_a_key.begin(), d_a_key.end(),
+                                                  d_b_key.begin(), d_b_key.end(),
+                                                  d_a_val.begin(),
+                                                  d_b_val.begin(),
+                                                  d_result_key.begin(),
+                                                  d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKeyMultiset);
+
diff --git a/thrust/testing/set_symmetric_difference_by_key_descending.cu b/thrust/testing/set_symmetric_difference_by_key_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..59db39f5553e6247ee13ef389bd6621f1ff20456
--- /dev/null
+++ b/thrust/testing/set_symmetric_difference_by_key_descending.cu
@@ -0,0 +1,101 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetSymmetricDifferenceByKeyDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(4), b_key(5);
+  Vector a_val(4), b_val(5);
+
+  a_key[0] = 6; a_key[1] = 4; a_key[2] = 2; a_key[3] = 0;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0; a_val[3] = 0;
+
+  b_key[0] = 7; b_key[1] = 4; b_key[2] = 3; b_key[3] = 3; b_key[4] = 0;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1; b_val[4] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 7; ref_key[1] = 6; ref_key[2] = 3; ref_key[3] = 3; ref_key[4] = 2;
+  ref_val[0] = 1; ref_val[1] = 0; ref_val[2] = 1; ref_val[3] = 1; ref_val[4] = 0;
+
+  Vector result_key(5), result_val(5);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_symmetric_difference_by_key(a_key.begin(), a_key.end(),
+                                            b_key.begin(), b_key.end(),
+                                            a_val.begin(),
+                                            b_val.begin(),
+                                            result_key.begin(),
+                                            result_val.begin(),
+                                            thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetSymmetricDifferenceByKeyDescendingSimple);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceByKeyDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end(), thrust::greater<T>());
+  thrust::sort(h_b_key.begin(), h_b_key.end(), thrust::greater<T>());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(h_a_key.size());
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(h_b_key.size());
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  size_t max_size = h_a_key.size() + h_b_key.size();
+  thrust::host_vector<T>   h_result_key(max_size), h_result_val(max_size);
+  thrust::device_vector<T> d_result_key(max_size), d_result_val(max_size);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_symmetric_difference_by_key(h_a_key.begin(), h_a_key.end(),
+                                                  h_b_key.begin(), h_b_key.end(),
+                                                  h_a_val.begin(),
+                                                  h_b_val.begin(),
+                                                  h_result_key.begin(),
+                                                  h_result_val.begin(),
+                                                  thrust::greater<T>());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_symmetric_difference_by_key(d_a_key.begin(), d_a_key.end(),
+                                                  d_b_key.begin(), d_b_key.end(),
+                                                  d_a_val.begin(),
+                                                  d_b_val.begin(),
+                                                  d_result_key.begin(),
+                                                  d_result_val.begin(),
+                                                  thrust::greater<T>());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKeyDescending);
+
diff --git a/thrust/testing/set_symmetric_difference_descending.cu b/thrust/testing/set_symmetric_difference_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ade5e089869c345f1fa84be3f5b0c98a5254f89
--- /dev/null
+++ b/thrust/testing/set_symmetric_difference_descending.cu
@@ -0,0 +1,68 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetSymmetricDifferenceDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(4), b(5);
+
+  a[0] = 6; a[1] = 4; a[2] = 2; a[3] = 0;
+  b[0] = 7; b[1] = 4; b[2] = 3; b[3] = 3; b[4] = 0;
+
+  Vector ref(5);
+  ref[0] = 7; ref[1] = 6; ref[2] = 3; ref[3] = 3; ref[4] = 2;
+
+  Vector result(5);
+
+  Iterator end = thrust::set_symmetric_difference(a.begin(), a.end(),
+                                                  b.begin(), b.end(),
+                                                  result.begin(),
+                                                  thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetSymmetricDifferenceDescendingSimple);
+
+
+template<typename T>
+void TestSetSymmetricDifferenceDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(h_result.size());
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_symmetric_difference(h_a.begin(), h_a.end(),
+                                           h_b.begin(), h_b.end(),
+                                           h_result.begin(),
+                                           thrust::greater<T>());
+  h_result.erase(h_end, h_result.end());
+
+  d_end = thrust::set_symmetric_difference(d_a.begin(), d_a.end(),
+                                           d_b.begin(), d_b.end(),
+                                           d_result.begin(),
+                                           thrust::greater<T>());
+
+  d_result.erase(d_end, d_result.end());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceDescending);
+
diff --git a/thrust/testing/set_union.cu b/thrust/testing/set_union.cu
new file mode 100644
index 0000000000000000000000000000000000000000..414bd3c81edbfbd61bd18600ea54961e0c2f3526
--- /dev/null
+++ b/thrust/testing/set_union.cu
@@ -0,0 +1,199 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_union(my_system &system,
+                         InputIterator1,
+                         InputIterator1,
+                         InputIterator2,
+                         InputIterator2,
+                         OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestSetUnionDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_union(sys,
+                    vec.begin(),
+                    vec.begin(),
+                    vec.begin(),
+                    vec.begin(),
+                    vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetUnionDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+OutputIterator set_union(my_tag,
+                         InputIterator1,
+                         InputIterator1,
+                         InputIterator2,
+                         InputIterator2,
+                         OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestSetUnionDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_union(thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()),
+                    thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetUnionDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetUnionSimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 0; a[1] = 2; a[2] = 4;
+  b[0] = 0; b[1] = 3; b[2] = 3; b[3] = 4;
+
+  Vector ref(5);
+  ref[0] = 0; ref[1] = 2; ref[2] = 3; ref[3] = 3; ref[4] = 4;
+
+  Vector result(5);
+
+  Iterator end = thrust::set_union(a.begin(), a.end(),
+                                   b.begin(), b.end(),
+                                   result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetUnionSimple);
+
+
+template<typename Vector>
+void TestSetUnionWithEquivalentElementsSimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(5);
+
+  a[0] = 0; a[1] = 2; a[2] = 2;
+  b[0] = 0; b[1] = 2; b[2] = 2; b[3] = 2; b[4] = 3;
+
+  Vector ref(5);
+  ref[0] = 0; ref[1] = 2; ref[2] = 2; ref[3] = 2; ref[4] = 3;
+
+  Vector result(5);
+
+  Iterator end = thrust::set_union(a.begin(), a.end(),
+                                   b.begin(), b.end(),
+                                   result.begin());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetUnionWithEquivalentElementsSimple);
+
+
+template<typename T>
+void TestSetUnion(const size_t n)
+{
+  size_t sizes[]   = {0, 1, n / 2, n, n + 1, 2 * n};
+  size_t num_sizes = sizeof(sizes) / sizeof(size_t);
+
+  thrust::host_vector<T> random = unittest::random_integers<unittest::int8_t>(n + *thrust::max_element(sizes, sizes + num_sizes));
+
+  thrust::host_vector<T> h_a(random.begin(), random.begin() + n);
+  thrust::host_vector<T> h_b(random.begin() + n, random.end());
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+  
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  for (size_t i = 0; i < num_sizes; i++)
+  {
+    size_t size = sizes[i];
+    
+    thrust::host_vector<T>   h_result(n + size);
+    thrust::device_vector<T> d_result(n + size);
+
+    typename thrust::host_vector<T>::iterator   h_end;
+    typename thrust::device_vector<T>::iterator d_end;
+    
+    h_end = thrust::set_union(h_a.begin(), h_a.end(),
+                              h_b.begin(), h_b.begin() + size,
+                              h_result.begin());
+    h_result.resize(h_end - h_result.begin());
+
+    d_end = thrust::set_union(d_a.begin(), d_a.end(),
+                              d_b.begin(), d_b.begin() + size,
+                              d_result.begin());
+    d_result.resize(d_end - d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnion);
+
+template<typename T>
+void TestSetUnionToDiscardIterator(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end());
+  thrust::sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::discard_iterator<> h_result;
+  thrust::discard_iterator<> d_result;
+
+  thrust::host_vector<T> h_reference(2 * n);
+  typename thrust::host_vector<T>::iterator h_end = 
+    thrust::set_union(h_a.begin(), h_a.end(),
+                      h_b.begin(), h_b.end(),
+                      h_reference.begin());
+  h_reference.erase(h_end, h_reference.end());
+  
+  h_result = thrust::set_union(h_a.begin(), h_a.end(),
+                               h_b.begin(), h_b.end(),
+                               thrust::make_discard_iterator());
+
+  d_result = thrust::set_union(d_a.begin(), d_a.end(),
+                               d_b.begin(), d_b.end(),
+                               thrust::make_discard_iterator());
+
+  thrust::discard_iterator<> reference(h_reference.size());
+
+  ASSERT_EQUAL_QUIET(reference, h_result);
+  ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionToDiscardIterator);
+
diff --git a/thrust/testing/set_union_by_key.cu b/thrust/testing/set_union_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ec8864941192abd55e6628e3a5941b4f6326e8c6
--- /dev/null
+++ b/thrust/testing/set_union_by_key.cu
@@ -0,0 +1,320 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(my_system &system,
+                   InputIterator1,
+                   InputIterator1,
+                   InputIterator2,
+                   InputIterator2,
+                   InputIterator3,
+                   InputIterator4,
+                   OutputIterator1 keys_result,
+                   OutputIterator2 values_result)
+{
+  system.validate_dispatch();
+  return thrust::make_pair(keys_result, values_result);
+}
+
+void TestSetUnionByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::set_union_by_key(sys,
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin(),
+                           vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSetUnionByKeyDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(my_tag,
+                   InputIterator1,
+                   InputIterator1,
+                   InputIterator2,
+                   InputIterator2,
+                   InputIterator3,
+                   InputIterator4,
+                   OutputIterator1 keys_result,
+                   OutputIterator2 values_result)
+{
+  *keys_result = 13;
+  return thrust::make_pair(keys_result,values_result);
+}
+
+void TestSetUnionByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::set_union_by_key(thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()),
+                           thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSetUnionByKeyDispatchImplicit);
+
+
+template<typename Vector>
+void TestSetUnionByKeySimple(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3), b_val(4);
+
+  a_key[0] = 0; a_key[1] = 2; a_key[2] = 4;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 0; b_key[1] = 3; b_key[2] = 3; b_key[3] = 4;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 0; ref_key[1] = 2; ref_key[2] = 3; ref_key[3] = 3; ref_key[4] = 4;
+  ref_val[0] = 0; ref_val[1] = 0; ref_val[2] = 1; ref_val[3] = 1; ref_val[4] = 0;
+
+  Vector result_key(5), result_val(5);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_union_by_key(a_key.begin(), a_key.end(),
+                             b_key.begin(), b_key.end(),
+                             a_val.begin(),
+                             b_val.begin(),
+                             result_key.begin(),
+                             result_val.begin());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetUnionByKeySimple);
+
+
+template<typename T>
+void TestSetUnionByKey(const size_t n)
+{
+  thrust::host_vector<T> random_keys = unittest::random_integers<unittest::int8_t>(n);
+  thrust::host_vector<T> random_vals = unittest::random_integers<unittest::int8_t>(n);
+
+  size_t denominators[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t num_denominators = sizeof(denominators) / sizeof(size_t);
+
+  for(size_t i = 0; i < num_denominators; ++i)
+  {
+    size_t size_a = n / denominators[i];
+
+    thrust::host_vector<T> h_a_keys(random_keys.begin(), random_keys.begin() + size_a);
+    thrust::host_vector<T> h_b_keys(random_keys.begin() + size_a, random_keys.end());
+
+    thrust::host_vector<T> h_a_vals(random_vals.begin(), random_vals.begin() + size_a);
+    thrust::host_vector<T> h_b_vals(random_vals.begin() + size_a, random_vals.end());
+
+    thrust::stable_sort(h_a_keys.begin(), h_a_keys.end());
+    thrust::stable_sort(h_b_keys.begin(), h_b_keys.end());
+
+    thrust::device_vector<T> d_a_keys = h_a_keys;
+    thrust::device_vector<T> d_b_keys = h_b_keys;
+
+    thrust::device_vector<T> d_a_vals = h_a_vals;
+    thrust::device_vector<T> d_b_vals = h_b_vals;
+
+    size_t max_size = h_a_keys.size() + h_b_keys.size();
+
+    thrust::host_vector<T> h_result_keys(max_size);
+    thrust::host_vector<T> h_result_vals(max_size);
+
+    thrust::device_vector<T> d_result_keys(max_size);
+    thrust::device_vector<T> d_result_vals(max_size);
+
+
+    thrust::pair<
+      typename thrust::host_vector<T>::iterator,
+      typename thrust::host_vector<T>::iterator
+    > h_end;
+
+    thrust::pair<
+      typename thrust::device_vector<T>::iterator,
+      typename thrust::device_vector<T>::iterator
+    > d_end;
+
+
+    h_end = thrust::set_union_by_key(h_a_keys.begin(), h_a_keys.end(),
+                                     h_b_keys.begin(), h_b_keys.end(),
+                                     h_a_vals.begin(),
+                                     h_b_vals.begin(),
+                                     h_result_keys.begin(),
+                                     h_result_vals.begin());
+    h_result_keys.erase(h_end.first, h_result_keys.end());
+    h_result_vals.erase(h_end.second, h_result_vals.end());
+
+    d_end = thrust::set_union_by_key(d_a_keys.begin(), d_a_keys.end(),
+                                     d_b_keys.begin(), d_b_keys.end(),
+                                     d_a_vals.begin(),
+                                     d_b_vals.begin(),
+                                     d_result_keys.begin(),
+                                     d_result_vals.begin());
+    d_result_keys.erase(d_end.first, d_result_keys.end());
+    d_result_vals.erase(d_end.second, d_result_vals.end());
+
+    ASSERT_EQUAL(h_result_keys, d_result_keys);
+    ASSERT_EQUAL(h_result_vals, d_result_vals);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionByKey);
+
+
+template<typename T>
+void TestSetUnionByKeyEquivalentRanges(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(n);
+
+  thrust::host_vector<T> h_a_key = temp;
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::host_vector<T> h_b_key = h_a_key;
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  size_t max_size = h_a_key.size() + h_b_key.size();
+
+  thrust::host_vector<T>   h_result_key(max_size), h_result_val(max_size);
+  thrust::device_vector<T> d_result_key(max_size), d_result_val(max_size);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+  
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_union_by_key(h_a_key.begin(), h_a_key.end(),
+                                   h_b_key.begin(), h_b_key.end(),
+                                   h_a_val.begin(),
+                                   h_b_val.begin(),
+                                   h_result_key.begin(),
+                                   h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_union_by_key(d_a_key.begin(), d_a_key.end(),
+                                   d_b_key.begin(), d_b_key.end(),
+                                   d_a_val.begin(),
+                                   d_b_val.begin(),
+                                   d_result_key.begin(),
+                                   d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionByKeyEquivalentRanges);
+
+
+template<typename T>
+void TestSetUnionByKeyMultiset(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+
+  // restrict elements to [min,13)
+  for(typename thrust::host_vector<T>::iterator i = temp.begin();
+      i != temp.end();
+      ++i)
+  {
+    int temp = static_cast<int>(*i);
+    temp %= 13;
+    *i = temp;
+  }
+
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end());
+  thrust::sort(h_b_key.begin(), h_b_key.end());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(n);
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(n);
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  size_t max_size = h_a_key.size() + h_b_key.size();
+  thrust::host_vector<T>   h_result_key(max_size), h_result_val(max_size);
+  thrust::device_vector<T> d_result_key(max_size), d_result_val(max_size);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_union_by_key(h_a_key.begin(), h_a_key.end(),
+                                   h_b_key.begin(), h_b_key.end(),
+                                   h_a_val.begin(),
+                                   h_b_val.begin(),
+                                   h_result_key.begin(),
+                                   h_result_val.begin());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_union_by_key(d_a_key.begin(), d_a_key.end(),
+                                   d_b_key.begin(), d_b_key.end(),
+                                   d_a_val.begin(),
+                                   d_b_val.begin(),
+                                   d_result_key.begin(),
+                                   d_result_val.begin());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionByKeyMultiset);
+
diff --git a/thrust/testing/set_union_by_key_descending.cu b/thrust/testing/set_union_by_key_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eff9659b41624455659cf285b6393ab0f51537b6
--- /dev/null
+++ b/thrust/testing/set_union_by_key_descending.cu
@@ -0,0 +1,101 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetUnionByKeyDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a_key(3), b_key(4);
+  Vector a_val(3), b_val(4);
+
+  a_key[0] = 4; a_key[1] = 2; a_key[2] = 0;
+  a_val[0] = 0; a_val[1] = 0; a_val[2] = 0;
+
+  b_key[0] = 4; b_key[1] = 3; b_key[2] = 3; b_key[3] = 0;
+  b_val[0] = 1; b_val[1] = 1; b_val[2] = 1; b_val[3] = 1;
+
+  Vector ref_key(5), ref_val(5);
+  ref_key[0] = 4; ref_key[1] = 3; ref_key[2] = 3; ref_key[3] = 2; ref_key[4] = 0;
+  ref_val[0] = 0; ref_val[1] = 1; ref_val[2] = 1; ref_val[3] = 0; ref_val[4] = 0;
+
+  Vector result_key(5), result_val(5);
+
+  thrust::pair<Iterator,Iterator> end =
+    thrust::set_union_by_key(a_key.begin(), a_key.end(),
+                             b_key.begin(), b_key.end(),
+                             a_val.begin(),
+                             b_val.begin(),
+                             result_key.begin(),
+                             result_val.begin(),
+                             thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result_key.end(), end.first);
+  ASSERT_EQUAL_QUIET(result_val.end(), end.second);
+  ASSERT_EQUAL(ref_key, result_key);
+  ASSERT_EQUAL(ref_val, result_val);
+}
+DECLARE_VECTOR_UNITTEST(TestSetUnionByKeyDescendingSimple);
+
+
+template<typename T>
+void TestSetUnionByKeyDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a_key.begin(), h_a_key.end(), thrust::greater<T>());
+  thrust::sort(h_b_key.begin(), h_b_key.end(), thrust::greater<T>());
+
+  thrust::host_vector<T> h_a_val = unittest::random_integers<T>(h_a_key.size());
+  thrust::host_vector<T> h_b_val = unittest::random_integers<T>(h_b_key.size());
+
+  thrust::device_vector<T> d_a_key = h_a_key;
+  thrust::device_vector<T> d_b_key = h_b_key;
+
+  thrust::device_vector<T> d_a_val = h_a_val;
+  thrust::device_vector<T> d_b_val = h_b_val;
+
+  size_t max_size = h_a_key.size() + h_b_key.size();
+  thrust::host_vector<T>   h_result_key(max_size), h_result_val(max_size);
+  thrust::device_vector<T> d_result_key(max_size), d_result_val(max_size);
+
+  thrust::pair<
+    typename thrust::host_vector<T>::iterator,
+    typename thrust::host_vector<T>::iterator
+  > h_end;
+
+  thrust::pair<
+    typename thrust::device_vector<T>::iterator,
+    typename thrust::device_vector<T>::iterator
+  > d_end;
+  
+  h_end = thrust::set_union_by_key(h_a_key.begin(), h_a_key.end(),
+                                   h_b_key.begin(), h_b_key.end(),
+                                   h_a_val.begin(),
+                                   h_b_val.begin(),
+                                   h_result_key.begin(),
+                                   h_result_val.begin(),
+                                   thrust::greater<T>());
+  h_result_key.erase(h_end.first,  h_result_key.end());
+  h_result_val.erase(h_end.second, h_result_val.end());
+
+  d_end = thrust::set_union_by_key(d_a_key.begin(), d_a_key.end(),
+                                   d_b_key.begin(), d_b_key.end(),
+                                   d_a_val.begin(),
+                                   d_b_val.begin(),
+                                   d_result_key.begin(),
+                                   d_result_val.begin(),
+                                   thrust::greater<T>());
+  d_result_key.erase(d_end.first,  d_result_key.end());
+  d_result_val.erase(d_end.second, d_result_val.end());
+
+  ASSERT_EQUAL(h_result_key, d_result_key);
+  ASSERT_EQUAL(h_result_val, d_result_val);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionByKeyDescending);
+
diff --git a/thrust/testing/set_union_descending.cu b/thrust/testing/set_union_descending.cu
new file mode 100644
index 0000000000000000000000000000000000000000..43dbc79acd06ef503ef710212d6889a4bb9437be
--- /dev/null
+++ b/thrust/testing/set_union_descending.cu
@@ -0,0 +1,68 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename Vector>
+void TestSetUnionDescendingSimple(void)
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector a(3), b(4);
+
+  a[0] = 4; a[1] = 2; a[2] = 0;
+  b[0] = 4; b[1] = 3; b[2] = 3; b[3] = 0;
+
+  Vector ref(5);
+  ref[0] = 4; ref[1] = 3; ref[2] = 3; ref[3] = 2; ref[4] = 0;
+
+  Vector result(5);
+
+  Iterator end = thrust::set_union(a.begin(), a.end(),
+                                   b.begin(), b.end(),
+                                   result.begin(),
+                                   thrust::greater<T>());
+
+  ASSERT_EQUAL_QUIET(result.end(), end);
+  ASSERT_EQUAL(ref, result);
+}
+DECLARE_VECTOR_UNITTEST(TestSetUnionDescendingSimple);
+
+
+template<typename T>
+void TestSetUnionDescending(const size_t n)
+{
+  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
+  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+
+  thrust::sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T> h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(h_result.size());
+
+  typename thrust::host_vector<T>::iterator h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_union(h_a.begin(), h_a.end(),
+                            h_b.begin(), h_b.end(),
+                            h_result.begin(),
+                            thrust::greater<T>());
+  h_result.erase(h_end, h_result.end());
+
+  d_end = thrust::set_union(d_a.begin(), d_a.end(),
+                            d_b.begin(), d_b.end(),
+                            d_result.begin(),
+                            thrust::greater<T>());
+
+  d_result.erase(d_end, d_result.end());
+
+  ASSERT_EQUAL(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionDescending);
+
diff --git a/thrust/testing/set_union_key_value.cu b/thrust/testing/set_union_key_value.cu
new file mode 100644
index 0000000000000000000000000000000000000000..29f3f83ece54ac2948a450995d0101d268a4f720
--- /dev/null
+++ b/thrust/testing/set_union_key_value.cu
@@ -0,0 +1,95 @@
+#include <unittest/unittest.h>
+#include <thrust/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+template<typename U>
+  void TestSetUnionKeyValue(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(d_a.size() + d_b.size());
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_union(h_a.begin(), h_a.end(),
+                            h_b.begin(), h_b.end(),
+                            h_result.begin());
+  h_result.erase(h_end, h_result.end());
+
+  d_end = thrust::set_union(d_a.begin(), d_a.end(),
+                            d_b.begin(), d_b.end(),
+                            d_result.begin());
+  d_result.erase(d_end, d_result.end());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionKeyValue);
+
+template<typename U>
+  void TestSetUnionKeyValueDescending(size_t n)
+{
+  typedef key_value<U,U> T;
+
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
+
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
+
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
+  {
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
+  }
+
+  thrust::stable_sort(h_a.begin(), h_a.end(), thrust::greater<T>());
+  thrust::stable_sort(h_b.begin(), h_b.end(), thrust::greater<T>());
+
+  thrust::device_vector<T> d_a = h_a;
+  thrust::device_vector<T> d_b = h_b;
+
+  thrust::host_vector<T>   h_result(h_a.size() + h_b.size());
+  thrust::device_vector<T> d_result(d_a.size() + d_b.size());
+
+  typename thrust::host_vector<T>::iterator   h_end;
+  typename thrust::device_vector<T>::iterator d_end;
+  
+  h_end = thrust::set_union(h_a.begin(), h_a.end(),
+                            h_b.begin(), h_b.end(),
+                            h_result.begin(),
+                            thrust::greater<T>());
+  h_result.erase(h_end, h_result.end());
+
+  d_end = thrust::set_union(d_a.begin(), d_a.end(),
+                            d_b.begin(), d_b.end(),
+                            d_result.begin(),
+                            thrust::greater<T>());
+  d_result.erase(d_end, d_result.end());
+
+  ASSERT_EQUAL_QUIET(h_result, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestSetUnionKeyValueDescending);
+
diff --git a/thrust/testing/shuffle.cu b/thrust/testing/shuffle.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2d9094b421c120c900e27f2289678bd83ca910da
--- /dev/null
+++ b/thrust/testing/shuffle.cu
@@ -0,0 +1,142 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+#include <unittest/unittest.h>
+#include <map>
+
+template <typename Vector>
+void TestShuffleSimple() {
+  Vector data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 2;
+  data[3] = 3;
+  data[4] = 4;
+  Vector shuffled(data.begin(), data.end());
+  thrust::default_random_engine g(2);
+  thrust::shuffle(shuffled.begin(), shuffled.end(), g);
+  thrust::sort(shuffled.begin(), shuffled.end());
+  // Check all of our data is present
+  // This only tests for strange conditions like duplicated elements
+  ASSERT_EQUAL(shuffled, data);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleSimple);
+
+template <typename Vector>
+void TestShuffleCopySimple() {
+  Vector data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 2;
+  data[3] = 3;
+  data[4] = 4;
+  Vector shuffled(5);
+  thrust::default_random_engine g(2);
+  thrust::shuffle_copy(data.begin(), data.end(), shuffled.begin(), g);
+  g.seed(2);
+  thrust::shuffle(data.begin(), data.end(), g);
+  ASSERT_EQUAL(shuffled, data);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleCopySimple);
+
+template <typename T>
+void TestHostDeviceIdentical(size_t m) {
+  thrust::host_vector<T> host_result(m);
+  thrust::host_vector<T> device_result(m);
+  thrust::sequence(host_result.begin(), host_result.end(), 0llu);
+  thrust::sequence(device_result.begin(), device_result.end(), 0llu);
+
+  thrust::default_random_engine host_g(183);
+  thrust::default_random_engine device_g(183);
+
+  thrust::shuffle(host_result.begin(), host_result.end(), host_g);
+  thrust::shuffle(device_result.begin(), device_result.end(), device_g);
+
+  ASSERT_EQUAL(device_result, host_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestHostDeviceIdentical);
+
+// Individual input keys should be permuted to output locations with uniform
+// probability. Perform chi-squared test with confidence 99.9%.
+template <typename Vector>
+void TestShuffleKeyPosition() {
+  typedef typename Vector::value_type T;
+  size_t m = 20;
+  size_t num_samples = 100;
+  thrust::host_vector<size_t> index_sum(m, 0);
+  thrust::host_vector<T> sequence(m);
+  thrust::sequence(sequence.begin(), sequence.end(), T(0));
+
+  for (size_t i = 0; i < num_samples; i++) {
+    Vector shuffled(sequence.begin(), sequence.end());
+    thrust::default_random_engine g(i);
+    thrust::shuffle(shuffled.begin(), shuffled.end(), g);
+    thrust::host_vector<T> tmp(shuffled.begin(), shuffled.end());
+
+    for (auto j = 0ull; j < m; j++) {
+      index_sum[tmp[j]] += j;
+    }
+  }
+  double expected_average_position = static_cast<double>(m - 1) / 2;
+  double chi_squared = 0.0;
+  for (auto j = 0ull; j < m; j++) {
+    double average_position = static_cast<double>(index_sum[j]) / num_samples;
+    chi_squared += std::pow(expected_average_position - average_position, 2) /
+                   expected_average_position;
+  }
+  // Tabulated chi-squared critical value for m-1=19 degrees of freedom
+  // and 99.9% confidence
+  double confidence_threshold = 43.82;
+  ASSERT_LESS(chi_squared, confidence_threshold);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleKeyPosition);
+
+struct vector_compare {
+  template <typename VectorT>
+  bool operator()(const VectorT& a, const VectorT& b) const {
+    for (auto i = 0ull; i < a.size(); i++) {
+      if (a[i] < b[i]) return true;
+      if (a[i] > b[i]) return false;
+    }
+    return false;
+  }
+};
+
+// Brute force check permutations are uniformly distributed on small input
+// Uses a chi-squared test indicating 99% confidence the output is uniformly
+// random
+template <typename Vector>
+void TestShuffleUniformPermutation() {
+  typedef typename Vector::value_type T;
+
+  size_t m = 5;
+  size_t num_samples = 1000;
+  size_t total_permutations = 1 * 2 * 3 * 4 * 5;
+  std::map<thrust::host_vector<T>, size_t, vector_compare> permutation_counts;
+  Vector sequence(m);
+  thrust::sequence(sequence.begin(), sequence.end(), T(0));
+  thrust::default_random_engine g(17);
+  for (auto i = 0ull; i < num_samples; i++) {
+    thrust::shuffle(sequence.begin(), sequence.end(), g);
+    thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+    permutation_counts[tmp]++;
+  }
+
+  ASSERT_EQUAL(permutation_counts.size(), total_permutations);
+
+  double chi_squared = 0.0;
+  double expected_count = static_cast<double>(num_samples) / total_permutations;
+  for (auto kv : permutation_counts) {
+    chi_squared += std::pow(expected_count - kv.second, 2) / expected_count;
+  }
+  // Tabulated chi-squared critical value for 119 degrees of freedom (5! - 1)
+  // and 99% confidence
+  double confidence_threshold = 157.8;
+  ASSERT_LESS(chi_squared, confidence_threshold);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleUniformPermutation);
+#endif
diff --git a/thrust/testing/sort.cu b/thrust/testing/sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e460655c42b0ec62f41d7045cdba595c3f48302d
--- /dev/null
+++ b/thrust/testing/sort.cu
@@ -0,0 +1,136 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename RandomAccessIterator>
+void sort(my_system &system, RandomAccessIterator, RandomAccessIterator)
+{
+  system.validate_dispatch();
+}
+
+void TestSortDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::sort(sys, vec.begin(), vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSortDispatchExplicit);
+
+
+template<typename RandomAccessIterator>
+void sort(my_tag, RandomAccessIterator first, RandomAccessIterator)
+{
+  *first = 13;
+}
+
+void TestSortDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::sort(thrust::retag<my_tag>(vec.begin()),
+               thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSortDispatchImplicit);
+
+template <class Vector>
+void InitializeSimpleKeySortTest(Vector& unsorted_keys, Vector& sorted_keys)
+{
+    unsorted_keys.resize(7);
+    unsorted_keys[0] = 1; 
+    unsorted_keys[1] = 3; 
+    unsorted_keys[2] = 6;
+    unsorted_keys[3] = 5;
+    unsorted_keys[4] = 2;
+    unsorted_keys[5] = 0;
+    unsorted_keys[6] = 4;
+
+    sorted_keys.resize(7); 
+    sorted_keys[0] = 0; 
+    sorted_keys[1] = 1; 
+    sorted_keys[2] = 2;
+    sorted_keys[3] = 3;
+    sorted_keys[4] = 4;
+    sorted_keys[5] = 5;
+    sorted_keys[6] = 6;
+}
+
+template <class Vector>
+void TestSortSimple(void)
+{
+    Vector unsorted_keys;
+    Vector   sorted_keys;
+
+    InitializeSimpleKeySortTest(unsorted_keys, sorted_keys);
+
+    thrust::sort(unsorted_keys.begin(), unsorted_keys.end());
+
+    ASSERT_EQUAL(unsorted_keys, sorted_keys);
+}
+DECLARE_VECTOR_UNITTEST(TestSortSimple);
+
+
+template <typename T>
+void TestSortAscendingKey(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    thrust::sort(h_data.begin(), h_data.end(), thrust::less<T>());
+    thrust::sort(d_data.begin(), d_data.end(), thrust::less<T>());
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestSortAscendingKey);
+
+void TestSortDescendingKey(void)
+{
+    const size_t n = 10027;
+
+    thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
+    thrust::device_vector<int> d_data = h_data;
+
+    thrust::sort(h_data.begin(), h_data.end(), thrust::greater<int>());
+    thrust::sort(d_data.begin(), d_data.end(), thrust::greater<int>());
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_UNITTEST(TestSortDescendingKey);
+
+
+void TestSortBool(void)
+{
+    const size_t n = 10027;
+
+    thrust::host_vector<bool>   h_data = unittest::random_integers<bool>(n);
+    thrust::device_vector<bool> d_data = h_data;
+
+    thrust::sort(h_data.begin(), h_data.end());
+    thrust::sort(d_data.begin(), d_data.end());
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_UNITTEST(TestSortBool);
+
+
+void TestSortBoolDescending(void)
+{
+    const size_t n = 10027;
+
+    thrust::host_vector<bool>   h_data = unittest::random_integers<bool>(n);
+    thrust::device_vector<bool> d_data = h_data;
+
+    thrust::sort(h_data.begin(), h_data.end(), thrust::greater<bool>());
+    thrust::sort(d_data.begin(), d_data.end(), thrust::greater<bool>());
+
+    ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_UNITTEST(TestSortBoolDescending);
+
+
diff --git a/thrust/testing/sort_by_key.cu b/thrust/testing/sort_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7e8c870ebe91bab9270690651bfc8f541990e692
--- /dev/null
+++ b/thrust/testing/sort_by_key.cu
@@ -0,0 +1,161 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void sort_by_key(my_system &system, RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2)
+{
+    system.validate_dispatch();
+}
+
+void TestSortByKeyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::sort_by_key(sys, vec.begin(), vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSortByKeyDispatchExplicit);
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void sort_by_key(my_tag, RandomAccessIterator1 keys_first, RandomAccessIterator1, RandomAccessIterator2)
+{
+    *keys_first = 13;
+}
+
+void TestSortByKeyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::sort_by_key(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSortByKeyDispatchImplicit);
+
+
+template <class Vector>
+void InitializeSimpleKeyValueSortTest(Vector& unsorted_keys, Vector& unsorted_values,
+                                      Vector& sorted_keys,   Vector& sorted_values)
+{
+    unsorted_keys.resize(7);   
+    unsorted_values.resize(7);   
+    unsorted_keys[0] = 1;  unsorted_values[0] = 0;
+    unsorted_keys[1] = 3;  unsorted_values[1] = 1;
+    unsorted_keys[2] = 6;  unsorted_values[2] = 2;
+    unsorted_keys[3] = 5;  unsorted_values[3] = 3;
+    unsorted_keys[4] = 2;  unsorted_values[4] = 4;
+    unsorted_keys[5] = 0;  unsorted_values[5] = 5;
+    unsorted_keys[6] = 4;  unsorted_values[6] = 6;
+    
+    sorted_keys.resize(7);
+    sorted_values.resize(7);
+    sorted_keys[0] = 0;  sorted_values[1] = 0;  
+    sorted_keys[1] = 1;  sorted_values[3] = 1;  
+    sorted_keys[2] = 2;  sorted_values[6] = 2;
+    sorted_keys[3] = 3;  sorted_values[5] = 3;
+    sorted_keys[4] = 4;  sorted_values[2] = 4;
+    sorted_keys[5] = 5;  sorted_values[0] = 5;
+    sorted_keys[6] = 6;  sorted_values[4] = 6;
+}
+
+
+template <class Vector>
+void TestSortByKeySimple(void)
+{
+    Vector unsorted_keys, unsorted_values;
+    Vector   sorted_keys,   sorted_values;
+
+    InitializeSimpleKeyValueSortTest(unsorted_keys, unsorted_values, sorted_keys, sorted_values);
+
+    thrust::sort_by_key(unsorted_keys.begin(), unsorted_keys.end(), unsorted_values.begin());
+
+    ASSERT_EQUAL(unsorted_keys,   sorted_keys);
+    ASSERT_EQUAL(unsorted_values, sorted_values);
+}
+DECLARE_VECTOR_UNITTEST(TestSortByKeySimple);
+
+
+template <typename T>
+void TestSortAscendingKeyValue(const size_t n)
+{
+    thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_keys = h_keys;
+    
+    thrust::host_vector<T>   h_values = h_keys;
+    thrust::device_vector<T> d_values = d_keys;
+
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::less<T>());
+    thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
+
+    ASSERT_EQUAL(h_keys,   d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+}
+DECLARE_VARIABLE_UNITTEST(TestSortAscendingKeyValue);
+
+
+template <typename T>
+void TestSortDescendingKeyValue(const size_t n)
+{
+    thrust::host_vector<int>   h_keys = unittest::random_integers<int>(n);
+    thrust::device_vector<int> d_keys = h_keys;
+    
+    thrust::host_vector<int>   h_values = h_keys;
+    thrust::device_vector<int> d_values = d_keys;
+
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::greater<int>());
+    thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), thrust::greater<int>());
+
+    ASSERT_EQUAL(h_keys,   d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+}
+DECLARE_VARIABLE_UNITTEST(TestSortDescendingKeyValue);
+
+
+void TestSortByKeyBool(void)
+{
+    const size_t n = 10027;
+
+    thrust::host_vector<bool>   h_keys = unittest::random_integers<bool>(n);
+    thrust::host_vector<int>    h_values = unittest::random_integers<int>(n);
+
+    thrust::device_vector<bool> d_keys = h_keys;
+    thrust::device_vector<int>  d_values = h_values;
+
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
+    thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
+
+    ASSERT_EQUAL(h_keys, d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+}
+DECLARE_UNITTEST(TestSortByKeyBool);
+
+
+void TestSortByKeyBoolDescending(void)
+{
+    const size_t n = 10027;
+
+    thrust::host_vector<bool>   h_keys = unittest::random_integers<bool>(n);
+    thrust::host_vector<int>    h_values = unittest::random_integers<int>(n);
+
+    thrust::device_vector<bool> d_keys = h_keys;
+    thrust::device_vector<int>  d_values = h_values;
+
+    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::greater<bool>());
+    thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), thrust::greater<bool>());
+
+    ASSERT_EQUAL(h_keys, d_keys);
+    ASSERT_EQUAL(h_values, d_values);
+}
+DECLARE_UNITTEST(TestSortByKeyBoolDescending);
+
+
diff --git a/thrust/testing/sort_by_key_variable_bits.cu b/thrust/testing/sort_by_key_variable_bits.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3791608603f52b39704ab491fac15c7cc7277023
--- /dev/null
+++ b/thrust/testing/sort_by_key_variable_bits.cu
@@ -0,0 +1,51 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+using namespace unittest;
+
+typedef unittest::type_list<
+#if !(defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 1))
+// XXX GCC 4.1 miscompiles the char sorts with -O2 for some reason
+                            unittest::uint8_t,
+#endif
+                            unittest::uint16_t,
+                            unittest::uint32_t,
+                            unittest::uint64_t> UnsignedIntegerTypes;
+
+
+template <typename T>
+struct TestSortByKeyVariableBits
+{
+  void operator()(const size_t n)
+  {
+    for(size_t num_bits = 0; num_bits < 8 * sizeof(T); num_bits += 3)
+    {
+        thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
+   
+        const T mask = (1 << num_bits) - 1;
+        for(size_t i = 0; i < n; i++)
+            h_keys[i] &= mask;
+
+        thrust::host_vector<T>   reference = h_keys;
+        thrust::device_vector<T> d_keys    = h_keys;
+    
+        thrust::host_vector<T>   h_values = h_keys;
+        thrust::device_vector<T> d_values = d_keys;
+
+        std::sort(reference.begin(), reference.end());
+
+        thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
+        thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
+
+        ASSERT_EQUAL(reference, h_keys);
+        ASSERT_EQUAL(reference, h_values);
+
+        ASSERT_EQUAL(h_keys,    d_keys);
+        ASSERT_EQUAL(h_values,  d_values);
+    }
+  }
+};
+VariableUnitTest<TestSortByKeyVariableBits, UnsignedIntegerTypes> TestSortByKeyVariableBitsInstance;
+
diff --git a/thrust/testing/sort_permutation_iterator.cu b/thrust/testing/sort_permutation_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..33d6ac6e143fbf0ae4e5183c1581a22f2f063c5e
--- /dev/null
+++ b/thrust/testing/sort_permutation_iterator.cu
@@ -0,0 +1,216 @@
+#include <unittest/unittest.h>
+
+#include <thrust/sort.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/functional.h>
+
+template <typename Iterator>
+class strided_range
+{
+    public:
+
+    typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
+    {
+        difference_type stride;
+
+        stride_functor(difference_type stride)
+            : stride(stride) {}
+
+        __host__ __device__
+        difference_type operator()(const difference_type& i) const
+        { 
+            return stride * i;
+        }
+    };
+
+    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
+    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+    // type of the strided_range iterator
+    typedef PermutationIterator iterator;
+
+    // construct strided_range for the range [first,last)
+    strided_range(Iterator first, Iterator last, difference_type stride)
+        : first(first), last(last), stride(stride) {}
+   
+    iterator begin(void) const
+    {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
+    }
+
+    iterator end(void) const
+    {
+        return begin() + ((last - first) + (stride - 1)) / stride;
+    }
+    
+    protected:
+    Iterator first;
+    Iterator last;
+    difference_type stride;
+};
+
+template <class Vector>
+void TestSortPermutationIterator(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector A(10);
+  A[0] = 2;
+  A[1] = 9;
+  A[2] = 0;
+  A[3] = 1;
+  A[4] = 5;
+  A[5] = 3;
+  A[6] = 8;
+  A[7] = 6;
+  A[8] = 7;
+  A[9] = 4;
+
+  strided_range<Iterator> S(A.begin(), A.end(), 2);
+
+  thrust::sort(S.begin(), S.end());
+
+  ASSERT_EQUAL(A[0], 0);
+  ASSERT_EQUAL(A[1], 9);
+  ASSERT_EQUAL(A[2], 2);
+  ASSERT_EQUAL(A[3], 1);
+  ASSERT_EQUAL(A[4], 5);
+  ASSERT_EQUAL(A[5], 3);
+  ASSERT_EQUAL(A[6], 7);
+  ASSERT_EQUAL(A[7], 6);
+  ASSERT_EQUAL(A[8], 8);
+  ASSERT_EQUAL(A[9], 4);
+}
+DECLARE_VECTOR_UNITTEST(TestSortPermutationIterator);
+
+template <class Vector>
+void TestStableSortPermutationIterator(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector A(10);
+  A[0] = 2;
+  A[1] = 9;
+  A[2] = 0;
+  A[3] = 1;
+  A[4] = 5;
+  A[5] = 3;
+  A[6] = 8;
+  A[7] = 6;
+  A[8] = 7;
+  A[9] = 4;
+
+  strided_range<Iterator> S(A.begin(), A.end(), 2);
+
+  thrust::stable_sort(S.begin(), S.end());
+
+  ASSERT_EQUAL(A[0], 0);
+  ASSERT_EQUAL(A[1], 9);
+  ASSERT_EQUAL(A[2], 2);
+  ASSERT_EQUAL(A[3], 1);
+  ASSERT_EQUAL(A[4], 5);
+  ASSERT_EQUAL(A[5], 3);
+  ASSERT_EQUAL(A[6], 7);
+  ASSERT_EQUAL(A[7], 6);
+  ASSERT_EQUAL(A[8], 8);
+  ASSERT_EQUAL(A[9], 4);
+}
+DECLARE_VECTOR_UNITTEST(TestStableSortPermutationIterator);
+
+template <class Vector>
+void TestSortByKeyPermutationIterator(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector A(10), B(10);
+  A[0] = 2; B[0] = 0;
+  A[1] = 9; B[1] = 1;
+  A[2] = 0; B[2] = 2;
+  A[3] = 1; B[3] = 3;
+  A[4] = 5; B[4] = 4;
+  A[5] = 3; B[5] = 5;
+  A[6] = 8; B[6] = 6;
+  A[7] = 6; B[7] = 7;
+  A[8] = 7; B[8] = 8;
+  A[9] = 4; B[9] = 9;
+  
+  strided_range<Iterator> S(A.begin(), A.end(), 2);
+  strided_range<Iterator> T(B.begin(), B.end(), 2);
+
+  thrust::sort_by_key(S.begin(), S.end(), T.begin());
+
+  ASSERT_EQUAL(A[0], 0);
+  ASSERT_EQUAL(A[1], 9);
+  ASSERT_EQUAL(A[2], 2);
+  ASSERT_EQUAL(A[3], 1);
+  ASSERT_EQUAL(A[4], 5);
+  ASSERT_EQUAL(A[5], 3);
+  ASSERT_EQUAL(A[6], 7);
+  ASSERT_EQUAL(A[7], 6);
+  ASSERT_EQUAL(A[8], 8);
+  ASSERT_EQUAL(A[9], 4);
+  
+  ASSERT_EQUAL(B[0], 2);
+  ASSERT_EQUAL(B[1], 1);
+  ASSERT_EQUAL(B[2], 0);
+  ASSERT_EQUAL(B[3], 3);
+  ASSERT_EQUAL(B[4], 4);
+  ASSERT_EQUAL(B[5], 5);
+  ASSERT_EQUAL(B[6], 8);
+  ASSERT_EQUAL(B[7], 7);
+  ASSERT_EQUAL(B[8], 6);
+  ASSERT_EQUAL(B[9], 9);
+}
+DECLARE_VECTOR_UNITTEST(TestSortByKeyPermutationIterator);
+
+template <class Vector>
+void TestStableSortByKeyPermutationIterator(void)
+{
+  typedef typename Vector::iterator Iterator;
+
+  Vector A(10), B(10);
+  A[0] = 2; B[0] = 0;
+  A[1] = 9; B[1] = 1;
+  A[2] = 0; B[2] = 2;
+  A[3] = 1; B[3] = 3;
+  A[4] = 5; B[4] = 4;
+  A[5] = 3; B[5] = 5;
+  A[6] = 8; B[6] = 6;
+  A[7] = 6; B[7] = 7;
+  A[8] = 7; B[8] = 8;
+  A[9] = 4; B[9] = 9;
+  
+  strided_range<Iterator> S(A.begin(), A.end(), 2);
+  strided_range<Iterator> T(B.begin(), B.end(), 2);
+
+  thrust::stable_sort_by_key(S.begin(), S.end(), T.begin());
+
+  ASSERT_EQUAL(A[0], 0);
+  ASSERT_EQUAL(A[1], 9);
+  ASSERT_EQUAL(A[2], 2);
+  ASSERT_EQUAL(A[3], 1);
+  ASSERT_EQUAL(A[4], 5);
+  ASSERT_EQUAL(A[5], 3);
+  ASSERT_EQUAL(A[6], 7);
+  ASSERT_EQUAL(A[7], 6);
+  ASSERT_EQUAL(A[8], 8);
+  ASSERT_EQUAL(A[9], 4);
+  
+  ASSERT_EQUAL(B[0], 2);
+  ASSERT_EQUAL(B[1], 1);
+  ASSERT_EQUAL(B[2], 0);
+  ASSERT_EQUAL(B[3], 3);
+  ASSERT_EQUAL(B[4], 4);
+  ASSERT_EQUAL(B[5], 5);
+  ASSERT_EQUAL(B[6], 8);
+  ASSERT_EQUAL(B[7], 7);
+  ASSERT_EQUAL(B[8], 6);
+  ASSERT_EQUAL(B[9], 9);
+}
+DECLARE_VECTOR_UNITTEST(TestStableSortByKeyPermutationIterator);
+
diff --git a/thrust/testing/sort_variable_bits.cu b/thrust/testing/sort_variable_bits.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4192e3da6d3facb99042a5655f1d8b5ad9b8e238
--- /dev/null
+++ b/thrust/testing/sort_variable_bits.cu
@@ -0,0 +1,44 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+using namespace unittest;
+
+typedef unittest::type_list<
+#if !(defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 1))
+// XXX GCC 4.1 miscompiles the char sorts with -O2 for some reason
+                            unittest::uint8_t,
+#endif
+                            unittest::uint16_t,
+                            unittest::uint32_t,
+                            unittest::uint64_t> UnsignedIntegerTypes;
+
+template <typename T>
+struct TestSortVariableBits
+{
+  void operator()(const size_t n)
+  {
+    for(size_t num_bits = 0; num_bits < 8 * sizeof(T); num_bits += 3){
+
+        thrust::host_vector<T>  h_keys = unittest::random_integers<T>(n);
+   
+        size_t mask = (1 << num_bits) - 1;
+        for(size_t i = 0; i < n; i++)
+            h_keys[i] &= mask;
+
+        thrust::host_vector<T>   reference = h_keys;
+        thrust::device_vector<T> d_keys    = h_keys;
+    
+        std::sort(reference.begin(), reference.end());
+
+        thrust::sort(h_keys.begin(), h_keys.end());
+        thrust::sort(d_keys.begin(), d_keys.end());
+    
+        ASSERT_EQUAL(reference, h_keys);
+        ASSERT_EQUAL(h_keys, d_keys);
+    }
+  }
+};
+VariableUnitTest<TestSortVariableBits, UnsignedIntegerTypes> TestSortVariableBitsInstance;
+
diff --git a/thrust/testing/stable_sort.cu b/thrust/testing/stable_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c7cdb3e5217a22804e65cffa74bff1b3eaab3fdc
--- /dev/null
+++ b/thrust/testing/stable_sort.cu
@@ -0,0 +1,175 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename RandomAccessIterator>
+void stable_sort(my_system &system, RandomAccessIterator, RandomAccessIterator)
+{
+    system.validate_dispatch();
+}
+
+void TestStableSortDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::stable_sort(sys, vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestStableSortDispatchExplicit);
+
+
+template<typename RandomAccessIterator>
+void stable_sort(my_tag, RandomAccessIterator first, RandomAccessIterator)
+{
+    *first = 13;
+}
+
+void TestStableSortDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::stable_sort(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestStableSortDispatchImplicit);
+
+
+template <typename T>
+struct less_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;}
+};
+
+template <class Vector>
+void InitializeSimpleStableKeySortTest(Vector& unsorted_keys, Vector& sorted_keys)
+{
+    unsorted_keys.resize(9);   
+    unsorted_keys[0] = 25; 
+    unsorted_keys[1] = 14; 
+    unsorted_keys[2] = 35; 
+    unsorted_keys[3] = 16; 
+    unsorted_keys[4] = 26; 
+    unsorted_keys[5] = 34; 
+    unsorted_keys[6] = 36; 
+    unsorted_keys[7] = 24; 
+    unsorted_keys[8] = 15; 
+    
+    sorted_keys.resize(9);
+    sorted_keys[0] = 14; 
+    sorted_keys[1] = 16; 
+    sorted_keys[2] = 15; 
+    sorted_keys[3] = 25; 
+    sorted_keys[4] = 26; 
+    sorted_keys[5] = 24; 
+    sorted_keys[6] = 35; 
+    sorted_keys[7] = 34; 
+    sorted_keys[8] = 36; 
+}
+
+
+template <class Vector>
+void TestStableSortSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector unsorted_keys;
+    Vector   sorted_keys;
+
+    InitializeSimpleStableKeySortTest(unsorted_keys, sorted_keys);
+
+    thrust::stable_sort(unsorted_keys.begin(), unsorted_keys.end(), less_div_10<T>());
+
+    ASSERT_EQUAL(unsorted_keys,   sorted_keys);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStableSortSimple);
+
+
+template <typename T>
+struct TestStableSort
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::stable_sort(h_data.begin(), h_data.end(), less_div_10<T>());
+        thrust::stable_sort(d_data.begin(), d_data.end(), less_div_10<T>());
+
+        ASSERT_EQUAL(h_data, d_data);
+    }
+};
+VariableUnitTest<TestStableSort, SignedIntegralTypes> TestStableSortInstance;
+
+
+template <typename T>
+struct TestStableSortSemantics
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::stable_sort(h_data.begin(), h_data.end(), less_div_10<T>());
+        thrust::stable_sort(d_data.begin(), d_data.end(), less_div_10<T>());
+
+        ASSERT_EQUAL(h_data, d_data);
+    }
+};
+VariableUnitTest<TestStableSortSemantics, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestStableSortSemanticsInstance;
+
+
+template <typename T>
+struct comp_mod3
+{
+    T * table;
+
+    comp_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    bool operator()(T a, T b)
+    {
+        return table[(int) a] < table[(int) b];
+    }
+};
+
+template <typename Vector>
+void TestStableSortWithIndirection(void)
+{
+    // add numbers modulo 3 with external lookup table
+    typedef typename Vector::value_type T;
+
+    Vector data(7);
+    data[0] = 1;
+    data[1] = 3;
+    data[2] = 5;
+    data[3] = 3;
+    data[4] = 0;
+    data[5] = 2;
+    data[6] = 1;
+
+    Vector table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::stable_sort(data.begin(), data.end(), comp_mod3<T>(thrust::raw_pointer_cast(&table[0])));
+    
+    ASSERT_EQUAL(data[0], T(3));
+    ASSERT_EQUAL(data[1], T(3));
+    ASSERT_EQUAL(data[2], T(0));
+    ASSERT_EQUAL(data[3], T(1));
+    ASSERT_EQUAL(data[4], T(1));
+    ASSERT_EQUAL(data[5], T(5));
+    ASSERT_EQUAL(data[6], T(2));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStableSortWithIndirection);
+
diff --git a/thrust/testing/stable_sort_by_key.cu b/thrust/testing/stable_sort_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3736542d38ec7500a6e671756b9bd71c2c7ada5
--- /dev/null
+++ b/thrust/testing/stable_sort_by_key.cu
@@ -0,0 +1,138 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2>
+void stable_sort_by_key(my_system &system, RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2)
+{
+    system.validate_dispatch();
+}
+
+void TestStableSortByKeyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::stable_sort_by_key(sys, vec.begin(), vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestStableSortByKeyDispatchExplicit);
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2>
+void stable_sort_by_key(my_tag, RandomAccessIterator1 keys_first, RandomAccessIterator1, RandomAccessIterator2)
+{
+    *keys_first = 13;
+}
+
+void TestStableSortByKeyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::stable_sort_by_key(thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestStableSortByKeyDispatchImplicit);
+
+template <typename T>
+struct less_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;}
+};
+
+
+template <class Vector>
+void InitializeSimpleStableKeyValueSortTest(Vector& unsorted_keys, Vector& unsorted_values,
+                                            Vector& sorted_keys,   Vector& sorted_values)
+{
+    unsorted_keys.resize(9);   
+    unsorted_values.resize(9);   
+    unsorted_keys[0] = 25;   unsorted_values[0] = 0;   
+    unsorted_keys[1] = 14;   unsorted_values[1] = 1; 
+    unsorted_keys[2] = 35;   unsorted_values[2] = 2; 
+    unsorted_keys[3] = 16;   unsorted_values[3] = 3; 
+    unsorted_keys[4] = 26;   unsorted_values[4] = 4; 
+    unsorted_keys[5] = 34;   unsorted_values[5] = 5; 
+    unsorted_keys[6] = 36;   unsorted_values[6] = 6; 
+    unsorted_keys[7] = 24;   unsorted_values[7] = 7; 
+    unsorted_keys[8] = 15;   unsorted_values[8] = 8; 
+    
+    sorted_keys.resize(9);
+    sorted_values.resize(9);
+    sorted_keys[0] = 14;   sorted_values[0] = 1;    
+    sorted_keys[1] = 16;   sorted_values[1] = 3; 
+    sorted_keys[2] = 15;   sorted_values[2] = 8; 
+    sorted_keys[3] = 25;   sorted_values[3] = 0; 
+    sorted_keys[4] = 26;   sorted_values[4] = 4; 
+    sorted_keys[5] = 24;   sorted_values[5] = 7; 
+    sorted_keys[6] = 35;   sorted_values[6] = 2; 
+    sorted_keys[7] = 34;   sorted_values[7] = 5; 
+    sorted_keys[8] = 36;   sorted_values[8] = 6; 
+}
+
+
+template <class Vector>
+void TestStableSortByKeySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector unsorted_keys, unsorted_values;
+    Vector   sorted_keys,   sorted_values;
+
+    InitializeSimpleStableKeyValueSortTest(unsorted_keys, unsorted_values, sorted_keys, sorted_values);
+
+    thrust::stable_sort_by_key(unsorted_keys.begin(), unsorted_keys.end(), unsorted_values.begin(), less_div_10<T>());
+
+    ASSERT_EQUAL(unsorted_keys,   sorted_keys);
+    ASSERT_EQUAL(unsorted_values, sorted_values);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStableSortByKeySimple);
+
+
+template <typename T>
+struct TestStableSortByKey
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_keys = h_keys;
+
+        thrust::host_vector<T>   h_values = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_values = h_values;
+
+        thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
+        thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
+
+        ASSERT_EQUAL(h_keys,   d_keys);
+        ASSERT_EQUAL(h_values, d_values);
+    }
+};
+VariableUnitTest<TestStableSortByKey, SignedIntegralTypes> TestStableSortByKeyInstance;
+
+
+template <typename T>
+struct TestStableSortByKeySemantics
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_keys = h_keys;
+
+        thrust::host_vector<T>   h_values = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_values = h_values;
+
+        thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), less_div_10<T>());
+        thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), less_div_10<T>());
+
+        ASSERT_EQUAL(h_keys,   d_keys);
+        ASSERT_EQUAL(h_values, d_values);
+    }
+};
+VariableUnitTest<TestStableSortByKeySemantics, unittest::type_list<unittest::uint8_t,unittest::uint16_t,unittest::uint32_t> > TestStableSortByKeySemanticsInstance;
+
diff --git a/thrust/testing/stable_sort_by_key_large.cu b/thrust/testing/stable_sort_by_key_large.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc69de64c2113cffd75c7fcb2050641ad8f911cb
--- /dev/null
+++ b/thrust/testing/stable_sort_by_key_large.cu
@@ -0,0 +1,155 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+
+template <typename T>
+struct less_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;}
+};
+
+template <typename T>
+struct greater_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 > ((int) rhs) / 10;}
+};
+
+
+template <typename T, unsigned int N>
+void _TestStableSortByKeyWithLargeKeys(void)
+{
+    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_keys(n);
+    thrust::host_vector<   unsigned int   > h_vals(n);
+
+    for(size_t i = 0; i < n; i++)
+    {
+        h_keys[i] = FixedVector<T,N>(rand());
+        h_vals[i] = i;
+    }
+
+    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
+    thrust::device_vector<   unsigned int   > d_vals = h_vals;
+    
+    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+    ASSERT_EQUAL_QUIET(h_keys, d_keys);
+    ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeKeys(void)
+{
+    _TestStableSortByKeyWithLargeKeys<int,    4>();
+    _TestStableSortByKeyWithLargeKeys<int,    8>();
+    _TestStableSortByKeyWithLargeKeys<int,   16>();
+
+// XXX these take too long to compile
+//    _TestStableSortByKeyWithLargeKeys<int,   32>();
+//    _TestStableSortByKeyWithLargeKeys<int,   64>();
+//    _TestStableSortByKeyWithLargeKeys<int,  128>();
+//    _TestStableSortByKeyWithLargeKeys<int,  256>();
+//    _TestStableSortByKeyWithLargeKeys<int,  512>();
+//    _TestStableSortByKeyWithLargeKeys<int, 1024>();
+//    _TestStableSortByKeyWithLargeKeys<int, 2048>();
+//    _TestStableSortByKeyWithLargeKeys<int, 4096>();
+//    _TestStableSortByKeyWithLargeKeys<int, 8192>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys);
+
+
+template <typename T, unsigned int N>
+void _TestStableSortByKeyWithLargeValues(void)
+{
+    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector<   unsigned int   > h_keys(n);
+    thrust::host_vector< FixedVector<T,N> > h_vals(n);
+
+    for(size_t i = 0; i < n; i++)
+    {
+        h_keys[i] = rand();
+        h_vals[i] = FixedVector<T,N>(i);
+    }
+
+    thrust::device_vector<   unsigned int   > d_keys = h_keys;
+    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
+    
+    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+    ASSERT_EQUAL_QUIET(h_keys, d_keys);
+    ASSERT_EQUAL_QUIET(h_vals, d_vals);
+
+    // so cuda::stable_merge_sort_by_key() is called
+    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), greater_div_10<unsigned int>());
+    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), greater_div_10<unsigned int>());
+
+    ASSERT_EQUAL_QUIET(h_keys, d_keys);
+    ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeValues(void)
+{
+    _TestStableSortByKeyWithLargeValues<int,    4>();
+    _TestStableSortByKeyWithLargeValues<int,    8>();
+    _TestStableSortByKeyWithLargeValues<int,   16>();
+    
+// XXX these take too long to compile
+//    _TestStableSortByKeyWithLargeValues<int,   32>();
+//    _TestStableSortByKeyWithLargeValues<int,   64>();
+//    _TestStableSortByKeyWithLargeValues<int,  128>();
+//    _TestStableSortByKeyWithLargeValues<int,  256>();
+//    _TestStableSortByKeyWithLargeValues<int,  512>();
+//    _TestStableSortByKeyWithLargeValues<int, 1024>();
+//    _TestStableSortByKeyWithLargeValues<int, 2048>();
+//    _TestStableSortByKeyWithLargeValues<int, 4096>();
+//    _TestStableSortByKeyWithLargeValues<int, 8192>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues);
+
+
+template <typename T, unsigned int N>
+void _TestStableSortByKeyWithLargeKeysAndValues(void)
+{
+    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_keys(n);
+    thrust::host_vector< FixedVector<T,N> > h_vals(n);
+
+    for(size_t i = 0; i < n; i++)
+    {
+        h_keys[i] = FixedVector<T,N>(rand());
+        h_vals[i] = FixedVector<T,N>(i);
+    }
+
+    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
+    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
+    
+    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+    ASSERT_EQUAL_QUIET(h_keys, d_keys);
+    ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeKeysAndValues(void)
+{
+    _TestStableSortByKeyWithLargeKeysAndValues<int,    4>();
+    _TestStableSortByKeyWithLargeKeysAndValues<int,    8>();
+    _TestStableSortByKeyWithLargeKeysAndValues<int,   16>();
+
+// XXX these take too long to compile
+//    _TestStableSortByKeyWithLargeKeysAndValues<int,   32>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int,   64>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int,  128>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int,  256>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int,  512>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int, 1024>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int, 2048>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int, 4096>();
+//    _TestStableSortByKeyWithLargeKeysAndValues<int, 8192>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues);
+
diff --git a/thrust/testing/stable_sort_large.cu b/thrust/testing/stable_sort_large.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b6b78b88f3a37ab1a99ae8020ac0bb3e66e1605
--- /dev/null
+++ b/thrust/testing/stable_sort_large.cu
@@ -0,0 +1,45 @@
+#include <unittest/unittest.h>
+#include <thrust/sort.h>
+#include <thrust/functional.h>
+
+
+template <typename T, unsigned int N>
+void _TestStableSortWithLargeKeys(void)
+{
+    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
+
+    thrust::host_vector< FixedVector<T,N> > h_keys(n);
+
+    for(size_t i = 0; i < n; i++)
+        // XXX Use proper random number generation facility.
+        h_keys[i] = FixedVector<T,N>(rand());
+
+    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
+    
+    thrust::stable_sort(h_keys.begin(), h_keys.end());
+    thrust::stable_sort(d_keys.begin(), d_keys.end());
+
+    ASSERT_EQUAL_QUIET(h_keys, d_keys);
+}
+
+void TestStableSortWithLargeKeys(void)
+{
+    _TestStableSortWithLargeKeys<int,    1>();
+    _TestStableSortWithLargeKeys<int,    2>();
+    _TestStableSortWithLargeKeys<int,    4>();
+    _TestStableSortWithLargeKeys<int,    8>();
+    _TestStableSortWithLargeKeys<int,   16>();
+    _TestStableSortWithLargeKeys<int,   32>();
+    _TestStableSortWithLargeKeys<int,   64>();
+    _TestStableSortWithLargeKeys<int,  128>();
+    _TestStableSortWithLargeKeys<int,  256>();
+
+// XXX these take too long to compile
+//    _TestStableSortWithLargeKeys<int,  512>();
+//    _TestStableSortWithLargeKeys<int, 1024>();
+//    _TestStableSortWithLargeKeys<int, 2048>();
+//    _TestStableSortWithLargeKeys<int, 4096>();
+//    _TestStableSortWithLargeKeys<int, 8192>();
+}
+DECLARE_UNITTEST(TestStableSortWithLargeKeys);
+
diff --git a/thrust/testing/swap_ranges.cu b/thrust/testing/swap_ranges.cu
new file mode 100644
index 0000000000000000000000000000000000000000..843c6624076db25775e9525065ffc217e3fb0c26
--- /dev/null
+++ b/thrust/testing/swap_ranges.cu
@@ -0,0 +1,222 @@
+#include <unittest/unittest.h>
+#include <thrust/swap.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/system/cpp/memory.h>
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+ForwardIterator2 swap_ranges(my_system &system,
+                             ForwardIterator1,
+                             ForwardIterator1,
+                             ForwardIterator2 first2)
+{
+    system.validate_dispatch();
+    return first2;
+}
+
+void TestSwapRangesDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::swap_ranges(sys, vec.begin(), vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestSwapRangesDispatchExplicit);
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+ForwardIterator2 swap_ranges(my_tag,
+                             ForwardIterator1,
+                             ForwardIterator1,
+                             ForwardIterator2 first2)
+{
+    *first2 = 13;
+    return first2;
+}
+
+void TestSwapRangesDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::swap_ranges(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestSwapRangesDispatchImplicit);
+
+
+template <class Vector>
+void TestSwapRangesSimple(void)
+{
+    Vector v1(5);
+    v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+
+    Vector v2(5);
+    v2[0] = 5; v2[1] = 6; v2[2] = 7; v2[3] = 8; v2[4] = 9;
+
+    thrust::swap_ranges(v1.begin(), v1.end(), v2.begin());
+
+    ASSERT_EQUAL(v1[0], 5);
+    ASSERT_EQUAL(v1[1], 6);
+    ASSERT_EQUAL(v1[2], 7);
+    ASSERT_EQUAL(v1[3], 8);
+    ASSERT_EQUAL(v1[4], 9);
+
+    ASSERT_EQUAL(v2[0], 0);
+    ASSERT_EQUAL(v2[1], 1);
+    ASSERT_EQUAL(v2[2], 2);
+    ASSERT_EQUAL(v2[3], 3);
+    ASSERT_EQUAL(v2[4], 4);
+}
+DECLARE_VECTOR_UNITTEST(TestSwapRangesSimple);
+
+
+template <typename T>
+void TestSwapRanges(const size_t n)
+{
+    thrust::host_vector<T> a1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T> a2 = unittest::random_integers<T>(n);
+
+    thrust::host_vector<T>    h1 = a1;
+    thrust::host_vector<T>    h2 = a2;
+    thrust::device_vector<T>  d1 = a1;
+    thrust::device_vector<T>  d2 = a2;
+
+    thrust::swap_ranges(h1.begin(), h1.end(), h2.begin());
+    thrust::swap_ranges(d1.begin(), d1.end(), d2.begin());
+
+    ASSERT_EQUAL(h1, a2);
+    ASSERT_EQUAL(d1, a2);
+    ASSERT_EQUAL(h2, a1);
+    ASSERT_EQUAL(d2, a1);
+}
+DECLARE_VARIABLE_UNITTEST(TestSwapRanges);
+
+#if (THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP)
+void TestSwapRangesForcedIterator(void)
+{
+  thrust::device_vector<int> A(3, 0);
+  thrust::device_vector<int> B(3, 1);
+
+  thrust::swap_ranges(thrust::retag<thrust::cpp::tag>(A.begin()),
+                      thrust::retag<thrust::cpp::tag>(A.end()),
+                      thrust::retag<thrust::cpp::tag>(B.begin()));
+
+  ASSERT_EQUAL(A[0], 1);
+  ASSERT_EQUAL(A[1], 1);
+  ASSERT_EQUAL(A[2], 1);
+  ASSERT_EQUAL(B[0], 0);
+  ASSERT_EQUAL(B[1], 0);
+  ASSERT_EQUAL(B[2], 0);
+}
+DECLARE_UNITTEST(TestSwapRangesForcedIterator);
+#endif
+
+struct type_with_swap
+{
+  inline __host__ __device__
+  type_with_swap()
+    : m_x(), m_swapped(false)
+  {}
+
+  inline __host__ __device__
+  type_with_swap(int x)
+    : m_x(x), m_swapped(false)
+  {}
+
+  inline __host__ __device__
+  type_with_swap(int x, bool s)
+    : m_x(x), m_swapped(s)
+  {}
+
+  inline __host__ __device__
+  type_with_swap(const type_with_swap &other)
+    : m_x(other.m_x), m_swapped(other.m_swapped)
+  {}
+
+  inline __host__ __device__
+  bool operator==(const type_with_swap &other) const
+  {
+    return m_x == other.m_x && m_swapped == other.m_swapped;
+  }
+
+#if THRUST_CPP_DIALECT >= 2011
+  type_with_swap & operator=(const type_with_swap &) = default;
+#endif
+
+  int m_x;
+  bool m_swapped;
+};
+
+inline __host__ __device__
+void swap(type_with_swap &a, type_with_swap &b)
+{
+  thrust::swap(a.m_x, b.m_x);
+  a.m_swapped = true;
+  b.m_swapped = true;
+}
+
+void TestSwapRangesUserSwap(void)
+{
+  thrust::host_vector<type_with_swap> h_A(3, type_with_swap(0));
+  thrust::host_vector<type_with_swap> h_B(3, type_with_swap(1));
+
+  thrust::device_vector<type_with_swap> d_A = h_A;
+  thrust::device_vector<type_with_swap> d_B = h_B;
+
+  // check that nothing is yet swapped
+  type_with_swap ref = type_with_swap(0, false);
+
+  ASSERT_EQUAL_QUIET(ref, h_A[0]);
+  ASSERT_EQUAL_QUIET(ref, h_A[1]);
+  ASSERT_EQUAL_QUIET(ref, h_A[2]);
+
+  ASSERT_EQUAL_QUIET(ref, d_A[0]);
+  ASSERT_EQUAL_QUIET(ref, d_A[1]);
+  ASSERT_EQUAL_QUIET(ref, d_A[2]);
+
+  ref = type_with_swap(1, false);
+
+  ASSERT_EQUAL_QUIET(ref, h_B[0]);
+  ASSERT_EQUAL_QUIET(ref, h_B[1]);
+  ASSERT_EQUAL_QUIET(ref, h_B[2]);
+
+  ASSERT_EQUAL_QUIET(ref, d_B[0]);
+  ASSERT_EQUAL_QUIET(ref, d_B[1]);
+  ASSERT_EQUAL_QUIET(ref, d_B[2]);
+
+  // swap the ranges
+
+  thrust::swap_ranges(h_A.begin(), h_A.end(), h_B.begin());
+  thrust::swap_ranges(d_A.begin(), d_A.end(), d_B.begin());
+
+  // check that things were swapped
+  ref = type_with_swap(1, true);
+
+  ASSERT_EQUAL_QUIET(ref, h_A[0]);
+  ASSERT_EQUAL_QUIET(ref, h_A[1]);
+  ASSERT_EQUAL_QUIET(ref, h_A[2]);
+
+  ASSERT_EQUAL_QUIET(ref, d_A[0]);
+  ASSERT_EQUAL_QUIET(ref, d_A[1]);
+  ASSERT_EQUAL_QUIET(ref, d_A[2]);
+
+  ref = type_with_swap(0, true);
+
+  ASSERT_EQUAL_QUIET(ref, h_B[0]);
+  ASSERT_EQUAL_QUIET(ref, h_B[1]);
+  ASSERT_EQUAL_QUIET(ref, h_B[2]);
+
+  ASSERT_EQUAL_QUIET(ref, d_B[0]);
+  ASSERT_EQUAL_QUIET(ref, d_B[1]);
+  ASSERT_EQUAL_QUIET(ref, d_B[2]);
+}
+DECLARE_UNITTEST(TestSwapRangesUserSwap);
+
diff --git a/thrust/testing/tabulate.cu b/thrust/testing/tabulate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25c6e40acb0c0d2c6c6cdb694399f7085b0f2444
--- /dev/null
+++ b/thrust/testing/tabulate.cu
@@ -0,0 +1,111 @@
+#include <unittest/unittest.h>
+#include <thrust/tabulate.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename ForwardIterator, typename UnaryOperation>
+void tabulate(my_system &system, ForwardIterator, ForwardIterator, UnaryOperation)
+{
+  system.validate_dispatch();
+}
+
+void TestTabulateDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::tabulate(sys, vec.begin(), vec.end(), thrust::identity<int>());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTabulateDispatchExplicit);
+
+
+template<typename ForwardIterator, typename UnaryOperation>
+void tabulate(my_tag, ForwardIterator first, ForwardIterator, UnaryOperation)
+{
+  *first = 13;
+}
+
+void TestTabulateDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::tabulate(thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.end()),
+                   thrust::identity<int>());
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTabulateDispatchImplicit);
+
+
+template <class Vector>
+void TestTabulateSimple(void)
+{
+  using namespace thrust::placeholders;
+  typedef typename Vector::value_type T;
+  
+  Vector v(5);
+
+  thrust::tabulate(v.begin(), v.end(), thrust::identity<T>());
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 2);
+  ASSERT_EQUAL(v[3], 3);
+  ASSERT_EQUAL(v[4], 4);
+
+  thrust::tabulate(v.begin(), v.end(), -_1);
+
+  ASSERT_EQUAL(v[0],  0);
+  ASSERT_EQUAL(v[1], -1);
+  ASSERT_EQUAL(v[2], -2);
+  ASSERT_EQUAL(v[3], -3);
+  ASSERT_EQUAL(v[4], -4);
+  
+  thrust::tabulate(v.begin(), v.end(), _1 * _1 * _1);
+
+  ASSERT_EQUAL(v[0], 0);
+  ASSERT_EQUAL(v[1], 1);
+  ASSERT_EQUAL(v[2], 8);
+  ASSERT_EQUAL(v[3], 27);
+  ASSERT_EQUAL(v[4], 64);
+}
+DECLARE_VECTOR_UNITTEST(TestTabulateSimple);
+
+
+template <typename T>
+void TestTabulate(size_t n)
+{
+  using namespace thrust::placeholders;
+
+  thrust::host_vector<T>   h_data(n);
+  thrust::device_vector<T> d_data(n);
+
+  thrust::tabulate(h_data.begin(), h_data.end(), _1 * _1 + 13);
+  thrust::tabulate(d_data.begin(), d_data.end(), _1 * _1 + 13);
+
+  ASSERT_EQUAL(h_data, d_data);
+
+  thrust::tabulate(h_data.begin(), h_data.end(), (_1 - 7) * _1);
+  thrust::tabulate(d_data.begin(), d_data.end(), (_1 - 7) * _1);
+
+  ASSERT_EQUAL(h_data, d_data);
+}
+DECLARE_VARIABLE_UNITTEST(TestTabulate);
+
+
+template <typename T>
+void TestTabulateToDiscardIterator(size_t n)
+{
+  thrust::tabulate(thrust::discard_iterator<thrust::device_system_tag>(),
+                   thrust::discard_iterator<thrust::device_system_tag>(n),
+                   thrust::identity<int>());
+
+  // nothing to check -- just make sure it compiles
+}
+DECLARE_VARIABLE_UNITTEST(TestTabulateToDiscardIterator);
+
diff --git a/thrust/testing/transform.cu b/thrust/testing/transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7e3c3e60fdf330424f3c9909b60933cf8976b8a4
--- /dev/null
+++ b/thrust/testing/transform.cu
@@ -0,0 +1,863 @@
+#include <unittest/unittest.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/pair.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template <class Vector>
+void TestTransformUnarySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    typename Vector::iterator iter;
+
+    Vector input(3);
+    Vector output(3);
+    Vector result(3);
+    input[0]  =  1; input[1]  = -2; input[2]  =  3;
+    result[0] = -1; result[1] =  2; result[2] = -3;
+
+    iter = thrust::transform(input.begin(), input.end(), output.begin(), thrust::negate<T>());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(output, result);
+}
+DECLARE_VECTOR_UNITTEST(TestTransformUnarySimple);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+OutputIterator transform(my_system &system, InputIterator, InputIterator, OutputIterator result, UnaryFunction)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformUnaryDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform(sys,
+                      vec.begin(),
+                      vec.begin(),
+                      vec.begin(),
+                      0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformUnaryDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+OutputIterator transform(my_tag, InputIterator, InputIterator, OutputIterator result, UnaryFunction)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformUnaryDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformUnaryDispatchImplicit);
+
+
+template <class Vector>
+void TestTransformIfUnaryNoStencilSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    typename Vector::iterator iter;
+
+    Vector input(3);
+    Vector output(3);
+    Vector result(3);
+
+    input[0]   =  0; input[1]   = -2; input[2]   =  0;
+    output[0]  = -1; output[1]  = -2; output[2]  = -3;
+    result[0]  = -1; result[1]  =  2; result[2]  = -3;
+
+    iter = thrust::transform_if(input.begin(), input.end(),
+                                output.begin(),
+                                thrust::negate<T>(),
+                                thrust::identity<T>());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(output, result);
+}
+DECLARE_VECTOR_UNITTEST(TestTransformIfUnaryNoStencilSimple);
+
+
+template<typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+ForwardIterator transform_if(my_system &system,
+                             InputIterator,
+                             InputIterator,
+                             ForwardIterator result,
+                             UnaryFunction,
+                             Predicate)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformIfUnaryNoStencilDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform_if(sys,
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformIfUnaryNoStencilDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+ForwardIterator transform_if(my_tag,
+                             InputIterator,
+                             InputIterator,
+                             ForwardIterator result,
+                             UnaryFunction,
+                             Predicate)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformIfUnaryNoStencilDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform_if(thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformIfUnaryNoStencilDispatchImplicit);
+
+
+template <class Vector>
+void TestTransformIfUnarySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    typename Vector::iterator iter;
+
+    Vector input(3);
+    Vector stencil(3);
+    Vector output(3);
+    Vector result(3);
+
+    input[0]   =  1; input[1]   = -2; input[2]   =  3;
+    output[0]  =  1; output[1]  =  2; output[2]  =  3;
+    stencil[0] =  1; stencil[1] =  0; stencil[2] =  1;
+    result[0]  = -1; result[1]  =  2; result[2]  = -3;
+
+    iter = thrust::transform_if(input.begin(), input.end(),
+                                stencil.begin(),
+                                output.begin(),
+                                thrust::negate<T>(),
+                                thrust::identity<T>());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(output, result);
+}
+DECLARE_VECTOR_UNITTEST(TestTransformIfUnarySimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+ForwardIterator transform_if(my_system &system,
+                             InputIterator1,
+                             InputIterator1,
+                             ForwardIterator result,
+                             UnaryFunction,
+                             Predicate)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformIfUnaryDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform_if(sys,
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         0,
+                         0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformIfUnaryDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+ForwardIterator transform_if(my_tag,
+                             InputIterator1,
+                             InputIterator1,
+                             ForwardIterator result,
+                             UnaryFunction,
+                             Predicate)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformIfUnaryDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform_if(thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         0,
+                         0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformIfUnaryDispatchImplicit);
+
+
+template <class Vector>
+void TestTransformBinarySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    typename Vector::iterator iter;
+
+    Vector input1(3);
+    Vector input2(3);
+    Vector output(3);
+    Vector result(3);
+    input1[0] =  1; input1[1] = -2; input1[2] =  3;
+    input2[0] = -4; input2[1] =  5; input2[2] =  6;
+    result[0] =  5; result[1] = -7; result[2] = -3;
+
+    iter = thrust::transform(input1.begin(), input1.end(), input2.begin(), output.begin(), thrust::minus<T>());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
+    ASSERT_EQUAL(output, result);
+}
+DECLARE_VECTOR_UNITTEST(TestTransformBinarySimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename UnaryFunction>
+OutputIterator transform(my_system &system, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, UnaryFunction)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformBinaryDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform(sys,
+                      vec.begin(),
+                      vec.begin(),
+                      vec.begin(),
+                      vec.begin(),
+                      0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformBinaryDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename UnaryFunction>
+OutputIterator transform(my_tag, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, UnaryFunction)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformBinaryDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform(thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      thrust::retag<my_tag>(vec.begin()),
+                      0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformBinaryDispatchImplicit);
+
+
+
+
+template <class Vector>
+void TestTransformIfBinarySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    typename Vector::iterator iter;
+
+    Vector input1(3);
+    Vector input2(3);
+    Vector stencil(3);
+    Vector output(3);
+    Vector result(3);
+
+    input1[0]  =  1; input1[1]  = -2; input1[2]  =  3;
+    input2[0]  = -4; input2[1]  =  5; input2[2]  =  6;
+    stencil[0] =  0; stencil[1] =  1; stencil[2] =  0;
+    output[0]  =  1; output[1]  =  2; output[2]  =  3;
+    result[0]  =  5; result[1]  =  2; result[2]  = -3;
+
+    thrust::identity<T> identity;
+
+    iter = thrust::transform_if(input1.begin(), input1.end(),
+                                input2.begin(),
+                                stencil.begin(),
+                                output.begin(),
+                                thrust::minus<T>(),
+                                thrust::not1(identity));
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
+    ASSERT_EQUAL(output, result);
+}
+DECLARE_VECTOR_UNITTEST(TestTransformIfBinarySimple);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+ForwardIterator transform_if(my_system &system,
+                             InputIterator1,
+                             InputIterator1,
+                             InputIterator2,
+                             InputIterator3,
+                             ForwardIterator result,
+                             BinaryFunction,
+                             Predicate)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformIfBinaryDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform_if(sys,
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         vec.begin(),
+                         0,
+                         0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformIfBinaryDispatchExplicit);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+ForwardIterator transform_if(my_tag,
+                             InputIterator1,
+                             InputIterator1,
+                             InputIterator2,
+                             InputIterator3,
+                             ForwardIterator result,
+                             BinaryFunction,
+                             Predicate)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformIfBinaryDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform_if(thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         thrust::retag<my_tag>(vec.begin()),
+                         0,
+                         0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformIfBinaryDispatchImplicit);
+
+
+template <typename T>
+void TestTransformUnary(const size_t n)
+{
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    thrust::transform(h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>());
+    thrust::transform(d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformUnary);
+
+
+template <typename T>
+void TestTransformUnaryToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::discard_iterator<> h_result =
+      thrust::transform(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), thrust::negate<T>());
+
+    thrust::discard_iterator<> d_result =
+      thrust::transform(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), thrust::negate<T>());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformUnaryToDiscardIterator);
+
+
+struct repeat2
+{
+  template<typename T>
+  __host__ __device__
+  thrust::pair<T,T> operator()(T x)
+  {
+    return thrust::make_pair(x,x);
+  }
+};
+
+
+template<typename T>
+void TestTransformUnaryToDiscardIteratorZipped(const size_t n)
+{
+    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input = h_input;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    typedef typename thrust::host_vector<T>::iterator Iterator1;
+    typedef typename thrust::device_vector<T>::iterator Iterator2;
+
+    typedef thrust::tuple<Iterator1,thrust::discard_iterator<> > Tuple1;
+    typedef thrust::tuple<Iterator2,thrust::discard_iterator<> > Tuple2;
+
+    typedef thrust::zip_iterator<Tuple1> ZipIterator1;
+    typedef thrust::zip_iterator<Tuple2> ZipIterator2;
+
+    ZipIterator1 z1(thrust::make_tuple(h_output.begin(), thrust::make_discard_iterator()));
+    ZipIterator2 z2(thrust::make_tuple(d_output.begin(), thrust::make_discard_iterator()));
+
+    ZipIterator1 h_result =
+      thrust::transform(h_input.begin(), h_input.end(), z1, repeat2());
+
+    ZipIterator2 d_result =
+      thrust::transform(d_input.begin(), d_input.end(), z2, repeat2());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL(h_output, d_output);
+
+    ASSERT_EQUAL_QUIET(reference, thrust::get<1>(h_result.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(reference, thrust::get<1>(d_result.get_iterator_tuple()));
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformUnaryToDiscardIteratorZipped);
+
+struct is_positive
+{
+  template<typename T>
+  __host__ __device__
+  bool operator()(T &x)
+  {
+    return x > 0;
+  } // end operator()()
+}; // end is_positive
+
+
+template <typename T>
+void TestTransformIfUnaryNoStencil(const size_t n)
+{
+    thrust::host_vector<T>   h_input   = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_output  = unittest::random_integers<T>(n);
+
+    thrust::device_vector<T> d_input   = h_input;
+    thrust::device_vector<T> d_output  = h_output;
+
+    thrust::transform_if(h_input.begin(), h_input.end(),
+                         h_output.begin(),
+                         thrust::negate<T>(), is_positive());
+
+    thrust::transform_if(d_input.begin(), d_input.end(),
+                         d_output.begin(),
+                         thrust::negate<T>(), is_positive());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformIfUnaryNoStencil);
+
+
+template <typename T>
+void TestTransformIfUnary(const size_t n)
+{
+    thrust::host_vector<T>   h_input   = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_output  = unittest::random_integers<T>(n);
+
+    thrust::device_vector<T> d_input   = h_input;
+    thrust::device_vector<T> d_stencil = h_stencil;
+    thrust::device_vector<T> d_output  = h_output;
+
+    thrust::transform_if(h_input.begin(), h_input.end(),
+                          h_stencil.begin(),
+                          h_output.begin(),
+                          thrust::negate<T>(), is_positive());
+
+    thrust::transform_if(d_input.begin(), d_input.end(),
+                          d_stencil.begin(),
+                          d_output.begin(),
+                          thrust::negate<T>(), is_positive());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformIfUnary);
+
+
+template <typename T>
+void TestTransformIfUnaryToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_input   = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+
+    thrust::device_vector<T> d_input   = h_input;
+    thrust::device_vector<T> d_stencil = h_stencil;
+
+    thrust::discard_iterator<> h_result =
+      thrust::transform_if(h_input.begin(), h_input.end(),
+                           h_stencil.begin(),
+                           thrust::make_discard_iterator(),
+                           thrust::negate<T>(), is_positive());
+
+    thrust::discard_iterator<> d_result =
+      thrust::transform_if(d_input.begin(), d_input.end(),
+                           d_stencil.begin(),
+                           thrust::make_discard_iterator(),
+                           thrust::negate<T>(), is_positive());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformIfUnaryToDiscardIterator);
+
+
+template <typename T>
+void TestTransformBinary(const size_t n)
+{
+    thrust::host_vector<T>   h_input1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_input2 = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input1 = h_input1;
+    thrust::device_vector<T> d_input2 = h_input2;
+
+    thrust::host_vector<T>   h_output(n);
+    thrust::device_vector<T> d_output(n);
+
+    thrust::transform(h_input1.begin(), h_input1.end(), h_input2.begin(), h_output.begin(), thrust::minus<T>());
+    thrust::transform(d_input1.begin(), d_input1.end(), d_input2.begin(), d_output.begin(), thrust::minus<T>());
+
+    ASSERT_EQUAL(h_output, d_output);
+
+    thrust::transform(h_input1.begin(), h_input1.end(), h_input2.begin(), h_output.begin(), thrust::multiplies<T>());
+    thrust::transform(d_input1.begin(), d_input1.end(), d_input2.begin(), d_output.begin(), thrust::multiplies<T>());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformBinary);
+
+
+template <typename T>
+void TestTransformBinaryToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_input1 = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_input2 = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_input1 = h_input1;
+    thrust::device_vector<T> d_input2 = h_input2;
+
+    thrust::discard_iterator<> h_result =
+      thrust::transform(h_input1.begin(), h_input1.end(), h_input2.begin(), thrust::make_discard_iterator(), thrust::minus<T>());
+    thrust::discard_iterator<> d_result =
+      thrust::transform(d_input1.begin(), d_input1.end(), d_input2.begin(), thrust::make_discard_iterator(), thrust::minus<T>());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformBinaryToDiscardIterator);
+
+
+template <typename T>
+void TestTransformIfBinary(const size_t n)
+{
+    thrust::host_vector<T>   h_input1  = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_input2  = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_output  = unittest::random_integers<T>(n);
+
+    thrust::device_vector<T> d_input1  = h_input1;
+    thrust::device_vector<T> d_input2  = h_input2;
+    thrust::device_vector<T> d_stencil = h_stencil;
+    thrust::device_vector<T> d_output  = h_output;
+
+    thrust::transform_if(h_input1.begin(), h_input1.end(),
+                         h_input2.begin(),
+                         h_stencil.begin(),
+                         h_output.begin(),
+                         thrust::minus<T>(), is_positive());
+
+    thrust::transform_if(d_input1.begin(), d_input1.end(),
+                         d_input2.begin(),
+                         d_stencil.begin(),
+                         d_output.begin(),
+                         thrust::minus<T>(), is_positive());
+
+    ASSERT_EQUAL(h_output, d_output);
+
+    h_stencil = unittest::random_integers<T>(n);
+    d_stencil = h_stencil;
+
+    thrust::transform_if(h_input1.begin(), h_input1.end(),
+                         h_input2.begin(),
+                         h_stencil.begin(),
+                         h_output.begin(),
+                         thrust::multiplies<T>(), is_positive());
+
+    thrust::transform_if(d_input1.begin(), d_input1.end(),
+                         d_input2.begin(),
+                         d_stencil.begin(),
+                         d_output.begin(),
+                         thrust::multiplies<T>(), is_positive());
+
+    ASSERT_EQUAL(h_output, d_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformIfBinary);
+
+
+template <typename T>
+void TestTransformIfBinaryToDiscardIterator(const size_t n)
+{
+    thrust::host_vector<T>   h_input1  = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_input2  = unittest::random_integers<T>(n);
+    thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
+
+    thrust::device_vector<T> d_input1  = h_input1;
+    thrust::device_vector<T> d_input2  = h_input2;
+    thrust::device_vector<T> d_stencil = h_stencil;
+
+    thrust::discard_iterator<> h_result =
+      thrust::transform_if(h_input1.begin(), h_input1.end(),
+                           h_input2.begin(),
+                           h_stencil.begin(),
+                           thrust::make_discard_iterator(),
+                           thrust::minus<T>(), is_positive());
+
+    thrust::discard_iterator<> d_result =
+      thrust::transform_if(d_input1.begin(), d_input1.end(),
+                           d_input2.begin(),
+                           d_stencil.begin(),
+                           thrust::make_discard_iterator(),
+                           thrust::minus<T>(), is_positive());
+
+    thrust::discard_iterator<> reference(n);
+
+    ASSERT_EQUAL_QUIET(reference, h_result);
+    ASSERT_EQUAL_QUIET(reference, d_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformIfBinaryToDiscardIterator);
+
+
+#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100) == 40400) || defined(__INTEL_COMPILER)
+template <typename T>
+void TestTransformUnaryCountingIterator()
+{
+    // G++ 4.4.x has a known failure with auto-vectorization (due to -O3 or
+    // -ftree-vectorize) of this test.
+    // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43251
+
+    // ICPC has a known failure with auto-vectorization (due to -O2 or
+    // higher) of this test.
+    // See nvbug 200326708.
+    KNOWN_FAILURE;
+}
+#else
+template <typename T>
+void TestTransformUnaryCountingIterator()
+{
+    size_t const n = 15 * sizeof(T);
+
+    ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+    thrust::counting_iterator<T, thrust::host_system_tag>   h_first = thrust::make_counting_iterator<T>(0);
+    thrust::counting_iterator<T, thrust::device_system_tag> d_first = thrust::make_counting_iterator<T>(0);
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    thrust::transform(h_first, h_first + n, h_result.begin(), thrust::identity<T>());
+    thrust::transform(d_first, d_first + n, d_result.begin(), thrust::identity<T>());
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+#endif
+DECLARE_GENERIC_UNITTEST(TestTransformUnaryCountingIterator);
+
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) == 40400
+template <typename T>
+void TestTransformBinaryCountingIterator()
+{
+    // GCC 4.4.x has a known failure with auto-vectorization (due to -O3 or -ftree-vectorize) of this test
+    // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43251
+
+    KNOWN_FAILURE;
+}
+#else
+template <typename T>
+void TestTransformBinaryCountingIterator()
+{
+    size_t const n = 15 * sizeof(T);
+
+    ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+    thrust::counting_iterator<T, thrust::host_system_tag>   h_first = thrust::make_counting_iterator<T>(0);
+    thrust::counting_iterator<T, thrust::device_system_tag> d_first = thrust::make_counting_iterator<T>(0);
+
+    thrust::host_vector<T>   h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    thrust::transform(h_first, h_first + n, h_first, h_result.begin(), thrust::plus<T>());
+    thrust::transform(d_first, d_first + n, d_first, d_result.begin(), thrust::plus<T>());
+
+    ASSERT_EQUAL(h_result, d_result);
+}
+#endif
+DECLARE_GENERIC_UNITTEST(TestTransformBinaryCountingIterator);
+
+
+template <typename T>
+struct plus_mod3
+{
+    T * table;
+
+    plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    T operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+template <typename Vector>
+void TestTransformWithIndirection(void)
+{
+    // add numbers modulo 3 with external lookup table
+    typedef typename Vector::value_type T;
+
+    Vector input1(7);
+    Vector input2(7);
+    Vector output(7, 0);
+    input1[0] = 0;  input2[0] = 2;
+    input1[1] = 1;  input2[1] = 2;
+    input1[2] = 2;  input2[2] = 2;
+    input1[3] = 1;  input2[3] = 0;
+    input1[4] = 2;  input2[4] = 2;
+    input1[5] = 0;  input2[5] = 1;
+    input1[6] = 1;  input2[6] = 0;
+
+    Vector table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::transform(input1.begin(), input1.end(),
+                      input2.begin(),
+                      output.begin(),
+                      plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
+
+    ASSERT_EQUAL(output[0], T(2));
+    ASSERT_EQUAL(output[1], T(0));
+    ASSERT_EQUAL(output[2], T(1));
+    ASSERT_EQUAL(output[3], T(1));
+    ASSERT_EQUAL(output[4], T(1));
+    ASSERT_EQUAL(output[5], T(1));
+    ASSERT_EQUAL(output[6], T(1));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformWithIndirection);
+
diff --git a/thrust/testing/transform_input_output_iterator.cu b/thrust/testing/transform_input_output_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7df1630777940f1da39b7bf5f8940cf0394aa803
--- /dev/null
+++ b/thrust/testing/transform_input_output_iterator.cu
@@ -0,0 +1,122 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/transform_input_output_iterator.h>
+
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/counting_iterator.h>
+
+template <class Vector>
+void TestTransformInputOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> InputFunction;
+    typedef thrust::square<T> OutputFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector squared(4);
+    Vector negated(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    // construct transform_iterator
+    thrust::transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+        transform_iter(squared.begin(), InputFunction(), OutputFunction());
+
+    // transform_iter writes squared value
+    thrust::copy(input.begin(), input.end(), transform_iter);
+
+    Vector gold_squared(4);
+    gold_squared[0] = 1;
+    gold_squared[1] = 4;
+    gold_squared[2] = 9;
+    gold_squared[3] = 16;
+
+    ASSERT_EQUAL(squared, gold_squared);
+
+    // negated value read from transform_iter
+    thrust::copy_n(transform_iter, squared.size(), negated.begin());
+
+    Vector gold_negated(4);
+    gold_negated[0] = -1;
+    gold_negated[1] = -4;
+    gold_negated[2] = -9;
+    gold_negated[3] = -16;
+
+    ASSERT_EQUAL(negated, gold_negated);
+
+}
+DECLARE_VECTOR_UNITTEST(TestTransformInputOutputIterator);
+
+template <class Vector>
+void TestMakeTransformInputOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> InputFunction;
+    typedef thrust::square<T> OutputFunction;
+
+    Vector input(4);
+    Vector negated(4);
+    Vector squared(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+
+    // negated value read from transform iterator
+    thrust::copy_n(thrust::make_transform_input_output_iterator(input.begin(), InputFunction(), OutputFunction()),
+                   input.size(), negated.begin());
+
+    Vector gold_negated(4);
+    gold_negated[0] = -1;
+    gold_negated[1] = -2;
+    gold_negated[2] = -3;
+    gold_negated[3] = -4;
+
+    ASSERT_EQUAL(negated, gold_negated);
+
+    // squared value writen by transform iterator
+    thrust::copy(negated.begin(), negated.end(),
+                 thrust::make_transform_input_output_iterator(squared.begin(), InputFunction(), OutputFunction()));
+
+    Vector gold_squared(4);
+    gold_squared[0] = 1;
+    gold_squared[1] = 4;
+    gold_squared[2] = 9;
+    gold_squared[3] = 16;
+
+    ASSERT_EQUAL(squared, gold_squared);
+
+}
+DECLARE_VECTOR_UNITTEST(TestMakeTransformInputOutputIterator);
+
+template <typename T>
+struct TestTransformInputOutputIteratorScan
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        // run on host (uses forward iterator negate)
+        thrust::inclusive_scan(thrust::make_transform_input_output_iterator(h_data.begin(), thrust::negate<T>(), thrust::identity<T>()),
+                               thrust::make_transform_input_output_iterator(h_data.end(),   thrust::negate<T>(), thrust::identity<T>()),
+                               h_result.begin());
+        // run on device (uses reverse iterator negate)
+        thrust::inclusive_scan(d_data.begin(), d_data.end(),
+                               thrust::make_transform_input_output_iterator(
+                                   d_result.begin(), thrust::square<T>(), thrust::negate<T>()));
+
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestTransformInputOutputIteratorScan, IntegralTypes> TestTransformInputOutputIteratorScanInstance;
+
diff --git a/thrust/testing/transform_iterator.cu b/thrust/testing/transform_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e28e333e13ad429b76ce79477268cfd7b25898ef
--- /dev/null
+++ b/thrust/testing/transform_iterator.cu
@@ -0,0 +1,86 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/counting_iterator.h>
+
+template <class Vector>
+void TestTransformIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> UnaryFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector output(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    // construct transform_iterator
+    thrust::transform_iterator<UnaryFunction, Iterator> iter(input.begin(), UnaryFunction());
+
+    thrust::copy(iter, iter + 4, output.begin());
+
+    ASSERT_EQUAL(output[0], -1);
+    ASSERT_EQUAL(output[1], -2);
+    ASSERT_EQUAL(output[2], -3);
+    ASSERT_EQUAL(output[3], -4);
+
+}
+DECLARE_VECTOR_UNITTEST(TestTransformIterator);
+
+template <class Vector>
+void TestMakeTransformIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> UnaryFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector output(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    // construct transform_iterator
+    thrust::transform_iterator<UnaryFunction, Iterator> iter(input.begin(), UnaryFunction());
+
+    thrust::copy(thrust::make_transform_iterator(input.begin(), UnaryFunction()), 
+                 thrust::make_transform_iterator(input.end(), UnaryFunction()), 
+                 output.begin());
+
+    ASSERT_EQUAL(output[0], -1);
+    ASSERT_EQUAL(output[1], -2);
+    ASSERT_EQUAL(output[2], -3);
+    ASSERT_EQUAL(output[3], -4);
+
+}
+DECLARE_VECTOR_UNITTEST(TestMakeTransformIterator);
+
+template <typename T>
+struct TestTransformIteratorReduce
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        // run on host
+        T h_result = thrust::reduce( thrust::make_transform_iterator(h_data.begin(), thrust::negate<T>()),
+                                     thrust::make_transform_iterator(h_data.end(),   thrust::negate<T>()) );
+
+        // run on device
+        T d_result = thrust::reduce( thrust::make_transform_iterator(d_data.begin(), thrust::negate<T>()),
+                                     thrust::make_transform_iterator(d_data.end(),   thrust::negate<T>()) );
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestTransformIteratorReduce, IntegralTypes> TestTransformIteratorReduceInstance;
+
diff --git a/thrust/testing/transform_output_iterator.cu b/thrust/testing/transform_output_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cdeb950f1dbdd8d071a6d1061543f70bad61b1e9
--- /dev/null
+++ b/thrust/testing/transform_output_iterator.cu
@@ -0,0 +1,92 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/transform_output_iterator.h>
+
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/counting_iterator.h>
+
+template <class Vector>
+void TestTransformOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> UnaryFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector output(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    // construct transform_iterator
+    thrust::transform_output_iterator<UnaryFunction, Iterator> output_iter(output.begin(), UnaryFunction());
+
+    thrust::copy(input.begin(), input.end(), output_iter);
+
+    Vector gold_output(4);
+    gold_output[0] = -1;
+    gold_output[1] = -2;
+    gold_output[2] = -3;
+    gold_output[3] = -4;
+
+    ASSERT_EQUAL(output, gold_output);
+
+}
+DECLARE_VECTOR_UNITTEST(TestTransformOutputIterator);
+
+template <class Vector>
+void TestMakeTransformOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> UnaryFunction;
+
+    Vector input(4);
+    Vector output(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    thrust::copy(input.begin(), input.end(),
+                 thrust::make_transform_output_iterator(output.begin(), UnaryFunction()));
+
+    Vector gold_output(4);
+    gold_output[0] = -1;
+    gold_output[1] = -2;
+    gold_output[2] = -3;
+    gold_output[3] = -4;
+
+    ASSERT_EQUAL(output, gold_output);
+
+}
+DECLARE_VECTOR_UNITTEST(TestMakeTransformOutputIterator);
+
+template <typename T>
+struct TestTransformOutputIteratorScan
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        // run on host
+        thrust::inclusive_scan(thrust::make_transform_iterator(h_data.begin(), thrust::negate<T>()),
+                               thrust::make_transform_iterator(h_data.end(),   thrust::negate<T>()),
+                               h_result.begin());
+        // run on device
+        thrust::inclusive_scan(d_data.begin(), d_data.end(),
+                               thrust::make_transform_output_iterator(
+                                   d_result.begin(), thrust::negate<T>()));
+
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestTransformOutputIteratorScan, IntegralTypes> TestTransformOutputIteratorScanInstance;
+
diff --git a/thrust/testing/transform_reduce.cu b/thrust/testing/transform_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ff3159d6cf2dea3773307d1782d991a0eb604bf
--- /dev/null
+++ b/thrust/testing/transform_reduce.cu
@@ -0,0 +1,128 @@
+#include <unittest/unittest.h>
+#include <thrust/transform_reduce.h>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+OutputType transform_reduce(my_system &system,
+                            InputIterator,
+                            InputIterator,
+                            UnaryFunction,
+                            OutputType init,
+                            BinaryFunction)
+{
+    system.validate_dispatch();
+    return init;
+}
+
+void TestTransformReduceDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform_reduce(sys,
+                             vec.begin(),
+                             vec.begin(),
+                             0,
+                             0,
+                             0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformReduceDispatchExplicit);
+
+template<typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+OutputType transform_reduce(my_tag,
+                            InputIterator first,
+                            InputIterator,
+                            UnaryFunction,
+                            OutputType init,
+                            BinaryFunction)
+{
+    *first = 13;
+    return init;
+}
+
+void TestTransformReduceDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform_reduce(thrust::retag<my_tag>(vec.begin()),
+                             thrust::retag<my_tag>(vec.begin()),
+                             0,
+                             0,
+                             0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformReduceDispatchImplicit);
+
+
+template <class Vector>
+void TestTransformReduceSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(3);
+    data[0] = 1; data[1] = -2; data[2] = 3;
+
+    T init = 10;
+    T result = thrust::transform_reduce(data.begin(), data.end(), thrust::negate<T>(), init, thrust::plus<T>());
+
+    ASSERT_EQUAL(result, 8);
+}
+DECLARE_VECTOR_UNITTEST(TestTransformReduceSimple);
+
+template <typename T>
+void TestTransformReduce(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    T init = 13;
+
+    T cpu_result = thrust::transform_reduce(h_data.begin(), h_data.end(), thrust::negate<T>(), init, thrust::plus<T>());
+    T gpu_result = thrust::transform_reduce(d_data.begin(), d_data.end(), thrust::negate<T>(), init, thrust::plus<T>());
+
+    ASSERT_ALMOST_EQUAL(cpu_result, gpu_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformReduce);
+
+template <typename T>
+void TestTransformReduceFromConst(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    T init = 13;
+
+    T cpu_result = thrust::transform_reduce(h_data.cbegin(), h_data.cend(), thrust::negate<T>(), init, thrust::plus<T>());
+    T gpu_result = thrust::transform_reduce(d_data.cbegin(), d_data.cend(), thrust::negate<T>(), init, thrust::plus<T>());
+
+    ASSERT_ALMOST_EQUAL(cpu_result, gpu_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestTransformReduceFromConst);
+
+template <class Vector>
+void TestTransformReduceCountingIterator(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename thrust::iterator_system<typename Vector::iterator>::type space;
+
+    thrust::counting_iterator<T, space> first(1);
+
+    T result = thrust::transform_reduce(first, first + 3, thrust::negate<short>(), 0, thrust::plus<short>());
+
+    ASSERT_EQUAL(result, -6);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformReduceCountingIterator);
+
diff --git a/thrust/testing/transform_scan.cu b/thrust/testing/transform_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e339f7e66dc054b49c514e3ddfdd92b9e9a7f276
--- /dev/null
+++ b/thrust/testing/transform_scan.cu
@@ -0,0 +1,349 @@
+#include <unittest/unittest.h>
+#include <thrust/transform_scan.h>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+OutputIterator transform_inclusive_scan(my_system &system,
+                                        InputIterator,
+                                        InputIterator,
+                                        OutputIterator result,
+                                        UnaryFunction,
+                                        AssociativeOperator)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformInclusiveScanDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform_inclusive_scan(sys,
+                                     vec.begin(),
+                                     vec.begin(),
+                                     vec.begin(),
+                                     0,
+                                     0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformInclusiveScanDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+OutputIterator transform_inclusive_scan(my_tag,
+                                        InputIterator,
+                                        InputIterator,
+                                        OutputIterator result,
+                                        UnaryFunction,
+                                        AssociativeOperator)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformInclusiveScanDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform_inclusive_scan(thrust::retag<my_tag>(vec.begin()),
+                                     thrust::retag<my_tag>(vec.begin()),
+                                     thrust::retag<my_tag>(vec.begin()),
+                                     0,
+                                     0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformInclusiveScanDispatchImplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+OutputIterator transform_exclusive_scan(my_system &system,
+                                        InputIterator,
+                                        InputIterator,
+                                        OutputIterator result,
+                                        UnaryFunction,
+                                        T,
+                                        AssociativeOperator)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestTransformExclusiveScanDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::transform_exclusive_scan(sys,
+                                     vec.begin(),
+                                     vec.begin(),
+                                     vec.begin(),
+                                     0,
+                                     0,
+                                     0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformExclusiveScanDispatchExplicit);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+OutputIterator transform_exclusive_scan(my_tag,
+                                        InputIterator,
+                                        InputIterator,
+                                        OutputIterator result,
+                                        UnaryFunction,
+                                        T,
+                                        AssociativeOperator)
+{
+    *result = 13;
+    return result;
+}
+
+void TestTransformExclusiveScanDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::transform_exclusive_scan(thrust::retag<my_tag>(vec.begin()),
+                                     thrust::retag<my_tag>(vec.begin()),
+                                     thrust::retag<my_tag>(vec.begin()),
+                                     0,
+                                     0,
+                                     0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestTransformExclusiveScanDispatchImplicit);
+
+
+template <class Vector>
+void TestTransformScanSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    typename Vector::iterator iter;
+
+    Vector input(5);
+    Vector result(5);
+    Vector output(5);
+
+    input[0] = 1; input[1] = 3; input[2] = -2; input[3] = 4; input[4] = -5;
+
+    Vector input_copy(input);
+
+    // inclusive scan
+    iter = thrust::transform_inclusive_scan(input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>());
+    result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+    
+    // exclusive scan with 0 init
+    iter = thrust::transform_exclusive_scan(input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>());
+    result[0] = 0; result[1] = -1; result[2] = -4; result[3] = -2; result[4] = -6;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+    
+    // exclusive scan with nonzero init
+    iter = thrust::transform_exclusive_scan(input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+    result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+    ASSERT_EQUAL(input,  input_copy);
+    ASSERT_EQUAL(output, result);
+    
+    // inplace inclusive scan
+    input = input_copy;
+    iter = thrust::transform_inclusive_scan(input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>());
+    result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+    ASSERT_EQUAL(input, result);
+
+    // inplace exclusive scan with init
+    input = input_copy;
+    iter = thrust::transform_exclusive_scan(input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+    result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+    ASSERT_EQUAL(input, result);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanSimple);
+
+struct Record {
+    int number;
+
+    bool operator==(const Record& rhs) const {
+        return number == rhs.number;
+    }
+    bool operator!=(const Record& rhs) const {
+        return !(rhs == *this);
+    }
+    friend Record operator+(Record lhs, const Record& rhs) {
+        lhs.number += rhs.number;
+        return lhs;
+    }
+    friend std::ostream& operator<<(std::ostream& os, const Record& record) {
+        os << "number: " << record.number;
+        return os;
+    }
+};
+
+struct negate {
+    __host__ __device__ int operator()(Record const& record) const
+    {
+        return - record.number;
+    }
+};
+
+void TestTransformInclusiveScanDifferentTypes()
+{
+    typename thrust::host_vector<int>::iterator h_iter;
+
+    thrust::host_vector<Record> h_input(5);
+    thrust::host_vector<int> h_output(5);
+    thrust::host_vector<int> result(5);
+
+    h_input[0] = {1}; h_input[1] = {3}; h_input[2] = {-2}; h_input[3] = {4}; h_input[4] = {-5};
+
+    thrust::host_vector<Record> input_copy(h_input);
+
+    h_iter = thrust::transform_inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), negate{}, thrust::plus<int>{});
+    result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+    ASSERT_EQUAL(std::size_t(h_iter - h_output.begin()), h_input.size());
+    ASSERT_EQUAL(h_input, input_copy);
+    ASSERT_EQUAL(h_output, result);
+
+    typename thrust::device_vector<int>::iterator d_iter;
+
+    thrust::device_vector<Record> d_input = h_input;
+    thrust::device_vector<int> d_output(5);
+
+    d_iter = thrust::transform_inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), negate{}, thrust::plus<int>{});
+    ASSERT_EQUAL(std::size_t(d_iter - d_output.begin()), d_input.size());
+    ASSERT_EQUAL(d_input, input_copy);
+    ASSERT_EQUAL(d_output, result);
+}
+DECLARE_UNITTEST(TestTransformInclusiveScanDifferentTypes);
+
+template <typename T>
+struct TestTransformScan
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_input = h_input;
+
+        thrust::host_vector<T>   h_output(n);
+        thrust::device_vector<T> d_output(n);
+        
+        thrust::transform_inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>(), thrust::plus<T>());
+        thrust::transform_inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>(), thrust::plus<T>());
+        ASSERT_EQUAL(d_output, h_output);
+        
+        thrust::transform_exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+        thrust::transform_exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+        ASSERT_EQUAL(d_output, h_output);
+        
+        // in-place scans
+        h_output = h_input;
+        d_output = d_input;
+        thrust::transform_inclusive_scan(h_output.begin(), h_output.end(), h_output.begin(), thrust::negate<T>(), thrust::plus<T>());
+        thrust::transform_inclusive_scan(d_output.begin(), d_output.end(), d_output.begin(), thrust::negate<T>(), thrust::plus<T>());
+        ASSERT_EQUAL(d_output, h_output);
+        
+        h_output = h_input;
+        d_output = d_input;
+        thrust::transform_exclusive_scan(h_output.begin(), h_output.end(), h_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+        thrust::transform_exclusive_scan(d_output.begin(), d_output.end(), d_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+        ASSERT_EQUAL(d_output, h_output);
+    }
+};
+VariableUnitTest<TestTransformScan, IntegralTypes> TestTransformScanInstance;
+
+template <class Vector>
+void TestTransformScanCountingIterator(void)
+{
+    typedef typename Vector::value_type T;
+    typedef typename thrust::iterator_system<typename Vector::iterator>::type space;
+
+    thrust::counting_iterator<T, space> first(1);
+
+    Vector result(3);
+
+    thrust::transform_inclusive_scan(first, first + 3, result.begin(), thrust::negate<T>(), thrust::plus<T>());
+
+    ASSERT_EQUAL(result[0], -1);
+    ASSERT_EQUAL(result[1], -3);
+    ASSERT_EQUAL(result[2], -6);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanCountingIterator);
+
+template <typename T>
+struct TestTransformScanToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
+        thrust::device_vector<T> d_input = h_input;
+
+        thrust::discard_iterator<> reference(n);
+        
+        thrust::discard_iterator<> h_result =
+          thrust::transform_inclusive_scan(h_input.begin(),
+                                           h_input.end(),
+                                           thrust::make_discard_iterator(),
+                                           thrust::negate<T>(),
+                                           thrust::plus<T>());
+
+        thrust::discard_iterator<> d_result =
+          thrust::transform_inclusive_scan(d_input.begin(),
+                                           d_input.end(),
+                                           thrust::make_discard_iterator(),
+                                           thrust::negate<T>(),
+                                           thrust::plus<T>());
+        ASSERT_EQUAL_QUIET(reference, h_result);
+        ASSERT_EQUAL_QUIET(reference, d_result);
+        
+        h_result =
+          thrust::transform_exclusive_scan(h_input.begin(),
+                                           h_input.end(),
+                                           thrust::make_discard_iterator(),
+                                           thrust::negate<T>(),
+                                           (T) 11,
+                                           thrust::plus<T>());
+
+        d_result =
+          thrust::transform_exclusive_scan(d_input.begin(),
+                                           d_input.end(),
+                                           thrust::make_discard_iterator(),
+                                           thrust::negate<T>(),
+                                           (T) 11,
+                                           thrust::plus<T>());
+
+        ASSERT_EQUAL_QUIET(reference, h_result);
+        ASSERT_EQUAL_QUIET(reference, d_result);
+    }
+};
+VariableUnitTest<TestTransformScanToDiscardIterator, IntegralTypes> TestTransformScanToDiscardIteratorInstance;
+
diff --git a/thrust/testing/trivial_sequence.cu b/thrust/testing/trivial_sequence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6dee8e5efb581154ed953546e56e2232477741e1
--- /dev/null
+++ b/thrust/testing/trivial_sequence.cu
@@ -0,0 +1,50 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/trivial_sequence.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <thrust/iterator/zip_iterator.h> 
+
+template <typename Iterator>
+void test(Iterator first, Iterator last)
+{
+    typedef typename thrust::iterator_system<Iterator>::type System;
+    System system;
+    thrust::detail::trivial_sequence<Iterator,System> ts(system, first, last);
+    typedef typename thrust::iterator_traits<Iterator>::value_type ValueType;
+    
+    ASSERT_EQUAL_QUIET((ValueType) ts.begin()[0], ValueType(0, 11)); 
+    ASSERT_EQUAL_QUIET((ValueType) ts.begin()[1], ValueType(2, 11)); 
+    ASSERT_EQUAL_QUIET((ValueType) ts.begin()[2], ValueType(1, 13)); 
+    ASSERT_EQUAL_QUIET((ValueType) ts.begin()[3], ValueType(0, 10)); 
+    ASSERT_EQUAL_QUIET((ValueType) ts.begin()[4], ValueType(1, 12)); 
+
+    ts.begin()[0] = ValueType(0,0);
+    ts.begin()[1] = ValueType(0,0);
+    ts.begin()[2] = ValueType(0,0);
+    ts.begin()[3] = ValueType(0,0);
+    ts.begin()[4] = ValueType(0,0);
+
+    typedef typename thrust::detail::trivial_sequence<Iterator,System>::iterator_type TrivialIterator;
+
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<Iterator>::value,        false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<TrivialIterator>::value,  true);
+}
+
+template <class Vector>
+void TestTrivialSequence(void)
+{
+    Vector A(5);  A[0] =  0;  A[1] =  2;  A[2] =  1;  A[3] =  0;  A[4] =  1;  
+    Vector B(5);  B[0] = 11;  B[1] = 11;  B[2] = 13;  B[3] = 10;  B[4] = 12;
+
+    test(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin())),
+         thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end())));
+
+    // ensure that values weren't modified
+    ASSERT_EQUAL(A[0], 0);  ASSERT_EQUAL(B[0], 11); 
+    ASSERT_EQUAL(A[1], 2);  ASSERT_EQUAL(B[1], 11); 
+    ASSERT_EQUAL(A[2], 1);  ASSERT_EQUAL(B[2], 13); 
+    ASSERT_EQUAL(A[3], 0);  ASSERT_EQUAL(B[3], 10); 
+    ASSERT_EQUAL(A[4], 1);  ASSERT_EQUAL(B[4], 12); 
+}
+DECLARE_VECTOR_UNITTEST(TestTrivialSequence);
+
diff --git a/thrust/testing/tuple.cu b/thrust/testing/tuple.cu
new file mode 100644
index 0000000000000000000000000000000000000000..40dccbd2231cd4c93ded01d5e9f8c55b3cd2fb9c
--- /dev/null
+++ b/thrust/testing/tuple.cu
@@ -0,0 +1,494 @@
+#include <unittest/unittest.h>
+#include <thrust/tuple.h>
+#include <thrust/generate.h>
+#include <thrust/swap.h>
+
+using namespace unittest;
+
+template <typename T>
+struct TestTupleConstructor
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+
+    host_vector<T> data = random_integers<T>(10);
+
+    tuple<T> t1(data[0]);
+    ASSERT_EQUAL(data[0], get<0>(t1));
+
+    tuple<T,T> t2(data[0], data[1]);
+    ASSERT_EQUAL(data[0], get<0>(t2));
+    ASSERT_EQUAL(data[1], get<1>(t2));
+
+    tuple<T,T,T> t3(data[0], data[1], data[2]);
+    ASSERT_EQUAL(data[0], get<0>(t3));
+    ASSERT_EQUAL(data[1], get<1>(t3));
+    ASSERT_EQUAL(data[2], get<2>(t3));
+
+    tuple<T,T,T,T> t4(data[0], data[1], data[2], data[3]);
+    ASSERT_EQUAL(data[0], get<0>(t4));
+    ASSERT_EQUAL(data[1], get<1>(t4));
+    ASSERT_EQUAL(data[2], get<2>(t4));
+    ASSERT_EQUAL(data[3], get<3>(t4));
+
+    tuple<T,T,T,T,T> t5(data[0], data[1], data[2], data[3], data[4]);
+    ASSERT_EQUAL(data[0], get<0>(t5));
+    ASSERT_EQUAL(data[1], get<1>(t5));
+    ASSERT_EQUAL(data[2], get<2>(t5));
+    ASSERT_EQUAL(data[3], get<3>(t5));
+    ASSERT_EQUAL(data[4], get<4>(t5));
+
+    tuple<T,T,T,T,T,T> t6(data[0], data[1], data[2], data[3], data[4], data[5]);
+    ASSERT_EQUAL(data[0], get<0>(t6));
+    ASSERT_EQUAL(data[1], get<1>(t6));
+    ASSERT_EQUAL(data[2], get<2>(t6));
+    ASSERT_EQUAL(data[3], get<3>(t6));
+    ASSERT_EQUAL(data[4], get<4>(t6));
+    ASSERT_EQUAL(data[5], get<5>(t6));
+
+    tuple<T,T,T,T,T,T,T> t7(data[0], data[1], data[2], data[3], data[4], data[5], data[6]);
+    ASSERT_EQUAL(data[0], get<0>(t7));
+    ASSERT_EQUAL(data[1], get<1>(t7));
+    ASSERT_EQUAL(data[2], get<2>(t7));
+    ASSERT_EQUAL(data[3], get<3>(t7));
+    ASSERT_EQUAL(data[4], get<4>(t7));
+    ASSERT_EQUAL(data[5], get<5>(t7));
+    ASSERT_EQUAL(data[6], get<6>(t7));
+
+    tuple<T,T,T,T,T,T,T,T> t8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
+    ASSERT_EQUAL(data[0], get<0>(t8));
+    ASSERT_EQUAL(data[1], get<1>(t8));
+    ASSERT_EQUAL(data[2], get<2>(t8));
+    ASSERT_EQUAL(data[3], get<3>(t8));
+    ASSERT_EQUAL(data[4], get<4>(t8));
+    ASSERT_EQUAL(data[5], get<5>(t8));
+    ASSERT_EQUAL(data[6], get<6>(t8));
+    ASSERT_EQUAL(data[7], get<7>(t8));
+
+    tuple<T,T,T,T,T,T,T,T,T> t9(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]);
+    ASSERT_EQUAL(data[0], get<0>(t9));
+    ASSERT_EQUAL(data[1], get<1>(t9));
+    ASSERT_EQUAL(data[2], get<2>(t9));
+    ASSERT_EQUAL(data[3], get<3>(t9));
+    ASSERT_EQUAL(data[4], get<4>(t9));
+    ASSERT_EQUAL(data[5], get<5>(t9));
+    ASSERT_EQUAL(data[6], get<6>(t9));
+    ASSERT_EQUAL(data[7], get<7>(t9));
+    ASSERT_EQUAL(data[8], get<8>(t9));
+
+    tuple<T,T,T,T,T,T,T,T,T,T> t10(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9]);
+    ASSERT_EQUAL(data[0], get<0>(t10));
+    ASSERT_EQUAL(data[1], get<1>(t10));
+    ASSERT_EQUAL(data[2], get<2>(t10));
+    ASSERT_EQUAL(data[3], get<3>(t10));
+    ASSERT_EQUAL(data[4], get<4>(t10));
+    ASSERT_EQUAL(data[5], get<5>(t10));
+    ASSERT_EQUAL(data[6], get<6>(t10));
+    ASSERT_EQUAL(data[7], get<7>(t10));
+    ASSERT_EQUAL(data[8], get<8>(t10));
+    ASSERT_EQUAL(data[9], get<9>(t10));
+  }
+};
+SimpleUnitTest<TestTupleConstructor, BuiltinNumericTypes> TestTupleConstructorInstance;
+
+template <typename T>
+struct TestMakeTuple
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+
+    host_vector<T> data = random_integers<T>(10);
+
+    tuple<T> t1 = make_tuple(data[0]);
+    ASSERT_EQUAL(data[0], get<0>(t1));
+
+    tuple<T,T> t2 = make_tuple(data[0], data[1]);
+    ASSERT_EQUAL(data[0], get<0>(t2));
+    ASSERT_EQUAL(data[1], get<1>(t2));
+
+    tuple<T,T,T> t3 = make_tuple(data[0], data[1], data[2]);
+    ASSERT_EQUAL(data[0], get<0>(t3));
+    ASSERT_EQUAL(data[1], get<1>(t3));
+    ASSERT_EQUAL(data[2], get<2>(t3));
+
+    tuple<T,T,T,T> t4 = make_tuple(data[0], data[1], data[2], data[3]);
+    ASSERT_EQUAL(data[0], get<0>(t4));
+    ASSERT_EQUAL(data[1], get<1>(t4));
+    ASSERT_EQUAL(data[2], get<2>(t4));
+    ASSERT_EQUAL(data[3], get<3>(t4));
+
+    tuple<T,T,T,T,T> t5 = make_tuple(data[0], data[1], data[2], data[3], data[4]);
+    ASSERT_EQUAL(data[0], get<0>(t5));
+    ASSERT_EQUAL(data[1], get<1>(t5));
+    ASSERT_EQUAL(data[2], get<2>(t5));
+    ASSERT_EQUAL(data[3], get<3>(t5));
+    ASSERT_EQUAL(data[4], get<4>(t5));
+
+    tuple<T,T,T,T,T,T> t6 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5]);
+    ASSERT_EQUAL(data[0], get<0>(t6));
+    ASSERT_EQUAL(data[1], get<1>(t6));
+    ASSERT_EQUAL(data[2], get<2>(t6));
+    ASSERT_EQUAL(data[3], get<3>(t6));
+    ASSERT_EQUAL(data[4], get<4>(t6));
+    ASSERT_EQUAL(data[5], get<5>(t6));
+
+    tuple<T,T,T,T,T,T,T> t7 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6]);
+    ASSERT_EQUAL(data[0], get<0>(t7));
+    ASSERT_EQUAL(data[1], get<1>(t7));
+    ASSERT_EQUAL(data[2], get<2>(t7));
+    ASSERT_EQUAL(data[3], get<3>(t7));
+    ASSERT_EQUAL(data[4], get<4>(t7));
+    ASSERT_EQUAL(data[5], get<5>(t7));
+    ASSERT_EQUAL(data[6], get<6>(t7));
+
+    tuple<T,T,T,T,T,T,T,T> t8 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
+    ASSERT_EQUAL(data[0], get<0>(t8));
+    ASSERT_EQUAL(data[1], get<1>(t8));
+    ASSERT_EQUAL(data[2], get<2>(t8));
+    ASSERT_EQUAL(data[3], get<3>(t8));
+    ASSERT_EQUAL(data[4], get<4>(t8));
+    ASSERT_EQUAL(data[5], get<5>(t8));
+    ASSERT_EQUAL(data[6], get<6>(t8));
+    ASSERT_EQUAL(data[7], get<7>(t8));
+
+    tuple<T,T,T,T,T,T,T,T,T> t9 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]);
+    ASSERT_EQUAL(data[0], get<0>(t9));
+    ASSERT_EQUAL(data[1], get<1>(t9));
+    ASSERT_EQUAL(data[2], get<2>(t9));
+    ASSERT_EQUAL(data[3], get<3>(t9));
+    ASSERT_EQUAL(data[4], get<4>(t9));
+    ASSERT_EQUAL(data[5], get<5>(t9));
+    ASSERT_EQUAL(data[6], get<6>(t9));
+    ASSERT_EQUAL(data[7], get<7>(t9));
+    ASSERT_EQUAL(data[8], get<8>(t9));
+
+    tuple<T,T,T,T,T,T,T,T,T,T> t10 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9]);
+    ASSERT_EQUAL(data[0], get<0>(t10));
+    ASSERT_EQUAL(data[1], get<1>(t10));
+    ASSERT_EQUAL(data[2], get<2>(t10));
+    ASSERT_EQUAL(data[3], get<3>(t10));
+    ASSERT_EQUAL(data[4], get<4>(t10));
+    ASSERT_EQUAL(data[5], get<5>(t10));
+    ASSERT_EQUAL(data[6], get<6>(t10));
+    ASSERT_EQUAL(data[7], get<7>(t10));
+    ASSERT_EQUAL(data[8], get<8>(t10));
+    ASSERT_EQUAL(data[9], get<9>(t10));
+  }
+};
+SimpleUnitTest<TestMakeTuple, BuiltinNumericTypes> TestMakeTupleInstance;
+
+template <typename T>
+struct TestTupleGet
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+    host_vector<T> data = random_integers<T>(10);
+
+    tuple<T> t1(data[0]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t1));
+
+    tuple<T,T> t2(data[0], data[1]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t2));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t2));
+
+    tuple<T,T,T> t3 = make_tuple(data[0], data[1], data[2]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t3));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t3));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t3));
+
+    tuple<T,T,T,T> t4 = make_tuple(data[0], data[1], data[2], data[3]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t4));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t4));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t4));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t4));
+
+    tuple<T,T,T,T,T> t5 = make_tuple(data[0], data[1], data[2], data[3], data[4]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t5));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t5));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t5));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t5));
+    ASSERT_EQUAL(data[4], thrust::get<4>(t5));
+
+    tuple<T,T,T,T,T,T> t6 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t6));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t6));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t6));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t6));
+    ASSERT_EQUAL(data[4], thrust::get<4>(t6));
+    ASSERT_EQUAL(data[5], thrust::get<5>(t6));
+
+    tuple<T,T,T,T,T,T,T> t7 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t7));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t7));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t7));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t7));
+    ASSERT_EQUAL(data[4], thrust::get<4>(t7));
+    ASSERT_EQUAL(data[5], thrust::get<5>(t7));
+    ASSERT_EQUAL(data[6], thrust::get<6>(t7));
+
+    tuple<T,T,T,T,T,T,T,T> t8 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t8));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t8));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t8));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t8));
+    ASSERT_EQUAL(data[4], thrust::get<4>(t8));
+    ASSERT_EQUAL(data[5], thrust::get<5>(t8));
+    ASSERT_EQUAL(data[6], thrust::get<6>(t8));
+    ASSERT_EQUAL(data[7], thrust::get<7>(t8));
+
+    tuple<T,T,T,T,T,T,T,T,T> t9 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t9));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t9));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t9));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t9));
+    ASSERT_EQUAL(data[4], thrust::get<4>(t9));
+    ASSERT_EQUAL(data[5], thrust::get<5>(t9));
+    ASSERT_EQUAL(data[6], thrust::get<6>(t9));
+    ASSERT_EQUAL(data[7], thrust::get<7>(t9));
+    ASSERT_EQUAL(data[8], thrust::get<8>(t9));
+
+    tuple<T,T,T,T,T,T,T,T,T,T> t10 = make_tuple(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9]);
+    ASSERT_EQUAL(data[0], thrust::get<0>(t10));
+    ASSERT_EQUAL(data[1], thrust::get<1>(t10));
+    ASSERT_EQUAL(data[2], thrust::get<2>(t10));
+    ASSERT_EQUAL(data[3], thrust::get<3>(t10));
+    ASSERT_EQUAL(data[4], thrust::get<4>(t10));
+    ASSERT_EQUAL(data[5], thrust::get<5>(t10));
+    ASSERT_EQUAL(data[6], thrust::get<6>(t10));
+    ASSERT_EQUAL(data[7], thrust::get<7>(t10));
+    ASSERT_EQUAL(data[8], thrust::get<8>(t10));
+    ASSERT_EQUAL(data[9], thrust::get<9>(t10));
+  }
+};
+SimpleUnitTest<TestTupleGet, BuiltinNumericTypes> TestTupleGetInstance;
+
+
+
+template <typename T>
+struct TestTupleComparison
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+
+    tuple<T,T,T,T,T> lhs(0, 0, 0, 0, 0), rhs(0, 0, 0, 0, 0);
+
+    // equality
+    ASSERT_EQUAL(true,  lhs == rhs);
+    get<0>(rhs) = 1;
+    ASSERT_EQUAL(false,  lhs == rhs);
+
+    // inequality
+    ASSERT_EQUAL(true,  lhs != rhs);
+    lhs = rhs;
+    ASSERT_EQUAL(false, lhs != rhs);
+
+    // less than
+    lhs = make_tuple(0,0,0,0,0);
+    rhs = make_tuple(0,0,1,0,0);
+    ASSERT_EQUAL(true,  lhs < rhs);
+    get<0>(lhs) = 2;
+    ASSERT_EQUAL(false, lhs < rhs);
+
+    // less than equal
+    lhs = make_tuple(0,0,0,0,0);
+    rhs = lhs;
+    ASSERT_EQUAL(true,  lhs <= rhs); // equal
+    get<2>(rhs) = 1;
+    ASSERT_EQUAL(true,  lhs <= rhs); // less than
+    get<2>(lhs) = 2;
+    ASSERT_EQUAL(false, lhs <= rhs);
+
+    // greater than
+    lhs = make_tuple(1,0,0,0,0);
+    rhs = make_tuple(0,1,1,1,1);
+    ASSERT_EQUAL(true,  lhs > rhs);
+    get<0>(rhs) = 2;
+    ASSERT_EQUAL(false, lhs > rhs);
+
+    // greater than equal
+    lhs = make_tuple(0,0,0,0,0);
+    rhs = lhs;
+    ASSERT_EQUAL(true,  lhs >= rhs); // equal
+    get<4>(lhs) = 1;
+    ASSERT_EQUAL(true,  lhs >= rhs); // greater than
+    get<3>(rhs) = 1;
+    ASSERT_EQUAL(false, lhs >= rhs);
+  }
+};
+SimpleUnitTest<TestTupleComparison, NumericTypes> TestTupleComparisonInstance;
+
+
+template <typename T>
+struct TestTupleTieFunctor
+{
+  __host__ __device__
+  void clear(T *data) const
+  {
+    for(int i = 0; i < 10; ++i)
+      data[i] = 13;
+  }
+
+  __host__ __device__
+  bool operator()() const
+  {
+    using namespace thrust;
+
+    bool result = true;
+
+    T data[10];
+    clear(data);
+
+    // 17 and not 0 to avoid triggering custom_numeric's `operator void *` and a comparison with a null pointer
+    // TODO: get this back from 17 to 0 once C++11 is on everywhere and that operator on custom_numeric is changed
+    // to an explicit operator bool
+    tie(data[0]) = make_tuple(17);
+    result &= data[0] == 17;
+    clear(data);
+
+    tie(data[0], data[1]) = make_tuple(17,1);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    clear(data);
+
+    tie(data[0], data[1], data[2]) = make_tuple(17,1,2);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3]) = make_tuple(17,1,2,3);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3], data[4]) = make_tuple(17,1,2,3,4);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    result &= data[4] == 4;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3], data[4], data[5]) = make_tuple(17,1,2,3,4,5);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    result &= data[4] == 4;
+    result &= data[5] == 5;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6]) = make_tuple(17,1,2,3,4,5,6);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    result &= data[4] == 4;
+    result &= data[5] == 5;
+    result &= data[6] == 6;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]) = make_tuple(17,1,2,3,4,5,6,7);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    result &= data[4] == 4;
+    result &= data[5] == 5;
+    result &= data[6] == 6;
+    result &= data[7] == 7;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]) = make_tuple(17,1,2,3,4,5,6,7,8);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    result &= data[4] == 4;
+    result &= data[5] == 5;
+    result &= data[6] == 6;
+    result &= data[7] == 7;
+    result &= data[8] == 8;
+    clear(data);
+
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9]) = make_tuple(17,1,2,3,4,5,6,7,8,9);
+    result &= data[0] == 17;
+    result &= data[1] == 1;
+    result &= data[2] == 2;
+    result &= data[3] == 3;
+    result &= data[4] == 4;
+    result &= data[5] == 5;
+    result &= data[6] == 6;
+    result &= data[7] == 7;
+    result &= data[8] == 8;
+    result &= data[9] == 9;
+    clear(data);
+
+    return result;
+  }
+};
+
+template <typename T>
+struct TestTupleTie
+{
+  void operator()(void)
+  {
+    thrust::host_vector<bool> h_result(1);
+    thrust::generate(h_result.begin(), h_result.end(), TestTupleTieFunctor<T>());
+
+    thrust::device_vector<bool> d_result(1);
+    thrust::generate(d_result.begin(), d_result.end(), TestTupleTieFunctor<T>());
+
+    ASSERT_EQUAL(true, h_result[0]);
+    ASSERT_EQUAL(true, d_result[0]);
+  }
+};
+SimpleUnitTest<TestTupleTie, NumericTypes> TestTupleTieInstance;
+
+void TestTupleSwap(void)
+{
+  int a = 7;
+  int b = 13;
+  int c = 42;
+
+  int x = 77;
+  int y = 1313;
+  int z = 4242;
+
+  thrust::tuple<int,int,int> t1(a,b,c);
+  thrust::tuple<int,int,int> t2(x,y,z);
+
+  thrust::swap(t1,t2);
+
+  ASSERT_EQUAL(x, thrust::get<0>(t1));
+  ASSERT_EQUAL(y, thrust::get<1>(t1));
+  ASSERT_EQUAL(z, thrust::get<2>(t1));
+  ASSERT_EQUAL(a, thrust::get<0>(t2));
+  ASSERT_EQUAL(b, thrust::get<1>(t2));
+  ASSERT_EQUAL(c, thrust::get<2>(t2));
+
+
+  typedef thrust::tuple<user_swappable,user_swappable,user_swappable,user_swappable> swappable_tuple;
+
+  thrust::host_vector<swappable_tuple>   h_v1(1), h_v2(1);
+  thrust::device_vector<swappable_tuple> d_v1(1), d_v2(1);
+
+  thrust::swap_ranges(h_v1.begin(), h_v1.end(), h_v2.begin());
+  thrust::swap_ranges(d_v1.begin(), d_v1.end(), d_v2.begin());
+
+  swappable_tuple ref(user_swappable(true),user_swappable(true),user_swappable(true),user_swappable(true));
+
+  ASSERT_EQUAL_QUIET(ref, h_v1[0]);
+  ASSERT_EQUAL_QUIET(ref, h_v1[0]);
+  ASSERT_EQUAL_QUIET(ref, (swappable_tuple)d_v1[0]);
+  ASSERT_EQUAL_QUIET(ref, (swappable_tuple)d_v1[0]);
+}
+DECLARE_UNITTEST(TestTupleSwap);
+
+
diff --git a/thrust/testing/tuple_algorithms.cu b/thrust/testing/tuple_algorithms.cu
new file mode 100644
index 0000000000000000000000000000000000000000..449fdc2f18f7c0141d5722169c59288b657e4397
--- /dev/null
+++ b/thrust/testing/tuple_algorithms.cu
@@ -0,0 +1,62 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <unittest/unittest.h>
+
+#include <thrust/detail/tuple_algorithms.h>
+#include <thrust/type_traits/integer_sequence.h>
+
+// FIXME: Replace with C++14 style `thrust::square<>` when we have it.
+struct custom_square
+{
+  template <typename T>
+  __host__ __device__
+  T operator()(T v) const
+  {
+    return v * v;
+  }
+};
+
+struct custom_square_inplace
+{
+  template <typename T>
+  __host__ __device__
+  void operator()(T& v) const
+  {
+    v *= v;
+  }
+};
+
+void test_tuple_subset()
+{
+  auto t0 = std::make_tuple(0, 2, 3.14);
+
+  auto t1 = thrust::tuple_subset(t0, thrust::index_sequence<2, 0>{});
+
+  ASSERT_EQUAL_QUIET(t1, std::make_tuple(3.14, 0));
+}
+DECLARE_UNITTEST(test_tuple_subset);
+
+void test_tuple_transform()
+{
+  auto t0 = std::make_tuple(0, 2, 3.14);
+
+  auto t1 = thrust::tuple_transform(t0, custom_square{});
+
+  ASSERT_EQUAL_QUIET(t1, std::make_tuple(0, 4, 9.8596));
+}
+DECLARE_UNITTEST(test_tuple_transform);
+
+void test_tuple_for_each()
+{
+  auto t = std::make_tuple(0, 2, 3.14);
+
+  thrust::tuple_for_each(t, custom_square_inplace{});
+
+  ASSERT_EQUAL_QUIET(t, std::make_tuple(0, 4, 9.8596));
+}
+DECLARE_UNITTEST(test_tuple_for_each);
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/testing/tuple_reduce.cu b/thrust/testing/tuple_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9edea4bda41c61f2b36c9851c57874b15f359ac7
--- /dev/null
+++ b/thrust/testing/tuple_reduce.cu
@@ -0,0 +1,60 @@
+#include <unittest/unittest.h>
+#include <thrust/tuple.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+using namespace unittest;
+
+struct SumTupleFunctor
+{
+  template <typename Tuple>
+  __host__ __device__
+  Tuple operator()(const Tuple &lhs, const Tuple &rhs)
+  {
+    using thrust::get;
+  
+    return thrust::make_tuple(get<0>(lhs) + get<0>(rhs),
+                              get<1>(lhs) + get<1>(rhs));
+  }
+};
+
+struct MakeTupleFunctor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+  thrust::tuple<T1,T2> operator()(T1 &lhs, T2 &rhs)
+  {
+    return thrust::make_tuple(lhs, rhs);
+  }
+};
+
+template <typename T>
+struct TestTupleReduce
+{
+  void operator()(const size_t n)
+  {
+     using namespace thrust;
+
+     host_vector<T> h_t1 = random_integers<T>(n);
+     host_vector<T> h_t2 = random_integers<T>(n);
+
+     // zip up the data
+     host_vector< tuple<T,T> > h_tuples(n);
+     transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor());
+
+     // copy to device
+     device_vector< tuple<T,T> > d_tuples = h_tuples;
+     
+     tuple<T,T> zero(0,0);
+
+     // sum on host
+     tuple<T,T> h_result = reduce(h_tuples.begin(), h_tuples.end(), zero, SumTupleFunctor());
+
+     // sum on device
+     tuple<T,T> d_result = reduce(d_tuples.begin(), d_tuples.end(), zero, SumTupleFunctor());
+
+     ASSERT_EQUAL_QUIET(h_result, d_result);
+  }
+};
+VariableUnitTest<TestTupleReduce, IntegralTypes> TestTupleReduceInstance;
+
diff --git a/thrust/testing/tuple_scan.cu b/thrust/testing/tuple_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c15b81751192d5556f0b26e048ceca5923fa39b6
--- /dev/null
+++ b/thrust/testing/tuple_scan.cu
@@ -0,0 +1,82 @@
+#include <unittest/unittest.h>
+#include <thrust/tuple.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <unittest/cuda/testframework.h>
+#endif
+
+using namespace unittest;
+
+struct SumTupleFunctor
+{
+  template <typename Tuple>
+  __host__ __device__
+  Tuple operator()(const Tuple &lhs, const Tuple &rhs)
+  {
+    using thrust::get;
+  
+    return thrust::make_tuple(get<0>(lhs) + get<0>(rhs),
+                              get<1>(lhs) + get<1>(rhs));
+  }
+};
+
+struct MakeTupleFunctor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+  thrust::tuple<T1,T2> operator()(T1 &lhs, T2 &rhs)
+  {
+    return thrust::make_tuple(lhs, rhs);
+  }
+};
+
+
+template <typename T>
+struct TestTupleScan
+{
+  void operator()(const size_t n)
+  {
+     using namespace thrust;
+
+     host_vector<T> h_t1 = unittest::random_integers<T>(n);
+     host_vector<T> h_t2 = unittest::random_integers<T>(n);
+
+     // initialize input
+     host_vector< tuple<T,T> > h_input(n);
+     transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor());
+     device_vector< tuple<T,T> > d_input = h_input;
+     
+     // allocate output
+     tuple<T,T> zero(0,0);
+     host_vector  < tuple<T,T> > h_output(n, zero);
+     device_vector< tuple<T,T> > d_output(n, zero);
+
+     // inclusive_scan
+     inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), SumTupleFunctor());
+     inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), SumTupleFunctor());
+     ASSERT_EQUAL_QUIET(h_output, d_output);
+
+    // The tests below get miscompiled on Tesla hw for 8b types
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
+    {
+      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
+      {
+        KNOWN_FAILURE;
+      } // end if
+    } // end if
+#endif
+
+     // exclusive_scan
+     tuple<T,T> init(13,17);
+     exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), init, SumTupleFunctor());
+     exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), init, SumTupleFunctor());
+
+     ASSERT_EQUAL_QUIET(h_output, d_output);
+  }
+};
+VariableUnitTest<TestTupleScan, IntegralTypes> TestTupleScanInstance;
+
diff --git a/thrust/testing/tuple_sort.cu b/thrust/testing/tuple_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1684ba45e52d7d8648975c6742ec592fa3a3a59f
--- /dev/null
+++ b/thrust/testing/tuple_sort.cu
@@ -0,0 +1,76 @@
+#include <unittest/unittest.h>
+#include <thrust/tuple.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+using namespace unittest;
+
+struct MakeTupleFunctor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+  thrust::tuple<T1,T2> operator()(T1 &lhs, T2 &rhs)
+  {
+    return thrust::make_tuple(lhs, rhs);
+  }
+};
+
+template<int N>
+struct GetFunctor
+{
+  template<typename Tuple>
+  __host__ __device__
+  typename thrust::access_traits<
+    typename thrust::tuple_element<N, Tuple>::type
+  >::const_type
+  operator()(const Tuple &t)
+  {
+    return thrust::get<N>(t);
+  }
+};
+
+template <typename T>
+struct TestTupleStableSort
+{
+  void operator()(const size_t n)
+  {
+     using namespace thrust;
+
+     host_vector<T> h_keys   = random_integers<T>(n);
+     host_vector<T> h_values = random_integers<T>(n);
+
+     // zip up the data
+     host_vector< tuple<T,T> > h_tuples(n);
+     transform(h_keys.begin(),   h_keys.end(),
+               h_values.begin(), h_tuples.begin(),
+               MakeTupleFunctor());
+
+     // copy to device
+     device_vector< tuple<T,T> > d_tuples = h_tuples;
+
+     // sort on host
+     stable_sort(h_tuples.begin(), h_tuples.end());
+
+     // sort on device
+     stable_sort(d_tuples.begin(), d_tuples.end());
+
+     ASSERT_EQUAL(true, is_sorted(d_tuples.begin(), d_tuples.end()));
+
+     // select keys
+     transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>());
+
+     device_vector<T> d_keys(h_keys.size());
+     transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>());
+
+     // select values
+     transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>());
+     
+     device_vector<T> d_values(h_values.size());
+     transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>());
+
+     ASSERT_ALMOST_EQUAL(h_keys, d_keys);
+     ASSERT_ALMOST_EQUAL(h_values, d_values);
+  }
+};
+VariableUnitTest<TestTupleStableSort, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestTupleStableSortInstance;
+
diff --git a/thrust/testing/tuple_transform.cu b/thrust/testing/tuple_transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e77bd489be8219366b157b22ff4eaa62b0af6f77
--- /dev/null
+++ b/thrust/testing/tuple_transform.cu
@@ -0,0 +1,66 @@
+#include <unittest/unittest.h>
+#include <thrust/tuple.h>
+#include <thrust/transform.h>
+
+using namespace unittest;
+
+struct MakeTupleFunctor
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+  thrust::tuple<T1,T2> operator()(T1 &lhs, T2 &rhs)
+  {
+    return thrust::make_tuple(lhs, rhs);
+  }
+};
+
+template<int N>
+struct GetFunctor
+{
+  template<typename Tuple>
+  __host__ __device__
+  typename thrust::access_traits<
+    typename thrust::tuple_element<N, Tuple>::type
+  >::const_type
+  operator()(const Tuple &t)
+  {
+    return thrust::get<N>(t);
+  }
+};
+
+
+template <typename T>
+struct TestTupleTransform
+{
+  void operator()(const size_t n)
+  {
+     using namespace thrust;
+
+     host_vector<T> h_t1 = random_integers<T>(n);
+     host_vector<T> h_t2 = random_integers<T>(n);
+
+     // zip up the data
+     host_vector< tuple<T,T> > h_tuples(n);
+     transform(h_t1.begin(), h_t1.end(),
+               h_t2.begin(), h_tuples.begin(),
+               MakeTupleFunctor());
+
+     // copy to device
+     device_vector< tuple<T,T> > d_tuples = h_tuples;
+
+     device_vector<T> d_t1(n), d_t2(n);
+
+     // select 0th
+     transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>());
+
+     // select 1st
+     transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>());
+
+     ASSERT_ALMOST_EQUAL(h_t1, d_t1);
+     ASSERT_ALMOST_EQUAL(h_t2, d_t2);
+
+     ASSERT_EQUAL_QUIET(h_tuples, d_tuples);
+  }
+};
+VariableUnitTest<TestTupleTransform, SignedIntegralTypes> TestTupleTransformInstance;
+
diff --git a/thrust/testing/type_traits.cu b/thrust/testing/type_traits.cu
new file mode 100644
index 0000000000000000000000000000000000000000..339e11b9088a184490a20e89339242c924e5f913
--- /dev/null
+++ b/thrust/testing/type_traits.cu
@@ -0,0 +1,127 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/device_ptr.h>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+struct non_pod
+{
+  // non-pods can have constructors
+  non_pod(void)
+  {}
+
+  int x; int y;
+};
+
+void TestIsPlainOldData(void)
+{
+    // primitive types
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<bool>::value, true);
+
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<char>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<signed char>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<unsigned char>::value, true);
+    
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<short>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<signed short>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<unsigned short>::value, true);
+
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<int>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<signed int>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<unsigned int>::value, true);
+    
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<long>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<signed long>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<unsigned long>::value, true);
+    
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<long long>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<signed long long>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<unsigned long long>::value, true);
+    
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<float>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<double>::value, true);
+    
+    // void
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<void>::value, true);
+
+    // structs
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<non_pod>::value, false);
+
+    // pointers
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<char *>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<int *>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<int **>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<non_pod *>::value, true);
+
+    // const types
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<const int>::value, true);
+    ASSERT_EQUAL((bool)thrust::detail::is_pod<const int *>::value, true);
+}
+DECLARE_UNITTEST(TestIsPlainOldData);
+
+void TestIsContiguousIterator(void)
+{
+    typedef thrust::host_vector<int>   HostVector;
+    typedef thrust::device_vector<int> DeviceVector;
+    
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator< int * >::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator< thrust::device_ptr<int> >::value, true);
+
+
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<HostVector::iterator>::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<HostVector::const_iterator>::value, true);
+
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<DeviceVector::iterator>::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<DeviceVector::const_iterator>::value, true);
+
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator< thrust::device_ptr<int> >::value, true);
+
+    typedef thrust::tuple< HostVector::iterator,   HostVector::iterator   > HostIteratorTuple;
+
+    typedef thrust::constant_iterator<int> ConstantIterator;
+    typedef thrust::counting_iterator<int> CountingIterator;
+    typedef thrust::transform_iterator<thrust::identity<int>, HostVector::iterator > TransformIterator;
+    typedef thrust::zip_iterator< HostIteratorTuple >  ZipIterator;
+
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<ConstantIterator>::value,  false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<CountingIterator>::value,  false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<TransformIterator>::value, false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<ZipIterator>::value,       false);
+
+}
+DECLARE_UNITTEST(TestIsContiguousIterator);
+
+void TestIsCommutative(void)
+{
+  { typedef int T; typedef thrust::plus<T>        Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::multiplies<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::minimum<T>     Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::maximum<T>     Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::logical_or<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::logical_and<T> Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::bit_or<T>      Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::bit_and<T>     Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef int T; typedef thrust::bit_xor<T>     Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  
+  { typedef char      T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef short     T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef long      T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef long long T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef float     T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  { typedef double    T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, true); }
+  
+  { typedef int   T; typedef thrust::minus<T>   Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, false); }
+  { typedef int   T; typedef thrust::divides<T> Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, false); }
+  { typedef float T; typedef thrust::divides<T> Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, false); }
+  { typedef float T; typedef thrust::minus<T>   Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, false); }
+  
+  { typedef thrust::tuple<int,int> T; typedef thrust::plus<T>  Op; ASSERT_EQUAL((bool) thrust::detail::is_commutative<Op>::value, false); }
+}
+DECLARE_UNITTEST(TestIsCommutative);
+
diff --git a/thrust/testing/uninitialized_copy.cu b/thrust/testing/uninitialized_copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7455d8c81131fe013a23bc1ac86ae7803c8956dd
--- /dev/null
+++ b/thrust/testing/uninitialized_copy.cu
@@ -0,0 +1,274 @@
+#include <unittest/unittest.h>
+#include <thrust/uninitialized_copy.h>
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename InputIterator, typename ForwardIterator>
+ForwardIterator uninitialized_copy(my_system &system,
+                                   InputIterator,
+                                   InputIterator,
+                                   ForwardIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestUninitializedCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::uninitialized_copy(sys,
+                               vec.begin(),
+                               vec.begin(),
+                               vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUninitializedCopyDispatchExplicit);
+
+
+template<typename InputIterator, typename ForwardIterator>
+ForwardIterator uninitialized_copy(my_tag,
+                                   InputIterator,
+                                   InputIterator,
+                                   ForwardIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestUninitializedCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::uninitialized_copy(thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUninitializedCopyDispatchImplicit);
+
+
+template<typename InputIterator, typename Size, typename ForwardIterator>
+ForwardIterator uninitialized_copy_n(my_system &system,
+                                     InputIterator,
+                                     Size,
+                                     ForwardIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestUninitializedCopyNDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::uninitialized_copy_n(sys,
+                                 vec.begin(),
+                                 vec.size(),
+                                 vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUninitializedCopyNDispatchExplicit);
+
+
+template<typename InputIterator, typename Size, typename ForwardIterator>
+ForwardIterator uninitialized_copy_n(my_tag,
+                                     InputIterator,
+                                     Size,
+                                     ForwardIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestUninitializedCopyNDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::uninitialized_copy_n(thrust::retag<my_tag>(vec.begin()),
+                                 vec.size(),
+                                 thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUninitializedCopyNDispatchImplicit);
+
+
+template <class Vector>
+void TestUninitializedCopySimplePOD(void)
+{
+    Vector v1(5);
+    v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+
+    // copy to Vector
+    Vector v2(5);
+    thrust::uninitialized_copy(v1.begin(), v1.end(), v2.begin());
+    ASSERT_EQUAL(v2[0], 0);
+    ASSERT_EQUAL(v2[1], 1);
+    ASSERT_EQUAL(v2[2], 2);
+    ASSERT_EQUAL(v2[3], 3);
+    ASSERT_EQUAL(v2[4], 4);
+}
+DECLARE_VECTOR_UNITTEST(TestUninitializedCopySimplePOD);
+
+
+template<typename Vector>
+void TestUninitializedCopyNSimplePOD(void)
+{
+    Vector v1(5);
+    v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
+
+    // copy to Vector
+    Vector v2(5);
+    thrust::uninitialized_copy_n(v1.begin(), v1.size(), v2.begin());
+    ASSERT_EQUAL(v2[0], 0);
+    ASSERT_EQUAL(v2[1], 1);
+    ASSERT_EQUAL(v2[2], 2);
+    ASSERT_EQUAL(v2[3], 3);
+    ASSERT_EQUAL(v2[4], 4);
+}
+DECLARE_VECTOR_UNITTEST(TestUninitializedCopyNSimplePOD);
+
+
+struct CopyConstructTest
+{
+  __host__ __device__
+  CopyConstructTest(void)
+    :copy_constructed_on_host(false),
+     copy_constructed_on_device(false)
+  {}
+
+  __host__ __device__
+  CopyConstructTest(const CopyConstructTest &)
+  {
+#if __CUDA_ARCH__
+    copy_constructed_on_device = true;
+    copy_constructed_on_host   = false;
+#else
+    copy_constructed_on_device = false;
+    copy_constructed_on_device = true;
+#endif
+  }
+
+  __host__ __device__
+  CopyConstructTest &operator=(const CopyConstructTest &x)
+  {
+    copy_constructed_on_host   = x.copy_constructed_on_host;
+    copy_constructed_on_device = x.copy_constructed_on_device;
+    return *this;
+  }
+
+  bool copy_constructed_on_host;
+  bool copy_constructed_on_device;
+};
+
+
+struct TestUninitializedCopyNonPODDevice
+{
+  void operator()(const size_t)
+  {
+    typedef CopyConstructTest T;
+
+    thrust::device_vector<T> v1(5), v2(5);
+
+    T x;
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    x = v1[0];
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    thrust::uninitialized_copy(v1.begin(), v1.end(), v2.begin());
+
+    x = v2[0];
+    ASSERT_EQUAL(true,  x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+  }
+};
+DECLARE_UNITTEST(TestUninitializedCopyNonPODDevice);
+
+
+struct TestUninitializedCopyNNonPODDevice
+{
+  void operator()(const size_t)
+  {
+    typedef CopyConstructTest T;
+
+    thrust::device_vector<T> v1(5), v2(5);
+
+    T x;
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    x = v1[0];
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    thrust::uninitialized_copy_n(v1.begin(), v1.size(), v2.begin());
+
+    x = v2[0];
+    ASSERT_EQUAL(true,  x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+  }
+};
+DECLARE_UNITTEST(TestUninitializedCopyNNonPODDevice);
+
+
+struct TestUninitializedCopyNonPODHost
+{
+  void operator()(const size_t)
+  {
+    typedef CopyConstructTest T;
+
+    thrust::host_vector<T> v1(5), v2(5);
+
+    T x;
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    x = v1[0];
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    thrust::uninitialized_copy(v1.begin(), v1.end(), v2.begin());
+
+    x = v2[0];
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(true,  x.copy_constructed_on_host);
+  }
+};
+DECLARE_UNITTEST(TestUninitializedCopyNonPODHost);
+
+
+struct TestUninitializedCopyNNonPODHost
+{
+  void operator()(const size_t)
+  {
+    typedef CopyConstructTest T;
+
+    thrust::host_vector<T> v1(5), v2(5);
+
+    T x;
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    x = v1[0];
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(false, x.copy_constructed_on_host);
+
+    thrust::uninitialized_copy_n(v1.begin(), v1.size(), v2.begin());
+
+    x = v2[0];
+    ASSERT_EQUAL(false, x.copy_constructed_on_device);
+    ASSERT_EQUAL(true,  x.copy_constructed_on_host);
+  }
+};
+DECLARE_UNITTEST(TestUninitializedCopyNNonPODHost);
+
diff --git a/thrust/testing/uninitialized_fill.cu b/thrust/testing/uninitialized_fill.cu
new file mode 100644
index 0000000000000000000000000000000000000000..facd6fe6f09ead4606e7b75209607fd27f82e09d
--- /dev/null
+++ b/thrust/testing/uninitialized_fill.cu
@@ -0,0 +1,297 @@
+#include <unittest/unittest.h>
+#include <thrust/uninitialized_fill.h>
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/iterator/retag.h>
+
+
+template<typename ForwardIterator, typename T>
+void uninitialized_fill(my_system &system,
+                        ForwardIterator,
+                        ForwardIterator,
+                        const T &)
+{
+    system.validate_dispatch();
+}
+
+void TestUninitializedFillDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::uninitialized_fill(sys, vec.begin(), vec.begin(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUninitializedFillDispatchExplicit);
+
+
+template<typename ForwardIterator, typename T>
+void uninitialized_fill(my_tag,
+                        ForwardIterator first,
+                        ForwardIterator,
+                        const T &)
+{
+    *first = 13;
+}
+
+void TestUninitializedFillDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::uninitialized_fill(thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()),
+                               0);
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUninitializedFillDispatchImplicit);
+
+
+template<typename ForwardIterator, typename Size, typename T>
+ForwardIterator uninitialized_fill_n(my_system &system,
+                                     ForwardIterator first,
+                                     Size,
+                                     const T &)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestUninitializedFillNDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::uninitialized_fill_n(sys, vec.begin(), vec.size(), 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUninitializedFillNDispatchExplicit);
+
+
+template<typename ForwardIterator, typename Size, typename T>
+ForwardIterator uninitialized_fill_n(my_tag,
+                                     ForwardIterator first,
+                                     Size,
+                                     const T &)
+{
+    *first = 13;
+    return first;
+}
+
+void TestUninitializedFillNDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::uninitialized_fill_n(sys,
+                                 vec.begin(),
+                                 vec.size(),
+                                 0);
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUninitializedFillNDispatchImplicit);
+
+
+template <class Vector>
+void TestUninitializedFillPOD(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    T exemplar(7);
+
+    thrust::uninitialized_fill(v.begin() + 1, v.begin() + 4, exemplar);
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], exemplar);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], exemplar);
+    ASSERT_EQUAL(v[4], 4);
+
+    exemplar = 8;
+    
+    thrust::uninitialized_fill(v.begin() + 0, v.begin() + 3, exemplar);
+    
+    ASSERT_EQUAL(v[0], exemplar);
+    ASSERT_EQUAL(v[1], exemplar);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], 7);
+    ASSERT_EQUAL(v[4], 4);
+
+    exemplar = 9;
+    
+    thrust::uninitialized_fill(v.begin() + 2, v.end(), exemplar);
+    
+    ASSERT_EQUAL(v[0], 8);
+    ASSERT_EQUAL(v[1], 8);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], exemplar);
+    ASSERT_EQUAL(v[4], 9);
+
+    exemplar = 1;
+
+    thrust::uninitialized_fill(v.begin(), v.end(), exemplar);
+    
+    ASSERT_EQUAL(v[0], exemplar);
+    ASSERT_EQUAL(v[1], exemplar);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], exemplar);
+    ASSERT_EQUAL(v[4], exemplar);
+}
+DECLARE_VECTOR_UNITTEST(TestUninitializedFillPOD);
+
+
+struct CopyConstructTest
+{
+  __host__ __device__
+  CopyConstructTest(void)
+    :copy_constructed_on_host(false),
+     copy_constructed_on_device(false)
+  {}
+
+  __host__ __device__
+  CopyConstructTest(const CopyConstructTest &)
+  {
+#if __CUDA_ARCH__
+    copy_constructed_on_device = true;
+    copy_constructed_on_host   = false;
+#else
+    copy_constructed_on_device = false;
+    copy_constructed_on_host   = true;
+#endif
+  }
+
+  __host__ __device__
+  CopyConstructTest &operator=(const CopyConstructTest &x)
+  {
+    copy_constructed_on_host   = x.copy_constructed_on_host;
+    copy_constructed_on_device = x.copy_constructed_on_device;
+    return *this;
+  }
+
+  bool copy_constructed_on_host;
+  bool copy_constructed_on_device;
+};
+
+
+struct TestUninitializedFillNonPOD
+{
+  void operator()(const size_t)
+  {
+    typedef CopyConstructTest T;
+    thrust::device_ptr<T> v = thrust::device_malloc<T>(5);
+
+    T exemplar;
+    ASSERT_EQUAL(false, exemplar.copy_constructed_on_device);
+    ASSERT_EQUAL(false, exemplar.copy_constructed_on_host);
+
+    T host_copy_of_exemplar(exemplar);
+    ASSERT_EQUAL(false, exemplar.copy_constructed_on_device);
+    ASSERT_EQUAL(true,  exemplar.copy_constructed_on_host);
+
+    // copy construct v from the exemplar
+    thrust::uninitialized_fill(v, v + 1, exemplar);
+
+    T x;
+    ASSERT_EQUAL(false,  x.copy_constructed_on_device);
+    ASSERT_EQUAL(false,  x.copy_constructed_on_host);
+
+    x = v[0];
+    ASSERT_EQUAL(true,   x.copy_constructed_on_device);
+    ASSERT_EQUAL(false,  x.copy_constructed_on_host);
+
+    thrust::device_free(v);
+  }
+};
+DECLARE_UNITTEST(TestUninitializedFillNonPOD);
+
+template <class Vector>
+void TestUninitializedFillNPOD(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    T exemplar(7);
+
+    typename Vector::iterator iter = thrust::uninitialized_fill_n(v.begin() + 1, 3, exemplar);
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], exemplar);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], exemplar);
+    ASSERT_EQUAL(v[4], 4);
+    ASSERT_EQUAL_QUIET(v.begin() + 4, iter);
+
+    exemplar = 8;
+    
+    iter = thrust::uninitialized_fill_n(v.begin() + 0, 3, exemplar);
+    
+    ASSERT_EQUAL(v[0], exemplar);
+    ASSERT_EQUAL(v[1], exemplar);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], 7);
+    ASSERT_EQUAL(v[4], 4);
+    ASSERT_EQUAL_QUIET(v.begin() + 3, iter);
+
+    exemplar = 9;
+    
+    iter = thrust::uninitialized_fill_n(v.begin() + 2, 3, exemplar);
+    
+    ASSERT_EQUAL(v[0], 8);
+    ASSERT_EQUAL(v[1], 8);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], exemplar);
+    ASSERT_EQUAL(v[4], 9);
+    ASSERT_EQUAL_QUIET(v.end(), iter);
+
+    exemplar = 1;
+
+    iter = thrust::uninitialized_fill_n(v.begin(), v.size(), exemplar);
+    
+    ASSERT_EQUAL(v[0], exemplar);
+    ASSERT_EQUAL(v[1], exemplar);
+    ASSERT_EQUAL(v[2], exemplar);
+    ASSERT_EQUAL(v[3], exemplar);
+    ASSERT_EQUAL(v[4], exemplar);
+    ASSERT_EQUAL_QUIET(v.end(), iter);
+}
+DECLARE_VECTOR_UNITTEST(TestUninitializedFillNPOD);
+
+
+struct TestUninitializedFillNNonPOD
+{
+  void operator()(const size_t)
+  {
+    typedef CopyConstructTest T;
+    thrust::device_ptr<T> v = thrust::device_malloc<T>(5);
+
+    T exemplar;
+    ASSERT_EQUAL(false, exemplar.copy_constructed_on_device);
+    ASSERT_EQUAL(false, exemplar.copy_constructed_on_host);
+
+    T host_copy_of_exemplar(exemplar);
+    ASSERT_EQUAL(false, exemplar.copy_constructed_on_device);
+    ASSERT_EQUAL(true,  exemplar.copy_constructed_on_host);
+
+    // copy construct v from the exemplar
+    thrust::uninitialized_fill_n(v, 1, exemplar);
+
+    T x;
+    ASSERT_EQUAL(false,  x.copy_constructed_on_device);
+    ASSERT_EQUAL(false,  x.copy_constructed_on_host);
+
+    x = v[0];
+    ASSERT_EQUAL(true,   x.copy_constructed_on_device);
+    ASSERT_EQUAL(false,  x.copy_constructed_on_host);
+
+    thrust::device_free(v);
+  }
+};
+DECLARE_UNITTEST(TestUninitializedFillNNonPOD);
+
diff --git a/thrust/testing/unique.cu b/thrust/testing/unique.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8073832dfa6b6e552b8025c1ded70e22e93a9b00
--- /dev/null
+++ b/thrust/testing/unique.cu
@@ -0,0 +1,268 @@
+#include <unittest/unittest.h>
+#include <thrust/unique.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template <typename ForwardIterator>
+ForwardIterator unique(my_system &system,
+                       ForwardIterator first,
+                       ForwardIterator)
+{
+    system.validate_dispatch();
+    return first;
+}
+
+void TestUniqueDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::unique(sys, vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUniqueDispatchExplicit);
+
+
+template <typename ForwardIterator>
+ForwardIterator unique(my_tag,
+                       ForwardIterator first,
+                       ForwardIterator)
+{
+    *first = 13;
+    return first;
+}
+
+
+void TestUniqueDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::unique(thrust::retag<my_tag>(vec.begin()),
+                   thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUniqueDispatchImplicit);
+
+
+template <typename InputIterator,
+          typename OutputIterator>
+OutputIterator unique_copy(my_system &system,
+                           InputIterator,
+                           InputIterator,
+                           OutputIterator result)
+{
+    system.validate_dispatch();
+    return result;
+}
+
+void TestUniqueCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::unique_copy(sys, vec.begin(), vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUniqueCopyDispatchExplicit);
+
+
+template <typename InputIterator,
+          typename OutputIterator>
+OutputIterator unique_copy(my_tag,
+                           InputIterator,
+                           InputIterator,
+                           OutputIterator result)
+{
+    *result = 13;
+    return result;
+}
+
+void TestUniqueCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::unique_copy(thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()),
+                        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUniqueCopyDispatchImplicit);
+
+
+template<typename T>
+struct is_equal_div_10_unique
+{
+    __host__ __device__
+    bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+template<typename Vector>
+void TestUniqueSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(10);
+    data[0] = 11; 
+    data[1] = 11; 
+    data[2] = 12;
+    data[3] = 20; 
+    data[4] = 29; 
+    data[5] = 21; 
+    data[6] = 21; 
+    data[7] = 31; 
+    data[8] = 31; 
+    data[9] = 37; 
+
+    typename Vector::iterator new_last;
+    
+    new_last = thrust::unique(data.begin(), data.end());
+
+    ASSERT_EQUAL(new_last - data.begin(), 7);
+    ASSERT_EQUAL(data[0], 11);
+    ASSERT_EQUAL(data[1], 12);
+    ASSERT_EQUAL(data[2], 20);
+    ASSERT_EQUAL(data[3], 29);
+    ASSERT_EQUAL(data[4], 21);
+    ASSERT_EQUAL(data[5], 31);
+    ASSERT_EQUAL(data[6], 37);
+
+    new_last = thrust::unique(data.begin(), new_last, is_equal_div_10_unique<T>());
+
+    ASSERT_EQUAL(new_last - data.begin(), 3);
+    ASSERT_EQUAL(data[0], 11);
+    ASSERT_EQUAL(data[1], 20);
+    ASSERT_EQUAL(data[2], 31);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueSimple);
+
+
+template<typename T>
+struct TestUnique
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<bool>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        typename thrust::host_vector<T>::iterator   h_new_last;
+        typename thrust::device_vector<T>::iterator d_new_last;
+
+        h_new_last = thrust::unique(h_data.begin(), h_data.end());
+        d_new_last = thrust::unique(d_data.begin(), d_data.end());
+
+        ASSERT_EQUAL(h_new_last - h_data.begin(), d_new_last - d_data.begin());
+        
+        h_data.resize(h_new_last - h_data.begin());
+        d_data.resize(d_new_last - d_data.begin());
+
+        ASSERT_EQUAL(h_data, d_data);
+    }
+};
+VariableUnitTest<TestUnique, IntegralTypes> TestUniqueInstance;
+
+
+template<typename Vector>
+void TestUniqueCopySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(10);
+    data[0] = 11; 
+    data[1] = 11; 
+    data[2] = 12;
+    data[3] = 20; 
+    data[4] = 29; 
+    data[5] = 21; 
+    data[6] = 21; 
+    data[7] = 31; 
+    data[8] = 31; 
+    data[9] = 37; 
+    
+    Vector output(10, -1);
+
+    typename Vector::iterator new_last;
+    
+    new_last = thrust::unique_copy(data.begin(), data.end(), output.begin());
+
+    ASSERT_EQUAL(new_last - output.begin(), 7);
+    ASSERT_EQUAL(output[0], 11);
+    ASSERT_EQUAL(output[1], 12);
+    ASSERT_EQUAL(output[2], 20);
+    ASSERT_EQUAL(output[3], 29);
+    ASSERT_EQUAL(output[4], 21);
+    ASSERT_EQUAL(output[5], 31);
+    ASSERT_EQUAL(output[6], 37);
+
+    new_last = thrust::unique_copy(output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
+
+    ASSERT_EQUAL(new_last - data.begin(), 3);
+    ASSERT_EQUAL(data[0], 11);
+    ASSERT_EQUAL(data[1], 20);
+    ASSERT_EQUAL(data[2], 31);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCopySimple);
+
+
+template<typename T>
+struct TestUniqueCopy
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<bool>(n);
+        thrust::device_vector<T> d_data = h_data;
+        
+        thrust::host_vector<T>   h_output(n);
+        thrust::device_vector<T> d_output(n);
+
+        typename thrust::host_vector<T>::iterator   h_new_last;
+        typename thrust::device_vector<T>::iterator d_new_last;
+
+        h_new_last = thrust::unique_copy(h_data.begin(), h_data.end(), h_output.begin());
+        d_new_last = thrust::unique_copy(d_data.begin(), d_data.end(), d_output.begin());
+
+        ASSERT_EQUAL(h_new_last - h_output.begin(), d_new_last - d_output.begin());
+        
+        h_data.resize(h_new_last - h_output.begin());
+        d_data.resize(d_new_last - d_output.begin());
+
+        ASSERT_EQUAL(h_output, d_output);
+    }
+};
+VariableUnitTest<TestUniqueCopy, IntegralTypes> TestUniqueCopyInstance;
+
+
+template<typename T>
+struct TestUniqueCopyToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_integers<bool>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::host_vector<T> h_unique = h_data;
+        h_unique.erase(thrust::unique(h_unique.begin(), h_unique.end()), h_unique.end());
+
+        thrust::discard_iterator<> reference(h_unique.size());
+
+        typename thrust::host_vector<T>::iterator   h_new_last;
+        typename thrust::device_vector<T>::iterator d_new_last;
+
+        thrust::discard_iterator<> h_result =
+          thrust::unique_copy(h_data.begin(), h_data.end(), thrust::make_discard_iterator());
+
+        thrust::discard_iterator<> d_result =
+          thrust::unique_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator());
+
+        ASSERT_EQUAL_QUIET(reference, h_result);
+        ASSERT_EQUAL_QUIET(reference, d_result);
+    }
+};
+VariableUnitTest<TestUniqueCopyToDiscardIterator, IntegralTypes> TestUniqueCopyToDiscardIteratorInstance;
+
+
diff --git a/thrust/testing/unique_by_key.cu b/thrust/testing/unique_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..76073e0ca33818d5610edd8adc5a1b406a4f6c97
--- /dev/null
+++ b/thrust/testing/unique_by_key.cu
@@ -0,0 +1,438 @@
+#include <unittest/unittest.h>
+#include <thrust/unique.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+
+
+template <typename ForwardIterator1,
+          typename ForwardIterator2>
+thrust::pair<ForwardIterator1,ForwardIterator2>
+unique_by_key(my_system &system,
+              ForwardIterator1 keys_first, 
+              ForwardIterator1,
+              ForwardIterator2 values_first)
+{
+    system.validate_dispatch();
+    return thrust::make_pair(keys_first,values_first);
+}
+
+void TestUniqueByKeyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::unique_by_key(sys, vec.begin(), vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUniqueByKeyDispatchExplicit);
+
+
+template <typename ForwardIterator1,
+          typename ForwardIterator2>
+thrust::pair<ForwardIterator1,ForwardIterator2>
+unique_by_key(my_tag,
+              ForwardIterator1 keys_first, 
+              ForwardIterator1,
+              ForwardIterator2 values_first)
+{
+    *keys_first = 13;
+    return thrust::make_pair(keys_first,values_first);
+}
+
+void TestUniqueByKeyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::unique_by_key(thrust::retag<my_tag>(vec.begin()), 
+                          thrust::retag<my_tag>(vec.begin()),
+                          thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUniqueByKeyDispatchImplicit);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+unique_by_key_copy(my_system &system,
+                   InputIterator1, 
+                   InputIterator1,
+                   InputIterator2,
+                   OutputIterator1 keys_output,
+                   OutputIterator2 values_output)
+{
+    system.validate_dispatch();
+    return thrust::make_pair(keys_output, values_output);
+}
+
+void TestUniqueByKeyCopyDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::unique_by_key_copy(sys,
+                               vec.begin(),
+                               vec.begin(),
+                               vec.begin(),
+                               vec.begin(),
+                               vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUniqueByKeyCopyDispatchExplicit);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+thrust::pair<OutputIterator1,OutputIterator2>
+unique_by_key_copy(my_tag,
+                   InputIterator1, 
+                   InputIterator1,
+                   InputIterator2,
+                   OutputIterator1 keys_output,
+                   OutputIterator2 values_output)
+{
+    *keys_output = 13;
+    return thrust::make_pair(keys_output, values_output);
+}
+
+void TestUniqueByKeyCopyDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    thrust::unique_by_key_copy(thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()),
+                               thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestUniqueByKeyCopyDispatchImplicit);
+
+
+template<typename T>
+struct is_equal_div_10_unique
+{
+    __host__ __device__
+    bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+template <typename Vector>
+void initialize_keys(Vector& keys)
+{
+    keys.resize(9);
+    keys[0] = 11;
+    keys[1] = 11;
+    keys[2] = 21;
+    keys[3] = 20;
+    keys[4] = 21;
+    keys[5] = 21;
+    keys[6] = 21;
+    keys[7] = 37;
+    keys[8] = 37;
+}
+
+template <typename Vector>
+void initialize_values(Vector& values)
+{
+    values.resize(9);
+    values[0] = 0; 
+    values[1] = 1;
+    values[2] = 2;
+    values[3] = 3;
+    values[4] = 4;
+    values[5] = 5;
+    values[6] = 6;
+    values[7] = 7;
+    values[8] = 8;
+}
+
+
+template<typename Vector>
+void TestUniqueByKeySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector keys;
+    Vector values;
+
+    typename thrust::pair<typename Vector::iterator, typename Vector::iterator> new_last;
+
+    // basic test
+    initialize_keys(keys);  initialize_values(values);
+
+    new_last = thrust::unique_by_key(keys.begin(), keys.end(), values.begin());
+
+    ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
+    ASSERT_EQUAL(new_last.second - values.begin(), 5);
+    ASSERT_EQUAL(keys[0], 11);
+    ASSERT_EQUAL(keys[1], 21);
+    ASSERT_EQUAL(keys[2], 20);
+    ASSERT_EQUAL(keys[3], 21);
+    ASSERT_EQUAL(keys[4], 37);
+    
+    ASSERT_EQUAL(values[0], 0);
+    ASSERT_EQUAL(values[1], 2);
+    ASSERT_EQUAL(values[2], 3);
+    ASSERT_EQUAL(values[3], 4);
+    ASSERT_EQUAL(values[4], 7);
+
+    // test BinaryPredicate
+    initialize_keys(keys);  initialize_values(values);
+    
+    new_last = thrust::unique_by_key(keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
+
+    ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
+    ASSERT_EQUAL(new_last.second - values.begin(), 3);
+    ASSERT_EQUAL(keys[0], 11);
+    ASSERT_EQUAL(keys[1], 21);
+    ASSERT_EQUAL(keys[2], 37);
+    
+    ASSERT_EQUAL(values[0], 0);
+    ASSERT_EQUAL(values[1], 2);
+    ASSERT_EQUAL(values[2], 7);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueByKeySimple);
+
+
+template<typename Vector>
+void TestUniqueCopyByKeySimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector keys;
+    Vector values;
+
+    typename thrust::pair<typename Vector::iterator, typename Vector::iterator> new_last;
+
+    // basic test
+    initialize_keys(keys);  initialize_values(values);
+
+    Vector output_keys(keys.size());
+    Vector output_values(values.size());
+
+    new_last = thrust::unique_by_key_copy(keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+
+    ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
+    ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
+    ASSERT_EQUAL(output_keys[0], 11);
+    ASSERT_EQUAL(output_keys[1], 21);
+    ASSERT_EQUAL(output_keys[2], 20);
+    ASSERT_EQUAL(output_keys[3], 21);
+    ASSERT_EQUAL(output_keys[4], 37);
+    
+    ASSERT_EQUAL(output_values[0], 0);
+    ASSERT_EQUAL(output_values[1], 2);
+    ASSERT_EQUAL(output_values[2], 3);
+    ASSERT_EQUAL(output_values[3], 4);
+    ASSERT_EQUAL(output_values[4], 7);
+
+    // test BinaryPredicate
+    initialize_keys(keys);  initialize_values(values);
+    
+    new_last = thrust::unique_by_key_copy(keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
+
+    ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
+    ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
+    ASSERT_EQUAL(output_keys[0], 11);
+    ASSERT_EQUAL(output_keys[1], 21);
+    ASSERT_EQUAL(output_keys[2], 37);
+    
+    ASSERT_EQUAL(output_values[0], 0);
+    ASSERT_EQUAL(output_values[1], 2);
+    ASSERT_EQUAL(output_values[2], 7);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCopyByKeySimple);
+
+
+template<typename K>
+struct TestUniqueByKey
+{
+    void operator()(const size_t n)
+    {
+        typedef unsigned int V; // ValueType
+
+        thrust::host_vector<K>   h_keys = unittest::random_integers<bool>(n);
+        thrust::host_vector<V>   h_vals = unittest::random_integers<V>(n);
+        thrust::device_vector<K> d_keys = h_keys;
+        thrust::device_vector<V> d_vals = h_vals;
+
+        typedef typename thrust::host_vector<K>::iterator   HostKeyIterator;
+        typedef typename thrust::host_vector<V>::iterator   HostValIterator;
+        typedef typename thrust::device_vector<K>::iterator DeviceKeyIterator;
+        typedef typename thrust::device_vector<V>::iterator DeviceValIterator;
+
+        typedef typename thrust::pair<HostKeyIterator,  HostValIterator>   HostIteratorPair;
+        typedef typename thrust::pair<DeviceKeyIterator,DeviceValIterator> DeviceIteratorPair;
+
+        HostIteratorPair   h_last = thrust::unique_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+        DeviceIteratorPair d_last = thrust::unique_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+        ASSERT_EQUAL(h_last.first  - h_keys.begin(), d_last.first  - d_keys.begin());
+        ASSERT_EQUAL(h_last.second - h_vals.begin(), d_last.second - d_vals.begin());
+       
+        size_t N = h_last.first - h_keys.begin();
+
+        h_keys.resize(N);
+        h_vals.resize(N);
+        d_keys.resize(N);
+        d_vals.resize(N);
+
+        ASSERT_EQUAL(h_keys, d_keys);
+        ASSERT_EQUAL(h_vals, d_vals);
+    }
+};
+VariableUnitTest<TestUniqueByKey, IntegralTypes> TestUniqueByKeyInstance;
+
+
+template<typename K>
+struct TestUniqueCopyByKey
+{
+    void operator()(const size_t n)
+    {
+        typedef unsigned int V; // ValueType
+
+        thrust::host_vector<K>   h_keys = unittest::random_integers<bool>(n);
+        thrust::host_vector<V>   h_vals = unittest::random_integers<V>(n);
+        thrust::device_vector<K> d_keys = h_keys;
+        thrust::device_vector<V> d_vals = h_vals;
+
+        thrust::host_vector<K>   h_keys_output(n);
+        thrust::host_vector<V>   h_vals_output(n);
+        thrust::device_vector<K> d_keys_output(n);
+        thrust::device_vector<V> d_vals_output(n);
+
+        typedef typename thrust::host_vector<K>::iterator   HostKeyIterator;
+        typedef typename thrust::host_vector<V>::iterator   HostValIterator;
+        typedef typename thrust::device_vector<K>::iterator DeviceKeyIterator;
+        typedef typename thrust::device_vector<V>::iterator DeviceValIterator;
+
+        typedef typename thrust::pair<HostKeyIterator,  HostValIterator>   HostIteratorPair;
+        typedef typename thrust::pair<DeviceKeyIterator,DeviceValIterator> DeviceIteratorPair;
+
+        HostIteratorPair   h_last = thrust::unique_by_key_copy(h_keys.begin(), h_keys.end(), h_vals.begin(), h_keys_output.begin(), h_vals_output.begin());
+        DeviceIteratorPair d_last = thrust::unique_by_key_copy(d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys_output.begin(), d_vals_output.begin());
+
+        ASSERT_EQUAL(h_last.first  - h_keys_output.begin(), d_last.first  - d_keys_output.begin());
+        ASSERT_EQUAL(h_last.second - h_vals_output.begin(), d_last.second - d_vals_output.begin());
+       
+        size_t N = h_last.first - h_keys_output.begin();
+
+        h_keys_output.resize(N);
+        h_vals_output.resize(N);
+        d_keys_output.resize(N);
+        d_vals_output.resize(N);
+
+        ASSERT_EQUAL(h_keys_output, d_keys_output);
+        ASSERT_EQUAL(h_vals_output, d_vals_output);
+    }
+};
+VariableUnitTest<TestUniqueCopyByKey, IntegralTypes> TestUniqueCopyByKeyInstance;
+
+template<typename K>
+struct TestUniqueCopyByKeyToDiscardIterator
+{
+    void operator()(const size_t n)
+    {
+        typedef unsigned int V; // ValueType
+
+        thrust::host_vector<K>   h_keys = unittest::random_integers<bool>(n);
+        thrust::host_vector<V>   h_vals = unittest::random_integers<V>(n);
+        thrust::device_vector<K> d_keys = h_keys;
+        thrust::device_vector<V> d_vals = h_vals;
+
+        thrust::host_vector<V>   h_vals_output(n);
+        thrust::device_vector<V> d_vals_output(n);
+
+        thrust::host_vector<K>   h_keys_output(n);
+        thrust::device_vector<K> d_keys_output(n);
+
+        thrust::host_vector<K> h_unique_keys = h_keys;
+        h_unique_keys.erase(thrust::unique(h_unique_keys.begin(), h_unique_keys.end()), h_unique_keys.end());
+
+        size_t num_unique_keys = h_unique_keys.size();
+
+
+        // mask both outputs
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
+          thrust::unique_by_key_copy(h_keys.begin(), h_keys.end(),
+                                     h_vals.begin(),
+                                     thrust::make_discard_iterator(),
+                                     thrust::make_discard_iterator());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > d_result1 =
+          thrust::unique_by_key_copy(d_keys.begin(), d_keys.end(),
+                                     d_vals.begin(),
+                                     thrust::make_discard_iterator(),
+                                     thrust::make_discard_iterator());
+
+        thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > reference1 =
+          thrust::make_pair(thrust::make_discard_iterator(num_unique_keys),
+                            thrust::make_discard_iterator(num_unique_keys));
+
+        ASSERT_EQUAL_QUIET(reference1, h_result1);
+        ASSERT_EQUAL_QUIET(reference1, d_result1);
+
+
+        // mask values output
+        thrust::pair<typename thrust::host_vector<K>::iterator, thrust::discard_iterator<> > h_result2 =
+          thrust::unique_by_key_copy(h_keys.begin(), h_keys.end(),
+                                     h_vals.begin(),
+                                     h_keys_output.begin(),
+                                     thrust::make_discard_iterator());
+
+        thrust::pair<typename thrust::device_vector<K>::iterator, thrust::discard_iterator<> > d_result2 =
+          thrust::unique_by_key_copy(d_keys.begin(), d_keys.end(),
+                                     d_vals.begin(),
+                                     d_keys_output.begin(),
+                                     thrust::make_discard_iterator());
+
+        thrust::pair<typename thrust::host_vector<K>::iterator, thrust::discard_iterator<> > h_reference2 =
+          thrust::make_pair(h_keys_output.begin() + num_unique_keys,
+                            thrust::make_discard_iterator(num_unique_keys));
+
+        thrust::pair<typename thrust::device_vector<K>::iterator, thrust::discard_iterator<> > d_reference2 =
+          thrust::make_pair(d_keys_output.begin() + num_unique_keys,
+                            thrust::make_discard_iterator(num_unique_keys));
+
+        ASSERT_EQUAL(h_keys_output, d_keys_output);
+        ASSERT_EQUAL_QUIET(h_reference2, h_result2);
+        ASSERT_EQUAL_QUIET(d_reference2, d_result2);
+
+
+        // mask keys output
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<V>::iterator> h_result3 =
+          thrust::unique_by_key_copy(h_keys.begin(), h_keys.end(),
+                                     h_vals.begin(),
+                                     thrust::make_discard_iterator(),
+                                     h_vals_output.begin());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<V>::iterator> d_result3 =
+          thrust::unique_by_key_copy(d_keys.begin(), d_keys.end(),
+                                     d_vals.begin(),
+                                     thrust::make_discard_iterator(),
+                                     d_vals_output.begin());
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::host_vector<V>::iterator> h_reference3 =
+          thrust::make_pair(thrust::make_discard_iterator(num_unique_keys),
+                            h_vals_output.begin() + num_unique_keys);
+
+        thrust::pair<thrust::discard_iterator<>, typename thrust::device_vector<V>::iterator> d_reference3 =
+          thrust::make_pair(thrust::make_discard_iterator(num_unique_keys),
+                            d_vals_output.begin() + num_unique_keys);
+
+        ASSERT_EQUAL(h_vals_output, d_vals_output);
+        ASSERT_EQUAL_QUIET(h_reference3, h_result3);
+        ASSERT_EQUAL_QUIET(d_reference3, d_result3);
+    }
+};
+VariableUnitTest<TestUniqueCopyByKeyToDiscardIterator, IntegralTypes> TestUniqueCopyByKeyToDiscardIteratorInstance;
+
diff --git a/thrust/testing/unittest/CMakeLists.txt b/thrust/testing/unittest/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9a652577be3696ffdc0ca8c6974b52cac73ac8f8
--- /dev/null
+++ b/thrust/testing/unittest/CMakeLists.txt
@@ -0,0 +1,21 @@
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  set(framework_target ${config_prefix}.test.framework)
+
+  if ("CUDA" STREQUAL "${config_device}")
+    set(framework_srcs
+      testframework.cu
+      cuda/testframework.cu
+    )
+  else()
+    # Wrap the cu file inside a .cpp file for non-CUDA builds
+    thrust_wrap_cu_in_cpp(framework_srcs testframework.cu ${thrust_target})
+  endif()
+
+  add_library(${framework_target} STATIC ${framework_srcs})
+  target_link_libraries(${framework_target} PUBLIC ${thrust_target})
+  target_include_directories(${framework_target} PRIVATE "${Thrust_SOURCE_DIR}/testing")
+  thrust_clone_target_properties(${framework_target} ${thrust_target})
+endforeach()
diff --git a/thrust/testing/unittest/assertions.h b/thrust/testing/unittest/assertions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6803e8168d85d0493c3fa1371aedb224443e399d
--- /dev/null
+++ b/thrust/testing/unittest/assertions.h
@@ -0,0 +1,593 @@
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+
+#include <unittest/exceptions.h>
+#include <unittest/util.h>
+
+#define ASSERT_EQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)           unittest::assert_equal((X),(Y), FILE_,  LINE_)
+#define ASSERT_EQUAL_QUIET_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)     unittest::assert_equal_quiet((X),(Y), FILE_, LINE_)
+#define ASSERT_NOT_EQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)       unittest::assert_not_equal((X),(Y), FILE_,  LINE_)
+#define ASSERT_NOT_EQUAL_QUIET_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_) unittest::assert_not_equal_quiet((X),(Y), FILE_, LINE_)
+#define ASSERT_LEQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)          unittest::assert_lequal((X),(Y), FILE_,  LINE_)
+#define ASSERT_GEQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)          unittest::assert_gequal((X),(Y), FILE_,  LINE_)
+#define ASSERT_LESS_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)            unittest::assert_less((X),(Y), FILE_,  LINE_)
+#define ASSERT_GREATER_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)         unittest::assert_greater((X),(Y), FILE_,  LINE_)
+#define ASSERT_ALMOST_EQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)    unittest::assert_almost_equal((X),(Y), FILE_, LINE_)
+#define ASSERT_EQUAL_RANGES_WITH_FILE_AND_LINE(X,Y,Z,FILE_,LINE_)  unittest::assert_equal((X),(Y),(Z), FILE_,  LINE_)
+
+#define ASSERT_THROWS_WITH_FILE_AND_LINE(                                     \
+  EXPR, EXCEPTION_TYPE, FILE_, LINE_                                          \
+)                                                                             \
+  {                                                                           \
+    unittest::threw_status THRUST_PP_CAT2(__s, LINE_)                         \
+      = unittest::did_not_throw;                                              \
+    try { EXPR; }                                                             \
+    catch (EXCEPTION_TYPE const&)                                             \
+    { THRUST_PP_CAT2(__s, LINE_) = unittest::threw_right_type; }              \
+    catch (...)                                                               \
+    { THRUST_PP_CAT2(__s, LINE_) = unittest::threw_wrong_type; }              \
+    unittest::check_assert_throws(                                            \
+      THRUST_PP_CAT2(__s, LINE_), THRUST_PP_STRINGIZE(EXCEPTION_TYPE)         \
+    , FILE_, LINE_                                                            \
+    );                                                                        \
+  }                                                                           \
+  /**/
+
+#define ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(                               \
+  EXPR, EXCEPTION_TYPE, VALUE, FILE_, LINE_                                   \
+)                                                                             \
+  {                                                                           \
+    unittest::threw_status THRUST_PP_CAT2(__s, LINE_)                         \
+      = unittest::did_not_throw;                                              \
+    try { EXPR; }                                                             \
+    catch (EXCEPTION_TYPE const& THRUST_PP_CAT2(__e, LINE_))                  \
+    {                                                                         \
+      if (VALUE == THRUST_PP_CAT2(__e, LINE_))                                \
+        THRUST_PP_CAT2(__s, LINE_)                                            \
+          = unittest::threw_right_type;                                       \
+      else                                                                    \
+        THRUST_PP_CAT2(__s, LINE_)                                            \
+          = unittest::threw_right_type_but_wrong_value;                       \
+    }                                                                         \
+    catch (...) { THRUST_PP_CAT2(__s, LINE_) = unittest::threw_wrong_type; }  \
+    unittest::check_assert_throws(                                            \
+      THRUST_PP_CAT2(__s, LINE_), THRUST_PP_STRINGIZE(EXCEPTION_TYPE)         \
+    , FILE_, LINE_                                                            \
+    );                                                                        \
+  }                                                                           \
+  /**/
+
+#define KNOWN_FAILURE_WITH_FILE_AND_LINE(FILE_, LINE_)                                  \
+  { unittest::UnitTestKnownFailure f; f << "[" << FILE_ ":" << LINE_ << "]"; throw f; } \
+  /**/
+
+#define ASSERT_EQUAL(X,Y)           ASSERT_EQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_EQUAL_QUIET(X,Y)     ASSERT_EQUAL_QUIET_WITH_FILE_AND_LINE((X),(Y), __FILE__, __LINE__)
+#define ASSERT_NOT_EQUAL(X,Y)       ASSERT_NOT_EQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_NOT_EQUAL_QUIET(X,Y) ASSERT_NOT_EQUAL_QUIET_WITH_FILE_AND_LINE((X),(Y), __FILE__, __LINE__)
+#define ASSERT_LEQUAL(X,Y)          ASSERT_LEQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_GEQUAL(X,Y)          ASSERT_GEQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_LESS(X,Y)            ASSERT_LESS_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_GREATER(X,Y)         ASSERT_GREATER_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_ALMOST_EQUAL(X,Y)    ASSERT_ALMOST_EQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__, __LINE__)
+#define ASSERT_EQUAL_RANGES(X,Y,Z)  ASSERT_EQUAL_WITH_FILE_AND_LINE((X),(Y),(Z), __FILE__,  __LINE__)
+
+#define ASSERT_THROWS(EXPR, EXCEPTION_TYPE)                                   \
+  ASSERT_THROWS_WITH_FILE_AND_LINE(EXPR, EXCEPTION_TYPE, __FILE__, __LINE__)  \
+  /**/
+
+#define ASSERT_THROWS_EQUAL(EXPR, EXCEPTION_TYPE, VALUE)                                  \
+  ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(EXPR, EXCEPTION_TYPE, VALUE, __FILE__, __LINE__) \
+  /**/
+
+#define KNOWN_FAILURE KNOWN_FAILURE_WITH_FILE_AND_LINE(__FILE__, __LINE__)
+
+namespace unittest
+{
+
+size_t const MAX_OUTPUT_LINES = 10;
+
+double const DEFAULT_RELATIVE_TOL = 1e-4;
+double const DEFAULT_ABSOLUTE_TOL = 1e-4;
+
+template<typename T>
+  struct value_type
+{
+  typedef typename thrust::detail::remove_const<
+    typename thrust::detail::remove_reference<
+      T
+    >::type
+  >::type type;
+};
+
+template<typename T>
+  struct value_type< thrust::device_reference<T> >
+{
+  typedef typename value_type<T>::type type;
+};
+
+////
+// check scalar values
+template <typename T1, typename T2>
+void assert_equal(T1 a, T2 b,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a == b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not equal: " << a << " " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_equal(char a, char b,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a == b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not equal: " << int(a) << " " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+// sometimes its not possible to << a type
+template <typename T1, typename T2>
+void assert_equal_quiet(const T1& a, const T2& b,
+                        const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a == b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not equal";
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+////
+// check scalar values
+template <typename T1, typename T2>
+void assert_not_equal(T1 a, T2 b,
+                      const std::string& filename = "unknown", int lineno = -1)
+{
+    if(a == b){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are equal: " << a << " " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_not_equal(char a, char b,
+                      const std::string& filename = "unknown", int lineno = -1)
+{
+    if(a == b){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are equal: " << int(a) << " " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+// sometimes its not possible to << a type
+template <typename T1, typename T2>
+void assert_not_equal_quiet(const T1& a, const T2& b,
+                            const std::string& filename = "unknown", int lineno = -1)
+{
+    if(a == b){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are equal";
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_less(T1 a, T2 b,
+                 const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a < b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << a << " is greater or equal to " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_less(char a, char b,
+                 const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a < b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is greater than or equal to " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_greater(T1 a, T2 b,
+                    const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a > b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << a << " is less than or equal to " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_greater(char a, char b,
+                    const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a > b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is less than or equal to " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_lequal(T1 a, T2 b,
+                   const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a <= b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << a << " is greater than " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_lequal(char a, char b,
+                   const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a <= b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is greater than " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_gequal(T1 a, T2 b,
+                   const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a >= b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << a << " is less than " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_gequal(char a, char b,
+                   const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a >= b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is less than " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+// define our own abs() because std::abs() isn't portable for all types for some reason
+template<typename T>
+  T abs(const T &x)
+{
+  return x > 0 ? x : -x;
+}
+
+
+inline
+bool almost_equal(const double& a, const double& b, const double& a_tol, const double& r_tol)
+{
+    if(abs(a - b) > r_tol * (abs(a) + abs(b)) + a_tol)
+        return false;
+    else
+        return true;
+}
+
+template <typename T1, typename T2>
+void assert_almost_equal(T1 a, T2 b,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
+
+{
+    if(!almost_equal(a, b, a_tol, r_tol)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not approximately equal: " << (double) a << " " << (double) b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+
+template <typename T1, typename T2>
+void assert_almost_equal(thrust::complex<T1> a, thrust::complex<T2> b,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
+
+{
+  if(!almost_equal(a.real(), b.real(), a_tol, r_tol)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not approximately equal: " <<  a << " " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+
+template <typename T1, typename T2>
+  void assert_almost_equal(const thrust::complex<T1>& a, const std::complex<T2>& b,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
+
+{
+  if(!almost_equal(a.real(), b.real(), a_tol, r_tol)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not approximately equal: " <<  a << " " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+template <typename T>
+class almost_equal_to
+{
+    public:
+        double a_tol, r_tol;
+        almost_equal_to(double _a_tol = DEFAULT_ABSOLUTE_TOL, double _r_tol = DEFAULT_RELATIVE_TOL) : a_tol(_a_tol), r_tol(_r_tol) {}
+        bool operator()(const T& a, const T& b) const {
+            return almost_equal((double) a, (double) b, a_tol, r_tol);
+        }
+};
+
+
+template <typename T>
+class almost_equal_to<thrust::complex<T> >
+{
+    public:
+        double a_tol, r_tol;
+        almost_equal_to(double _a_tol = DEFAULT_ABSOLUTE_TOL, double _r_tol = DEFAULT_RELATIVE_TOL) : a_tol(_a_tol), r_tol(_r_tol) {}
+        bool operator()(const thrust::complex<T>& a, const thrust::complex<T>& b) const {
+            return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol) 
+                && almost_equal((double) a.imag(), (double) b.imag(), a_tol, r_tol);
+        }
+};
+
+////
+// check sequences
+
+template <typename ForwardIterator1, typename ForwardIterator2, typename BinaryPredicate>
+void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate op,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    typedef typename thrust::iterator_difference<ForwardIterator1>::type difference_type;
+    typedef typename thrust::iterator_value<ForwardIterator1>::type InputType;
+    
+    bool failure = false;
+
+    difference_type length1 = thrust::distance(first1, last1);
+    difference_type length2 = thrust::distance(first2, last2);
+    
+    difference_type min_length = thrust::min(length1, length2);
+
+    unittest::UnitTestFailure f;
+    f << "[" << filename << ":" << lineno << "] ";
+
+    // check lengths
+    if (length1 != length2)
+    {
+      failure = true;
+      f << "Sequences have different sizes (" << length1 << " != " << length2 << ")\n";
+    }
+
+    // check values
+    
+    size_t mismatches = 0;
+
+    for (difference_type i = 0; i < min_length; i++)
+    {
+      if(!op(*first1, *first2))
+      {
+        if (mismatches == 0)
+        {
+          failure = true;
+          f << "Sequences are not equal [type='" << type_name<InputType>() << "']\n";
+          f << "--------------------------------\n";
+        }
+
+        mismatches++;
+
+        if(mismatches <= MAX_OUTPUT_LINES)
+        {
+          if (sizeof(InputType) == 1)
+            f << "  [" << i << "] " << *first1 + InputType() << "  " << *first2 + InputType() << "\n"; // unprintable chars are a problem
+          else
+            f << "  [" << i << "] " << *first1 << "  " << *first2 << "\n";
+        }
+      }
+
+      first1++;
+      first2++;
+    }
+
+    if (mismatches > 0)
+    {
+      if(mismatches > MAX_OUTPUT_LINES)
+          f << "  (output limit reached)\n";
+      f << "--------------------------------\n";
+      f << "Sequences differ at " << mismatches << " of " << min_length << " positions" << "\n";
+    }
+    else if (length1 != length2)
+    {
+      f << "Sequences agree through " << min_length << " positions [type='" << type_name<InputType>() << "']\n";
+    }
+
+    if (failure)
+      throw f;
+}
+
+template <typename ForwardIterator1, typename ForwardIterator2>
+void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType;
+    assert_equal(first1, last1, first2, last2, thrust::equal_to<InputType>(), filename, lineno);
+}
+
+
+template <typename ForwardIterator1, typename ForwardIterator2>
+void assert_almost_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType;
+    assert_equal(first1, last1, first2, last2, almost_equal_to<InputType>(a_tol, r_tol), filename, lineno);
+}
+
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    thrust::host_vector<T,Alloc1> B_host = B;
+    assert_equal(A, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    thrust::host_vector<T,Alloc2> A_host = A;
+    assert_equal(A_host, B, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    thrust::host_vector<T> A_host = A;
+    thrust::host_vector<T> B_host = B;
+    assert_equal(A_host, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    thrust::host_vector<T,Alloc1> B_host = B;
+    assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    thrust::host_vector<T,Alloc2> A_host = A;
+    assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    thrust::host_vector<T> A_host = A;
+    thrust::host_vector<T> B_host = B;
+    assert_almost_equal(A_host, B_host, filename, lineno, a_tol, r_tol);
+}
+
+enum threw_status
+{
+  did_not_throw
+, threw_wrong_type
+, threw_right_type_but_wrong_value
+, threw_right_type
+};
+
+void check_assert_throws(
+  threw_status s
+, std::string const& exception_name
+, std::string const& file_name = "unknown"
+, int line_number = -1
+)
+{
+  switch (s)
+  {
+    case did_not_throw:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] did not throw anything";
+      throw f;
+    }
+    case threw_wrong_type:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] did not throw an "
+        << "object of type " << exception_name;
+      throw f;
+    }
+    case threw_right_type_but_wrong_value:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] threw an object of the "
+        << "correct type (" << exception_name << ") but wrong value";
+      throw f;
+    }
+    case threw_right_type:
+      break;
+    default:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] encountered an "
+        << "unknown error";
+      throw f;
+    }
+  }
+}
+
+}; //end namespace unittest
diff --git a/thrust/testing/unittest/cuda/testframework.cu b/thrust/testing/unittest/cuda/testframework.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a8bc52ea4e04488807f740c9a83d8d46fd7c3e0e
--- /dev/null
+++ b/thrust/testing/unittest/cuda/testframework.cu
@@ -0,0 +1,219 @@
+#include <unittest/testframework.h>
+#include <unittest/cuda/testframework.h>
+#include <thrust/system/cuda/memory.h>
+#include <cuda_runtime.h>
+#include <numeric>
+
+__global__ void dummy_kernel() {}
+
+bool binary_exists_for_current_device()
+{
+  // check against the dummy_kernel
+  // if we're unable to get the attributes, then
+  // we didn't compile a binary compatible with the current device
+  cudaFuncAttributes attr;
+  cudaError_t error = cudaFuncGetAttributes(&attr, dummy_kernel);
+
+  // clear the CUDA global error state if we just set it, so that
+  // check_cuda_error doesn't complain
+  if (cudaSuccess != error) (void)cudaGetLastError();
+
+  return cudaSuccess == error;
+}
+
+void list_devices(void)
+{
+  int deviceCount;
+  cudaGetDeviceCount(&deviceCount);
+  if(deviceCount == 0)
+  {
+    std::cout << "There is no device supporting CUDA" << std::endl;
+  }
+  
+  int selected_device;
+  cudaGetDevice(&selected_device);
+  
+  for (int dev = 0; dev < deviceCount; ++dev)
+  {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, dev);
+    
+    if(dev == 0)
+    {
+      if(deviceProp.major == 9999 && deviceProp.minor == 9999)
+        std::cout << "There is no device supporting CUDA." << std::endl;
+      else if(deviceCount == 1)
+        std::cout << "There is 1 device supporting CUDA" << std:: endl;
+      else
+        std::cout << "There are " << deviceCount <<  " devices supporting CUDA" << std:: endl;
+    }
+    
+    std::cout << "\nDevice " << dev << ": \"" << deviceProp.name << "\"";
+    if(dev == selected_device)
+      std::cout << "  [SELECTED]";
+    std::cout << std::endl;
+    
+    std::cout << "  Major revision number:                         " << deviceProp.major << std::endl;
+    std::cout << "  Minor revision number:                         " << deviceProp.minor << std::endl;
+    std::cout << "  Total amount of global memory:                 " << deviceProp.totalGlobalMem << " bytes" << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+// provide next, which c++03 doesn't have
+template<typename Iterator> Iterator my_next(Iterator iter)
+{
+  return ++iter;
+}
+
+
+std::vector<int> CUDATestDriver::target_devices(const ArgumentMap &kwargs)
+{
+  std::vector<int> result;
+  
+  // by default, test all devices in the system (device id -1)
+  int device_id = kwargs.count("device") ? atoi(kwargs.find("device")->second.c_str()) : -1;
+  
+  if(device_id < 0)
+  {
+    // target all devices in the system
+    int count = 0;
+    cudaGetDeviceCount(&count);
+    
+    result.resize(count);
+    std::iota(result.begin(), result.end(), 0);
+  }
+  else
+  {
+    // target the specified device
+    result = std::vector<int>(1,device_id);
+  }
+  
+  return result;
+}
+
+bool CUDATestDriver::check_cuda_error(bool concise)
+{
+  cudaError_t const error = cudaGetLastError();
+  if(cudaSuccess != error)
+  {
+    if(!concise)
+    {
+      std::cout << "[ERROR] CUDA error detected before running tests: ["
+                << std::string(cudaGetErrorName(error))
+                << ": "
+                << std::string(cudaGetErrorString(error))
+                << "]" << std::endl;
+    }
+  } 
+
+  return cudaSuccess != error;
+}
+
+bool CUDATestDriver::post_test_sanity_check(const UnitTest &test, bool concise)
+{
+  cudaError_t const error = cudaDeviceSynchronize();
+  if(cudaSuccess != error)
+  {
+    if(!concise)
+    {
+      std::cout << "\t[ERROR] CUDA error detected after running " << test.name << ": ["
+                << std::string(cudaGetErrorName(error))
+                << ": "
+                << std::string(cudaGetErrorString(error))
+                << "]" << std::endl;
+    }
+  }
+
+  return cudaSuccess == error;
+}
+  
+bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwargs)
+{
+  bool verbose = kwargs.count("verbose");
+  bool concise = kwargs.count("concise");
+
+  if(verbose && concise)
+  {
+    std::cout << "--verbose and --concise cannot be used together" << std::endl;
+    exit(EXIT_FAILURE);
+    return false;
+  }
+
+  // check error status before doing anything
+  if(check_cuda_error(concise)) return false;
+  
+  bool result = true;
+
+  if(kwargs.count("verbose"))
+  {
+    list_devices();
+  }
+  
+  // figure out which devices to target
+  std::vector<int> devices = target_devices(kwargs);
+  
+  // target each device
+  for(std::vector<int>::iterator device = devices.begin();
+      device != devices.end();
+      ++device)
+  {
+    cudaDeviceSynchronize();
+
+    // set the device
+    cudaSetDevice(*device);
+
+    // check if a binary exists for this device
+    // if none exists, skip the device silently unless this is the only one we're targeting
+    if(devices.size() > 1 && !binary_exists_for_current_device())
+    {
+      // note which device we're skipping
+      cudaDeviceProp deviceProp;
+      cudaGetDeviceProperties(&deviceProp, *device);
+      
+      std::cout << "Skipping Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl;
+
+      continue;
+    }
+
+    if(!concise)
+    {
+      // note which device we're testing
+      cudaDeviceProp deviceProp;
+      cudaGetDeviceProperties(&deviceProp, *device);
+      
+      std::cout << "Testing Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl;
+    }
+
+    // check error status before running any tests
+    if(check_cuda_error(concise)) return false;
+    
+    // run tests
+    result &= UnitTestDriver::run_tests(args, kwargs);
+    
+    if(!concise && my_next(device) != devices.end())
+    {
+      // provide some separation between the output of separate tests
+      std::cout << std::endl;
+    }
+  }
+  
+  return result;
+}
+
+int CUDATestDriver::current_device_architecture() const
+{
+  int current = -1;
+  cudaGetDevice(&current);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, current);
+
+  return 100 * deviceProp.major + 10 * deviceProp.minor;
+}
+
+UnitTestDriver &driver_instance(thrust::system::cuda::tag)
+{
+  static CUDATestDriver s_instance;
+  return s_instance;
+}
+
diff --git a/thrust/testing/unittest/cuda/testframework.h b/thrust/testing/unittest/cuda/testframework.h
new file mode 100644
index 0000000000000000000000000000000000000000..953f88c1c546d9893bac28f0ec38d31f9af93031
--- /dev/null
+++ b/thrust/testing/unittest/cuda/testframework.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <unittest/testframework.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/system_error.h>
+#include <vector>
+
+class CUDATestDriver
+  : public UnitTestDriver
+{
+  public:
+    int current_device_architecture() const;
+
+  private:
+    std::vector<int> target_devices(const ArgumentMap &kwargs);
+
+    bool check_cuda_error(bool concise);
+
+    virtual bool post_test_sanity_check(const UnitTest &test, bool concise);
+
+    virtual bool run_tests(const ArgumentSet &args, const ArgumentMap &kwargs);
+};
+
+UnitTestDriver &driver_instance(thrust::system::cuda::tag);
+
diff --git a/thrust/testing/unittest/exceptions.h b/thrust/testing/unittest/exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f3633fd6c0c2e1b31a25af0b53cf2e4cbe9d9df
--- /dev/null
+++ b/thrust/testing/unittest/exceptions.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <string>
+#include <iostream>
+#include <sstream>
+
+namespace unittest
+{
+
+class UnitTestException 
+{
+    public:
+    std::string message;
+
+    UnitTestException() {}
+    UnitTestException(const std::string& msg) : message(msg) {}
+
+    friend std::ostream& operator<<(std::ostream& os, const UnitTestException& e)
+    { 
+        return os << e.message;  
+    }
+
+    template <typename T>
+    UnitTestException& operator<<(const T& t) 
+    {
+        std::ostringstream oss;
+        oss << t;
+        message += oss.str();
+        return *this;
+    }
+};
+
+
+class UnitTestError   : public UnitTestException 
+{
+    public:
+    UnitTestError() {}
+    UnitTestError(const std::string& msg) : UnitTestException(msg) {}
+};
+
+class UnitTestFailure : public UnitTestException
+{
+    public:
+    UnitTestFailure() {}
+    UnitTestFailure(const std::string& msg) : UnitTestException(msg) {}
+};
+
+class UnitTestKnownFailure : public UnitTestException
+{
+    public:
+    UnitTestKnownFailure() {}
+    UnitTestKnownFailure(const std::string& msg) : UnitTestException(msg) {}
+};
+
+
+}; //end namespace unittest
diff --git a/thrust/testing/unittest/meta.h b/thrust/testing/unittest/meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..39c62edb645361dcb9064b439b9dfc4d86b741e0
--- /dev/null
+++ b/thrust/testing/unittest/meta.h
@@ -0,0 +1,260 @@
+/*! \file meta.h
+ *  \brief Defines template classes
+ *         for metaprogramming in the
+ *         unit tests.
+ */
+
+#pragma once
+
+namespace unittest
+{
+
+// mark the absence of a type
+struct null_type {}; 
+
+// this type encapsulates a list of
+// up to 10 types
+template<typename T0 = null_type,
+         typename T1 = null_type,
+         typename T2 = null_type,
+         typename T3 = null_type,
+         typename T4 = null_type,
+         typename T5 = null_type,
+         typename T6 = null_type,
+         typename T7 = null_type,
+         typename T8 = null_type,
+         typename T9 = null_type,
+         typename T10 = null_type,
+         typename T11 = null_type,
+         typename T12 = null_type,
+         typename T13 = null_type,
+         typename T14 = null_type,
+         typename T15 = null_type,
+         typename T16 = null_type,
+         typename T17 = null_type,
+         typename T18 = null_type,
+         typename T19 = null_type>
+  struct type_list
+{
+  typedef T0 type_0;
+  typedef T1 type_1;
+  typedef T2 type_2;
+  typedef T3 type_3;
+  typedef T4 type_4;
+  typedef T5 type_5;
+  typedef T6 type_6;
+  typedef T7 type_7;
+  typedef T8 type_8;
+  typedef T9 type_9;
+  typedef T10 type_10;
+  typedef T11 type_11;
+  typedef T12 type_12;
+  typedef T13 type_13;
+  typedef T14 type_14;
+  typedef T15 type_15;
+  typedef T16 type_16;
+  typedef T17 type_17;
+  typedef T18 type_18;
+  typedef T19 type_19;
+};
+
+// this type provides a way of indexing
+// into a type_list
+template<typename List, unsigned int i>
+  struct get_type
+{
+  typedef null_type type;
+};
+
+template<typename List>  struct get_type<List,0> { typedef typename List::type_0 type; };
+template<typename List>  struct get_type<List,1> { typedef typename List::type_1 type; };
+template<typename List>  struct get_type<List,2> { typedef typename List::type_2 type; };
+template<typename List>  struct get_type<List,3> { typedef typename List::type_3 type; };
+template<typename List>  struct get_type<List,4> { typedef typename List::type_4 type; };
+template<typename List>  struct get_type<List,5> { typedef typename List::type_5 type; };
+template<typename List>  struct get_type<List,6> { typedef typename List::type_6 type; };
+template<typename List>  struct get_type<List,7> { typedef typename List::type_7 type; };
+template<typename List>  struct get_type<List,8> { typedef typename List::type_8 type; };
+template<typename List>  struct get_type<List,9> { typedef typename List::type_9 type; };
+template<typename List>  struct get_type<List,10> { typedef typename List::type_10 type; };
+template<typename List>  struct get_type<List,11> { typedef typename List::type_11 type; };
+template<typename List>  struct get_type<List,12> { typedef typename List::type_12 type; };
+template<typename List>  struct get_type<List,13> { typedef typename List::type_13 type; };
+template<typename List>  struct get_type<List,14> { typedef typename List::type_14 type; };
+template<typename List>  struct get_type<List,15> { typedef typename List::type_15 type; };
+template<typename List>  struct get_type<List,16> { typedef typename List::type_16 type; };
+template<typename List>  struct get_type<List,17> { typedef typename List::type_17 type; };
+template<typename List>  struct get_type<List,18> { typedef typename List::type_18 type; };
+template<typename List>  struct get_type<List,19> { typedef typename List::type_19 type; };
+
+// this type and its specialization provides a way to
+// iterate over a type_list, and
+// applying a unary function to each type
+template<typename TypeList,
+         template <typename> class Function,
+         typename T,
+         unsigned int i = 0>
+  struct for_each_type
+{
+  template<typename U>
+    void operator()(U n)
+  {
+    // run the function on type T
+    Function<T> f;
+    f(n);
+
+    // get the next type
+    typedef typename get_type<TypeList,i+1>::type next_type;
+
+    // recurse to i + 1
+    for_each_type<TypeList, Function, next_type, i + 1> loop;
+    loop(n);
+  }
+
+  void operator()(void)
+  {
+    // run the function on type T
+    Function<T> f;
+    f();
+
+    // get the next type
+    typedef typename get_type<TypeList,i+1>::type next_type;
+
+    // recurse to i + 1
+    for_each_type<TypeList, Function, next_type, i + 1> loop;
+    loop();
+  }
+};
+
+// terminal case: do nothing when encountering null_type
+template<typename TypeList,
+         template <typename> class Function,
+         unsigned int i>
+  struct for_each_type<TypeList, Function, null_type, i>
+{
+  template<typename U>
+    void operator()(U)
+  {
+    // no-op
+  }
+
+  void operator()(void)
+  {
+    // no-op
+  }
+};
+
+// this type and its specialization instantiates
+// a template by applying T to Template.
+// if T == null_type, then its result is also null_type
+template<template <typename> class Template,
+         typename T>
+  struct ApplyTemplate1
+{
+  typedef Template<T> type;
+};
+
+template<template <typename> class Template>
+  struct ApplyTemplate1<Template, null_type>
+{
+  typedef null_type type;
+};
+
+// this type and its specializations instantiates
+// a template by applying T1 & T2 to Template.
+// if either T1 or T2 == null_type, then its result
+// is also null_type
+template<template <typename,typename> class Template,
+         typename T1,
+         typename T2>
+  struct ApplyTemplate2
+{
+  typedef Template<T1,T2> type;
+};
+
+template<template <typename,typename> class Template,
+         typename T>
+  struct ApplyTemplate2<Template, T, null_type>
+{
+  typedef null_type type;
+};
+
+template<template <typename,typename> class Template,
+         typename T>
+  struct ApplyTemplate2<Template, null_type, T>
+{
+  typedef null_type type;
+};
+
+template<template <typename,typename> class Template>
+  struct ApplyTemplate2<Template, null_type, null_type>
+{
+  typedef null_type type;
+};
+
+// this type creates a new type_list by applying a Template to each of
+// the Type_list's types
+template<typename TypeList,
+         template <typename> class Template>
+  struct transform1
+{
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,0>::type>::type type_0;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,1>::type>::type type_1;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,2>::type>::type type_2;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,3>::type>::type type_3;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,4>::type>::type type_4;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,5>::type>::type type_5;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,6>::type>::type type_6;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,7>::type>::type type_7;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,8>::type>::type type_8;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,9>::type>::type type_9;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,10>::type>::type type_10;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,11>::type>::type type_11;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,12>::type>::type type_12;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,13>::type>::type type_13;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,14>::type>::type type_14;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,15>::type>::type type_15;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,16>::type>::type type_16;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,17>::type>::type type_17;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,18>::type>::type type_18;
+  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,19>::type>::type type_19;
+
+  typedef type_list<type_0, type_1, type_2, type_3, type_4, type_5, type_6, type_7, type_8, type_9,
+                    type_10, type_11, type_12, type_13, type_14, type_15, type_16, type_17, type_18, type_19> type;
+};
+
+// this type creates a new type_list by applying a Template to each of
+// two type_list's types
+template<typename TypeList1,
+         typename TypeList2,
+         template <typename,typename> class Template>
+  struct transform2
+{
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,0>::type, typename get_type<TypeList2,0>::type>::type type_0;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,1>::type, typename get_type<TypeList2,1>::type>::type type_1;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,2>::type, typename get_type<TypeList2,2>::type>::type type_2;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,3>::type, typename get_type<TypeList2,3>::type>::type type_3;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,4>::type, typename get_type<TypeList2,4>::type>::type type_4;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,5>::type, typename get_type<TypeList2,5>::type>::type type_5;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,6>::type, typename get_type<TypeList2,6>::type>::type type_6;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,7>::type, typename get_type<TypeList2,7>::type>::type type_7;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,8>::type, typename get_type<TypeList2,8>::type>::type type_8;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,9>::type, typename get_type<TypeList2,9>::type>::type type_9;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,10>::type, typename get_type<TypeList2,10>::type>::type type_10;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,11>::type, typename get_type<TypeList2,11>::type>::type type_11;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,12>::type, typename get_type<TypeList2,12>::type>::type type_12;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,13>::type, typename get_type<TypeList2,13>::type>::type type_13;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,14>::type, typename get_type<TypeList2,14>::type>::type type_14;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,15>::type, typename get_type<TypeList2,15>::type>::type type_15;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,16>::type, typename get_type<TypeList2,16>::type>::type type_16;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,17>::type, typename get_type<TypeList2,17>::type>::type type_17;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,18>::type, typename get_type<TypeList2,18>::type>::type type_18;
+  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,19>::type, typename get_type<TypeList2,19>::type>::type type_19;
+  
+
+  typedef type_list<type_0, type_1, type_2, type_3, type_4, type_5, type_6, type_7, type_8, type_9,
+                    type_10, type_11, type_12, type_13, type_14, type_15, type_16, type_17, type_18, type_19> type;
+};
+
+} // end unittest
+
diff --git a/thrust/testing/unittest/random.h b/thrust/testing/unittest/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..924c0f0e167e1ce67191b075cca2c4f1ea7c655e
--- /dev/null
+++ b/thrust/testing/unittest/random.h
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/detail/type_traits.h>
+
+#include <limits>
+
+namespace unittest
+{
+
+inline unsigned int hash(unsigned int a)
+{
+    a = (a+0x7ed55d16) + (a<<12);
+    a = (a^0xc761c23c) ^ (a>>19);
+    a = (a+0x165667b1) + (a<<5);
+    a = (a+0xd3a2646c) ^ (a<<9);
+    a = (a+0xfd7046c5) + (a<<3);
+    a = (a^0xb55a4f09) ^ (a>>16);
+    return a;
+}
+
+template<typename T, typename = void>
+  struct generate_random_integer;
+
+template<typename T>
+  struct generate_random_integer<T,
+    typename thrust::detail::disable_if<
+      thrust::detail::is_non_bool_arithmetic<T>::value
+    >::type
+  >
+{
+  T operator()(unsigned int i) const
+  {
+      thrust::default_random_engine rng(hash(i));
+
+      return static_cast<T>(rng());
+  }
+};
+
+template<typename T>
+  struct generate_random_integer<T,
+    typename thrust::detail::enable_if<
+      thrust::detail::is_non_bool_integral<T>::value
+    >::type
+  >
+{
+  T operator()(unsigned int i) const
+  {
+      thrust::default_random_engine rng(hash(i));
+      thrust::uniform_int_distribution<T> dist;
+
+      return static_cast<T>(dist(rng));
+  }
+};
+
+template<typename T>
+  struct generate_random_integer<T,
+    typename thrust::detail::enable_if<
+      thrust::detail::is_floating_point<T>::value
+    >::type
+  >
+{
+  T operator()(unsigned int i) const
+  {
+      T const min = std::numeric_limits<T>::min();
+      T const max = std::numeric_limits<T>::max();
+
+      thrust::default_random_engine rng(hash(i));
+      thrust::uniform_real_distribution<T> dist(min, max);
+
+      return static_cast<T>(dist(rng));
+  }
+};
+
+template<>
+  struct generate_random_integer<bool>
+{
+  bool operator()(unsigned int i) const
+  {
+      thrust::default_random_engine rng(hash(i));
+      thrust::uniform_int_distribution<unsigned int> dist(0,1);
+
+      return dist(rng) == 1;
+  }
+};
+
+
+template<typename T>
+  struct generate_random_sample
+{
+  T operator()(unsigned int i) const
+  {
+      thrust::default_random_engine rng(hash(i));
+      thrust::uniform_int_distribution<unsigned int> dist(0,20);
+
+      return static_cast<T>(dist(rng));
+  } 
+}; 
+
+
+
+template<typename T>
+thrust::host_vector<T> random_integers(const size_t N)
+{
+    thrust::host_vector<T> vec(N);
+    thrust::transform(thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
+                      thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
+                      vec.begin(),
+                      generate_random_integer<T>());
+
+    return vec;
+}
+
+template<typename T>
+T random_integer()
+{
+    return generate_random_integer<T>()(0);
+}
+
+template<typename T>
+thrust::host_vector<T> random_samples(const size_t N)
+{
+    thrust::host_vector<T> vec(N);
+    thrust::transform(thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
+                      thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
+                      vec.begin(),
+                      generate_random_sample<T>());
+
+    return vec;
+}
+
+}; //end namespace unittest
+
diff --git a/thrust/testing/unittest/runtime_static_assert.h b/thrust/testing/unittest/runtime_static_assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..13d8b68a9dc94500a1d82112779ac38a0a1d05b7
--- /dev/null
+++ b/thrust/testing/unittest/runtime_static_assert.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <string>
+
+#include <thrust/detail/static_assert.h>
+#undef THRUST_STATIC_ASSERT
+#undef THRUST_STATIC_ASSERT_MSG
+
+#define THRUST_STATIC_ASSERT(B) unittest::assert_static((B), __FILE__, __LINE__);
+#define THRUST_STATIC_ASSERT_MSG(B, msg) unittest::assert_static((B), __FILE__, __LINE__);
+
+namespace unittest
+{
+    __host__ __device__
+    void assert_static(bool condition, const char * filename, int lineno);
+}
+
+#include <thrust/device_new.h>
+#include <thrust/device_delete.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+
+#define ASSERT_STATIC_ASSERT(X) \
+    { \
+        bool triggered = false; \
+        typedef unittest::static_assert_exception ex_t; \
+        thrust::device_ptr<ex_t> device_ptr = thrust::device_new<ex_t>(); \
+        ex_t* raw_ptr = thrust::raw_pointer_cast(device_ptr); \
+        ::cudaMemcpyToSymbol(unittest::detail::device_exception, &raw_ptr, sizeof(ex_t*)); \
+        try { X; } catch (ex_t) { triggered = true; } \
+        if (!triggered) { \
+            triggered = static_cast<ex_t>(*device_ptr).triggered; \
+        } \
+        thrust::device_free(device_ptr); \
+        raw_ptr = NULL; \
+        ::cudaMemcpyToSymbol(unittest::detail::device_exception, &raw_ptr, sizeof(ex_t*)); \
+        if (!triggered) { unittest::UnitTestFailure f; f << "[" << __FILE__ << ":" << __LINE__ << "] did not trigger a THRUST_STATIC_ASSERT"; throw f; } \
+    }
+
+#else
+
+#define ASSERT_STATIC_ASSERT(X) \
+    { \
+        bool triggered = false; \
+        typedef unittest::static_assert_exception ex_t; \
+        try { X; } catch (ex_t) { triggered = true; } \
+        if (!triggered) { unittest::UnitTestFailure f; f << "[" << __FILE__ << ":" << __LINE__ << "] did not trigger a THRUST_STATIC_ASSERT"; throw f; } \
+    }
+
+#endif
+
+namespace unittest
+{
+    class static_assert_exception
+    {
+    public:
+        __host__ __device__
+        static_assert_exception() : triggered(false)
+        {
+        }
+
+        __host__ __device__
+        static_assert_exception(const char * filename, int lineno)
+            : triggered(true), filename(filename), lineno(lineno)
+        {
+        }
+
+        bool triggered;
+        const char * filename;
+        int lineno;
+    };
+
+    namespace detail
+    {
+#ifdef __clang__
+        __attribute__((used))
+#endif
+        __device__ static static_assert_exception* device_exception = NULL;
+    }
+
+    __host__ __device__
+    void assert_static(bool condition, const char * filename, int lineno)
+    {
+        if (!condition)
+        {
+            static_assert_exception ex(filename, lineno);
+
+#ifdef __CUDA_ARCH__
+            *detail::device_exception = ex;
+#else
+            throw ex;
+#endif
+        }
+    }
+}
+
diff --git a/thrust/testing/unittest/special_types.h b/thrust/testing/unittest/special_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..b046a96eec4d80ff907f84d994b8dd04a9be0506
--- /dev/null
+++ b/thrust/testing/unittest/special_types.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#include <iostream>
+#include <thrust/execution_policy.h>
+
+template <typename T, unsigned int N>
+struct FixedVector
+{
+    T data[N];
+    
+    __host__ __device__
+    FixedVector()
+    {
+        for(unsigned int i = 0; i < N; i++)
+            data[i] = T();
+    }
+
+    __host__ __device__
+    FixedVector(T init)
+    {
+        for(unsigned int i = 0; i < N; i++)
+            data[i] = init;
+    }
+
+    __host__ __device__
+    FixedVector operator+(const FixedVector& bs) const
+    {
+        FixedVector output;
+        for(unsigned int i = 0; i < N; i++)
+            output.data[i] = data[i] + bs.data[i];
+        return output;
+    }
+    
+    __host__ __device__
+    bool operator<(const FixedVector& bs) const
+    {
+        for(unsigned int i = 0; i < N; i++)
+        {
+            if(data[i] < bs.data[i])
+                return true;
+            else if(bs.data[i] < data[i])
+                return false;
+        }
+        return false;
+    }
+
+    __host__ __device__
+    bool operator==(const FixedVector& bs) const
+    {
+        for(unsigned int i = 0; i < N; i++)
+        {
+            if(!(data[i] == bs.data[i]))
+                return false;
+        }
+        return true;                
+    }
+};
+
+template<typename Key, typename Value>
+  struct key_value
+{
+  typedef Key   key_type;
+  typedef Value value_type;
+
+  __host__ __device__
+  key_value(void)
+    : key(), value()
+  {}
+
+  __host__ __device__
+  key_value(key_type k, value_type v)
+    : key(k), value(v)
+  {}
+
+  __host__ __device__
+  bool operator<(const key_value &rhs) const
+  {
+    return key < rhs.key;
+  }
+
+  __host__ __device__
+  bool operator>(const key_value &rhs) const
+  {
+    return key > rhs.key;
+  }
+
+  __host__ __device__
+  bool operator==(const key_value &rhs) const
+  {
+    return key == rhs.key && value == rhs.value;
+  }
+
+  __host__ __device__
+  bool operator!=(const key_value &rhs) const
+  {
+    return !operator==(rhs);
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const key_value &kv)
+  {
+    return os << "(" << kv.key << ", " << kv.value << ")";
+  }
+
+  key_type key;
+  value_type value;
+};
+
+struct user_swappable
+{
+  inline __host__ __device__
+  user_swappable(bool swapped = false)
+    : was_swapped(swapped)
+  {}
+
+  bool was_swapped;
+};
+
+inline __host__ __device__
+bool operator==(const user_swappable &x, const user_swappable &y)
+{
+  return x.was_swapped == y.was_swapped;
+}
+
+inline __host__ __device__
+void swap(user_swappable &x, user_swappable &y)
+{
+  x.was_swapped = true;
+  y.was_swapped = false;
+}
+
+class my_system : public thrust::device_execution_policy<my_system>
+{
+  public:
+    my_system(int)
+      : correctly_dispatched(false),
+        num_copies(0)
+    {}
+
+    my_system(const my_system &other)
+      : correctly_dispatched(false),
+        num_copies(other.num_copies + 1)
+    {}
+
+    void validate_dispatch()
+    {
+      correctly_dispatched = (num_copies == 0);
+    }
+
+    bool is_valid()
+    {
+      return correctly_dispatched;
+    }
+
+  private:
+    bool correctly_dispatched;
+
+    // count the number of copies so that we can validate
+    // that dispatch does not introduce any
+    unsigned int num_copies;
+
+
+    // disallow default construction
+    my_system();
+};
+
+struct my_tag : thrust::device_execution_policy<my_tag> {};
+
+namespace unittest
+{
+
+
+using thrust::detail::int8_t;
+using thrust::detail::int16_t;
+using thrust::detail::int32_t;
+using thrust::detail::int64_t;
+
+using thrust::detail::uint8_t;
+using thrust::detail::uint16_t;
+using thrust::detail::uint32_t;
+using thrust::detail::uint64_t;
+
+  
+}
+
diff --git a/thrust/testing/unittest/system.h b/thrust/testing/unittest/system.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3552c2b321068d9a7f5eef21fed456574806f65
--- /dev/null
+++ b/thrust/testing/unittest/system.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// for demangling the result of type_info.name()
+// with msvc, type_info.name() is already demangled
+#ifdef __GNUC__
+#include <cxxabi.h>
+#endif // __GNUC__
+
+#include <string>
+#include <cstdlib>
+
+namespace unittest
+{
+
+#if __GNUC__ && !__NVCOMPILER_CUDA__
+inline std::string demangle(const char* name)
+{
+  int status = 0;
+  char* realname = abi::__cxa_demangle(name, 0, 0, &status);
+  std::string result(realname);
+  std::free(realname);
+
+  return result;
+}
+#else
+inline std::string demangle(const char* name)
+{
+  return name;
+}
+#endif
+
+} // end unittest
+
diff --git a/thrust/testing/unittest/testframework.cu b/thrust/testing/unittest/testframework.cu
new file mode 100644
index 0000000000000000000000000000000000000000..26db08a3e63ea7c8300d306c34627c6fa064924e
--- /dev/null
+++ b/thrust/testing/unittest/testframework.cu
@@ -0,0 +1,523 @@
+#include "unittest/testframework.h"
+#include "unittest/exceptions.h"
+#include <thrust/memory.h>
+
+// #include backends' testframework.h, if they exist and are required for the build
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <unittest/cuda/testframework.h>
+#endif
+
+#include <iostream>
+#include <iomanip>
+#include <cstdlib>
+#include <algorithm>
+#include <numeric>
+#include <string>
+#include <limits>
+#include <ctime>
+#include <limits>
+
+
+const size_t standard_test_sizes[] =
+{
+  0, 1, 2, 3, 4, 5, 8, 10, 13, 16, 17, 19, 27, 30, 31, 32,
+  33, 35, 42, 53, 58, 63, 64, 65, 72, 97, 100, 127, 128, 129, 142, 183, 192, 201, 240, 255, 256,
+  257, 302, 511, 512, 513, 687, 900, 1023, 1024, 1025, 1565, 1786, 1973, 2047, 2048, 2049, 3050, 4095, 4096,
+  4097, 5030, 7791, 10000, 10027, 12345, 16384, 17354, 26255, 32768, 43718, 65533, 65536,
+  65539, 123456, 131072, 731588, 1048575, 1048576,
+  3398570, 9760840, (1 << 24) - 1, (1 << 24),
+  (1 << 24) + 1, (1 << 25) - 1, (1 << 25), (1 << 25) + 1, (1 << 26) - 1, 1 << 26,
+  (1 << 26) + 1, (1 << 27) - 1, (1 << 27)
+};
+
+        
+const size_t tiny_threshold    = 1 <<  5;  //   32
+const size_t small_threshold   = 1 <<  8;  //  256
+const size_t medium_threshold  = 1 << 12;  //   4K
+const size_t default_threshold = 1 << 16;  //  64K
+const size_t large_threshold   = 1 << 20;  //   1M
+const size_t huge_threshold    = 1 << 24;  //  16M
+const size_t epic_threshold    = 1 << 26;  //  64M
+const size_t max_threshold     = (std::numeric_limits<size_t>::max)();
+
+
+std::vector<size_t> test_sizes;
+
+
+std::vector<size_t> get_test_sizes(void)
+{
+  return test_sizes;
+}
+
+
+void set_test_sizes(const std::string& val)
+{
+  size_t threshold = 0;
+
+  if(val == "tiny")
+    threshold = tiny_threshold;
+  else if(val == "small")
+    threshold = small_threshold;
+  else if(val == "medium")
+    threshold = medium_threshold;
+  else if(val == "default")
+    threshold = default_threshold;
+  else if(val == "large")
+    threshold = large_threshold;
+  else if(val == "huge")
+    threshold = huge_threshold;
+  else if(val == "epic")
+    threshold = epic_threshold;
+  else if(val == "max")
+    threshold = max_threshold;
+  else
+  {
+    std::cerr << "invalid test size \"" << val << "\"" << std::endl;
+    exit(1);
+  }
+
+  for(size_t i = 0; i < sizeof(standard_test_sizes) / sizeof(*standard_test_sizes); i++)
+  {
+    if(standard_test_sizes[i] <= threshold)
+      test_sizes.push_back(standard_test_sizes[i]);
+  }
+}
+
+
+void UnitTestDriver::register_test(UnitTest * test)
+{
+  if(UnitTestDriver::s_driver().test_map.count(test->name) )
+  {
+    std::cout << "[WARNING] Test name \"" << test->name << " already encountered " << std::endl;
+  }
+
+  UnitTestDriver::s_driver().test_map[test->name] = test;
+}
+
+
+UnitTest::UnitTest(const char * _name) : name(_name)
+{
+  UnitTestDriver::s_driver().register_test(this);
+}
+
+
+void process_args(int argc, char ** argv,
+                  ArgumentSet& args,
+                  ArgumentMap& kwargs)
+
+{
+  for(int i = 1; i < argc; i++)
+  {
+    std::string arg(argv[i]);
+
+    // look for --key or --key=value arguments 
+    if(arg.substr(0,2) == "--")
+    {   
+      std::string::size_type n = arg.find('=',2);
+
+      if(n == std::string::npos)
+      {
+        kwargs[arg.substr(2)] = std::string();              // (key,"")
+      }
+      else
+      {
+        kwargs[arg.substr(2, n - 2)] = arg.substr(n + 1);   // (key,value)
+      }
+    }
+    else
+    {
+      args.insert(arg);
+    }
+  }
+}
+
+
+void usage(int /*argc*/, char** argv)
+{
+  std::string indent = "  ";
+  
+  std::cout << "Example Usage:\n";
+  std::cout << indent << argv[0] << "\n";
+  std::cout << indent << argv[0] << " TestName1 [TestName2 ...] \n";
+  std::cout << indent << argv[0] << " PartialTestName1* [PartialTestName2* ...] \n";
+  std::cout << indent << argv[0] << " --device=1\n";
+  std::cout << indent << argv[0] << " --sizes={tiny,small,medium,default,large,huge,epic,max}\n";
+  std::cout << indent << argv[0] << " --verbose or --concise\n";
+  std::cout << indent << argv[0] << " --list\n";
+  std::cout << indent << argv[0] << " --help\n";
+  std::cout << "\n";
+  std::cout << "Options:\n";
+  std::cout << indent << "The sizes option determines which input sizes are tested.\n";
+  std::cout << indent << indent << "--sizes=tiny    tests sizes up to " << tiny_threshold    << "\n";
+  std::cout << indent << indent << "--sizes=small   tests sizes up to " << small_threshold   << "\n";
+  std::cout << indent << indent << "--sizes=medium  tests sizes up to " << medium_threshold  << "\n";
+  std::cout << indent << indent << "--sizes=default tests sizes up to " << default_threshold << "\n";
+  std::cout << indent << indent << "--sizes=large   tests sizes up to " << large_threshold   << " (0.25 GB memory)\n";
+  std::cout << indent << indent << "--sizes=huge    tests sizes up to " << huge_threshold    << " (1.50 GB memory)\n";
+  std::cout << indent << indent << "--sizes=epic    tests sizes up to " << epic_threshold    << " (3.00 GB memory)\n";
+  std::cout << indent << indent << "--sizes=max     tests all available sizes\n";
+}
+
+
+struct TestResult
+{
+  TestStatus  status;
+  std::string name;
+  std::string message;
+  
+  // XXX use a c++11 timer result when available
+  std::clock_t elapsed;
+  
+  TestResult(const TestStatus status, std::clock_t elapsed, const UnitTest& u, const std::string& message = "")
+      : status(status), name(u.name), message(message), elapsed(elapsed)
+  {}
+  
+  bool operator<(const TestResult& tr) const
+  {
+    if(status < tr.status)
+    {
+      return true;
+    }
+    else if(tr.status < status)
+    {
+      return false;
+    }
+    else
+    {
+      return name < tr.name;
+    }
+  }
+};
+
+
+void record_result(const TestResult& test_result, std::vector< TestResult >& test_results)
+{
+  test_results.push_back(test_result);
+}
+
+
+void report_results(std::vector< TestResult >& test_results, double elapsed_minutes)
+{
+  std::cout << std::endl;
+  
+  std::string hline = "================================================================";
+  
+  std::sort(test_results.begin(), test_results.end());
+  
+  size_t num_passes = 0;
+  size_t num_failures = 0;
+  size_t num_known_failures = 0;
+  size_t num_errors = 0;
+  
+  for(size_t i = 0; i < test_results.size(); i++)
+  {
+    const TestResult& tr = test_results[i];
+    
+    if(tr.status == Pass)
+    {
+      num_passes++;
+    }
+    else
+    {
+      std::cout << hline << std::endl;
+    
+      switch(tr.status)
+      {
+        case Failure:
+          std::cout << "FAILURE";       num_failures++;       break;
+        case KnownFailure:
+          std::cout << "KNOWN FAILURE"; num_known_failures++; break;
+        case Error:
+          std::cout << "ERROR";         num_errors++;         break;
+        default:
+          break;
+      }
+    
+      std::cout << ": " << tr.name << std::endl << tr.message << std::endl;
+    }
+  }
+  
+  std::cout << hline << std::endl;
+  
+  std::cout << "Totals: ";
+  std::cout << num_failures << " failures, ";
+  std::cout << num_known_failures << " known failures, ";
+  std::cout << num_errors << " errors, and ";
+  std::cout << num_passes << " passes." << std::endl;
+  std::cout << "Time:  " << elapsed_minutes << " minutes" << std::endl;
+}
+
+
+void UnitTestDriver::list_tests(void)
+{
+  for(TestMap::iterator iter = test_map.begin(); iter != test_map.end(); iter++)
+  {
+    std::cout << iter->second->name << std::endl;
+  }
+}
+
+
+bool UnitTestDriver::post_test_sanity_check(const UnitTest &/*test*/, bool /*concise*/)
+{
+  return true;
+}
+
+
+bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const ArgumentMap& kwargs)
+{
+  std::time_t start_time = std::time(0);
+  
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN
+  bool verbose = kwargs.count("verbose");
+  bool concise = kwargs.count("concise");
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END
+  
+  std::vector< TestResult > test_results;
+  
+  if(verbose && concise)
+  {
+    std::cout << "--verbose and --concise cannot be used together" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  
+  if(!concise)
+  {
+    std::cout << "Running " << tests_to_run.size() << " unit tests." << std::endl;
+  }
+  
+  for(size_t i = 0; i < tests_to_run.size(); i++)
+  {
+     UnitTest& test = *tests_to_run[i];
+  
+     if(verbose)
+     {
+       std::cout << "Running " << test.name << "..." << std::flush;
+     }
+  
+     try
+     {
+       // time the test
+       std::clock_t start = std::clock();
+  
+       // run the test
+       test.run();
+  
+       // test passed
+       record_result(TestResult(Pass, std::clock() - start, test), test_results);
+     } 
+     catch(unittest::UnitTestFailure& f)
+     {
+       record_result(TestResult(Failure, (std::numeric_limits<std::clock_t>::max)(), test, f.message), test_results);
+     }
+     catch(unittest::UnitTestKnownFailure& f)
+     {
+       record_result(TestResult(KnownFailure, (std::numeric_limits<std::clock_t>::max)(), test, f.message), test_results);
+     }
+     catch(std::bad_alloc& e)
+     {
+       record_result(TestResult(Error, (std::numeric_limits<std::clock_t>::max)(), test, e.what()), test_results);
+     }
+     catch(unittest::UnitTestError& e)
+     {
+       record_result(TestResult(Error, (std::numeric_limits<std::clock_t>::max)(), test, e.message), test_results);
+     }
+  
+     // immediate report
+     if(!concise)
+     {
+       if(verbose)
+       {
+         switch(test_results.back().status)
+         {
+           case Pass:
+             std::cout << "\r[PASS] ";
+             std::cout << std::setw(10) << 1000.f * float(test_results.back().elapsed) / CLOCKS_PER_SEC << " ms";
+             break;
+           case Failure:
+             std::cout << "\r[FAILURE]           "; break;
+           case KnownFailure:
+             std::cout << "\r[KNOWN FAILURE]     "; break;
+           case Error:
+             std::cout << "\r[ERROR]             "; break;
+           default:
+             break;
+         }
+  
+         std::cout << " " << test.name << std::endl;
+       }
+       else
+       {
+         switch(test_results.back().status)
+         {
+           case Pass:
+             std::cout << "."; break;
+           case Failure:
+             std::cout << "F"; break;
+           case KnownFailure:
+             std::cout << "K"; break;
+           case Error:
+             std::cout << "E"; break;
+           default:
+             break;
+         }
+       }
+     }
+  
+     if(!post_test_sanity_check(test, concise))
+     {
+       return false;
+     }
+  
+     std::cout.flush();
+  }
+  
+  double elapsed_minutes = double(std::time(0) - start_time) / 60;
+  
+  // summary report
+  if(!concise)
+  {
+    report_results(test_results, elapsed_minutes);
+  }
+  
+  
+  // if any failures or errors return false
+  for(size_t i = 0; i < test_results.size(); i++)
+  {
+    if(test_results[i].status != Pass && test_results[i].status != KnownFailure)
+    {
+      return false;
+    }
+  }
+  
+  // all tests pass or are known failures
+  return true;
+}
+
+
+bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwargs)
+{
+  if(args.empty())
+  {
+    // run all tests
+    std::vector<UnitTest *> tests_to_run;
+    
+    for(TestMap::iterator iter = test_map.begin(); iter != test_map.end(); iter++)
+    {
+      tests_to_run.push_back(iter->second);
+    }
+    
+    return run_tests(tests_to_run, kwargs);
+  }
+  else
+  {
+    // all non-keyword arguments are assumed to be test names or partial test names
+  
+    typedef TestMap::iterator               TestMapIterator;
+  
+    // vector to accumulate tests
+    std::vector<UnitTest *> tests_to_run;
+  
+    for(ArgumentSet::const_iterator iter = args.begin(); iter != args.end(); iter++)
+    {
+      const std::string& arg = *iter;
+  
+      size_t len = arg.size();
+      size_t matches = 0;
+  
+      if(arg[len-1] == '*')
+      {
+        // wildcard search
+        std::string search = arg.substr(0,len-1);
+  
+        TestMapIterator lb = test_map.lower_bound(search);
+        while(lb != test_map.end())
+        {
+          if(search != lb->first.substr(0,len-1))
+          {
+            break;
+          }
+  
+          tests_to_run.push_back(lb->second); 
+          lb++;
+          matches++;
+        }
+      }
+      else
+      {
+        // non-wildcard search
+        TestMapIterator lb = test_map.find(arg);
+  
+        if(lb != test_map.end())
+        {
+          tests_to_run.push_back(lb->second); 
+          matches++;
+        }
+      }
+  
+      if(matches == 0)
+      {
+        std::cout << "[ERROR] found no test names matching the pattern: " << arg << std::endl;
+        return false;
+      }
+    }
+  
+    return run_tests(tests_to_run, kwargs);
+  }
+}
+
+
+// driver_instance maps a DeviceSystem to a singleton UnitTestDriver
+template<typename DeviceSystem>
+UnitTestDriver &driver_instance(DeviceSystem)
+{
+  static UnitTestDriver s_instance;
+  return s_instance;
+}
+
+
+// if we need a special kind of UnitTestDriver, overload
+// driver_instance in that function
+UnitTestDriver &UnitTestDriver::s_driver()
+{
+  return driver_instance(thrust::device_system_tag());
+}
+
+
+int main(int argc, char **argv)
+{
+  ArgumentSet args;
+  ArgumentMap kwargs;
+  
+  process_args(argc, argv, args, kwargs);
+  
+  if(kwargs.count("help"))
+  {
+    usage(argc, argv);
+    return 0;
+  }
+  
+  if(kwargs.count("list"))
+  {
+    UnitTestDriver::s_driver().list_tests();
+    return 0;
+  }
+  
+  if(kwargs.count("sizes"))
+  {
+    set_test_sizes(kwargs["sizes"]);
+  }
+  else
+  {
+    set_test_sizes("default");
+  }
+  
+  bool passed = UnitTestDriver::s_driver().run_tests(args, kwargs);
+  
+  if(kwargs.count("concise"))
+  {
+    std::cout << ((passed) ? "PASSED" : "FAILED") << std::endl;
+  }
+  
+  return (passed) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
diff --git a/thrust/testing/unittest/testframework.h b/thrust/testing/unittest/testframework.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec5c42bb653af8aa4295bb9a61860aafd739a3a2
--- /dev/null
+++ b/thrust/testing/unittest/testframework.h
@@ -0,0 +1,574 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include <iostream>
+
+#include <stdio.h>
+
+#include "meta.h"
+#include "util.h"
+
+#include <thrust/limits.h>
+#include <thrust/detail/integer_traits.h>
+#include <thrust/memory/detail/device_system_resource.h>
+#include <thrust/memory/detail/host_system_resource.h>
+#include <thrust/mr/allocator.h>
+
+// define some common lists of types
+typedef unittest::type_list<int,
+                            unsigned int,
+                            float> ThirtyTwoBitTypes;
+
+typedef unittest::type_list<long long,
+                            unsigned long long,
+                            double> SixtyFourBitTypes;
+
+typedef unittest::type_list<char,
+                            signed char,
+                            unsigned char,
+                            short,
+                            unsigned short,
+                            int,
+                            unsigned int,
+                            long,
+                            unsigned long,
+                            long long,
+                            unsigned long long> IntegralTypes;
+
+typedef unittest::type_list<signed char,
+                            signed short,
+                            signed int,
+                            signed long,
+                            signed long long> SignedIntegralTypes;
+
+typedef unittest::type_list<unsigned char,
+                            unsigned short,
+                            unsigned int,
+                            unsigned long,
+                            unsigned long long> UnsignedIntegralTypes;
+
+typedef unittest::type_list<char,
+                            signed char,
+                            unsigned char> ByteTypes;
+
+typedef unittest::type_list<char,
+                            signed char,
+                            unsigned char,
+                            short,
+                            unsigned short> SmallIntegralTypes;
+
+typedef unittest::type_list<long long,
+                            unsigned long long> LargeIntegralTypes;
+
+typedef unittest::type_list<float,
+                            double> FloatingPointTypes;
+
+// A type that behaves as if it was a normal numeric type,
+// so it can be used in the same tests as "normal" numeric types.
+// NOTE: This is explicitly NOT proclaimed trivially reloctable.
+class custom_numeric
+{
+public:
+    __host__ __device__
+    custom_numeric()
+    {
+        fill(0);
+    }
+
+    __host__ __device__
+    custom_numeric(int i)
+    {
+        fill(i);
+    }
+
+    __host__ __device__
+    custom_numeric(const custom_numeric & other)
+    {
+        fill(other.value[0]);
+    }
+
+    __host__ __device__
+    custom_numeric & operator=(int val)
+    {
+        fill(val);
+        return *this;
+    }
+
+    __host__ __device__
+    custom_numeric & operator=(const custom_numeric & other)
+    {
+        fill(other.value[0]);
+        return *this;
+    }
+
+    // cast to void * instead of bool to fool overload resolution
+    // WTB C++11 explicit conversion operators
+    __host__ __device__
+    operator void *() const
+    {
+        // static cast first to avoid MSVC warning C4312
+        return reinterpret_cast<void *>(static_cast<std::size_t>(value[0]));
+    }
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric & operator op() {                                \
+        fill(op value[0]);                                          \
+        return *this;                                               \
+    }                                                               \
+    __host__ __device__                                             \
+    custom_numeric operator op(int) const {                         \
+        custom_numeric ret(*this);                                  \
+        op ret;                                                     \
+        return ret;                                                 \
+    }
+
+    DEFINE_OPERATOR(++)
+    DEFINE_OPERATOR(--)
+
+#undef DEFINE_OPERATOR
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric operator op () const                             \
+    {                                                               \
+        return custom_numeric(op value[0]);                         \
+    }
+
+    DEFINE_OPERATOR(+)
+    DEFINE_OPERATOR(-)
+    DEFINE_OPERATOR(~)
+
+#undef DEFINE_OPERATOR
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric operator op (const custom_numeric & other) const \
+    {                                                               \
+        return custom_numeric(value[0] op other.value[0]);          \
+    }
+
+    DEFINE_OPERATOR(+)
+    DEFINE_OPERATOR(-)
+    DEFINE_OPERATOR(*)
+    DEFINE_OPERATOR(/)
+    DEFINE_OPERATOR(%)
+    DEFINE_OPERATOR(<<)
+    DEFINE_OPERATOR(>>)
+    DEFINE_OPERATOR(&)
+    DEFINE_OPERATOR(|)
+    DEFINE_OPERATOR(^)
+
+#undef DEFINE_OPERATOR
+
+#define CONCAT(X, Y) X ## Y
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric & operator CONCAT(op, =) (const custom_numeric & other) \
+    {                                                               \
+        fill(value[0] op other.value[0]);                           \
+        return *this;                                               \
+    }
+
+    DEFINE_OPERATOR(+)
+    DEFINE_OPERATOR(-)
+    DEFINE_OPERATOR(*)
+    DEFINE_OPERATOR(/)
+    DEFINE_OPERATOR(%)
+    DEFINE_OPERATOR(<<)
+    DEFINE_OPERATOR(>>)
+    DEFINE_OPERATOR(&)
+    DEFINE_OPERATOR(|)
+    DEFINE_OPERATOR(^)
+
+#undef DEFINE_OPERATOR
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    friend bool operator op (const custom_numeric & lhs, const custom_numeric & rhs) \
+    {                                                               \
+        return lhs.value[0] op rhs.value[0];                        \
+    }
+
+    DEFINE_OPERATOR(==)
+    DEFINE_OPERATOR(!=)
+    DEFINE_OPERATOR(<)
+    DEFINE_OPERATOR(<=)
+    DEFINE_OPERATOR(>)
+    DEFINE_OPERATOR(>=)
+    DEFINE_OPERATOR(&&)
+    DEFINE_OPERATOR(||);
+
+
+#undef DEFINE_OPERATOR
+
+    friend std::ostream & operator<<(std::ostream & os, const custom_numeric & val)
+    {
+        return os << "custom_numeric{" << val.value[0] << "}";
+    }
+
+private:
+    int value[5];
+
+    __host__ __device__
+    void fill(int val)
+    {
+        for (int i = 0; i < 5; ++i)
+        {
+            value[i] = val;
+        }
+    }
+};
+
+namespace thrust
+{
+
+template <>
+struct numeric_limits<custom_numeric> : numeric_limits<int> {};
+
+namespace detail
+{
+
+// For random number generation
+template<>
+class integer_traits<custom_numeric>
+  : public integer_traits_base<int, INT_MIN, INT_MAX>
+{};
+
+}} // namespace thrust::detail
+
+typedef unittest::type_list<char,
+                            signed char,
+                            unsigned char,
+                            short,
+                            unsigned short,
+                            int,
+                            unsigned int,
+                            long,
+                            unsigned long,
+                            long long,
+                            unsigned long long,
+                            float,
+                            double,
+                            custom_numeric> NumericTypes;
+
+typedef unittest::type_list<char,
+                            signed char,
+                            unsigned char,
+                            short,
+                            unsigned short,
+                            int,
+                            unsigned int,
+                            long,
+                            unsigned long,
+                            long long,
+                            unsigned long long,
+                            float,
+                            double> BuiltinNumericTypes;
+
+inline void chop_prefix(std::string& str, const std::string& prefix)
+{
+    str.replace(str.find(prefix) == 0 ? 0 : str.size(), prefix.size(), "");
+}
+
+inline std::string base_class_name(const std::string& name)
+{
+  std::string result = name;
+
+  // if the name begins with "struct ", chop it off
+  chop_prefix(result, "struct ");
+
+  // if the name begins with "class ", chop it off
+  chop_prefix(result, "class ");
+
+  const std::size_t first_lt = result.find_first_of("<");
+
+  if (first_lt < result.size())
+      // chop everything including and after first "<"
+      return result.replace(first_lt, result.size(), "");
+  else
+      return result;
+}
+
+enum TestStatus { Pass = 0, Failure = 1, KnownFailure = 2, Error = 3, UnknownException = 4};
+
+typedef std::set<std::string>              ArgumentSet;
+typedef std::map<std::string, std::string> ArgumentMap;
+
+std::vector<size_t> get_test_sizes(void);
+void                set_test_sizes(const std::string&);
+
+class UnitTest {
+    public:
+        std::string name;
+        UnitTest() {}
+        UnitTest(const char * name);
+        virtual ~UnitTest() {}
+        virtual void run() {}
+
+        bool operator<(const UnitTest& u) const
+        {
+            return name < u.name;
+        }
+};
+
+class UnitTestDriver;
+
+class UnitTestDriver
+{
+  typedef std::map<std::string, UnitTest*> TestMap;
+
+  TestMap test_map;
+
+  bool run_tests(std::vector<UnitTest *>& tests_to_run, const ArgumentMap& kwargs);
+
+protected:
+  // executed immediately after each test
+  // \param test The UnitTest of interest
+  // \param concise Whether or not to suppress output
+  // \return true if all is well; false if the tests must be immediately aborted
+  virtual bool post_test_sanity_check(const UnitTest &test, bool concise);
+
+public:
+  inline virtual ~UnitTestDriver() {};
+
+  void register_test(UnitTest * test);
+  virtual bool run_tests(const ArgumentSet& args, const ArgumentMap& kwargs);
+  void list_tests(void);
+
+  static UnitTestDriver &s_driver();
+};
+
+// Macro to create a single unittest
+#define DECLARE_UNITTEST(TEST)                                   \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run(){                                                  \
+            TEST();                                              \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+#define DECLARE_UNITTEST_WITH_NAME(TEST, NAME)                   \
+class NAME##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    NAME##UnitTest() : UnitTest(#NAME) {}                        \
+    void run(){                                                  \
+            TEST();                                              \
+    }                                                            \
+};                                                               \
+NAME##UnitTest NAME##Instance
+
+// Macro to create host and device versions of a
+// unit test for a bunch of data types
+#define DECLARE_VECTOR_UNITTEST(VTEST)                          \
+void VTEST##Host(void) {                                        \
+    VTEST< thrust::host_vector<signed char> >();                \
+    VTEST< thrust::host_vector<short> >();                      \
+    VTEST< thrust::host_vector<int> >();                        \
+    VTEST< thrust::host_vector<float> >();                      \
+    VTEST< thrust::host_vector<custom_numeric> >();             \
+    /* MR vectors */                                            \
+    VTEST< thrust::host_vector<int,                             \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::host_memory_resource> > >();                \
+}                                                               \
+void VTEST##Device(void) {                                      \
+    VTEST< thrust::device_vector<signed char> >();              \
+    VTEST< thrust::device_vector<short> >();                    \
+    VTEST< thrust::device_vector<int> >();                      \
+    VTEST< thrust::device_vector<float> >();                    \
+    VTEST< thrust::device_vector<custom_numeric> >();           \
+    /* MR vectors */                                            \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::device_memory_resource> > >();              \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::universal_memory_resource> > >();           \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::universal_host_pinned_memory_resource> > >();\
+}                                                               \
+DECLARE_UNITTEST(VTEST##Host);                                  \
+DECLARE_UNITTEST(VTEST##Device);
+
+// Same as above, but only for integral types
+#define DECLARE_INTEGRAL_VECTOR_UNITTEST(VTEST)                 \
+void VTEST##Host(void) {                                        \
+    VTEST< thrust::host_vector<signed char> >();                \
+    VTEST< thrust::host_vector<short> >();                      \
+    VTEST< thrust::host_vector<int> >();                        \
+}                                                               \
+void VTEST##Device(void) {                                      \
+    VTEST< thrust::device_vector<signed char> >();              \
+    VTEST< thrust::device_vector<short> >();                    \
+    VTEST< thrust::device_vector<int> >();                      \
+}                                                               \
+DECLARE_UNITTEST(VTEST##Host);                                  \
+DECLARE_UNITTEST(VTEST##Device);
+
+// Macro to create instances of a test for several data types.
+#define DECLARE_GENERIC_UNITTEST(TEST)                           \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        TEST<signed char>();                                     \
+        TEST<unsigned char>();                                   \
+        TEST<short>();                                           \
+        TEST<unsigned short>();                                  \
+        TEST<int>();                                             \
+        TEST<unsigned int>();                                    \
+        TEST<float>();                                           \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+// Macro to create instances of a test for several data types and array sizes
+#define DECLARE_VARIABLE_UNITTEST(TEST)                          \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        std::vector<size_t> sizes = get_test_sizes();            \
+        for(size_t i = 0; i != sizes.size(); ++i)                \
+        {                                                        \
+            TEST<signed char>(sizes[i]);                         \
+            TEST<unsigned char>(sizes[i]);                       \
+            TEST<short>(sizes[i]);                               \
+            TEST<unsigned short>(sizes[i]);                      \
+            TEST<int>(sizes[i]);                                 \
+            TEST<unsigned int>(sizes[i]);                        \
+            TEST<float>(sizes[i]);                               \
+            TEST<double>(sizes[i]);                              \
+        }                                                        \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+#define DECLARE_INTEGRAL_VARIABLE_UNITTEST(TEST)                 \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        std::vector<size_t> sizes = get_test_sizes();            \
+        for(size_t i = 0; i != sizes.size(); ++i)                \
+        {                                                        \
+            TEST<signed char>(sizes[i]);                         \
+            TEST<unsigned char>(sizes[i]);                       \
+            TEST<short>(sizes[i]);                               \
+            TEST<unsigned short>(sizes[i]);                      \
+            TEST<int>(sizes[i]);                                 \
+            TEST<unsigned int>(sizes[i]);                        \
+        }                                                        \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+#define DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(TEST, TYPES, NAME)       \
+  ::SimpleUnitTest<TEST, TYPES> NAME##_instance(#NAME)                        \
+  /**/
+
+#define DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(TEST, TYPES, NAME) \
+  ::VariableUnitTest<TEST, TYPES> NAME##_instance(#NAME)                      \
+  /**/
+
+#define DECLARE_GENERIC_UNITTEST_WITH_TYPES(TEST, TYPES)                      \
+  ::SimpleUnitTest<TEST, TYPES> TEST##_instance(#TEST)                        \
+  /**/
+
+#define DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TEST, TYPES)                \
+  ::VariableUnitTest<TEST, TYPES> TEST##_instance(#TEST)                      \
+  /**/
+
+template<template <typename> class TestName, typename TypeList>
+  class SimpleUnitTest : public UnitTest
+{
+  public:
+    SimpleUnitTest()
+      : UnitTest(base_class_name(unittest::type_name<TestName<int> >()).c_str()) {}
+
+    SimpleUnitTest(const char * name)
+      : UnitTest(name) {}
+
+    void run()
+    {
+      // get the first type in the list
+      typedef typename unittest::get_type<TypeList,0>::type first_type;
+
+      unittest::for_each_type<TypeList,TestName,first_type,0> for_each;
+
+      // loop over the types
+      for_each();
+    }
+}; // end SimpleUnitTest
+
+
+template<template <typename> class TestName, typename TypeList>
+  class VariableUnitTest : public UnitTest
+{
+  public:
+    VariableUnitTest()
+      : UnitTest(base_class_name(unittest::type_name<TestName<int> >()).c_str()) {}
+
+    VariableUnitTest(const char * name)
+      : UnitTest(name) {}
+
+    void run()
+    {
+        std::vector<size_t> sizes = get_test_sizes();
+        for(size_t i = 0; i != sizes.size(); ++i)
+        {
+            // get the first type in the list
+            typedef typename unittest::get_type<TypeList,0>::type first_type;
+
+            unittest::for_each_type<TypeList,TestName,first_type,0> loop;
+
+            // loop over the types
+            loop(sizes[i]);
+        }
+    }
+}; // end VariableUnitTest
+
+template<template <typename> class TestName,
+         typename TypeList,
+         template <typename, typename> class Vector,
+         template <typename> class Alloc>
+  struct VectorUnitTest
+    : public UnitTest
+{
+  VectorUnitTest()
+    : UnitTest((base_class_name(unittest::type_name<TestName< Vector<int, Alloc<int> > > >()) + "<" +
+                base_class_name(unittest::type_name<Vector<int, Alloc<int> > >()) + ">").c_str())
+  { }
+
+  VectorUnitTest(const char * name)
+    : UnitTest(name) {}
+
+  void run()
+  {
+    // zip up the type list with Alloc
+    typedef typename unittest::transform1<TypeList, Alloc>::type AllocList;
+
+    // zip up the type list & alloc list with Vector
+    typedef typename unittest::transform2<TypeList, AllocList, Vector>::type VectorList;
+
+    // get the first type in the list
+    typedef typename unittest::get_type<VectorList,0>::type first_type;
+
+    unittest::for_each_type<VectorList,TestName,first_type,0> loop;
+
+    // loop over the types
+    loop(0);
+  }
+}; // end VectorUnitTest
+
diff --git a/thrust/testing/unittest/unittest.h b/thrust/testing/unittest/unittest.h
new file mode 100644
index 0000000000000000000000000000000000000000..49c9daf429ade8877027382a22712a42677e6043
--- /dev/null
+++ b/thrust/testing/unittest/unittest.h
@@ -0,0 +1,11 @@
+#pragma once
+
+// this is the only header included by unittests
+// it pulls in all the others used for unittesting
+
+#include <unittest/assertions.h>
+#include <unittest/meta.h>
+#include <unittest/random.h>
+#include <unittest/testframework.h>
+#include <unittest/special_types.h>
+
diff --git a/thrust/testing/unittest/util.h b/thrust/testing/unittest/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..02c1eb7ce3d3eeaed860daaf628aa13c7816c772
--- /dev/null
+++ b/thrust/testing/unittest/util.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <typeinfo>
+#include <unittest/system.h>
+
+#include <thrust/extrema.h>
+#include <thrust/limits.h>
+#include <thrust/detail/type_traits.h>
+
+namespace unittest
+{
+
+template<typename T>
+  std::string type_name(void)
+{
+  return demangle(typeid(T).name());
+} // end type_name()
+
+// Use this with counting_iterator to avoid generating a range larger than we
+// can represent.
+template <typename T>
+typename thrust::detail::disable_if<
+  thrust::detail::is_floating_point<T>::value
+, T
+>::type truncate_to_max_representable(std::size_t n)
+{
+  return thrust::min<std::size_t>(
+    n, static_cast<std::size_t>(thrust::numeric_limits<T>::max())
+  );
+}
+
+// TODO: This probably won't work for `half`.
+template <typename T>
+typename thrust::detail::enable_if<
+  thrust::detail::is_floating_point<T>::value
+, T
+>::type truncate_to_max_representable(std::size_t n)
+{
+  return thrust::min<T>(
+    n, thrust::numeric_limits<T>::max()
+  );
+}
+
+} // end unittest
+
+template <typename Iterator>
+void PRINT(Iterator first, Iterator last)
+{
+  size_t n = 0;
+  for (Iterator i = first; i != last; i++, n++)
+    std::cout << ">>> [" << n << "] = " << *i << std::endl;
+}
+
+template <typename Container>
+void PRINT(const Container& c)
+{
+  PRINT(c.begin(), c.end());
+}
+
+template <size_t N>
+void PRINT(const char (&c)[N])
+{
+  std::cout << std::string(c, c + N) << std::endl;
+}
+
diff --git a/thrust/testing/unittest/util_async.h b/thrust/testing/unittest/util_async.h
new file mode 100644
index 0000000000000000000000000000000000000000..984cc61c6bf433508956db64c35c541a7c301bf3
--- /dev/null
+++ b/thrust/testing/unittest/util_async.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <unittest/unittest.h>
+
+#include <thrust/future.h>
+
+#define TEST_EVENT_WAIT(e)                                                    \
+  ::unittest::test_event_wait(e, __FILE__, __LINE__)                          \
+  /**/
+
+#define TEST_FUTURE_VALUE_RETRIEVAL(f)                                        \
+  ::unittest::test_future_value_retrieval(f, __FILE__, __LINE__)              \
+  /**/
+
+namespace unittest
+{
+
+template <typename Event>
+__host__
+void test_event_wait(
+  Event&& e, std::string const& filename = "unknown", int lineno = -1
+)
+{
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.valid_stream(), filename, lineno);
+
+  e.wait();
+  e.wait();
+
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.ready(), filename, lineno);
+}
+
+template <typename Future>
+__host__
+auto test_future_value_retrieval(
+  Future&& f, std::string const& filename = "unknown", int lineno = -1
+) -> decltype(f.extract())
+{
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_content(), filename, lineno);
+
+  auto const r0 = f.get();
+  auto const r1 = f.get();
+
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.ready(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_content(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(r0, r1, filename, lineno);
+
+  auto const r2 = f.extract();
+
+  ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(
+    auto x = f.extract();
+    THRUST_UNUSED_VAR(x)
+  , thrust::event_error
+  , thrust::event_error(thrust::event_errc::no_content)
+  , filename, lineno
+  );
+
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.ready(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.valid_content(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(r2, r1, filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(r2, r0, filename, lineno);
+
+  return r2;
+}
+
+} // namespace unittest
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/testing/unittest_static_assert.cu b/thrust/testing/unittest_static_assert.cu
new file mode 100644
index 0000000000000000000000000000000000000000..02322f8d692967e7cdb532e4b6f612f9144c68a1
--- /dev/null
+++ b/thrust/testing/unittest_static_assert.cu
@@ -0,0 +1,30 @@
+#include <unittest/runtime_static_assert.h>
+#include <unittest/unittest.h>
+#include <thrust/generate.h>
+
+template<typename T>
+struct dependent_false
+{
+    enum { value = false };
+};
+
+template<typename T>
+struct static_assertion
+{
+    __host__ __device__
+    int operator()() const
+    {
+        THRUST_STATIC_ASSERT(dependent_false<T>::value);
+        return 0;
+    }
+};
+
+template<typename V>
+void TestStaticAssertAssert()
+{
+#if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_OMP && THRUST_HOST_SYSTEM != THRUST_HOST_SYSTEM_OMP
+    V test(10);
+    ASSERT_STATIC_ASSERT(thrust::generate(test.begin(), test.end(), static_assertion<int>()));
+#endif
+}
+DECLARE_VECTOR_UNITTEST(TestStaticAssertAssert);
diff --git a/thrust/testing/unittest_tester.cu b/thrust/testing/unittest_tester.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27e97ca91b5be7ee014d667acbb1e6d38f6f19a4
--- /dev/null
+++ b/thrust/testing/unittest_tester.cu
@@ -0,0 +1,47 @@
+#include <unittest/unittest.h>
+
+void TestAssertEqual(void)
+{
+    ASSERT_EQUAL(0, 0);
+    ASSERT_EQUAL(1, 1);
+    ASSERT_EQUAL(-15.0f, -15.0f);
+}
+DECLARE_UNITTEST(TestAssertEqual);
+
+void TestAssertLEqual(void)
+{
+    ASSERT_LEQUAL(0, 1);
+    ASSERT_LEQUAL(0, 0);
+}
+DECLARE_UNITTEST(TestAssertLEqual);
+
+void TestAssertGEqual(void)
+{
+    ASSERT_GEQUAL(1, 0);
+    ASSERT_GEQUAL(0, 0);
+}
+DECLARE_UNITTEST(TestAssertGEqual);
+
+void TestAssertLess(void)
+{
+    ASSERT_LESS(0, 1);
+}
+DECLARE_UNITTEST(TestAssertLess);
+
+void TestAssertGreater(void)
+{
+    ASSERT_GREATER(1, 0);
+}
+DECLARE_UNITTEST(TestAssertGreater);
+
+void TestTypeName(void)
+{
+    ASSERT_EQUAL(unittest::type_name<char>(),          "char");
+    ASSERT_EQUAL(unittest::type_name<signed char>(),   "signed char");
+    ASSERT_EQUAL(unittest::type_name<unsigned char>(), "unsigned char");
+    ASSERT_EQUAL(unittest::type_name<int>(),           "int");
+    ASSERT_EQUAL(unittest::type_name<float>(),         "float");
+    ASSERT_EQUAL(unittest::type_name<double>(),        "double");
+}
+DECLARE_UNITTEST(TestTypeName);
+
diff --git a/thrust/testing/vector.cu b/thrust/testing/vector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8154b01c6cc90e498e9188209f542cd2094ea08b
--- /dev/null
+++ b/thrust/testing/vector.cu
@@ -0,0 +1,800 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/sequence.h>
+#include <thrust/device_malloc_allocator.h>
+
+#include <vector>
+#include <list>
+#include <limits>
+#include <utility>
+
+template <class Vector>
+void TestVectorZeroSize(void)
+{
+    Vector v;
+    ASSERT_EQUAL(v.size(), 0lu);
+    ASSERT_EQUAL((v.begin() == v.end()), true);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorZeroSize);
+
+
+void TestVectorBool(void)
+{
+    thrust::host_vector<bool> h(3);
+    thrust::device_vector<bool> d(3);
+
+    h[0] = true; h[1] = false; h[2] = true;
+    d[0] = true; d[1] = false; d[2] = true;
+
+    ASSERT_EQUAL(h[0], true);
+    ASSERT_EQUAL(h[1], false);
+    ASSERT_EQUAL(h[2], true);
+
+    ASSERT_EQUAL(d[0], true);
+    ASSERT_EQUAL(d[1], false);
+    ASSERT_EQUAL(d[2], true);
+}
+DECLARE_UNITTEST(TestVectorBool);
+
+
+template <class Vector>
+void TestVectorFrontBack(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v(3);
+    v[0] = 0; v[1] = 1; v[2] = 2;
+
+    ASSERT_EQUAL(v.front(), T(0));
+    ASSERT_EQUAL(v.back(),  T(2));
+}
+DECLARE_VECTOR_UNITTEST(TestVectorFrontBack);
+
+
+template <class Vector>
+void TestVectorData(void)
+{
+    typedef typename Vector::pointer PointerT;
+    typedef typename Vector::const_pointer PointerConstT;
+
+    Vector v(3);
+    v[0] = 0; v[1] = 1; v[2] = 2;
+
+    ASSERT_EQUAL(0,          *v.data());
+    ASSERT_EQUAL(1,          *(v.data() + 1));
+    ASSERT_EQUAL(2,          *(v.data() + 2));
+    ASSERT_EQUAL(PointerT(&v.front()),  v.data());
+    ASSERT_EQUAL(PointerT(&*v.begin()), v.data());
+    ASSERT_EQUAL(PointerT(&v[0]),       v.data());
+
+    const Vector &c_v = v;
+
+    ASSERT_EQUAL(0,            *c_v.data());
+    ASSERT_EQUAL(1,            *(c_v.data() + 1));
+    ASSERT_EQUAL(2,            *(c_v.data() + 2));
+    ASSERT_EQUAL(PointerConstT(&c_v.front()),  c_v.data());
+    ASSERT_EQUAL(PointerConstT(&*c_v.begin()), c_v.data());
+    ASSERT_EQUAL(PointerConstT(&c_v[0]),       c_v.data());
+}
+DECLARE_VECTOR_UNITTEST(TestVectorData);
+
+
+template <class Vector>
+void TestVectorElementAssignment(void)
+{
+    Vector v(3);
+
+    v[0] = 0; v[1] = 1; v[2] = 2;
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+
+    v[0] = 10; v[1] = 11; v[2] = 12;
+
+    ASSERT_EQUAL(v[0], 10);
+    ASSERT_EQUAL(v[1], 11);
+    ASSERT_EQUAL(v[2], 12);
+
+    Vector w(3);
+    w[0] = v[0];
+    w[1] = v[1];
+    w[2] = v[2];
+
+    ASSERT_EQUAL(v, w);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorElementAssignment);
+
+
+template <class Vector>
+void TestVectorFromSTLVector(void)
+{
+    typedef typename Vector::value_type T;
+
+    std::vector<T> stl_vector(3);
+    stl_vector[0] = 0;
+    stl_vector[1] = 1;
+    stl_vector[2] = 2;
+
+    thrust::host_vector<T> v(stl_vector);
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+
+    v = stl_vector;
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorFromSTLVector);
+
+
+template <class Vector>
+void TestVectorFillAssign(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::host_vector<T> v;
+    v.assign(3, 13);
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 13);
+    ASSERT_EQUAL(v[1], 13);
+    ASSERT_EQUAL(v[2], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorFillAssign);
+
+
+template <class Vector>
+void TestVectorAssignFromSTLVector(void)
+{
+    typedef typename Vector::value_type T;
+
+    std::vector<T> stl_vector(3);
+    stl_vector[0] = 0;
+    stl_vector[1] = 1;
+    stl_vector[2] = 2;
+
+    thrust::host_vector<T> v;
+    v.assign(stl_vector.begin(), stl_vector.end());
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorAssignFromSTLVector);
+
+
+template <class Vector>
+void TestVectorFromBiDirectionalIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    std::list<T> stl_list;
+    stl_list.push_back(0);
+    stl_list.push_back(1);
+    stl_list.push_back(2);
+
+    Vector v(stl_list.begin(), stl_list.end());
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorFromBiDirectionalIterator);
+
+
+template <class Vector>
+void TestVectorAssignFromBiDirectionalIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    std::list<T> stl_list;
+    stl_list.push_back(0);
+    stl_list.push_back(1);
+    stl_list.push_back(2);
+
+    Vector v;
+    v.assign(stl_list.begin(), stl_list.end());
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorAssignFromBiDirectionalIterator);
+
+
+template <class Vector>
+void TestVectorAssignFromHostVector(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::host_vector<T> h(3);
+    h[0] = 0;
+    h[1] = 1;
+    h[2] = 2;
+
+    Vector v;
+    v.assign(h.begin(), h.end());
+
+    ASSERT_EQUAL(v, h);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorAssignFromHostVector);
+
+
+template <class Vector>
+void TestVectorToAndFromHostVector(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::host_vector<T> h(3);
+    h[0] = 0;
+    h[1] = 1;
+    h[2] = 2;
+
+    Vector v(h);
+
+    ASSERT_EQUAL(v, h);
+
+    THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING(v = v);
+
+    ASSERT_EQUAL(v, h);
+
+    v[0] = 10;
+    v[1] = 11;
+    v[2] = 12;
+
+    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10);
+    ASSERT_EQUAL(h[1], 1);  ASSERT_EQUAL(v[1], 11);
+    ASSERT_EQUAL(h[2], 2);  ASSERT_EQUAL(v[2], 12);
+
+    h = v;
+
+    ASSERT_EQUAL(v, h);
+
+    h[1] = 11;
+
+    v = h;
+
+    ASSERT_EQUAL(v, h);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorToAndFromHostVector);
+
+
+template <class Vector>
+void TestVectorAssignFromDeviceVector(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::device_vector<T> d(3);
+    d[0] = 0;
+    d[1] = 1;
+    d[2] = 2;
+
+    Vector v;
+    v.assign(d.begin(), d.end());
+
+    ASSERT_EQUAL(v, d);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorAssignFromDeviceVector);
+
+
+template <class Vector>
+void TestVectorToAndFromDeviceVector(void)
+{
+    typedef typename Vector::value_type T;
+
+    thrust::device_vector<T> h(3);
+    h[0] = 0;
+    h[1] = 1;
+    h[2] = 2;
+
+    Vector v(h);
+
+    ASSERT_EQUAL(v, h);
+
+    THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING(v = v);
+
+    ASSERT_EQUAL(v, h);
+
+    v[0] = 10;
+    v[1] = 11;
+    v[2] = 12;
+
+    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10);
+    ASSERT_EQUAL(h[1], 1);  ASSERT_EQUAL(v[1], 11);
+    ASSERT_EQUAL(h[2], 2);  ASSERT_EQUAL(v[2], 12);
+
+    h = v;
+
+    ASSERT_EQUAL(v, h);
+
+    h[1] = 11;
+
+    v = h;
+
+    ASSERT_EQUAL(v, h);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorToAndFromDeviceVector);
+
+
+template <class Vector>
+void TestVectorWithInitialValue(void)
+{
+    typedef typename Vector::value_type T;
+
+    const T init = 17;
+
+    Vector v(3, init);
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], init);
+    ASSERT_EQUAL(v[1], init);
+    ASSERT_EQUAL(v[2], init);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorWithInitialValue);
+
+
+template <class Vector>
+void TestVectorSwap(void)
+{
+    Vector v(3);
+    v[0] = 0; v[1] = 1; v[2] = 2;
+
+    Vector u(3);
+    u[0] = 10; u[1] = 11; u[2] = 12;
+
+    v.swap(u);
+
+    ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(u[0], 0);
+    ASSERT_EQUAL(v[1], 11); ASSERT_EQUAL(u[1], 1);
+    ASSERT_EQUAL(v[2], 12); ASSERT_EQUAL(u[2], 2);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorSwap);
+
+
+template <class Vector>
+void TestVectorErasePosition(void)
+{
+    Vector v(5);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
+
+    v.erase(v.begin() + 2);
+
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 3);
+    ASSERT_EQUAL(v[3], 4);
+
+    v.erase(v.begin() + 0);
+
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 3);
+    ASSERT_EQUAL(v[2], 4);
+
+    v.erase(v.begin() + 2);
+
+    ASSERT_EQUAL(v.size(), 2lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 3);
+
+    v.erase(v.begin() + 1);
+
+    ASSERT_EQUAL(v.size(), 1lu);
+    ASSERT_EQUAL(v[0], 1);
+
+    v.erase(v.begin() + 0);
+
+    ASSERT_EQUAL(v.size(), 0lu);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorErasePosition);
+
+
+template <class Vector>
+void TestVectorEraseRange(void)
+{
+    Vector v(6);
+    v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4; v[5] = 5;
+
+    v.erase(v.begin() + 1, v.begin() + 3);
+
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 3);
+    ASSERT_EQUAL(v[2], 4);
+    ASSERT_EQUAL(v[3], 5);
+
+    v.erase(v.begin() + 2, v.end());
+
+    ASSERT_EQUAL(v.size(), 2lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 3);
+
+    v.erase(v.begin() + 0, v.begin() + 1);
+
+    ASSERT_EQUAL(v.size(), 1lu);
+    ASSERT_EQUAL(v[0], 3);
+
+    v.erase(v.begin(), v.end());
+
+    ASSERT_EQUAL(v.size(), 0lu);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorEraseRange);
+
+
+void TestVectorEquality(void)
+{
+    thrust::host_vector<int> h_a(3);
+    thrust::host_vector<int> h_b(3);
+    thrust::host_vector<int> h_c(3);
+    h_a[0] = 0;    h_a[1] = 1;    h_a[2] = 2;
+    h_b[0] = 0;    h_b[1] = 1;    h_b[2] = 3;
+    h_b[0] = 0;    h_b[1] = 1;
+
+    thrust::device_vector<int> d_a(3);
+    thrust::device_vector<int> d_b(3);
+    thrust::device_vector<int> d_c(3);
+    d_a[0] = 0;    d_a[1] = 1;    d_a[2] = 2;
+    d_b[0] = 0;    d_b[1] = 1;    d_b[2] = 3;
+    d_b[0] = 0;    d_b[1] = 1;
+
+    std::vector<int> s_a(3);
+    std::vector<int> s_b(3);
+    std::vector<int> s_c(3);
+    s_a[0] = 0;    s_a[1] = 1;    s_a[2] = 2;
+    s_b[0] = 0;    s_b[1] = 1;    s_b[2] = 3;
+    s_b[0] = 0;    s_b[1] = 1;
+
+    ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true);  ASSERT_EQUAL((d_a == d_a), true);
+    ASSERT_EQUAL((h_b == h_b), true); ASSERT_EQUAL((h_b == d_b), true); ASSERT_EQUAL((d_b == h_b), true);  ASSERT_EQUAL((d_b == d_b), true);
+    ASSERT_EQUAL((h_c == h_c), true); ASSERT_EQUAL((h_c == d_c), true); ASSERT_EQUAL((d_c == h_c), true);  ASSERT_EQUAL((d_c == d_c), true);
+
+    // test vector vs device_vector
+    ASSERT_EQUAL((s_a == d_a), true); ASSERT_EQUAL((d_a == s_a), true);
+    ASSERT_EQUAL((s_b == d_b), true); ASSERT_EQUAL((d_b == s_b), true);
+    ASSERT_EQUAL((s_c == d_c), true); ASSERT_EQUAL((d_c == s_c), true);
+
+    // test vector vs host_vector
+    ASSERT_EQUAL((s_a == h_a), true); ASSERT_EQUAL((h_a == s_a), true);
+    ASSERT_EQUAL((s_b == h_b), true); ASSERT_EQUAL((h_b == s_b), true);
+    ASSERT_EQUAL((s_c == h_c), true); ASSERT_EQUAL((h_c == s_c), true);
+
+    ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false);
+    ASSERT_EQUAL((h_b == h_a), false); ASSERT_EQUAL((h_b == d_a), false); ASSERT_EQUAL((d_b == h_a), false); ASSERT_EQUAL((d_b == d_a), false);
+    ASSERT_EQUAL((h_a == h_c), false); ASSERT_EQUAL((h_a == d_c), false); ASSERT_EQUAL((d_a == h_c), false); ASSERT_EQUAL((d_a == d_c), false);
+    ASSERT_EQUAL((h_c == h_a), false); ASSERT_EQUAL((h_c == d_a), false); ASSERT_EQUAL((d_c == h_a), false); ASSERT_EQUAL((d_c == d_a), false);
+    ASSERT_EQUAL((h_b == h_c), false); ASSERT_EQUAL((h_b == d_c), false); ASSERT_EQUAL((d_b == h_c), false); ASSERT_EQUAL((d_b == d_c), false);
+    ASSERT_EQUAL((h_c == h_b), false); ASSERT_EQUAL((h_c == d_b), false); ASSERT_EQUAL((d_c == h_b), false); ASSERT_EQUAL((d_c == d_b), false);
+
+    // test vector vs device_vector
+    ASSERT_EQUAL((s_a == d_b), false); ASSERT_EQUAL((d_a == s_b), false);
+    ASSERT_EQUAL((s_b == d_a), false); ASSERT_EQUAL((d_b == s_a), false);
+    ASSERT_EQUAL((s_a == d_c), false); ASSERT_EQUAL((d_a == s_c), false);
+    ASSERT_EQUAL((s_c == d_a), false); ASSERT_EQUAL((d_c == s_a), false);
+    ASSERT_EQUAL((s_b == d_c), false); ASSERT_EQUAL((d_b == s_c), false);
+    ASSERT_EQUAL((s_c == d_b), false); ASSERT_EQUAL((d_c == s_b), false);
+
+    // test vector vs host_vector
+    ASSERT_EQUAL((s_a == h_b), false); ASSERT_EQUAL((h_a == s_b), false);
+    ASSERT_EQUAL((s_b == h_a), false); ASSERT_EQUAL((h_b == s_a), false);
+    ASSERT_EQUAL((s_a == h_c), false); ASSERT_EQUAL((h_a == s_c), false);
+    ASSERT_EQUAL((s_c == h_a), false); ASSERT_EQUAL((h_c == s_a), false);
+    ASSERT_EQUAL((s_b == h_c), false); ASSERT_EQUAL((h_b == s_c), false);
+    ASSERT_EQUAL((s_c == h_b), false); ASSERT_EQUAL((h_c == s_b), false);
+}
+DECLARE_UNITTEST(TestVectorEquality);
+
+void TestVectorInequality(void)
+{
+    thrust::host_vector<int> h_a(3);
+    thrust::host_vector<int> h_b(3);
+    thrust::host_vector<int> h_c(3);
+    h_a[0] = 0;    h_a[1] = 1;    h_a[2] = 2;
+    h_b[0] = 0;    h_b[1] = 1;    h_b[2] = 3;
+    h_b[0] = 0;    h_b[1] = 1;
+
+    thrust::device_vector<int> d_a(3);
+    thrust::device_vector<int> d_b(3);
+    thrust::device_vector<int> d_c(3);
+    d_a[0] = 0;    d_a[1] = 1;    d_a[2] = 2;
+    d_b[0] = 0;    d_b[1] = 1;    d_b[2] = 3;
+    d_b[0] = 0;    d_b[1] = 1;
+
+    std::vector<int> s_a(3);
+    std::vector<int> s_b(3);
+    std::vector<int> s_c(3);
+    s_a[0] = 0;    s_a[1] = 1;    s_a[2] = 2;
+    s_b[0] = 0;    s_b[1] = 1;    s_b[2] = 3;
+    s_b[0] = 0;    s_b[1] = 1;
+
+    ASSERT_EQUAL((h_a != h_a), false); ASSERT_EQUAL((h_a != d_a), false); ASSERT_EQUAL((d_a != h_a), false);  ASSERT_EQUAL((d_a != d_a), false);
+    ASSERT_EQUAL((h_b != h_b), false); ASSERT_EQUAL((h_b != d_b), false); ASSERT_EQUAL((d_b != h_b), false);  ASSERT_EQUAL((d_b != d_b), false);
+    ASSERT_EQUAL((h_c != h_c), false); ASSERT_EQUAL((h_c != d_c), false); ASSERT_EQUAL((d_c != h_c), false);  ASSERT_EQUAL((d_c != d_c), false);
+
+    // test vector vs device_vector
+    ASSERT_EQUAL((s_a != d_a), false); ASSERT_EQUAL((d_a != s_a), false);
+    ASSERT_EQUAL((s_b != d_b), false); ASSERT_EQUAL((d_b != s_b), false);
+    ASSERT_EQUAL((s_c != d_c), false); ASSERT_EQUAL((d_c != s_c), false);
+
+    // test vector vs host_vector
+    ASSERT_EQUAL((s_a != h_a), false); ASSERT_EQUAL((h_a != s_a), false);
+    ASSERT_EQUAL((s_b != h_b), false); ASSERT_EQUAL((h_b != s_b), false);
+    ASSERT_EQUAL((s_c != h_c), false); ASSERT_EQUAL((h_c != s_c), false);
+
+    ASSERT_EQUAL((h_a != h_b), true); ASSERT_EQUAL((h_a != d_b), true); ASSERT_EQUAL((d_a != h_b), true); ASSERT_EQUAL((d_a != d_b), true);
+    ASSERT_EQUAL((h_b != h_a), true); ASSERT_EQUAL((h_b != d_a), true); ASSERT_EQUAL((d_b != h_a), true); ASSERT_EQUAL((d_b != d_a), true);
+    ASSERT_EQUAL((h_a != h_c), true); ASSERT_EQUAL((h_a != d_c), true); ASSERT_EQUAL((d_a != h_c), true); ASSERT_EQUAL((d_a != d_c), true);
+    ASSERT_EQUAL((h_c != h_a), true); ASSERT_EQUAL((h_c != d_a), true); ASSERT_EQUAL((d_c != h_a), true); ASSERT_EQUAL((d_c != d_a), true);
+    ASSERT_EQUAL((h_b != h_c), true); ASSERT_EQUAL((h_b != d_c), true); ASSERT_EQUAL((d_b != h_c), true); ASSERT_EQUAL((d_b != d_c), true);
+    ASSERT_EQUAL((h_c != h_b), true); ASSERT_EQUAL((h_c != d_b), true); ASSERT_EQUAL((d_c != h_b), true); ASSERT_EQUAL((d_c != d_b), true);
+
+    // test vector vs device_vector
+    ASSERT_EQUAL((s_a != d_b), true); ASSERT_EQUAL((d_a != s_b), true);
+    ASSERT_EQUAL((s_b != d_a), true); ASSERT_EQUAL((d_b != s_a), true);
+    ASSERT_EQUAL((s_a != d_c), true); ASSERT_EQUAL((d_a != s_c), true);
+    ASSERT_EQUAL((s_c != d_a), true); ASSERT_EQUAL((d_c != s_a), true);
+    ASSERT_EQUAL((s_b != d_c), true); ASSERT_EQUAL((d_b != s_c), true);
+    ASSERT_EQUAL((s_c != d_b), true); ASSERT_EQUAL((d_c != s_b), true);
+
+    // test vector vs host_vector
+    ASSERT_EQUAL((s_a != h_b), true); ASSERT_EQUAL((h_a != s_b), true);
+    ASSERT_EQUAL((s_b != h_a), true); ASSERT_EQUAL((h_b != s_a), true);
+    ASSERT_EQUAL((s_a != h_c), true); ASSERT_EQUAL((h_a != s_c), true);
+    ASSERT_EQUAL((s_c != h_a), true); ASSERT_EQUAL((h_c != s_a), true);
+    ASSERT_EQUAL((s_b != h_c), true); ASSERT_EQUAL((h_b != s_c), true);
+    ASSERT_EQUAL((s_c != h_b), true); ASSERT_EQUAL((h_c != s_b), true);
+}
+DECLARE_UNITTEST(TestVectorInequality);
+
+
+template <class Vector>
+void TestVectorResizing(void)
+{
+    Vector v;
+
+    v.resize(3);
+
+    ASSERT_EQUAL(v.size(), 3lu);
+
+    v[0] = 0; v[1] = 1; v[2] = 2;
+
+    v.resize(5);
+
+    ASSERT_EQUAL(v.size(), 5lu);
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+
+    v[3] = 3; v[4] = 4;
+
+    v.resize(4);
+
+    ASSERT_EQUAL(v.size(), 4lu);
+
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 2);
+    ASSERT_EQUAL(v[3], 3);
+
+    v.resize(0);
+
+    ASSERT_EQUAL(v.size(), 0lu);
+
+// TODO remove this WAR
+#if defined(__CUDACC__) && CUDART_VERSION==3000
+    // depending on sizeof(T), we will receive one
+    // of two possible exceptions
+    try
+    {
+      v.resize(std::numeric_limits<size_t>::max());
+    }
+    catch(std::length_error e) {}
+    catch(std::bad_alloc e)
+    {
+      // reset the CUDA error
+      cudaGetLastError();
+    } // end catch
+#endif // defined(__CUDACC__) && CUDART_VERSION==3000
+
+    ASSERT_EQUAL(v.size(), 0lu);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorResizing);
+
+
+
+template <class Vector>
+void TestVectorReserving(void)
+{
+    Vector v;
+
+    v.reserve(3);
+
+    ASSERT_GEQUAL(v.capacity(), 3lu);
+
+    size_t old_capacity = v.capacity();
+
+    v.reserve(0);
+
+    ASSERT_EQUAL(v.capacity(), old_capacity);
+
+// TODO remove this WAR
+#if defined(__CUDACC__) && CUDART_VERSION==3000
+    try
+    {
+      v.reserve(std::numeric_limits<size_t>::max());
+    }
+    catch(std::length_error e) {}
+    catch(std::bad_alloc e) {}
+#endif // defined(__CUDACC__) && CUDART_VERSION==3000
+
+    ASSERT_EQUAL(v.capacity(), old_capacity);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorReserving)
+
+
+
+template <class Vector>
+void TestVectorShrinkToFit(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector v;
+
+    v.reserve(200);
+
+    ASSERT_GEQUAL(v.capacity(), 200lu);
+
+    v.push_back(1);
+    v.push_back(2);
+    v.push_back(3);
+
+    v.shrink_to_fit();
+
+    ASSERT_EQUAL(T(1), v[0]);
+    ASSERT_EQUAL(T(2), v[1]);
+    ASSERT_EQUAL(T(3), v[2]);
+    ASSERT_EQUAL(3lu, v.size());
+    ASSERT_EQUAL(3lu, v.capacity());
+}
+DECLARE_VECTOR_UNITTEST(TestVectorShrinkToFit)
+
+template <int N>
+struct LargeStruct
+{
+  int data[N];
+
+  __host__ __device__
+  bool operator==(const LargeStruct & ls) const
+  {
+    for (int i = 0; i < N; i++)
+      if (data[i] != ls.data[i])
+        return false;
+    return true;
+  }
+};
+
+void TestVectorContainingLargeType(void)
+{
+    // Thrust issue #5
+    // http://code.google.com/p/thrust/issues/detail?id=5
+    const static int N = 100;
+    typedef LargeStruct<N> T;
+
+    thrust::device_vector<T> dv1;
+    thrust::host_vector<T>   hv1;
+
+    ASSERT_EQUAL_QUIET(dv1, hv1);
+
+    thrust::device_vector<T> dv2(20);
+    thrust::host_vector<T>   hv2(20);
+
+    ASSERT_EQUAL_QUIET(dv2, hv2);
+
+    // initialize tofirst element to something nonzero
+    T ls;
+
+    for (int i = 0; i < N; i++)
+      ls.data[i] = i;
+
+    thrust::device_vector<T> dv3(20, ls);
+    thrust::host_vector<T>   hv3(20, ls);
+
+    ASSERT_EQUAL_QUIET(dv3, hv3);
+
+    // change first element
+    ls.data[0] = -13;
+
+    dv3[2] = ls;
+    hv3[2] = ls;
+
+    ASSERT_EQUAL_QUIET(dv3, hv3);
+}
+DECLARE_UNITTEST(TestVectorContainingLargeType);
+
+
+template <typename Vector>
+void TestVectorReversed(void)
+{
+  Vector v(3);
+  v[0] = 0; v[1] = 1; v[2] = 2;
+
+  ASSERT_EQUAL(3, v.rend() - v.rbegin());
+  ASSERT_EQUAL(3, static_cast<const Vector&>(v).rend() - static_cast<const Vector&>(v).rbegin());
+  ASSERT_EQUAL(3, v.crend() - v.crbegin());
+
+  ASSERT_EQUAL(2, *v.rbegin());
+  ASSERT_EQUAL(2, *static_cast<const Vector&>(v).rbegin());
+  ASSERT_EQUAL(2, *v.crbegin());
+
+  ASSERT_EQUAL(1, *(v.rbegin() + 1));
+  ASSERT_EQUAL(0, *(v.rbegin() + 2));
+
+  ASSERT_EQUAL(0, *(v.rend() - 1));
+  ASSERT_EQUAL(1, *(v.rend() - 2));
+}
+DECLARE_VECTOR_UNITTEST(TestVectorReversed);
+
+#if THRUST_CPP_DIALECT >= 2011
+  template <class Vector>
+  void TestVectorMove(void)
+  {
+    //test move construction
+    Vector v1(3);
+    v1[0] = 0; v1[1] = 1; v1[2] = 2;
+
+    const auto ptr1 = v1.data();
+    const auto size1 = v1.size();
+
+    Vector v2(std::move(v1));
+    const auto ptr2 = v2.data();
+    const auto size2 = v2.size();
+
+    // ensure v1 was left empty
+    ASSERT_EQUAL(true, v1.empty());
+
+    // ensure v2 received the data from before
+    ASSERT_EQUAL(v2[0], 0);
+    ASSERT_EQUAL(v2[1], 1);
+    ASSERT_EQUAL(v2[2], 2);
+    ASSERT_EQUAL(size1, size2);
+
+    // ensure v2 received the pointer from before
+    ASSERT_EQUAL(ptr1, ptr2);
+
+    //test move assignment
+    Vector v3(3);
+    v3[0] = 3; v3[1] = 4; v3[2] = 5;
+
+    const auto ptr3 = v3.data();
+    const auto size3 = v3.size();
+
+    v2 = std::move(v3);
+    const auto ptr4 = v2.data();
+    const auto size4 = v2.size();
+
+    // ensure v3 was left empty
+    ASSERT_EQUAL(true, v3.empty());
+
+    // ensure v2 received the data from before
+    ASSERT_EQUAL(v2[0], 3);
+    ASSERT_EQUAL(v2[1], 4);
+    ASSERT_EQUAL(v2[2], 5);
+    ASSERT_EQUAL(size3, size4);
+
+    // ensure v2 received the pointer from before
+    ASSERT_EQUAL(ptr3, ptr4);
+  }
+  DECLARE_VECTOR_UNITTEST(TestVectorMove);
+#endif
+
diff --git a/thrust/testing/vector_allocators.cu b/thrust/testing/vector_allocators.cu
new file mode 100644
index 0000000000000000000000000000000000000000..568ea7ff672300d3aea312f53b060cf08461240c
--- /dev/null
+++ b/thrust/testing/vector_allocators.cu
@@ -0,0 +1,276 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+template<typename BaseAlloc, bool PropagateOnSwap>
+class stateful_allocator : public BaseAlloc
+{
+  typedef thrust::detail::allocator_traits<BaseAlloc> base_traits;
+
+public:
+    stateful_allocator(int i) : state(i)
+    {
+    }
+
+    ~stateful_allocator() {}
+
+    stateful_allocator(const stateful_allocator &other)
+        : BaseAlloc(other), state(other.state)
+    {
+    }
+
+    stateful_allocator & operator=(const stateful_allocator & other)
+    {
+        state = other.state;
+        return *this;
+    }
+
+#if THRUST_CPP_DIALECT >= 2011
+    stateful_allocator(stateful_allocator && other)
+        : BaseAlloc(std::move(other)), state(other.state)
+    {
+        other.state = 0;
+    }
+
+    stateful_allocator & operator=(stateful_allocator && other)
+    {
+        state = other.state;
+        other.state = 0;
+        return *this;
+    }
+#endif
+
+    static int last_allocated;
+    static int last_deallocated;
+
+    typedef typename base_traits::pointer pointer;
+    typedef typename base_traits::const_pointer const_pointer;
+    typedef typename base_traits::reference reference;
+    typedef typename base_traits::const_reference const_reference;
+
+    pointer allocate(std::size_t size)
+    {
+        BaseAlloc alloc;
+        last_allocated = state;
+        return base_traits::allocate(alloc, size);
+    }
+
+    void deallocate(pointer ptr, std::size_t size)
+    {
+        BaseAlloc alloc;
+        last_deallocated = state;
+        return base_traits::deallocate(alloc, ptr, size);
+    }
+
+    static void construct(pointer ptr)
+    {
+      BaseAlloc alloc;
+      return base_traits::construct(alloc, ptr);
+    }
+
+    static void destroy(pointer ptr)
+    {
+      BaseAlloc alloc;
+      return base_traits::destroy(alloc, ptr);
+    }
+
+    bool operator==(const stateful_allocator &rhs) const
+    {
+        return state == rhs.state;
+    }
+
+    bool operator!=(const stateful_allocator &rhs) const
+    {
+        return state != rhs.state;
+    }
+
+    friend std::ostream & operator<<(std::ostream &os,
+        const stateful_allocator & alloc)
+    {
+        os << "stateful_alloc(" << alloc.state << ")";
+        return os;
+    }
+
+    typedef thrust::detail::false_type is_always_equal;
+    typedef thrust::detail::true_type propagate_on_container_copy_assignment;
+    typedef thrust::detail::true_type propagate_on_container_move_assignment;
+    typedef thrust::detail::integral_constant<bool, PropagateOnSwap> propagate_on_container_swap;
+
+private:
+    int state;
+};
+
+template<typename BaseAlloc, bool PropagateOnSwap>
+int stateful_allocator<BaseAlloc, PropagateOnSwap>::last_allocated = 0;
+
+template<typename BaseAlloc, bool PropagateOnSwap>
+int stateful_allocator<BaseAlloc, PropagateOnSwap>::last_deallocated = 0;
+
+typedef stateful_allocator<std::allocator<int>, true> host_alloc;
+typedef stateful_allocator<thrust::device_allocator<int>, true> device_alloc;
+
+typedef thrust::host_vector<int, host_alloc> host_vector;
+typedef thrust::device_vector<int, device_alloc> device_vector;
+
+typedef stateful_allocator<std::allocator<int>, false> host_alloc_nsp;
+typedef stateful_allocator<thrust::device_allocator<int>, false> device_alloc_nsp;
+
+typedef thrust::host_vector<int, host_alloc_nsp> host_vector_nsp;
+typedef thrust::device_vector<int, device_alloc_nsp> device_vector_nsp;
+
+template<typename Vector>
+void TestVectorAllocatorConstructors()
+{
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    Vector v1(alloc1);
+    ASSERT_EQUAL(v1.get_allocator(), alloc1);
+
+    Vector v2(10, alloc1);
+    ASSERT_EQUAL(v2.size(), 10u);
+    ASSERT_EQUAL(v2.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 1);
+    Alloc::last_allocated = 0;
+
+    Vector v3(10, 17, alloc1);
+    ASSERT_EQUAL((v3 == std::vector<int>(10, 17)), true);
+    ASSERT_EQUAL(v3.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 1);
+    Alloc::last_allocated = 0;
+
+    Vector v4(v3, alloc2);
+    ASSERT_EQUAL((v3 == v4), true);
+    ASSERT_EQUAL(v4.get_allocator(), alloc2);
+    ASSERT_EQUAL(Alloc::last_allocated, 2);
+    Alloc::last_allocated = 0;
+
+#if THRUST_CPP_DIALECT >= 2011
+    // FIXME: uncomment this after the vector_base(vector_base&&, const Alloc&)
+    // is fixed and implemented
+    // Vector v5(std::move(v3), alloc2);
+    // ASSERT_EQUAL((v4 == v5), true);
+    // ASSERT_EQUAL(v5.get_allocator(), alloc2);
+    // ASSERT_EQUAL(Alloc::last_allocated, 1);
+    // Alloc::last_allocated = 0;
+#endif
+
+    Vector v6(v4.begin(), v4.end(), alloc2);
+    ASSERT_EQUAL((v4 == v6), true);
+    ASSERT_EQUAL(v6.get_allocator(), alloc2);
+    ASSERT_EQUAL(Alloc::last_allocated, 2);
+}
+
+void TestVectorAllocatorConstructorsHost()
+{
+    TestVectorAllocatorConstructors<host_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorConstructorsHost);
+
+void TestVectorAllocatorConstructorsDevice()
+{
+    TestVectorAllocatorConstructors<device_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorConstructorsDevice);
+
+template<typename Vector>
+void TestVectorAllocatorPropagateOnCopyAssignment()
+{
+    ASSERT_EQUAL(thrust::detail::allocator_traits<typename Vector::allocator_type>::propagate_on_container_copy_assignment::value, true);
+
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    Vector v1(10, alloc1);
+    Vector v2(15, alloc2);
+
+    v2 = v1;
+    ASSERT_EQUAL((v1 == v2), true);
+    ASSERT_EQUAL(v2.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 1);
+    ASSERT_EQUAL(Alloc::last_deallocated, 2);
+}
+
+void TestVectorAllocatorPropagateOnCopyAssignmentHost()
+{
+    TestVectorAllocatorPropagateOnCopyAssignment<host_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnCopyAssignmentHost);
+
+void TestVectorAllocatorPropagateOnCopyAssignmentDevice()
+{
+    TestVectorAllocatorPropagateOnCopyAssignment<device_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnCopyAssignmentDevice);
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename Vector>
+void TestVectorAllocatorPropagateOnMoveAssignment()
+{
+    typedef typename Vector::allocator_type Alloc;
+    ASSERT_EQUAL(thrust::detail::allocator_traits<typename Vector::allocator_type>::propagate_on_container_copy_assignment::value, true);
+
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    {
+    Vector v1(10, alloc1);
+    Vector v2(15, alloc2);
+
+    v2 = std::move(v1);
+    ASSERT_EQUAL(v2.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 2);
+    ASSERT_EQUAL(Alloc::last_deallocated, 2);
+    }
+
+    ASSERT_EQUAL(Alloc::last_deallocated, 1);
+}
+
+void TestVectorAllocatorPropagateOnMoveAssignmentHost()
+{
+    TestVectorAllocatorPropagateOnMoveAssignment<host_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnMoveAssignmentHost);
+
+void TestVectorAllocatorPropagateOnMoveAssignmentDevice()
+{
+    TestVectorAllocatorPropagateOnMoveAssignment<device_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnMoveAssignmentDevice);
+#endif
+
+template<typename Vector>
+void TestVectorAllocatorPropagateOnSwap()
+{
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    Vector v1(10, alloc1);
+    Vector v2(17, alloc1);
+    thrust::swap(v1, v2);
+
+    ASSERT_EQUAL(v1.size(), 17u);
+    ASSERT_EQUAL(v2.size(), 10u);
+
+    Vector v3(15, alloc1);
+    Vector v4(31, alloc2);
+    ASSERT_THROWS(thrust::swap(v3, v4), thrust::detail::allocator_mismatch_on_swap);
+}
+
+void TestVectorAllocatorPropagateOnSwapHost()
+{
+    TestVectorAllocatorPropagateOnSwap<host_vector_nsp>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnSwapHost);
+
+void TestVectorAllocatorPropagateOnSwapDevice()
+{
+    TestVectorAllocatorPropagateOnSwap<device_vector_nsp>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnSwapDevice);
diff --git a/thrust/testing/vector_insert.cu b/thrust/testing/vector_insert.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a9f674aa000f13a80c61af2a4667f40c81ac8743
--- /dev/null
+++ b/thrust/testing/vector_insert.cu
@@ -0,0 +1,330 @@
+#include <unittest/unittest.h>
+#include <thrust/sequence.h>
+#include <thrust/device_malloc_allocator.h>
+
+template <class Vector>
+struct TestVectorRangeInsertSimple
+{
+    void operator()(size_t)
+    {
+        typedef typename Vector::value_type T;
+
+        Vector v1(5);
+        thrust::sequence(v1.begin(), v1.end());
+
+        // test when insertion range fits inside capacity
+        // and the size of the insertion is greater than the number
+        // of displaced elements
+        Vector v2(3);
+        v2.reserve(10);
+        thrust::sequence(v2.begin(), v2.end());
+
+        size_t new_size = v2.size() + v1.size();
+        size_t insertion_size = v1.end() - v1.begin();
+        size_t num_displaced = v2.end() - (v2.begin() + 1);
+
+        ASSERT_EQUAL(true, v2.capacity()   >= new_size);
+        ASSERT_EQUAL(true, insertion_size  >  num_displaced);
+
+        v2.insert(v2.begin() + 1,
+                  v1.begin(), v1.end());
+
+        ASSERT_EQUAL(T(0), v2[0]);
+
+        ASSERT_EQUAL(T(0), v2[1]);
+        ASSERT_EQUAL(T(1), v2[2]);
+        ASSERT_EQUAL(T(2), v2[3]);
+        ASSERT_EQUAL(T(3), v2[4]);
+        ASSERT_EQUAL(T(4), v2[5]);
+
+        ASSERT_EQUAL(T(1), v2[6]);
+        ASSERT_EQUAL(T(2), v2[7]);
+        
+        ASSERT_EQUAL(8lu,  v2.size());
+        ASSERT_EQUAL(10lu, v2.capacity());
+
+        // test when insertion range fits inside capacity
+        // and the size of the insertion is equal to the number
+        // of displaced elements
+        Vector v3(5);
+        v3.reserve(10);
+        thrust::sequence(v3.begin(), v3.end());
+
+        new_size = v3.size() + v1.size();
+        insertion_size = v1.end() - v1.begin();
+        num_displaced = v3.end() - v3.begin();
+
+        ASSERT_EQUAL(true, v3.capacity()   >=  new_size);
+        ASSERT_EQUAL(true, insertion_size  ==  num_displaced);
+
+        v3.insert(v3.begin(),
+                  v1.begin(), v1.end());
+
+        ASSERT_EQUAL(T(0), v3[0]);
+        ASSERT_EQUAL(T(1), v3[1]);
+        ASSERT_EQUAL(T(2), v3[2]);
+        ASSERT_EQUAL(T(3), v3[3]);
+        ASSERT_EQUAL(T(4), v3[4]);
+
+        ASSERT_EQUAL(T(0), v3[5]);
+        ASSERT_EQUAL(T(1), v3[6]);
+        ASSERT_EQUAL(T(2), v3[7]);
+        ASSERT_EQUAL(T(3), v3[8]);
+        ASSERT_EQUAL(T(4), v3[9]);
+
+        ASSERT_EQUAL(10lu, v3.size());
+        ASSERT_EQUAL(10lu, v3.capacity());
+
+        // test when insertion range fits inside capacity
+        // and the size of the insertion is less than the
+        // number of displaced elements
+        Vector v4(5);
+        v4.reserve(10);
+        thrust::sequence(v4.begin(), v4.end());
+
+        new_size = v4.size() + v1.size();
+        insertion_size = (v1.begin() + 3) - v1.begin();
+        num_displaced = v4.end() - (v4.begin() + 1);
+
+        ASSERT_EQUAL(true, v4.capacity()   >=  new_size);
+        ASSERT_EQUAL(true, insertion_size  <   num_displaced);
+
+        v4.insert(v4.begin() + 1,
+                  v1.begin(), v1.begin() + 3);
+
+        ASSERT_EQUAL(T(0), v4[0]);
+
+        ASSERT_EQUAL(T(0), v4[1]);
+        ASSERT_EQUAL(T(1), v4[2]);
+        ASSERT_EQUAL(T(2), v4[3]);
+
+        ASSERT_EQUAL(T(1), v4[4]);
+        ASSERT_EQUAL(T(2), v4[5]);
+        ASSERT_EQUAL(T(3), v4[6]);
+        ASSERT_EQUAL(T(4), v4[7]);
+
+        ASSERT_EQUAL(8lu, v4.size());
+        ASSERT_EQUAL(10lu, v4.capacity());
+
+        // test when insertion range does not fit inside capacity
+        Vector v5(5);
+        thrust::sequence(v5.begin(), v5.end());
+
+        new_size = v5.size() + v1.size();
+
+        ASSERT_EQUAL(true, v5.capacity() < new_size);
+
+        v5.insert(v5.begin() + 1,
+                  v1.begin(), v1.end());
+
+        ASSERT_EQUAL(T(0), v5[0]);
+
+        ASSERT_EQUAL(T(0), v5[1]);
+        ASSERT_EQUAL(T(1), v5[2]);
+        ASSERT_EQUAL(T(2), v5[3]);
+        ASSERT_EQUAL(T(3), v5[4]);
+        ASSERT_EQUAL(T(4), v5[5]);
+
+        ASSERT_EQUAL(T(1), v5[6]);
+        ASSERT_EQUAL(T(2), v5[7]);
+        ASSERT_EQUAL(T(3), v5[8]);
+        ASSERT_EQUAL(T(4), v5[9]);
+
+        ASSERT_EQUAL(10lu, v5.size());
+    }
+}; // end TestVectorRangeInsertSimple
+VectorUnitTest<TestVectorRangeInsertSimple, NumericTypes, thrust::device_vector, thrust::device_malloc_allocator> TestVectorRangeInsertSimpleDeviceInstance;
+VectorUnitTest<TestVectorRangeInsertSimple, NumericTypes, thrust::host_vector,   std::allocator>                  TestVectorRangeInsertSimpleHostInstance;
+
+template <class T>
+struct TestVectorRangeInsert
+{
+    void operator()(size_t n)
+    {
+        thrust::host_vector<T>   h_src = unittest::random_samples<T>(n + 3);
+        thrust::host_vector<T>   h_dst = unittest::random_samples<T>(n);
+
+        thrust::device_vector<T> d_src = h_src;
+        thrust::device_vector<T> d_dst = h_dst;
+
+        // choose insertion range at random
+        size_t begin = n > 0 ? (size_t)h_src[n]   % n : 0;
+        size_t end   = n > 0 ? (size_t)h_src[n+1] % n : 0;
+        if(end < begin) thrust::swap(begin,end);
+
+        // choose insertion position at random
+        size_t position = n > 0 ? (size_t)h_src[n+2] % n : 0;
+
+        // insert on host
+        h_dst.insert(h_dst.begin() + position,
+                     h_src.begin() + begin,
+                     h_src.begin() + end);
+
+        // insert on device
+        d_dst.insert(d_dst.begin() + position,
+                     d_src.begin() + begin,
+                     d_src.begin() + end);
+
+        ASSERT_EQUAL(h_dst, d_dst);
+    }
+}; // end TestVectorRangeInsert
+VariableUnitTest<TestVectorRangeInsert, IntegralTypes> TestVectorRangeInsertInstance;
+
+template <class Vector>
+struct TestVectorFillInsertSimple
+{
+    void operator()(size_t)
+    {
+        typedef typename Vector::value_type T;
+
+        // test when insertion range fits inside capacity
+        // and the size of the insertion is greater than the number
+        // of displaced elements
+        Vector v1(3);
+        v1.reserve(10);
+        thrust::sequence(v1.begin(), v1.end());
+
+        size_t insertion_size = 5;
+        size_t new_size = v1.size() + insertion_size;
+        size_t num_displaced = v1.end() - (v1.begin() + 1);
+
+        ASSERT_EQUAL(true, v1.capacity()   >= new_size);
+        ASSERT_EQUAL(true, insertion_size  >  num_displaced);
+
+        v1.insert(v1.begin() + 1, insertion_size, 13);
+
+        ASSERT_EQUAL(T(0), v1[0]);
+
+        ASSERT_EQUAL(T(13), v1[1]);
+        ASSERT_EQUAL(T(13), v1[2]);
+        ASSERT_EQUAL(T(13), v1[3]);
+        ASSERT_EQUAL(T(13), v1[4]);
+        ASSERT_EQUAL(T(13), v1[5]);
+
+        ASSERT_EQUAL(T(1), v1[6]);
+        ASSERT_EQUAL(T(2), v1[7]);
+        
+        ASSERT_EQUAL(8lu,  v1.size());
+        ASSERT_EQUAL(10lu, v1.capacity());
+
+        // test when insertion range fits inside capacity
+        // and the size of the insertion is equal to the number
+        // of displaced elements
+        Vector v2(5);
+        v2.reserve(10);
+        thrust::sequence(v2.begin(), v2.end());
+
+        insertion_size = 5;
+        new_size = v2.size() + insertion_size;
+        num_displaced = v2.end() - v2.begin();
+
+        ASSERT_EQUAL(true, v2.capacity()   >=  new_size);
+        ASSERT_EQUAL(true, insertion_size  ==  num_displaced);
+
+        v2.insert(v2.begin(), insertion_size, 13);
+
+        ASSERT_EQUAL(T(13), v2[0]);
+        ASSERT_EQUAL(T(13), v2[1]);
+        ASSERT_EQUAL(T(13), v2[2]);
+        ASSERT_EQUAL(T(13), v2[3]);
+        ASSERT_EQUAL(T(13), v2[4]);
+
+        ASSERT_EQUAL(T(0), v2[5]);
+        ASSERT_EQUAL(T(1), v2[6]);
+        ASSERT_EQUAL(T(2), v2[7]);
+        ASSERT_EQUAL(T(3), v2[8]);
+        ASSERT_EQUAL(T(4), v2[9]);
+
+        ASSERT_EQUAL(10lu, v2.size());
+        ASSERT_EQUAL(10lu, v2.capacity());
+
+        // test when insertion range fits inside capacity
+        // and the size of the insertion is less than the
+        // number of displaced elements
+        Vector v3(5);
+        v3.reserve(10);
+        thrust::sequence(v3.begin(), v3.end());
+
+        insertion_size = 3;
+        new_size = v3.size() + insertion_size;
+        num_displaced = v3.end() - (v3.begin() + 1);
+
+        ASSERT_EQUAL(true, v3.capacity()   >=  new_size);
+        ASSERT_EQUAL(true, insertion_size  <   num_displaced);
+
+        v3.insert(v3.begin() + 1, insertion_size, 13);
+
+        ASSERT_EQUAL(T(0), v3[0]);
+
+        ASSERT_EQUAL(T(13), v3[1]);
+        ASSERT_EQUAL(T(13), v3[2]);
+        ASSERT_EQUAL(T(13), v3[3]);
+
+        ASSERT_EQUAL(T(1), v3[4]);
+        ASSERT_EQUAL(T(2), v3[5]);
+        ASSERT_EQUAL(T(3), v3[6]);
+        ASSERT_EQUAL(T(4), v3[7]);
+
+        ASSERT_EQUAL(8lu, v3.size());
+        ASSERT_EQUAL(10lu, v3.capacity());
+
+        // test when insertion range does not fit inside capacity
+        Vector v4(5);
+        thrust::sequence(v4.begin(), v4.end());
+
+        insertion_size = 5;
+        new_size = v4.size() + insertion_size;
+
+        ASSERT_EQUAL(true, v4.capacity() < new_size);
+
+        v4.insert(v4.begin() + 1, insertion_size, 13);
+
+        ASSERT_EQUAL(T(0), v4[0]);
+
+        ASSERT_EQUAL(T(13), v4[1]);
+        ASSERT_EQUAL(T(13), v4[2]);
+        ASSERT_EQUAL(T(13), v4[3]);
+        ASSERT_EQUAL(T(13), v4[4]);
+        ASSERT_EQUAL(T(13), v4[5]);
+
+        ASSERT_EQUAL(T(1), v4[6]);
+        ASSERT_EQUAL(T(2), v4[7]);
+        ASSERT_EQUAL(T(3), v4[8]);
+        ASSERT_EQUAL(T(4), v4[9]);
+
+        ASSERT_EQUAL(10lu, v4.size());
+    }
+}; // end TestVectorFillInsertSimple
+VectorUnitTest<TestVectorFillInsertSimple, NumericTypes, thrust::device_vector, thrust::device_malloc_allocator> TestVectorFillInsertSimpleDeviceInstance;
+VectorUnitTest<TestVectorFillInsertSimple, NumericTypes, thrust::host_vector,   std::allocator>                  TestVectorFillInsertSimpleHostInstance;
+
+template <class T>
+struct TestVectorFillInsert
+{
+    void operator()(size_t n)
+    {
+        thrust::host_vector<T>   h_dst = unittest::random_samples<T>(n + 2);
+
+        thrust::device_vector<T> d_dst = h_dst;
+
+        // choose insertion position at random
+        size_t position = n > 0 ? (size_t)h_dst[n] % n : 0;
+
+        // choose insertion size at random
+        size_t insertion_size = n > 0 ? (size_t)h_dst[n] % n : 13;
+
+        // insert on host
+        h_dst.insert(h_dst.begin() + position,
+                     insertion_size,
+                     13);
+
+        // insert on device
+        d_dst.insert(d_dst.begin() + position,
+                     insertion_size,
+                     13);
+
+        ASSERT_EQUAL(h_dst, d_dst);
+    }
+}; // end TestVectorFillInsert
+VariableUnitTest<TestVectorFillInsert, IntegralTypes> TestVectorFillInsertInstance;
+
diff --git a/thrust/testing/vector_manipulation.cu b/thrust/testing/vector_manipulation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a949b154e8029c1f287f43d22c00744babebf64b
--- /dev/null
+++ b/thrust/testing/vector_manipulation.cu
@@ -0,0 +1,102 @@
+#include <unittest/unittest.h>
+#include <thrust/device_malloc_allocator.h>
+#include <vector>
+
+template <class Vector>
+void TestVectorManipulation(size_t n)
+{
+    typedef typename Vector::iterator   Iterator;
+    typedef typename Vector::value_type T;
+
+    thrust::host_vector<T> src = unittest::random_samples<T>(n);
+    ASSERT_EQUAL(src.size(), n);
+
+    // basic initialization
+    Vector test0(n);
+    Vector test1(n, T(3));
+    ASSERT_EQUAL(test0.size(), n);
+    ASSERT_EQUAL(test1.size(), n);
+    ASSERT_EQUAL((test1 == std::vector<T>(n, T(3))), true);
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+    // XXX MSVC 2005's STL unintentionally uses adl to dispatch advance which
+    //     produces an ambiguity between std::advance & thrust::advance
+    //     don't produce a KNOWN_FAILURE, just ignore the issue
+#else
+    // initializing from other vector
+    std::vector<T> stl_vector(src.begin(), src.end());
+    Vector cpy0 = src;
+    Vector cpy1(stl_vector);
+    Vector cpy2(stl_vector.begin(), stl_vector.end());
+    ASSERT_EQUAL(cpy0, src);
+    ASSERT_EQUAL(cpy1, src);
+    ASSERT_EQUAL(cpy2, src);
+#endif
+
+    // resizing
+    Vector vec1(src);
+    vec1.resize(n + 3);
+    ASSERT_EQUAL(vec1.size(), n + 3);
+    vec1.resize(n);
+    ASSERT_EQUAL(vec1.size(), n);
+    ASSERT_EQUAL(vec1, src); 
+    
+    vec1.resize(n + 20, T(11));
+    Vector tail(vec1.begin() + n, vec1.end());
+    ASSERT_EQUAL((tail == std::vector<T>(20, T(11))), true);
+
+    // shrinking a vector should not invalidate iterators
+    Iterator first = vec1.begin();
+    vec1.resize(10);
+    ASSERT_EQUAL_QUIET(first, vec1.begin());
+
+    vec1.resize(0);
+    ASSERT_EQUAL(vec1.size(), 0lu);
+    ASSERT_EQUAL(vec1.empty(), true);
+    vec1.resize(10);
+    ASSERT_EQUAL(vec1.size(), 10lu);
+    vec1.clear();
+    ASSERT_EQUAL(vec1.size(), 0lu);
+    vec1.resize(5);
+    ASSERT_EQUAL(vec1.size(), 5lu);
+
+    // push_back
+    Vector vec2;
+    for(size_t i = 0; i < 10; ++i)
+    {
+        ASSERT_EQUAL(vec2.size(), i);
+        vec2.push_back(T(i));
+        ASSERT_EQUAL(vec2.size(), i + 1);
+        for(size_t j = 0; j <= i; j++)
+            ASSERT_EQUAL(vec2[j], T(j));
+        ASSERT_EQUAL(vec2.back(), T(i));
+    }
+
+    // pop_back
+    for(size_t i = 10; i > 0; --i)
+    {
+        ASSERT_EQUAL(vec2.size(), i);
+        ASSERT_EQUAL(vec2.back(), T(i - 1));
+        vec2.pop_back();
+        ASSERT_EQUAL(vec2.size(), i - 1);
+        for(size_t j = 0; j < i; j++)
+            ASSERT_EQUAL(vec2[j], T(j));
+    }
+
+    //TODO test swap, erase(pos), erase(begin, end)
+}
+
+template <typename T>
+void TestVectorManipulationHost(size_t n)
+{
+    TestVectorManipulation< thrust::host_vector<T> >(n);
+}
+DECLARE_VARIABLE_UNITTEST(TestVectorManipulationHost);
+
+template <typename T>
+void TestVectorManipulationDevice(size_t n)
+{
+    TestVectorManipulation< thrust::device_vector<T> >(n);
+}
+DECLARE_VARIABLE_UNITTEST(TestVectorManipulationDevice);
+
diff --git a/thrust/testing/zip_function.cu b/thrust/testing/zip_function.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a1545a1a1fbd33665ba27922bd38d97c69199dc7
--- /dev/null
+++ b/thrust/testing/zip_function.cu
@@ -0,0 +1,70 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/zip_function.h>
+
+#include <iostream>
+
+using namespace unittest;
+
+struct SumThree
+{
+  template <typename T1, typename T2, typename T3>
+  __host__ __device__
+  auto operator()(T1 x, T2 y, T3 z) const
+  THRUST_DECLTYPE_RETURNS(x + y + z)
+}; // end SumThree
+
+struct SumThreeTuple
+{
+  template <typename Tuple>
+  __host__ __device__
+  auto operator()(Tuple x) const
+  THRUST_DECLTYPE_RETURNS(thrust::get<0>(x) + thrust::get<1>(x) + thrust::get<2>(x))
+}; // end SumThreeTuple
+
+template <typename T>
+struct TestZipFunctionTransform
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_samples<T>(n);
+    host_vector<T> h_data1 = unittest::random_samples<T>(n);
+    host_vector<T> h_data2 = unittest::random_samples<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+    device_vector<T> d_data2 = h_data2;
+
+    host_vector<T>   h_result_tuple(n);
+    host_vector<T>   h_result_zip(n);
+    device_vector<T> d_result_zip(n);
+
+    // Tuple base case
+    transform(make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+              make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+              h_result_tuple.begin(),
+              SumThreeTuple{});
+    // Zip Function
+    transform(make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+              make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+              h_result_zip.begin(),
+              make_zip_function(SumThree{}));
+    transform(make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin(), d_data2.begin())),
+              make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end(),   d_data2.end())),
+              d_result_zip.begin(),
+              make_zip_function(SumThree{}));
+
+    ASSERT_EQUAL(h_result_tuple, h_result_zip);
+    ASSERT_EQUAL(h_result_tuple, d_result_zip);
+  }
+};
+VariableUnitTest<TestZipFunctionTransform, ThirtyTwoBitTypes> TestZipFunctionTransformInstance;
+
+#endif // THRUST_CPP_DIALECT
diff --git a/thrust/testing/zip_iterator.cu b/thrust/testing/zip_iterator.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ea34b25f8c0244459c97ef3758c956f02d4ce1b
--- /dev/null
+++ b/thrust/testing/zip_iterator.cu
@@ -0,0 +1,489 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/copy.h>
+#include <thrust/transform.h>
+
+using namespace unittest;
+
+template<typename T>
+  struct TestZipIteratorManipulation
+{
+  template<typename Vector>
+  void test(void)
+  {
+    using namespace thrust;
+
+    Vector v0(4);
+    Vector v1(4);
+    Vector v2(4);
+
+    // initialize input
+    sequence(v0.begin(), v0.end());
+    sequence(v1.begin(), v1.end());
+    sequence(v2.begin(), v2.end());
+
+    typedef tuple<typename Vector::iterator, typename Vector::iterator> IteratorTuple;
+
+    IteratorTuple t = make_tuple(v0.begin(), v1.begin());
+
+    typedef zip_iterator<IteratorTuple> ZipIterator;
+
+    // test construction
+    ZipIterator iter0 = make_zip_iterator(t);
+
+    ASSERT_EQUAL_QUIET(v0.begin(), get<0>(iter0.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(v1.begin(), get<1>(iter0.get_iterator_tuple()));
+
+    // test dereference
+    ASSERT_EQUAL(*v0.begin(), get<0>(*iter0));
+    ASSERT_EQUAL(*v1.begin(), get<1>(*iter0));
+
+    // test equality
+    ZipIterator iter1 = iter0;
+    ZipIterator iter2 = make_zip_iterator(make_tuple(v0.begin(), v2.begin()));
+    ZipIterator iter3 = make_zip_iterator(make_tuple(v1.begin(), v2.begin()));
+    ASSERT_EQUAL(true,  iter0 == iter1);
+    ASSERT_EQUAL(true,  iter0 == iter2);
+    ASSERT_EQUAL(false, iter0 == iter3);
+
+    // test inequality
+    ASSERT_EQUAL(false, iter0 != iter1);
+    ASSERT_EQUAL(false, iter0 != iter2);
+    ASSERT_EQUAL(true,  iter0 != iter3);
+
+    // test advance
+    ZipIterator iter4 = iter0 + 1;
+    ASSERT_EQUAL_QUIET(v0.begin() + 1, get<0>(iter4.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(v1.begin() + 1, get<1>(iter4.get_iterator_tuple()));
+
+    // test pre-increment
+    ++iter4;
+    ASSERT_EQUAL_QUIET(v0.begin() + 2, get<0>(iter4.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(v1.begin() + 2, get<1>(iter4.get_iterator_tuple()));
+
+    // test post-increment
+    iter4++;
+    ASSERT_EQUAL_QUIET(v0.begin() + 3, get<0>(iter4.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(v1.begin() + 3, get<1>(iter4.get_iterator_tuple()));
+
+    // test pre-decrement
+    --iter4;
+    ASSERT_EQUAL_QUIET(v0.begin() + 2, get<0>(iter4.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(v1.begin() + 2, get<1>(iter4.get_iterator_tuple()));
+
+    // test post-decrement
+    iter4--;
+    ASSERT_EQUAL_QUIET(v0.begin() + 1, get<0>(iter4.get_iterator_tuple()));
+    ASSERT_EQUAL_QUIET(v1.begin() + 1, get<1>(iter4.get_iterator_tuple()));
+
+    // test difference
+    ASSERT_EQUAL( 1, iter4 - iter0);
+    ASSERT_EQUAL(-1, iter0 - iter4);
+  }
+
+  void operator()(void)
+  {
+    test<   thrust::host_vector<T> >();
+    test< thrust::device_vector<T> >();
+  }
+};
+SimpleUnitTest<TestZipIteratorManipulation, type_list<int> > TestZipIteratorManipulationInstance;
+
+template <typename T>
+  struct TestZipIteratorReference
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+
+    // test host types
+    typedef typename host_vector<T>::iterator          Iterator1;
+    typedef typename host_vector<T>::const_iterator    Iterator2;
+    typedef tuple<Iterator1,Iterator2>                 IteratorTuple1;
+    typedef zip_iterator<IteratorTuple1> ZipIterator1;
+
+    typedef typename iterator_reference<ZipIterator1>::type zip_iterator_reference_type1;
+
+    host_vector<T> h_variable(1);
+
+    typedef tuple<T&,const T&> reference_type1;
+
+    reference_type1               ref1(*h_variable.begin(),*h_variable.cbegin());
+    zip_iterator_reference_type1 test1(*h_variable.begin(),*h_variable.cbegin());
+
+    ASSERT_EQUAL_QUIET(ref1, test1);
+    ASSERT_EQUAL( get<0>(ref1),  get<0>(test1));
+    ASSERT_EQUAL( get<1>(ref1),  get<1>(test1));
+
+
+    // test device types
+    typedef typename device_vector<T>::iterator        Iterator3;
+    typedef typename device_vector<T>::const_iterator  Iterator4;
+    typedef tuple<Iterator3,Iterator4>                 IteratorTuple2;
+    typedef zip_iterator<IteratorTuple2> ZipIterator2;
+
+    typedef typename iterator_reference<ZipIterator2>::type zip_iterator_reference_type2;
+
+    device_vector<T> d_variable(1);
+
+    typedef tuple< device_reference<T>, device_reference<const T> > reference_type2;
+
+    reference_type2               ref2(*d_variable.begin(),*d_variable.cbegin());
+    zip_iterator_reference_type2 test2(*d_variable.begin(),*d_variable.cbegin());
+
+    ASSERT_EQUAL_QUIET(ref2, test2);
+    ASSERT_EQUAL( get<0>(ref2),  get<0>(test2));
+    ASSERT_EQUAL( get<1>(ref2),  get<1>(test2));
+  } // end operator()()
+};
+SimpleUnitTest<TestZipIteratorReference, NumericTypes> TestZipIteratorReferenceInstance;
+
+
+template <typename T>
+  struct TestZipIteratorTraversal
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+
+#if 0
+    // test host types
+    typedef typename host_vector<T>::iterator          Iterator1;
+    typedef typename host_vector<T>::const_iterator    Iterator2;
+    typedef tuple<Iterator1,Iterator2>                 IteratorTuple1;
+    typedef zip_iterator<IteratorTuple1> ZipIterator1;
+
+    typedef typename iterator_traversal<ZipIterator1>::type zip_iterator_traversal_type1;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_traversal_type1, random_access_traversal_tag>::value) );
+
+
+#if 0
+    // test device types
+    typedef typename device_vector<T>::iterator        Iterator3;
+    typedef typename device_vector<T>::const_iterator  Iterator4;
+    typedef tuple<Iterator3,Iterator4>                 IteratorTuple2;
+    typedef zip_iterator<IteratorTuple2> ZipIterator2;
+
+    typedef typename iterator_traversal<ZipIterator2>::type zip_iterator_traversal_type2;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_traversal_type2, thrust::random_access_traversal_tag>::value) );
+  } // end operator()()
+};
+SimpleUnitTest<TestZipIteratorTraversal, NumericTypes> TestZipIteratorTraversalInstance;
+
+
+template <typename T>
+  struct TestZipIteratorSystem
+{
+  void operator()(void)
+  {
+    using namespace thrust;
+
+    // XXX these assertions complain about undefined references to integral_constant<...>::value
+
+#if 0
+    // test host types
+    typedef typename host_vector<T>::iterator          Iterator1;
+    typedef typename host_vector<T>::const_iterator    Iterator2;
+    typedef tuple<Iterator1,Iterator2>                 IteratorTuple1;
+    typedef zip_iterator<IteratorTuple1> ZipIterator1;
+
+    typedef typename iterator_system<ZipIterator1>::type zip_iterator_system_type1;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_same<zip_iterator_system_type1, experimental::space::host>::value) );
+
+
+#if 0
+    // test device types
+    typedef typename device_vector<T>::iterator        Iterator3;
+    typedef typename device_vector<T>::const_iterator  Iterator4;
+    typedef tuple<Iterator3,Iterator4>                 IteratorTuple2;
+    typedef zip_iterator<IteratorTuple1> ZipIterator2;
+
+    typedef typename iterator_system<ZipIterator2>::type zip_iterator_system_type2;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type2, experimental::space::device>::value) );
+
+
+#if 0
+    // test any
+    typedef counting_iterator<T>         Iterator5;
+    typedef counting_iterator<const T>   Iterator6;
+    typedef tuple<Iterator5, Iterator6>                IteratorTuple3;
+    typedef zip_iterator<IteratorTuple3> ZipIterator3;
+
+    typedef typename iterator_system<ZipIterator3>::type zip_iterator_system_type3;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type3, thrust::experimental::space::any>::value) );
+
+    
+#if 0
+    // test host/any
+    typedef tuple<Iterator1, Iterator5>                IteratorTuple4;
+    typedef zip_iterator<IteratorTuple4> ZipIterator4;
+
+    typedef typename iterator_system<ZipIterator4>::type zip_iterator_system_type4;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type4, thrust::host_system_tag>::value) );
+
+
+#if 0
+    // test any/host
+    typedef tuple<Iterator5, Iterator1>                IteratorTuple5;
+    typedef zip_iterator<IteratorTuple5> ZipIterator5;
+
+    typedef typename iterator_system<ZipIterator5>::type zip_iterator_system_type5;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type5, thrust::host_system_tag>::value) );
+
+
+#if 0
+    // test device/any
+    typedef tuple<Iterator3, Iterator5>                IteratorTuple6;
+    typedef zip_iterator<IteratorTuple6> ZipIterator6;
+
+    typedef typename iterator_system<ZipIterator6>::type zip_iterator_system_type6;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type6, thrust::device_system_tag>::value) );
+
+
+#if 0
+    // test any/device
+    typedef tuple<Iterator5, Iterator3>                IteratorTuple7;
+    typedef zip_iterator<IteratorTuple7> ZipIterator7;
+
+    typedef typename iterator_system<ZipIterator7>::type zip_iterator_system_type7;
+#endif
+
+    //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type7, thrust::device_system_tag>::value) );
+  } // end operator()()
+};
+SimpleUnitTest<TestZipIteratorSystem, NumericTypes> TestZipIteratorSystemInstance;
+
+
+template <typename Vector>
+void TestZipIteratorCopy(void)
+{
+  using namespace thrust;
+
+  Vector input0(4),  input1(4);
+  Vector output0(4), output1(4);
+
+  // initialize input
+  sequence(input0.begin(), input0.end(),  0);
+  sequence(input1.begin(), input1.end(), 13);
+
+  copy( make_zip_iterator(make_tuple(input0.begin(),  input1.begin())),
+        make_zip_iterator(make_tuple(input0.end(),    input1.end())),
+        make_zip_iterator(make_tuple(output0.begin(), output1.begin())));
+
+  ASSERT_EQUAL(input0, output0);
+  ASSERT_EQUAL(input1, output1);
+}
+DECLARE_VECTOR_UNITTEST(TestZipIteratorCopy);
+
+
+struct SumTwoTuple
+{
+  template<typename Tuple>
+  __host__ __device__
+  typename thrust::detail::remove_reference<typename thrust::tuple_element<0,Tuple>::type>::type
+    operator()(Tuple x) const
+  {
+    return thrust::get<0>(x) + thrust::get<1>(x);
+  }
+}; // end SumTwoTuple
+
+struct SumThreeTuple
+{
+  template<typename Tuple>
+  __host__ __device__
+  typename thrust::detail::remove_reference<typename thrust::tuple_element<0,Tuple>::type>::type
+    operator()(Tuple x) const
+  {
+    return thrust::get<0>(x) + thrust::get<1>(x) + thrust::get<2>(x);
+  }
+}; // end SumThreeTuple
+
+
+template <typename T>
+struct TestZipIteratorTransform
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_samples<T>(n);
+    host_vector<T> h_data1 = unittest::random_samples<T>(n);
+    host_vector<T> h_data2 = unittest::random_samples<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+    device_vector<T> d_data2 = h_data2;
+
+    host_vector<T>   h_result(n);
+    device_vector<T> d_result(n);
+
+    // Tuples with 2 elements
+    transform( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
+               make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
+               h_result.begin(),
+               SumTwoTuple());
+    transform( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin())),
+               make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end())),
+               d_result.begin(),
+               SumTwoTuple());
+    ASSERT_EQUAL(h_result, d_result);
+    
+    
+    // Tuples with 3 elements
+    transform( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+               make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+               h_result.begin(),
+               SumThreeTuple());
+    transform( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin(), d_data2.begin())),
+               make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end(),   d_data2.end())),
+               d_result.begin(),
+               SumThreeTuple());
+    ASSERT_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestZipIteratorTransform, ThirtyTwoBitTypes> TestZipIteratorTransformInstance;
+
+
+void TestZipIteratorCopyAoSToSoA(void)
+{
+  using namespace thrust;
+
+  const size_t n = 1;
+
+  typedef tuple<int,int> structure;
+  typedef host_vector<structure>   host_array_of_structures;
+  typedef device_vector<structure> device_array_of_structures;
+
+  typedef zip_iterator<
+    tuple<host_vector<int>::iterator, host_vector<int>::iterator>
+  > host_structure_of_arrays;
+
+  typedef zip_iterator<
+    tuple<device_vector<int>::iterator, device_vector<int>::iterator>
+  > device_structure_of_arrays;
+
+  host_array_of_structures   h_aos(n, make_tuple(7, 13) );
+  device_array_of_structures d_aos(n, make_tuple(7, 13) );
+
+
+
+  // host to host
+  host_vector<int> h_field0(n), h_field1(n);
+  host_structure_of_arrays h_soa = make_zip_iterator( make_tuple(h_field0.begin(), h_field1.begin()) );
+
+  thrust::copy(h_aos.begin(), h_aos.end(), h_soa);
+  ASSERT_EQUAL_QUIET(make_tuple(7, 13), h_soa[0]);
+
+
+
+  // host to device
+  device_vector<int> d_field0(n), d_field1(n);
+  device_structure_of_arrays d_soa = make_zip_iterator( make_tuple(d_field0.begin(), d_field1.begin()) );
+
+  thrust::copy(h_aos.begin(), h_aos.end(), d_soa);
+  ASSERT_EQUAL_QUIET(make_tuple(7, 13), d_soa[0]);
+
+
+
+  // device to device
+  thrust::fill(d_field0.begin(), d_field0.end(), 0);
+  thrust::fill(d_field1.begin(), d_field1.end(), 0);
+
+  thrust::copy(d_aos.begin(), d_aos.end(), d_soa);
+  ASSERT_EQUAL_QUIET(make_tuple(7, 13), d_soa[0]);
+
+
+  // device to host
+  thrust::fill(h_field0.begin(), h_field0.end(), 0);
+  thrust::fill(h_field1.begin(), h_field1.end(), 0);
+
+  thrust::copy(d_aos.begin(), d_aos.end(), h_soa);
+  ASSERT_EQUAL_QUIET(make_tuple(7, 13), h_soa[0]);
+};
+DECLARE_UNITTEST(TestZipIteratorCopyAoSToSoA);
+
+
+
+void TestZipIteratorCopySoAToAoS(void)
+{
+  using namespace thrust;
+
+  const size_t n = 1;
+
+  typedef tuple<int,int> structure;
+  typedef host_vector<structure>   host_array_of_structures;
+  typedef device_vector<structure> device_array_of_structures;
+
+  typedef zip_iterator<
+    tuple<host_vector<int>::iterator, host_vector<int>::iterator>
+  > host_structure_of_arrays;
+
+  typedef zip_iterator<
+    tuple<device_vector<int>::iterator, device_vector<int>::iterator>
+  > device_structure_of_arrays;
+
+  host_vector<int>   h_field0(n, 7), h_field1(n, 13);
+  device_vector<int> d_field0(n, 7), d_field1(n, 13);
+
+  host_structure_of_arrays   h_soa = make_zip_iterator(make_tuple(h_field0.begin(), h_field1.begin()));
+  device_structure_of_arrays d_soa = make_zip_iterator(make_tuple(d_field0.begin(), d_field1.begin()));
+
+  host_array_of_structures   h_aos(n);
+  device_array_of_structures d_aos(n);
+
+
+
+  // host to host
+  thrust::fill(h_aos.begin(), h_aos.end(), make_tuple(0,0));
+
+  thrust::copy(h_soa, h_soa + n, h_aos.begin());
+  ASSERT_EQUAL_QUIET(7,  get<0>(h_soa[0]));
+  ASSERT_EQUAL_QUIET(13, get<1>(h_soa[0]));
+
+
+
+  // host to device
+  thrust::fill(d_aos.begin(), d_aos.end(), make_tuple(0,0));
+
+  thrust::copy(h_soa, h_soa + n, d_aos.begin());
+  ASSERT_EQUAL_QUIET(7,  get<0>(d_soa[0]));
+  ASSERT_EQUAL_QUIET(13, get<1>(d_soa[0]));
+
+
+
+  // device to device
+  thrust::fill(d_aos.begin(), d_aos.end(), make_tuple(0,0));
+
+  thrust::copy(d_soa, d_soa + n, d_aos.begin());
+  ASSERT_EQUAL_QUIET(7,  get<0>(d_soa[0]));
+  ASSERT_EQUAL_QUIET(13, get<1>(d_soa[0]));
+
+
+
+  // device to host
+  thrust::fill(h_aos.begin(), h_aos.end(), make_tuple(0,0));
+
+  thrust::copy(d_soa, d_soa + n, h_aos.begin());
+  ASSERT_EQUAL_QUIET(7,  get<0>(h_soa[0]));
+  ASSERT_EQUAL_QUIET(13, get<1>(h_soa[0]));
+};
+DECLARE_UNITTEST(TestZipIteratorCopySoAToAoS);
+
diff --git a/thrust/testing/zip_iterator_reduce.cu b/thrust/testing/zip_iterator_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c1ad037dd32442cb957653dafb31ec553217de44
--- /dev/null
+++ b/thrust/testing/zip_iterator_reduce.cu
@@ -0,0 +1,54 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+
+using namespace unittest;
+
+template<typename Tuple>
+struct TuplePlus
+{
+  __host__ __device__
+  Tuple operator()(Tuple x, Tuple y) const
+  {
+    using namespace thrust;
+    return make_tuple(get<0>(x) + get<0>(y),
+                      get<1>(x) + get<1>(y));
+  }
+}; // end SumTuple
+
+
+template <typename T>
+struct TestZipIteratorReduce
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_samples<T>(n);
+    host_vector<T> h_data1 = unittest::random_samples<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+
+    typedef tuple<T,T> Tuple;
+
+    // run on host
+    Tuple h_result = reduce( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
+                             make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
+                             make_tuple<T,T>(0,0),
+                             TuplePlus<Tuple>());
+
+    // run on device
+    Tuple d_result = reduce( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin())),
+                             make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end())),
+                             make_tuple<T,T>(0,0),
+                             TuplePlus<Tuple>());
+
+    ASSERT_EQUAL(get<0>(h_result), get<0>(d_result));
+    ASSERT_EQUAL(get<1>(h_result), get<1>(d_result));
+  }
+};
+VariableUnitTest<TestZipIteratorReduce, IntegralTypes> TestZipIteratorReduceInstance;
+
+
+
diff --git a/thrust/testing/zip_iterator_reduce_by_key.cu b/thrust/testing/zip_iterator_reduce_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3fc99d66a33d03c6c14ce4060dfcb82e3d3c6c0
--- /dev/null
+++ b/thrust/testing/zip_iterator_reduce_by_key.cu
@@ -0,0 +1,124 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <unittest/cuda/testframework.h>
+#endif
+
+using namespace unittest;
+
+template<typename Tuple>
+struct TuplePlus
+{
+  __host__ __device__
+  Tuple operator()(Tuple x, Tuple y) const
+  {
+    using namespace thrust;
+    return make_tuple(get<0>(x) + get<0>(y),
+                      get<1>(x) + get<1>(y));
+  }
+}; // end TuplePlus
+
+
+template <typename T>
+struct TestZipIteratorReduceByKey
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_integers<bool>(n);
+    host_vector<T> h_data1 = unittest::random_integers<T>(n);
+    host_vector<T> h_data2 = unittest::random_integers<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+    device_vector<T> d_data2 = h_data2;
+
+    typedef tuple<T,T> Tuple;
+
+    // integer key, tuple value
+    {
+      host_vector<T> h_data3(n,0);
+      host_vector<T> h_data4(n,0);
+      host_vector<T> h_data5(n,0);
+      device_vector<T> d_data3(n,0);
+      device_vector<T> d_data4(n,0);
+      device_vector<T> d_data5(n,0);
+
+      // run on host
+      reduce_by_key
+          ( h_data0.begin(), h_data0.end(),
+            make_zip_iterator(make_tuple(h_data1.begin(), h_data2.begin())),
+            h_data3.begin(),
+            make_zip_iterator(make_tuple(h_data4.begin(), h_data5.begin())),
+            equal_to<T>(),
+            TuplePlus<Tuple>());
+
+      // run on device
+      reduce_by_key
+          ( d_data0.begin(), d_data0.end(),
+            make_zip_iterator(make_tuple(d_data1.begin(), d_data2.begin())),
+            d_data3.begin(),
+            make_zip_iterator(make_tuple(d_data4.begin(), d_data5.begin())),
+            equal_to<T>(),
+            TuplePlus<Tuple>());
+
+      ASSERT_EQUAL(h_data3, d_data3);
+      ASSERT_EQUAL(h_data4, d_data4);
+      ASSERT_EQUAL(h_data5, d_data5);
+    }
+    
+    // The tests below get miscompiled on Tesla hw for 8b types
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
+    {
+      if(typeid(T) == typeid(unittest::uint8_t) && driver->current_device_architecture() < 200)
+      {
+        KNOWN_FAILURE;
+      } // end if
+    } // end if
+#endif
+
+    // tuple key, tuple value
+    {
+      host_vector<T> h_data3(n,0);
+      host_vector<T> h_data4(n,0);
+      host_vector<T> h_data5(n,0);
+      host_vector<T> h_data6(n,0);
+      device_vector<T> d_data3(n,0);
+      device_vector<T> d_data4(n,0);
+      device_vector<T> d_data5(n,0);
+      device_vector<T> d_data6(n,0);
+
+      // run on host
+      reduce_by_key
+          ( make_zip_iterator(make_tuple(h_data0.begin(), h_data0.begin())),
+            make_zip_iterator(make_tuple(h_data0.end(),   h_data0.end())),
+            make_zip_iterator(make_tuple(h_data1.begin(), h_data2.begin())),
+            make_zip_iterator(make_tuple(h_data3.begin(), h_data4.begin())),
+            make_zip_iterator(make_tuple(h_data5.begin(), h_data6.begin())),
+            equal_to<Tuple>(),
+            TuplePlus<Tuple>());
+
+      // run on device
+      reduce_by_key
+          ( make_zip_iterator(make_tuple(d_data0.begin(), d_data0.begin())),
+            make_zip_iterator(make_tuple(d_data0.end(),   d_data0.end())),
+            make_zip_iterator(make_tuple(d_data1.begin(), d_data2.begin())),
+            make_zip_iterator(make_tuple(d_data3.begin(), d_data4.begin())),
+            make_zip_iterator(make_tuple(d_data5.begin(), d_data6.begin())),
+            equal_to<Tuple>(),
+            TuplePlus<Tuple>());
+
+      ASSERT_EQUAL(h_data3, d_data3);
+      ASSERT_EQUAL(h_data4, d_data4);
+      ASSERT_EQUAL(h_data5, d_data5);
+      ASSERT_EQUAL(h_data6, d_data6);
+    }
+  }
+};
+VariableUnitTest<TestZipIteratorReduceByKey, UnsignedIntegralTypes> TestZipIteratorReduceByKeyInstance;
+
diff --git a/thrust/testing/zip_iterator_scan.cu b/thrust/testing/zip_iterator_scan.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9fb767a6878c20f0b4a538fb7283091758710d48
--- /dev/null
+++ b/thrust/testing/zip_iterator_scan.cu
@@ -0,0 +1,112 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <unittest/cuda/testframework.h>
+#endif
+
+using namespace unittest;
+
+
+template<typename Tuple>
+struct TuplePlus
+{
+  __host__ __device__
+  Tuple operator()(Tuple x, Tuple y) const
+  {
+    using namespace thrust;
+    return make_tuple(get<0>(x) + get<0>(y),
+                      get<1>(x) + get<1>(y));
+  }
+}; // end SumTuple
+
+
+template <typename T>
+struct TestZipIteratorScan
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_samples<T>(n);
+    host_vector<T> h_data1 = unittest::random_samples<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+
+    typedef tuple<T,T> Tuple;
+
+    host_vector<Tuple>   h_result(n);
+    device_vector<Tuple> d_result(n);
+
+    // The tests below get miscompiled on Tesla hw for 8b types
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
+    {
+      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
+      {
+        KNOWN_FAILURE;
+      } // end if
+    } // end if
+#endif
+
+    // inclusive_scan (tuple output)
+    inclusive_scan( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
+                    make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
+                    h_result.begin(),
+                    TuplePlus<Tuple>());
+    inclusive_scan( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin())),
+                    make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end())),
+                    d_result.begin(),
+                    TuplePlus<Tuple>());
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+   
+    // exclusive_scan (tuple output)
+    exclusive_scan( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
+                    make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
+                    h_result.begin(),
+                    make_tuple<T,T>(0,0),
+                    TuplePlus<Tuple>());
+    exclusive_scan( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin())),
+                    make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end())),
+                    d_result.begin(),
+                    make_tuple<T,T>(0,0),
+                    TuplePlus<Tuple>());
+    ASSERT_EQUAL_QUIET(h_result, d_result);
+
+    host_vector<T>   h_result0(n);
+    host_vector<T>   h_result1(n);
+    device_vector<T> d_result0(n);
+    device_vector<T> d_result1(n);
+    
+    // inclusive_scan (zip_iterator output)
+    inclusive_scan( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
+                    make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
+                    make_zip_iterator(make_tuple(h_result0.begin(), h_result1.begin())),
+                    TuplePlus<Tuple>());
+    inclusive_scan( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin())),
+                    make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end())),
+                    make_zip_iterator(make_tuple(d_result0.begin(), d_result1.begin())),
+                    TuplePlus<Tuple>());
+    ASSERT_EQUAL_QUIET(h_result0, d_result0);
+    ASSERT_EQUAL_QUIET(h_result1, d_result1);
+    
+    // exclusive_scan (zip_iterator output)
+    exclusive_scan( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
+                    make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
+                    make_zip_iterator(make_tuple(h_result0.begin(), h_result1.begin())),
+                    make_tuple<T,T>(0,0),
+                    TuplePlus<Tuple>());
+    exclusive_scan( make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin())),
+                    make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end())),
+                    make_zip_iterator(make_tuple(d_result0.begin(), d_result1.begin())),
+                    make_tuple<T,T>(0,0),
+                    TuplePlus<Tuple>());
+    ASSERT_EQUAL_QUIET(h_result0, d_result0);
+    ASSERT_EQUAL_QUIET(h_result1, d_result1);
+  }
+};
+VariableUnitTest<TestZipIteratorScan, SignedIntegralTypes> TestZipIteratorScanInstance;
+
diff --git a/thrust/testing/zip_iterator_sort.cu b/thrust/testing/zip_iterator_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..62a2afb9f852b1c0a73e03cf8ed2c4c256dca674
--- /dev/null
+++ b/thrust/testing/zip_iterator_sort.cu
@@ -0,0 +1,31 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+template <typename T>
+  struct TestZipIteratorStableSort
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T>   h1 = unittest::random_integers<T>(n);
+    host_vector<T>   h2 = unittest::random_integers<T>(n);
+    
+    device_vector<T> d1 = h1;
+    device_vector<T> d2 = h2;
+    
+    // sort on host
+    stable_sort( make_zip_iterator(make_tuple(h1.begin(), h2.begin())),
+                 make_zip_iterator(make_tuple(h1.end(),   h2.end())) );
+
+    // sort on device
+    stable_sort( make_zip_iterator(make_tuple(d1.begin(), d2.begin())),
+                 make_zip_iterator(make_tuple(d1.end(),   d2.end())) );
+  
+    ASSERT_EQUAL_QUIET(h1, d1);
+    ASSERT_EQUAL_QUIET(h2, d2);
+  }
+};
+VariableUnitTest<TestZipIteratorStableSort, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestZipIteratorStableSortInstance;
+
diff --git a/thrust/testing/zip_iterator_sort_by_key.cu b/thrust/testing/zip_iterator_sort_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c1d7d0e8016199ed2723f14245eb49e74a908f9b
--- /dev/null
+++ b/thrust/testing/zip_iterator_sort_by_key.cu
@@ -0,0 +1,58 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+template <typename T>
+  struct TestZipIteratorStableSortByKey
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T>   h1 = unittest::random_integers<T>(n);
+    host_vector<T>   h2 = unittest::random_integers<T>(n);
+    host_vector<T>   h3 = unittest::random_integers<T>(n);
+    host_vector<T>   h4 = unittest::random_integers<T>(n);
+    
+    device_vector<T> d1 = h1;
+    device_vector<T> d2 = h2;
+    device_vector<T> d3 = h3;
+    device_vector<T> d4 = h4;
+    
+    // sort with (tuple, scalar)
+    stable_sort_by_key( make_zip_iterator(make_tuple(h1.begin(), h2.begin())),
+                        make_zip_iterator(make_tuple(h1.end(),   h2.end())),
+                        h3.begin() );
+    stable_sort_by_key( make_zip_iterator(make_tuple(d1.begin(), d2.begin())),
+                        make_zip_iterator(make_tuple(d1.end(),   d2.end())),
+                        d3.begin() );
+    
+    ASSERT_EQUAL_QUIET(h1, d1);
+    ASSERT_EQUAL_QUIET(h2, d2);
+    ASSERT_EQUAL_QUIET(h3, d3);
+    ASSERT_EQUAL_QUIET(h4, d4);
+    
+    // sort with (scalar, tuple)
+    stable_sort_by_key( h1.begin(),
+                        h1.end(),
+                        make_zip_iterator(make_tuple(h3.begin(), h4.begin())) );
+    stable_sort_by_key( d1.begin(),
+                        d1.end(),
+                        make_zip_iterator(make_tuple(d3.begin(), d4.begin())) );
+    
+    // sort with (tuple, tuple)
+    stable_sort_by_key( make_zip_iterator(make_tuple(h1.begin(), h2.begin())),
+                        make_zip_iterator(make_tuple(h1.end(),   h2.end())),
+                        make_zip_iterator(make_tuple(h3.begin(), h4.begin())) );
+    stable_sort_by_key( make_zip_iterator(make_tuple(d1.begin(), d2.begin())),
+                        make_zip_iterator(make_tuple(d1.end(),   d2.end())),
+                        make_zip_iterator(make_tuple(d3.begin(), d4.begin())) );
+  
+    ASSERT_EQUAL_QUIET(h1, d1);
+    ASSERT_EQUAL_QUIET(h2, d2);
+    ASSERT_EQUAL_QUIET(h3, d3);
+    ASSERT_EQUAL_QUIET(h4, d4);
+  }
+};
+VariableUnitTest<TestZipIteratorStableSortByKey, unittest::type_list<unittest::int8_t,unittest::int16_t,unittest::int32_t> > TestZipIteratorStableSortByKeyInstance;
+
diff --git a/thrust/thrust.vlcc b/thrust/thrust.vlcc
new file mode 100644
index 0000000000000000000000000000000000000000..c3c860f5dafcc01a09027180380ecd8ac29d80c8
--- /dev/null
+++ b/thrust/thrust.vlcc
@@ -0,0 +1,19 @@
+# thrust component
+{
+  # Descriptive name for the component
+  "name"      : "Thrust Library",
+  # Component owner (email address)
+  "owner"     : "blelbach@nvidia.com",
+  "module"    : "CUDA - Thrust",
+
+  # Files included in this component specified with one or more paths.
+  # Paths are relative to this file location. Path syntax uses wildcards and p4-like '...'.
+   "files"     : [ "..."           
+                 ],
+  # Output produced by this component and the installation location
+  # for each output. The install location is relative to
+  # VULCAN_INSTALL_DIR. Optional 'kind' property can be used to tag
+  # artifact kinds.
+   "artifacts" : [ { "thrust/*"            : "cuda/${INSTALL_TARGET_DIR}/include/thrust/." }
+                 ]
+}
diff --git a/thrust/thrust/addressof.h b/thrust/thrust/addressof.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa9e41c8efadf3458f3f2ed0b0ff8e281150bc9c
--- /dev/null
+++ b/thrust/thrust/addressof.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  include <thrust/detail/memory_wrapper.h>
+#endif
+
+namespace thrust
+{
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! Obtains the actual address of the object or function arg, even in presence of overloaded operator&.
+ */
+template <typename T>
+__host__ __device__
+T* addressof(T& arg) 
+{
+  return reinterpret_cast<T*>(
+    &const_cast<char&>(reinterpret_cast<const volatile char&>(arg))
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/adjacent_difference.h b/thrust/thrust/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..838beabe5fd62ab6bba85ec5e12319a587f9accc
--- /dev/null
+++ b/thrust/thrust/adjacent_difference.h
@@ -0,0 +1,246 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief Compute difference between consecutive elements of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations Transformations
+ *  \{
+ */
+
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>\*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, the difference of <tt>\*i</tt> and <tt>*(i - 1)</tt>
+ *  is assigned to <tt>\*(result + (i - first))</tt>.
+ *
+ *  This version of \p adjacent_difference uses <tt>operator-</tt> to calculate
+ *  differences.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the difference between adjacent elements of a range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin());
+ *
+ *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last, 
+                                   OutputIterator result);
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, <tt>binary_op(\*i, \*(i - 1))</tt> is assigned to
+ *  <tt>\*(result + (i - first))</tt>.
+ *  
+ *  This version of \p adjacent_difference uses the binary function \p binary_op to
+ *  calculate differences.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_op The binary function used to compute differences.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the sum between adjacent elements of a range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
+ *
+ *  // d_result is now [1, 3, 3, 3, 3, 3, 3, 3]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>\*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, the difference of <tt>\*i</tt> and <tt>*(i - 1)</tt>
+ *  is assigned to <tt>\*(result + (i - first))</tt>.
+ *
+ *  This version of \p adjacent_difference uses <tt>operator-</tt> to calculate
+ *  differences.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the difference between adjacent elements of a range.
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin());
+ *
+ *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template <typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
+                                   OutputIterator result);
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, <tt>binary_op(\*i, \*(i - 1))</tt> is assigned to
+ *  <tt>\*(result + (i - first))</tt>.
+ *  
+ *  This version of \p adjacent_difference uses the binary function \p binary_op to
+ *  calculate differences.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_op The binary function used to compute differences.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the sum between adjacent elements of a range.
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
+ *
+ *  // d_result is now [1, 3, 3, 3, 3, 3, 3, 3]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+/*! \}
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/adjacent_difference.inl>
+
diff --git a/thrust/thrust/advance.h b/thrust/thrust/advance.h
new file mode 100644
index 0000000000000000000000000000000000000000..d077e04345daea987044eab83a9e722ca956f19a
--- /dev/null
+++ b/thrust/thrust/advance.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file advance.h
+ *  \brief Advance an iterator by a given distance.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \p advance(i, n) increments the iterator \p i by the distance \p n.
+ *  If <tt>n > 0</tt> it is equivalent to executing <tt>++i</tt> \p n
+ *  times, and if <tt>n < 0</tt> it is equivalent to executing <tt>--i</tt>
+ *  \p n times. If <tt>n == 0</tt>, the call has no effect.
+ *
+ *  \param i The iterator to be advanced.
+ *  \param n The distance by which to advance the iterator.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Distance is an integral type that is convertible to \p InputIterator's distance type.
+ *
+ *  \pre \p n shall be negative only for bidirectional and random access iterators.
+ *
+ *  The following code snippet demonstrates how to use \p advance to increment
+ *  an iterator a given number of times.
+ *
+ *  \code
+ *  #include <thrust/advance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator iter = vec.begin();
+ *
+ *  thrust::advance(iter, 7);
+ *
+ *  // iter - vec.begin() == 7
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/advance.html
+ */
+template <typename InputIterator, typename Distance>
+__host__ __device__
+void advance(InputIterator& i, Distance n);
+
+/*! \p next(i, n) returns the \p n th successor of the iterator \p i.
+ *
+ *  \param i An iterator.
+ *  \param n The number of elements to advance.
+ *
+ *  \tparam InputIterator must meet the <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">InputIterator</a>.
+ *
+ *  \pre \p n shall be negative only for bidirectional and random access iterators.
+ *
+ *  The following code snippet demonstrates how to use \p next.
+ *
+ *  \code
+ *  #include <thrust/advance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator i0 = vec.begin();
+ *
+ *  auto i1 = thrust::next(i0);
+ *
+ *  // i0 - vec.begin() == 0
+ *  // i1 - vec.begin() == 1
+ *  \endcode
+ *
+ *  \see https://en.cppreference.com/w/cpp/iterator/next
+ */
+#if 0 // Doxygen only
+template <typename InputIterator, typename Distance>
+__host__ __device__
+InputIterator next(
+  InputIterator i
+, typename iterator_traits<InputIterator>::difference_type n = 1
+);
+#endif
+
+/*! \p prev(i, n) returns the \p n th predecessor of the iterator \p i.
+ *
+ *  \param i An iterator.
+ *  \param n The number of elements to descend.
+ *
+ *  \tparam BidirectionalIterator must meet the <a href="https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator">BidirectionalIterator</a>.
+ *
+ *  The following code snippet demonstrates how to use \p prev.
+ *
+ *  \code
+ *  #include <thrust/advance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator i0 = vec.end();
+ *
+ *  auto i1 = thrust::prev(i0);
+ *
+ *  // vec.end() - i0 == 0
+ *  // vec.end() - i1 == 1
+ *  \endcode
+ *
+ *  \see https://en.cppreference.com/w/cpp/iterator/prev
+ */
+#if 0 // Doxygen only
+template <typename BidirectionalIterator, typename Distance>
+__host__ __device__
+BidirectionalIterator prev(
+  BidirectionalIterator i
+, typename iterator_traits<BidirectionalIterator>::difference_type n = 1
+);
+#endif
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/detail/advance.inl>
+
diff --git a/thrust/thrust/allocate_unique.h b/thrust/thrust/allocate_unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e67d1b18a6dd8c4e8dd27a0f78531819489d6a4
--- /dev/null
+++ b/thrust/thrust/allocate_unique.h
@@ -0,0 +1,444 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_deduction.h>
+#include <thrust/detail/memory_algorithms.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+
+#include <utility>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+
+// wg21.link/p0316r0
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename Allocator, typename Pointer>
+void allocator_delete_impl(
+  Allocator const& alloc, Pointer p, std::false_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    traits::destroy(alloc_T, thrust::raw_pointer_cast(p));
+    traits::deallocate(alloc_T, p, 1);
+  }
+}
+
+template <typename Allocator, typename Pointer>
+void allocator_delete_impl(
+  Allocator const& alloc, Pointer p, std::true_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    traits::deallocate(alloc_T, p, 1);
+  }
+}
+
+} // namespace detail
+
+template <typename T, typename Allocator, bool Uninitialized = false>
+struct allocator_delete final
+{
+  using allocator_type
+    = typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type::template rebind<T>::other;
+  using pointer = typename detail::allocator_traits<allocator_type>::pointer;
+
+  template <typename UAllocator>
+  allocator_delete(UAllocator&& other) noexcept
+    : alloc_(THRUST_FWD(other))
+  {}
+
+  template <typename U, typename UAllocator>
+  allocator_delete(
+      allocator_delete<U, UAllocator> const& other
+    ) noexcept
+    : alloc_(other.get_allocator())
+  {}
+  template <typename U, typename UAllocator>
+  allocator_delete(
+      allocator_delete<U, UAllocator>&& other
+    ) noexcept
+    : alloc_(std::move(other.get_allocator()))
+  {}
+
+  template <typename U, typename UAllocator>
+  allocator_delete& operator=(
+    allocator_delete<U, UAllocator> const& other
+  ) noexcept
+  {
+    alloc_ = other.get_allocator();
+    return *this;
+  }
+  template <typename U, typename UAllocator>
+  allocator_delete& operator=(
+    allocator_delete<U, UAllocator>&& other
+  ) noexcept
+  {
+    alloc_ = std::move(other.get_allocator());
+    return *this;
+  }
+
+  void operator()(pointer p)
+  {
+    std::integral_constant<bool, Uninitialized> ic;
+
+    detail::allocator_delete_impl(get_allocator(), p, ic);
+  }
+
+  allocator_type& get_allocator() noexcept { return alloc_; }
+  allocator_type const& get_allocator() const noexcept { return alloc_; }
+
+  void swap(allocator_delete& other) noexcept
+  {
+    using std::swap;
+    swap(alloc_, other.alloc_);
+  }
+
+private:
+  allocator_type alloc_;
+};
+
+template <typename T, typename Allocator>
+using uninitialized_allocator_delete = allocator_delete<T, Allocator, true>;
+
+namespace detail {
+
+template <typename Allocator, typename Pointer, typename Size>
+void array_allocator_delete_impl(
+  Allocator const& alloc, Pointer p, Size count, std::false_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    destroy_n(alloc_T, p, count);
+    traits::deallocate(alloc_T, p, count);
+  }
+}
+
+template <typename Allocator, typename Pointer, typename Size>
+void array_allocator_delete_impl(
+  Allocator const& alloc, Pointer p, Size count, std::true_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    traits::deallocate(alloc_T, p, count);
+  }
+}
+
+} // namespace detail
+
+template <typename T, typename Allocator, bool Uninitialized = false>
+struct array_allocator_delete final
+{
+  using allocator_type
+    = typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type::template rebind<T>::other;
+  using pointer = typename detail::allocator_traits<allocator_type>::pointer;
+
+  template <typename UAllocator>
+  array_allocator_delete(UAllocator&& other, std::size_t n) noexcept
+    : alloc_(THRUST_FWD(other)), count_(n)
+  {}
+
+  template <typename U, typename UAllocator>
+  array_allocator_delete(
+      array_allocator_delete<U, UAllocator> const& other
+    ) noexcept
+    : alloc_(other.get_allocator()), count_(other.count_)
+  {}
+  template <typename U, typename UAllocator>
+  array_allocator_delete(
+      array_allocator_delete<U, UAllocator>&& other
+    ) noexcept
+    : alloc_(std::move(other.get_allocator())), count_(other.count_)
+  {}
+
+  template <typename U, typename UAllocator>
+  array_allocator_delete& operator=(
+    array_allocator_delete<U, UAllocator> const& other
+  ) noexcept
+  {
+    alloc_ = other.get_allocator();
+    count_ = other.count_;
+    return *this;
+  }
+  template <typename U, typename UAllocator>
+  array_allocator_delete& operator=(
+    array_allocator_delete<U, UAllocator>&& other
+  ) noexcept
+  {
+    alloc_ = std::move(other.get_allocator());
+    count_ = other.count_;
+    return *this;
+  }
+
+  void operator()(pointer p)
+  {
+    std::integral_constant<bool, Uninitialized> ic;
+
+    detail::array_allocator_delete_impl(get_allocator(), p, count_, ic);
+  }
+
+  allocator_type& get_allocator() noexcept { return alloc_; }
+  allocator_type const& get_allocator() const noexcept { return alloc_; }
+
+  void swap(array_allocator_delete& other) noexcept
+  {
+    using std::swap;
+    swap(alloc_, other.alloc_);
+    swap(count_, other.count_);
+  }
+
+private:
+  allocator_type alloc_;
+  std::size_t    count_;
+};
+  
+template <typename T, typename Allocator>
+using uninitialized_array_allocator_delete
+  = array_allocator_delete<T, Allocator, true>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pointer, typename Lambda>
+struct tagged_deleter : Lambda
+{
+  __host__ __device__
+  tagged_deleter(Lambda&& l) : Lambda(THRUST_FWD(l)) {}
+
+  using pointer = Pointer;
+};
+
+template <typename Pointer, typename Lambda>
+__host__ __device__
+tagged_deleter<Pointer, Lambda>
+make_tagged_deleter(Lambda&& l)
+{
+  return tagged_deleter<Pointer, Lambda>(THRUST_FWD(l));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Allocator, typename... Args>
+__host__
+std::unique_ptr<
+  T,
+  allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+allocate_unique(
+  Allocator const& alloc, Args&&... args
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [&alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, 1);
+    }
+  );
+  using hold_t = std::unique_ptr<T, decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, 1), hold_deleter);
+
+  traits::construct(
+    alloc_T, thrust::raw_pointer_cast(hold.get()), THRUST_FWD(args)...
+  );
+  auto deleter = allocator_delete<T, typename traits::allocator_type>(alloc);
+  return std::unique_ptr<T, decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+template <typename T, typename Allocator>
+__host__
+std::unique_ptr<
+  T,
+  uninitialized_allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+uninitialized_allocate_unique(
+  Allocator const& alloc
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [&alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, 1);
+    }
+  );
+  using hold_t = std::unique_ptr<T, decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, 1), hold_deleter);
+
+  auto deleter = uninitialized_allocator_delete<
+    T, typename traits::allocator_type
+  >(alloc_T);
+  return std::unique_ptr<T, decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+template <typename T, typename Allocator, typename Size, typename... Args>
+__host__
+std::unique_ptr<
+  T[],
+  array_allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+allocate_unique_n(
+  Allocator const& alloc, Size n, Args&&... args
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [n, &alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, n);
+    }
+  );
+  using hold_t = std::unique_ptr<T[], decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, n), hold_deleter);
+
+  uninitialized_construct_n_with_allocator(
+    alloc_T, hold.get(), n, THRUST_FWD(args)...
+  );
+  auto deleter = array_allocator_delete<
+    T, typename traits::allocator_type
+  >(alloc_T, n);
+  return std::unique_ptr<T[], decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+template <typename T, typename Allocator, typename Size>
+__host__
+std::unique_ptr<
+  T[],
+  uninitialized_array_allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+uninitialized_allocate_unique_n(
+  Allocator const& alloc, Size n
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [n, &alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, n);
+    }
+  );
+  using hold_t = std::unique_ptr<T[], decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, n), hold_deleter);
+
+  auto deleter = uninitialized_array_allocator_delete<
+    T, typename traits::allocator_type
+  >(alloc_T, n);
+  return std::unique_ptr<T[], decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/async/copy.h b/thrust/thrust/async/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6d792d55c3cfbefadc88745b84c9afc26693be5
--- /dev/null
+++ b/thrust/thrust/async/copy.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/copy.h
+ *  \brief Functions for asynchronously copying a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/adl/async/copy.h>
+
+#include <thrust/event.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+namespace unimplemented
+{
+
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+__host__
+event<FromPolicy>
+async_copy(
+  thrust::execution_policy<FromPolicy>& from_exec
+, thrust::execution_policy<ToPolicy>&   to_exec
+, ForwardIt first, Sentinel last, OutputIt output
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace copy_detail
+{
+
+using thrust::async::unimplemented::async_copy;
+
+struct copy_fn final
+{
+  template <
+    typename FromPolicy, typename ToPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<FromPolicy> const& from_exec
+  , thrust::detail::execution_policy_base<ToPolicy> const&   to_exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_copy(
+      thrust::detail::derived_cast(thrust::detail::strip_const(from_exec))
+    , thrust::detail::derived_cast(thrust::detail::strip_const(to_exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  )
+  THRUST_RETURNS(
+    copy_fn::call(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+      // Synthesize a suitable new execution policy, because we don't want to
+      // try and extract twice from the one we were passed.
+    , typename remove_cvref_t<
+        decltype(thrust::detail::derived_cast(thrust::detail::strip_const(exec)))
+      >::tag_type{}
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output)
+  THRUST_RETURNS(
+    copy_fn::call(
+      thrust::detail::select_system(
+        typename thrust::iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , thrust::detail::select_system(
+        typename thrust::iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace copy_detail
+
+THRUST_INLINE_CONSTANT copy_detail::copy_fn copy{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/async/for_each.h b/thrust/thrust/async/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..df8e141180bdea705f826e8d61b5bc6109fae2b3
--- /dev/null
+++ b/thrust/thrust/async/for_each.h
@@ -0,0 +1,119 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a for_each of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/for_each.h
+ *  \brief Functions for asynchronously iterating over the elements of a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/adl/async/for_each.h>
+
+#include <thrust/event.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename UnaryFunction
+>
+__host__
+event<DerivedPolicy>
+async_for_each(
+  thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, UnaryFunction
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+} 
+
+} // namespace unimplemented
+
+namespace for_each_detail
+{
+    
+using thrust::async::unimplemented::async_for_each;
+
+struct for_each_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename UnaryFunction
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , UnaryFunction&& f 
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_for_each(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(f)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename UnaryFunction>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last, UnaryFunction&& f) 
+  THRUST_RETURNS(
+    for_each_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(f)
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace for_each_detail
+
+THRUST_INLINE_CONSTANT for_each_detail::for_each_fn for_each{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/async/reduce.h b/thrust/thrust/async/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..da2b1195d0acbb2d50fff2054e82ae4a7ae03f58
--- /dev/null
+++ b/thrust/thrust/async/reduce.h
@@ -0,0 +1,441 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/reduce.h
+ *  \brief Functions for asynchronously reducing a range to a single value.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/system/detail/adl/async/reduce.h>
+
+#include <thrust/future.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
+>
+__host__ 
+future<DerivedPolicy, T>
+async_reduce(
+  thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, T, BinaryOp
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+} 
+
+} // namespace unimplemented
+
+namespace reduce_detail
+{
+
+using thrust::async::unimplemented::async_reduce;
+
+struct reduce_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , T&& init
+  , BinaryOp&& op
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename T
+  >
+  __host__
+  static auto call4(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , T&& init
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel
+  >
+  __host__
+  static auto
+  call3(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
+  __host__
+  static auto call4(ForwardIt&& first, Sentinel&& last,
+                    T&& init,
+                    BinaryOp&& op,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    reduce_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename T>
+  __host__
+  static auto call3(ForwardIt&& first, Sentinel&& last,
+                    T&& init,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    reduce_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3)
+  THRUST_RETURNS(
+    reduce_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
+                     thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename T1, typename T2, typename T3, typename T4>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4)
+  THRUST_RETURNS(
+    reduce_fn::call4(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+                     thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename ForwardIt, typename Sentinel>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last)
+  THRUST_RETURNS(
+    reduce_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__ 
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace reduce_detail
+
+THRUST_INLINE_CONSTANT reduce_detail::reduce_fn reduce{};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename T, typename BinaryOp
+>
+__host__
+event<DerivedPolicy>
+async_reduce_into(
+  thrust::execution_policy<DerivedPolicy>&
+, ForwardIt, Sentinel, OutputIt, T, BinaryOp
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+} 
+
+} // namespace unimplemented
+
+namespace reduce_into_detail
+{
+
+using thrust::async::unimplemented::async_reduce_into;
+
+struct reduce_into_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T, typename BinaryOp
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , BinaryOp&& op
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce_into(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T
+  >
+  __host__
+  static auto call5(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce_into(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto
+  call4(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce_into(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T, typename BinaryOp
+  >
+  __host__
+  static auto call5(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , BinaryOp&& op
+  , thrust::false_type
+  )
+  THRUST_RETURNS(
+    reduce_into_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T
+  >
+  __host__
+  static auto call4(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , thrust::false_type
+  )
+  THRUST_RETURNS(
+    reduce_into_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto call(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  )
+  THRUST_RETURNS(
+    reduce_into_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3, typename T4>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4)
+  THRUST_RETURNS(
+    reduce_into_fn::call4(
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+      thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4, T5&& t5)
+  THRUST_RETURNS(
+    reduce_into_fn::call5(
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+      THRUST_FWD(t5), thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__ 
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace reduce_into_detail
+
+THRUST_INLINE_CONSTANT reduce_into_detail::reduce_into_fn reduce_into{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/async/sort.h b/thrust/thrust/async/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..c665c6467e372929efbb586a8ffa19b761601c39
--- /dev/null
+++ b/thrust/thrust/async/sort.h
@@ -0,0 +1,275 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/sort.h
+ *  \brief Functions for asynchronously sorting a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/system/detail/adl/async/sort.h>
+
+#include <thrust/event.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+>
+__host__ 
+event<DerivedPolicy>
+async_stable_sort(
+  thrust::execution_policy<DerivedPolicy>& 
+, ForwardIt, Sentinel, StrictWeakOrdering
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+} 
+
+} // namespace unimplemented
+
+namespace stable_sort_detail
+{
+
+using thrust::async::unimplemented::async_stable_sort;
+
+struct stable_sort_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+  >
+  __host__ 
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , StrictWeakOrdering&& comp
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_stable_sort(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel
+  >
+  __host__ 
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_stable_sort(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
+  __host__ 
+  static auto call(ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp) 
+  THRUST_RETURNS(
+    stable_sort_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel>
+  __host__ 
+  static auto call(ForwardIt&& first, Sentinel&& last) 
+  THRUST_RETURNS(
+    stable_sort_fn::call(
+      THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__ 
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace stable_sort_detail
+
+THRUST_INLINE_CONSTANT stable_sort_detail::stable_sort_fn stable_sort{};
+
+namespace fallback
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+>
+__host__ 
+event<DerivedPolicy>
+async_sort(
+  thrust::execution_policy<DerivedPolicy>& exec
+, ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp
+)
+{
+  return async_stable_sort(
+    thrust::detail::derived_cast(exec)
+  , THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(comp)
+  );
+} 
+
+} // namespace fallback
+
+namespace sort_detail
+{
+
+using thrust::async::fallback::async_sort;
+
+struct sort_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+  >
+  __host__ 
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , StrictWeakOrdering&& comp
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_sort(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel
+  >
+  __host__ 
+  static auto call3(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , thrust::true_type
+  )
+  THRUST_RETURNS(
+    sort_fn::call(
+      exec
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
+  __host__ 
+  static auto call3(ForwardIt&& first, Sentinel&& last,
+                    StrictWeakOrdering&& comp,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    sort_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3)
+  THRUST_RETURNS(
+    sort_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
+                   thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename ForwardIt, typename Sentinel>
+  __host__ 
+  static auto call(ForwardIt&& first, Sentinel&& last) 
+  THRUST_RETURNS(
+    sort_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__ 
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace sort_detail
+
+THRUST_INLINE_CONSTANT sort_detail::sort_fn sort{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/async/transform.h b/thrust/thrust/async/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..89687e93ad38ed03df4638b0b98f15b78c8826d7
--- /dev/null
+++ b/thrust/thrust/async/transform.h
@@ -0,0 +1,134 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a transform of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/transform.h
+ *  \brief Functions for asynchronously transforming a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/adl/async/transform.h>
+
+#include <thrust/event.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename UnaryOperation
+>
+__host__
+event<DerivedPolicy>
+async_transform(
+  thrust::execution_policy<DerivedPolicy>& exec
+, ForwardIt first, Sentinel last, OutputIt output, UnaryOperation op
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace transform_detail
+{
+
+using thrust::async::unimplemented::async_transform;
+
+struct transform_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename UnaryOperation
+  >
+  __host__
+  static auto
+  call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , UnaryOperation&& op
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_transform(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename UnaryOperation
+  >
+  __host__
+  static auto call(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , UnaryOperation&& op
+  )
+  THRUST_RETURNS(
+    transform_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace tranform_detail
+
+THRUST_INLINE_CONSTANT transform_detail::transform_fn transform{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/binary_search.h b/thrust/thrust/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..127be16aab996b03e7290bac5ae3d1d1fce27588
--- /dev/null
+++ b/thrust/thrust/binary_search.h
@@ -0,0 +1,1902 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief Search for values in sorted ranges.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+    
+/*! \addtogroup algorithms
+ */
+
+
+/*! \addtogroup searching
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \addtogroup binary_search Binary Search
+ *  \ingroup searching
+ *  \{
+ */
+
+
+//////////////////////   
+// Scalar Functions //
+//////////////////////
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>*j < value</tt>. 
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 0); // returns input.begin()
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 2); // returns input.begin() + 1
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 8); // returns input.begin() + 4
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value);
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>*j < value</tt>. 
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(input.begin(), input.end(), 0); // returns input.begin()
+ *  thrust::lower_bound(input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 2); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::lower_bound(input.begin(), input.end(), 8); // returns input.begin() + 4
+ *  thrust::lower_bound(input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class LessThanComparable>
+ForwardIterator lower_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value);
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses function object \c comp for comparison 
+ * and returns the furthermost iterator \c i in <tt>[first, last)</tt>
+ * such that, for every iterator \c j in <tt>[first, i)</tt>, 
+ * <tt>comp(*j, value)</tt> is \c true. 
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin()
+ *  thrust::lower_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::lower_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.begin() + 4
+ *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp);
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses function object \c comp for comparison 
+ * and returns the furthermost iterator \c i in <tt>[first, last)</tt>
+ * such that, for every iterator \c j in <tt>[first, i)</tt>, 
+ * <tt>comp(*j, value)</tt> is \c true. 
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin()
+ *  thrust::lower_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::lower_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.begin() + 4
+ *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+ForwardIterator lower_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>value < *j</tt>
+ * is \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelism:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 0); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 2); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 8); // returns input.end()
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value);
+
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>value < *j</tt>
+ * is \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(input.begin(), input.end(), 0); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 2); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 8); // returns input.end()
+ *  thrust::upper_bound(input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class LessThanComparable>
+ForwardIterator upper_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value);
+
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses function object \c comp for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>comp(value, *j)</tt>
+ * is \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns input.end()
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp);
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses function object \c comp for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>comp(value, *j)</tt>
+ * is \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.end()
+ *  thrust::upper_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+ForwardIterator upper_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>*i < value</tt> and <tt>value < *i</tt> are both \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 0); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 1); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 2); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 3); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 8); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>*i < value</tt> and <tt>value < *i</tt> are both \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(input.begin(), input.end(), 0); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 1); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 2); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 3); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 8); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 9); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class LessThanComparable>
+bool binary_search(ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>comp(*i, value)</tt> and <tt>comp(value, *i)</tt> are both \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>comp(*i, value)</tt> and <tt>comp(value, *i)</tt> are both \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(input.begin(), input.end(), 0, thrust::less<int>()); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 1, thrust::less<int>()); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 2, thrust::less<int>()); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 3, thrust::less<int>()); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 8, thrust::less<int>()); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 9, thrust::less<int>()); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+bool binary_search(ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>, where \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>*k < value</tt>.  \c j is the furthermost
+ * iterator in <tt>[first, last)</tt> such that, for every iterator 
+ * \c k in <tt>[first, j)</tt>, <tt>value < *k</tt> is \c false. 
+ * For every iterator \c k in <tt>[i, j)</tt>, neither 
+ * <tt>value < *k</tt> nor <tt>*k < value</tt> is \c true.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 0); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>, where \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>*k < value</tt>.  \c j is the furthermost
+ * iterator in <tt>[first, last)</tt> such that, for every iterator 
+ * \c k in <tt>[first, j)</tt>, <tt>value < *k</tt> is \c false. 
+ * For every iterator \c k in <tt>[i, j)</tt>, neither 
+ * <tt>value < *k</tt> nor <tt>*k < value</tt> is \c true.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(input.begin(), input.end(), 0); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class LessThanComparable>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>. \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>comp(*k, value)</tt> is \c true.
+ * \c j is the furthermost iterator in <tt>[first, last)</tt> such
+ * that, for every iterator \c k in <tt>[first, last)</tt>, 
+ * <tt>comp(value, *k)</tt> is \c false. For every iterator \c k 
+ * in <tt>[i, j)</tt>, neither <tt>comp(value, *k)</tt> nor 
+ * <tt>comp(*k, value)</tt> is \c true.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>. \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>comp(*k, value)</tt> is \c true.
+ * \c j is the furthermost iterator in <tt>[first, last)</tt> such
+ * that, for every iterator \c k in <tt>[first, last)</tt>, 
+ * <tt>comp(value, *k)</tt> is \c false. For every iterator \c k 
+ * in <tt>[i, j)</tt>, neither <tt>comp(value, *k)</tt> nor 
+ * <tt>comp(*k, value)</tt> is \c true.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(input.begin(), input.end(), 0, thrust::less<int>()); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 1, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp);
+
+
+/*! \addtogroup vectorized_binary_search Vectorized Searches
+ *  \ingroup binary_search
+ *  \{
+ */
+
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(thrust::device,
+ *                      input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p lower_bound uses function object \c comp for comparison.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p lower_bound uses function object \c comp for comparison.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of last position where value could
+ * be inserted without violating the ordering.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(thrust::device,
+ *                      input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of last position where value could
+ * be inserted without violating the ordering.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p upper_bound uses function object \c comp for comparison.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(thrust::device,
+ *                      input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p upper_bound uses function object \c comp for comparison.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(thrust::device,
+ *                        input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  This version of \p binary_search uses function object 
+ * \c comp for comparison.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(thrust::device,
+ *                        input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin(),
+ *                        thrust::less<T>());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result,
+                             StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  This version of \p binary_search uses function object 
+ * \c comp for comparison.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin(),
+ *                        thrust::less<T>());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result,
+                             StrictWeakOrdering comp);
+
+
+/*! \} // end vectorized_binary_search
+ */
+
+
+/*! \} // end binary_search
+ */
+
+
+/*! \} // end searching
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/binary_search.inl>
+
diff --git a/thrust/thrust/cmake/FindTBB.cmake b/thrust/thrust/cmake/FindTBB.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..f0d5c8119b56036c23f930b6c7ea3f470f513d72
--- /dev/null
+++ b/thrust/thrust/cmake/FindTBB.cmake
@@ -0,0 +1,440 @@
+# - Find ThreadingBuildingBlocks include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(TBB
+#    [REQUIRED]             # Fail with error if TBB is not found
+#    )                      #
+# Once done, this will define
+#
+#  TBB_FOUND - system has TBB
+#  TBB_INCLUDE_DIRS - the TBB include directories
+#  TBB_LIBRARIES - TBB libraries to be lined, doesn't include malloc or
+#                  malloc proxy
+#  TBB::tbb - imported target for the TBB library
+#
+#  TBB_VERSION - Product Version Number ("MAJOR.MINOR")
+#  TBB_VERSION_MAJOR - Major Product Version Number
+#  TBB_VERSION_MINOR - Minor Product Version Number
+#  TBB_INTERFACE_VERSION - Engineering Focused Version Number
+#  TBB_COMPATIBLE_INTERFACE_VERSION - The oldest major interface version
+#                                     still supported. This uses the engineering
+#                                     focused interface version numbers.
+#
+#  TBB_MALLOC_FOUND - system has TBB malloc library
+#  TBB_MALLOC_INCLUDE_DIRS - the TBB malloc include directories
+#  TBB_MALLOC_LIBRARIES - The TBB malloc libraries to be lined
+#  TBB::malloc - imported target for the TBB malloc library
+#
+#  TBB_MALLOC_PROXY_FOUND - system has TBB malloc proxy library
+#  TBB_MALLOC_PROXY_INCLUDE_DIRS = the TBB malloc proxy include directories
+#  TBB_MALLOC_PROXY_LIBRARIES - The TBB malloc proxy libraries to be lined
+#  TBB::malloc_proxy - imported target for the TBB malloc proxy library
+#
+#
+# This module reads hints about search locations from variables:
+#  ENV TBB_ARCH_PLATFORM - for eg. set it to "mic" for Xeon Phi builds
+#  ENV TBB_ROOT or just TBB_ROOT - root directory of tbb installation
+#  ENV TBB_BUILD_PREFIX - specifies the build prefix for user built tbb
+#                         libraries. Should be specified with ENV TBB_ROOT
+#                         and optionally...
+#  ENV TBB_BUILD_DIR - if build directory is different than ${TBB_ROOT}/build
+#
+#
+# Modified by Robert Maynard from the original OGRE source
+#
+#-------------------------------------------------------------------
+# This file is part of the CMake build system for OGRE
+#     (Object-oriented Graphics Rendering Engine)
+# For the latest info, see http://www.ogre3d.org/
+#
+# The contents of this file are placed in the public domain. Feel
+# free to make use of it in any way you like.
+#-------------------------------------------------------------------
+#
+#=============================================================================
+# Copyright 2010-2012 Kitware, Inc.
+# Copyright 2012      Rolf Eike Beer <eike@sf-mail.de>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+#=============================================================================
+#  FindTBB helper functions and macros
+#
+
+#====================================================
+# Fix the library path in case it is a linker script
+#====================================================
+function(tbb_extract_real_library library real_library)
+  if(NOT UNIX OR NOT EXISTS ${library})
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  #Read in the first 4 bytes and see if they are the ELF magic number
+  set(_elf_magic "7f454c46")
+  file(READ ${library} _hex_data OFFSET 0 LIMIT 4 HEX)
+  if(_hex_data STREQUAL _elf_magic)
+    #we have opened a elf binary so this is what
+    #we should link to
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  file(READ ${library} _data OFFSET 0 LIMIT 1024)
+  if("${_data}" MATCHES "INPUT \\(([^(]+)\\)")
+    #extract out the .so name from REGEX MATCH command
+    set(_proper_so_name "${CMAKE_MATCH_1}")
+
+    #construct path to the real .so which is presumed to be in the same directory
+    #as the input file
+    get_filename_component(_so_dir "${library}" DIRECTORY)
+    set(${real_library} "${_so_dir}/${_proper_so_name}" PARENT_SCOPE)
+  else()
+    #unable to determine what this library is so just hope everything works
+    #and pass it unmodified.
+    set(${real_library} "${library}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+#===============================================
+# Do the final processing for the package find.
+#===============================================
+macro(findpkg_finish PREFIX TARGET_NAME)
+  if (${PREFIX}_INCLUDE_DIR AND ${PREFIX}_LIBRARY)
+    set(${PREFIX}_FOUND TRUE)
+    set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIR})
+    set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARY})
+  else ()
+    if (${PREFIX}_FIND_REQUIRED)
+      message(FATAL_ERROR "Required library ${PREFIX} not found.")
+    elseif (NOT ${PREFIX}_FIND_QUIETLY)
+      message("Library ${PREFIX} not found.")
+    endif()
+    return()
+  endif ()
+
+  if (NOT TARGET "TBB::${TARGET_NAME}")
+    if (${PREFIX}_LIBRARY_RELEASE)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_RELEASE} real_release)
+    endif ()
+    if (${PREFIX}_LIBRARY_DEBUG)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_DEBUG} real_debug)
+    endif ()
+    add_library(TBB::${TARGET_NAME} UNKNOWN IMPORTED)
+    set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${${PREFIX}_INCLUDE_DIR}")
+    if (${PREFIX}_LIBRARY_DEBUG AND ${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}"
+        IMPORTED_LOCATION_DEBUG "${real_debug}"
+        IMPORTED_LOCATION_RELEASE "${real_release}")
+    elseif (${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}")
+    elseif (${PREFIX}_LIBRARY_DEBUG)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_debug}")
+    endif ()
+  endif ()
+
+  #mark the following variables as internal variables
+  mark_as_advanced(${PREFIX}_INCLUDE_DIR
+                   ${PREFIX}_LIBRARY
+                   ${PREFIX}_LIBRARY_DEBUG
+                   ${PREFIX}_LIBRARY_RELEASE)
+endmacro()
+
+#===============================================
+# Generate debug names from given release names
+#===============================================
+macro(get_debug_names PREFIX)
+  foreach(i ${${PREFIX}})
+    set(${PREFIX}_DEBUG ${${PREFIX}_DEBUG} ${i}d ${i}D ${i}_d ${i}_D ${i}_debug ${i})
+  endforeach()
+endmacro()
+
+#===============================================
+# See if we have env vars to help us find tbb
+#===============================================
+macro(getenv_path VAR)
+   set(ENV_${VAR} $ENV{${VAR}})
+   # replace won't work if var is blank
+   if (ENV_${VAR})
+     string( REGEX REPLACE "\\\\" "/" ENV_${VAR} ${ENV_${VAR}} )
+   endif ()
+endmacro()
+
+#===============================================
+# Couple a set of release AND debug libraries
+#===============================================
+macro(make_library_set PREFIX)
+  if (${PREFIX}_RELEASE AND ${PREFIX}_DEBUG)
+    set(${PREFIX} optimized ${${PREFIX}_RELEASE} debug ${${PREFIX}_DEBUG})
+  elseif (${PREFIX}_RELEASE)
+    set(${PREFIX} ${${PREFIX}_RELEASE})
+  elseif (${PREFIX}_DEBUG)
+    set(${PREFIX} ${${PREFIX}_DEBUG})
+  endif ()
+endmacro()
+
+
+#=============================================================================
+#  Now to actually find TBB
+#
+
+# Get path, convert backslashes as ${ENV_${var}}
+getenv_path(TBB_ROOT)
+
+# initialize search paths
+set(TBB_PREFIX_PATH ${TBB_ROOT} ${ENV_TBB_ROOT})
+set(TBB_INC_SEARCH_PATH "")
+set(TBB_LIB_SEARCH_PATH "")
+
+
+# If user built from sources
+set(TBB_BUILD_PREFIX $ENV{TBB_BUILD_PREFIX})
+if (TBB_BUILD_PREFIX AND ENV_TBB_ROOT)
+  getenv_path(TBB_BUILD_DIR)
+  if (NOT ENV_TBB_BUILD_DIR)
+    set(ENV_TBB_BUILD_DIR ${ENV_TBB_ROOT}/build)
+  endif ()
+
+  # include directory under ${ENV_TBB_ROOT}/include
+  list(APPEND TBB_LIB_SEARCH_PATH
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_release
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_debug)
+endif ()
+
+
+# For Windows, let's assume that the user might be using the precompiled
+# TBB packages from the main website. These use a rather awkward directory
+# structure (at least for automatically finding the right files) depending
+# on platform and compiler, but we'll do our best to accommodate it.
+# Not adding the same effort for the precompiled linux builds, though. Those
+# have different versions for CC compiler versions and linux kernels which
+# will never adequately match the user's setup, so there is no feasible way
+# to detect the "best" version to use. The user will have to manually
+# select the right files. (Chances are the distributions are shipping their
+# custom version of tbb, anyway, so the problem is probably nonexistent.)
+if (WIN32 AND MSVC)
+  set(COMPILER_PREFIX "vc7.1")
+  if (MSVC_VERSION EQUAL 1400)
+    set(COMPILER_PREFIX "vc8")
+  elseif(MSVC_VERSION EQUAL 1500)
+    set(COMPILER_PREFIX "vc9")
+  elseif(MSVC_VERSION EQUAL 1600)
+    set(COMPILER_PREFIX "vc10")
+  elseif(MSVC_VERSION EQUAL 1700)
+    set(COMPILER_PREFIX "vc11")
+  elseif(MSVC_VERSION EQUAL 1800)
+    set(COMPILER_PREFIX "vc12")
+  elseif(MSVC_VERSION GREATER_EQUAL 1900 AND MSVC_VERSION LESS_EQUAL 1925)
+      # 1900-1925 actually spans three Visual Studio versions:
+      # 1900      = VS 14.0 (v140 toolset) a.k.a. MSVC 2015
+      # 1910-1919 = VS 15.0 (v141 toolset) a.k.a. MSVC 2017
+      # 1920-1929 = VS 16.0 (v142 toolset) a.k.a. MSVC 2019
+      #
+      # But these are binary compatible and TBB's open source distribution only
+      # ships a single vs14 lib (as of 2020.0)
+    set(COMPILER_PREFIX "vc14")
+  else()
+    # The next poor soul who finds themselves having to decode visual studio
+    # version conventions may find these helpful:
+    # - https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
+    # - https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B#Internal_version_numbering
+    message(AUTHOR_WARNING
+      "Unrecognized MSVC version. Please update FindTBB.cmake. "
+      "Some TBB_* values may need to be set manually."
+    )
+  endif ()
+
+  # for each prefix path, add ia32/64\${COMPILER_PREFIX}\lib to the lib search path
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    if (CMAKE_CL_64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia64/${COMPILER_PREFIX})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${COMPILER_PREFIX})
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${COMPILER_PREFIX})
+    endif ()
+  endforeach ()
+endif ()
+
+# For OS X binary distribution, choose libc++ based libraries for Mavericks (10.9)
+# and above and AppleClang
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND
+    NOT CMAKE_SYSTEM_VERSION VERSION_LESS 13.0)
+  set (USE_LIBCXX OFF)
+  cmake_policy(GET CMP0025 POLICY_VAR)
+
+  if (POLICY_VAR STREQUAL "NEW")
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      set (USE_LIBCXX ON)
+    endif ()
+  else ()
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      set (USE_LIBCXX ON)
+    endif ()
+  endif ()
+
+  if (USE_LIBCXX)
+    foreach (dir IN LISTS TBB_PREFIX_PATH)
+      list (APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/libc++ ${dir}/libc++/lib)
+    endforeach ()
+  endif ()
+endif ()
+
+# check compiler ABI
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
+    list(APPEND COMPILER_PREFIX "gcc4.4")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.1")
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.6)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+else() # Assume compatibility with 4.4 for other compilers
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+endif ()
+
+# if platform architecture is explicitly specified
+set(TBB_ARCH_PLATFORM $ENV{TBB_ARCH_PLATFORM})
+if (TBB_ARCH_PLATFORM)
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/${TBB_ARCH_PLATFORM}/lib)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/${TBB_ARCH_PLATFORM})
+  endforeach ()
+endif ()
+
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  foreach (prefix IN LISTS COMPILER_PREFIX)
+    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${prefix}/lib)
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${prefix}/lib)
+    endif ()
+  endforeach()
+endforeach ()
+
+# add general search paths
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib ${dir}/Lib ${dir}/lib/tbb
+    ${dir}/Libs)
+  list(APPEND TBB_INC_SEARCH_PATH ${dir}/include ${dir}/Include
+    ${dir}/include/tbb)
+endforeach ()
+
+set(TBB_LIBRARY_NAMES tbb)
+get_debug_names(TBB_LIBRARY_NAMES)
+
+
+find_path(TBB_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_LIBRARY_RELEASE
+             NAMES ${TBB_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_LIBRARY_DEBUG
+             NAMES ${TBB_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_LIBRARY)
+
+findpkg_finish(TBB tbb)
+
+#if we haven't found TBB no point on going any further
+if (NOT TBB_FOUND)
+  return()
+endif ()
+
+#=============================================================================
+# Look for TBB's malloc package
+set(TBB_MALLOC_LIBRARY_NAMES tbbmalloc)
+get_debug_names(TBB_MALLOC_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_LIBRARY)
+
+findpkg_finish(TBB_MALLOC tbbmalloc)
+
+#=============================================================================
+# Look for TBB's malloc proxy package
+set(TBB_MALLOC_PROXY_LIBRARY_NAMES tbbmalloc_proxy)
+get_debug_names(TBB_MALLOC_PROXY_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_PROXY_INCLUDE_DIR
+          NAMES tbb/tbbmalloc_proxy.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_PROXY_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_PROXY_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_PROXY_LIBRARY)
+
+findpkg_finish(TBB_MALLOC_PROXY tbbmalloc_proxy)
+
+
+#=============================================================================
+#parse all the version numbers from tbb
+if(NOT TBB_VERSION)
+
+ #only read the start of the file
+ file(STRINGS
+      "${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h"
+      TBB_VERSION_CONTENTS
+      REGEX "VERSION")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MAJOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MINOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_COMPATIBLE_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_COMPATIBLE_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
+
+endif()
diff --git a/thrust/thrust/cmake/README.md b/thrust/thrust/cmake/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c032411d043aafc5dff3da8eeb15a42ce796f7f3
--- /dev/null
+++ b/thrust/thrust/cmake/README.md
@@ -0,0 +1,215 @@
+# Using Thrust with CMake
+
+Thrust provides configuration files that simplify using Thrust
+from other CMake projects. Requirements:
+
+- Thrust >= 1.9.10
+- CMake >= 3.15
+
+See the [Fixing Legacy FindThrust.cmake](#fixing-legacy-findthrustcmake)
+section for solutions that work on older Thrust versions.
+
+## User Guide
+
+#### Default Configuration (CUDA)
+
+Thrust is configured using a `thrust_create_target` CMake function that
+assembles a complete interface to the Thrust library:
+
+```cmake
+find_package(Thrust REQUIRED CONFIG)
+thrust_create_target(Thrust)
+target_link_libraries(MyProgram Thrust)
+```
+
+The first argument is the name of the interface target to create, and any
+additional options will be used to configure the target. By default,
+`thrust_create_target` will configure its result to use CUDA acceleration.
+
+If desired, `thrust_create_target` may be called multiple times to build
+several unique Thrust interface targets with different configurations, as
+detailed below.
+
+**Note:** If CMake is unable to locate Thrust, specify the path to Thrust's CMake
+configuration directory (where this README file is located) as `Thrust_DIR`,
+e.g.:
+
+```
+$ cmake . -DThrust_DIR=/usr/local/cuda/include/thrust/cmake/
+```
+
+#### TBB / OpenMP
+
+To explicitly specify host/device systems, `HOST` and `DEVICE` arguments can be
+passed to `thrust_create_target`. If an explicit system is not specified, the
+target will default to using CPP for host and/or CUDA for device.
+
+```cmake
+thrust_create_target(ThrustTBB DEVICE TBB)
+thrust_create_target(ThrustOMP HOST CPP DEVICE OMP)
+```
+
+will create targets `ThrustTBB` and `ThrustOMP`. Both will use the serial `CPP`
+host system, but will find and use TBB or OpenMP for the device system.
+
+#### Configure Target from Cache Options
+
+To allow a Thrust target to be configurable easily via `cmake-gui` or
+`ccmake`, pass the `FROM_OPTIONS` flag to `thrust_create_target`. This will add
+`THRUST_HOST_SYSTEM` and `THRUST_DEVICE_SYSTEM` options to the CMake cache that
+allow selection from the systems supported by this version of Thrust.
+
+```cmake
+thrust_create_target(Thrust FROM_OPTIONS
+  [HOST_OPTION <option name>]
+  [DEVICE_OPTION <option name>]
+  [HOST_OPTION_DOC <doc string>]
+  [DEVICE_OPTION_DOC <doc string>]
+  [HOST <default host system name>]
+  [DEVICE <default device system name>]
+  [ADVANCED]
+)
+```
+
+The optional arguments have sensible defaults, but may be configured per
+`thrust_create_target` call:
+
+| Argument            | Default                 | Description                     |
+|---------------------|-------------------------|---------------------------------|
+| `HOST_OPTION`       | `THRUST_HOST_SYSTEM`    | Name of cache option for host   |
+| `DEVICE_OPTION`     | `THRUST_DEVICE_SYSTEM`  | Name of cache option for device |
+| `HOST_OPTION_DOC`   | Thrust's host system.   | Docstring for host option       |
+| `DEVICE_OPTION_DOC` | Thrust's device system. | Docstring for device option     |
+| `HOST`              | `CPP`                   | Default host system             |
+| `DEVICE`            | `CUDA`                  | Default device system           |
+| `ADVANCED`          | *N/A*                   | Mark cache options advanced     |
+
+### Specifying Thrust Version Requirements
+
+A specific version of Thrust may be required in the `find_package` call:
+
+```cmake
+find_package(Thrust 1.9.10)
+```
+
+will only consider Thrust installations with version `1.9.10.X`. An exact match
+down to the patch version can be forced by using `EXACT` matching:
+
+```cmake
+find_package(Thrust 1.9.10.1 EXACT)
+```
+
+would only match the 1.9.10.1 release.
+
+#### Using a Specific TBB or OpenMP Environment
+
+When `thrust_create_target` is called, it will lazily load the requested
+systems on-demand through internal `find_package` calls. If a project already
+uses TBB or OpenMP, it may specify a CMake target for Thrust to share instead:
+
+```cmake
+thrust_set_TBB_target(MyTBBTarget)
+thrust_set_OMP_target(MyOMPTarget)
+```
+
+These functions must be called **before** `thrust_create_target`, and will
+have no effect if the dependency is loaded as a
+`find_package(Thrust COMPONENT [...])` component.
+
+#### Testing for Systems
+
+The following functions check if a system has been found, either by lazy loading
+through `thrust_create_target` or as a `find_package` `COMPONENT` /
+`OPTIONAL_COMPONENT`:
+
+```cmake
+# Set var_name to TRUE or FALSE if an individual system has been found:
+thrust_is_cuda_system_found(<var_name>)
+thrust_is_cpp_system_found(<var_name>)
+thrust_is_tbb_system_found(<var_name>)
+thrust_is_omp_system_found(<var_name>)
+
+# Generic version that takes a component name from CUDA, CPP, TBB, OMP:
+thrust_is_system_found(<component_name> <var_name>)
+
+# Defines `THRUST_*_FOUND` variables in the current scope that reflect the
+# state of all known systems. Can be used to refresh these flags after
+# lazy system loading.
+thrust_update_system_found_flags()
+```
+
+#### Debugging
+
+Thrust will produce a detailed log describing its targets, cache options, and
+interfaces when `--log-level=VERBOSE` is passed to CMake 3.15.7 or newer:
+
+```
+$ cmake . --log-level=VERBOSE
+```
+
+This can be handy for inspecting interface and dependency information.
+
+## Fixing Legacy FindThrust.cmake
+
+A community-created `FindThrust.cmake` module exists and is necessary to find
+Thrust installations prior to Thrust 1.9.10. Its usage is discouraged whenever
+possible and the config files in this directory should be strongly preferred.
+However, projects that need to support old versions of Thrust may still need to
+use the legacy `FindThrust.cmake` with pre-1.9.10 installations.
+
+One popular flavor of this find module has a version parsing bug. Projects that
+rely on `FindThrust.cmake` should check for this and patch their copies as
+follows.
+
+Replace:
+
+```cmake
+string( REGEX MATCH "^[0-9]" major ${version} )
+string( REGEX REPLACE "^${major}00" "" version "${version}" )
+string( REGEX MATCH "^[0-9]" minor ${version} )
+string( REGEX REPLACE "^${minor}0" "" version "${version}" )
+```
+
+with:
+
+```cmake
+math(EXPR major "${version} / 100000")
+math(EXPR minor "(${version} / 100) % 1000")
+math(EXPR version "${version} % 100")
+```
+
+# Thrust Developer Documentation
+
+This portion of the file contains descriptions of Thrust's internal CMake target
+structure for Thrust developers. It should not be necessary for users
+who just want to use Thrust from their projects.
+
+## Internal Targets
+
+By default, `find_package(Thrust)` will only create a single `Thrust::Thrust`
+target that describes where the actual Thrust headers are located. It does not
+locate or create configurations for any dependencies; these are lazily loaded
+on-demand by calls to `create_thrust_target`, or when explicitly requested via
+`find_package`'s component mechanism.
+
+As mentioned, the basic Thrust interface is described by the `Thrust::Thrust`
+target.
+
+Each backend system (`CPP`, `CUDA`, `TBB`, `OMP`) is described by multiple
+targets:
+
+- `Thrust::${system}`
+  - Specifies an interface configured to build against all
+    dependencies for this backend (including `Thrust::Thrust`).
+  - For example, the `Thrust::CUDA` target is an interface
+    target that combines the interfaces of both Thrust and CUB.
+- `Thrust::${system}::Host`
+  - Configures an interface for using a specific host system.
+  - Multiple `::Host` targets cannot be combined in the same library/executable.
+    Attempting to do so will produce a CMake configuration error.
+  - Only defined for systems that support being used as the host.
+- `Thrust::${system}::Device`
+  - Configures an interface for using a specific device system.
+  - Multiple `::Device` targets cannot be combined in the same library/executable.
+    Attempting to do so will produce a CMake configuration error.
+  - Only defined for systems that support being used as the device.
diff --git a/thrust/thrust/cmake/thrust-config-version.cmake b/thrust/thrust/cmake/thrust-config-version.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0d7fdb943b9131a397e6c4e2d5d8222691797034
--- /dev/null
+++ b/thrust/thrust/cmake/thrust-config-version.cmake
@@ -0,0 +1,33 @@
+# Parse version information from version.h:
+file(READ "${CMAKE_CURRENT_LIST_DIR}/../version.h" THRUST_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
+set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that Thrust calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+THRUST_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
+set(THRUST_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR THRUST_VERSION_MAJOR "${THRUST_VERSION_FLAT} / 100000")
+math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION_FLAT} / 100) % 1000")
+math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION_FLAT} % 100") # Thrust: "subminor" CMake: "patch"
+
+# Build comparison versions:
+set(THRUST_COMPAT "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}")
+set(THRUST_EXACT "${THRUST_COMPAT}.${THRUST_VERSION_TWEAK}")
+set(FIND_COMPAT "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}")
+set(FIND_EXACT "${FIND_COMPAT}.${PACKAGE_FIND_VERSION_TWEAK}")
+
+# Set default results
+set(PACKAGE_VERSION ${THRUST_EXACT})
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+
+# Test for compatibility (ignores tweak)
+if (FIND_COMPAT VERSION_EQUAL THRUST_COMPAT)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+endif()
+
+# Test for exact (does not ignore tweak)
+if (FIND_EXACT VERSION_EQUAL THRUST_EXACT)
+  set(PACKAGE_VERSION_EXACT TRUE)
+endif()
diff --git a/thrust/thrust/cmake/thrust-config.cmake b/thrust/thrust/cmake/thrust-config.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..467579d1d0b8e273b562ff7f6eb01fc31c901d10
--- /dev/null
+++ b/thrust/thrust/cmake/thrust-config.cmake
@@ -0,0 +1,652 @@
+#
+# find_package(Thrust) config file.
+#
+# Provided by NVIDIA under the same license as the associated Thrust library.
+#
+# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
+#
+# *****************************************************************************
+# **     The following is a short reference to using Thrust from CMake.      **
+# ** For more details, see the README.md in the same directory as this file. **
+# *****************************************************************************
+#
+# # General Usage:
+# find_package(Thrust REQUIRED CONFIG)
+# thrust_create_target(Thrust [options])
+# target_link_libraries(some_project_lib Thrust)
+#
+# # Create default target with: HOST=CPP DEVICE=CUDA
+# thrust_create_target(TargetName)
+#
+# # Create target with: HOST=CPP DEVICE=TBB
+# thrust_create_target(TargetName DEVICE TBB)
+#
+# # Create target with: HOST=TBB DEVICE=OMP
+# thrust_create_target(TargetName HOST TBB DEVICE OMP)
+#
+# # Create CMake cache options THRUST_[HOST|DEVICE]_SYSTEM and configure a
+# # target from them. This allows these systems to be changed by developers at
+# # configure time, per build.
+# thrust_create_target(TargetName FROM_OPTIONS
+#   [HOST_OPTION <option_name>]      # Optionally rename the host system option
+#   [DEVICE_OPTION <option_name>]    # Optionally rename the device system option
+#   [HOST_OPTION_DOC <doc_string>]   # Optionally change the cache label
+#   [DEVICE_OPTION_DOC <doc_string>] # Optionally change the cache label
+#   [HOST <default system>]          # Optionally change the default backend
+#   [DEVICE <default system>]        # Optionally change the default backend
+#   [ADVANCED]                       # Optionally mark options as advanced
+# )
+#
+# # Use a custom TBB, CUB, and/or OMP
+# # (Note that once set, these cannot be changed. This includes COMPONENT
+# # preloading and lazy lookups in thrust_create_target)
+# find_package(Thrust REQUIRED)
+# thrust_set_CUB_target(MyCUBTarget)  # MyXXXTarget contains an existing
+# thrust_set_TBB_target(MyTBBTarget)  # interface to XXX for Thrust to use.
+# thrust_set_OMP_target(MyOMPTarget)
+# thrust_create_target(ThrustWithMyCUB DEVICE CUDA)
+# thrust_create_target(ThrustWithMyTBB DEVICE TBB)
+# thrust_create_target(ThrustWithMyOMP DEVICE OMP)
+#
+# # Create target with HOST=CPP DEVICE=CUDA and some advanced flags set
+# thrust_create_target(TargetName
+#   IGNORE_DEPRECATED_CPP_DIALECT # Silence build warnings about deprecated compilers and C++ standards
+#   IGNORE_DEPRECATED_CPP_11      # Only silence deprecation warnings for C++11
+#   IGNORE_DEPRECATED_COMPILER    # Only silence deprecation warnings for old compilers
+#   IGNORE_CUB_VERSION            # Skip configure-time and compile-time CUB version checks
+# )
+#
+# # Test if a particular system has been loaded. ${var_name} is set to TRUE or
+# # FALSE to indicate if "system" is found.
+# thrust_is_system_found(<system> <var_name>)
+# thrust_is_cuda_system_found(<var_name>)
+# thrust_is_tbb_system_found(<var_name>)
+# thrust_is_omp_system_found(<var_name>)
+# thrust_is_cpp_system_found(<var_name>)
+#
+# # Define / update THRUST_${system}_FOUND flags in current scope
+# thrust_update_system_found_flags()
+#
+# # View verbose log with target and dependency information:
+# $ cmake . --log-level=VERBOSE (CMake 3.15.7 and above)
+#
+# # Print debugging output to status channel:
+# thrust_debug_internal_targets()
+# thrust_debug_target(TargetName "${THRUST_VERSION}")
+
+cmake_minimum_required(VERSION 3.15)
+
+################################################################################
+# User variables and APIs. Users can rely on these:
+#
+
+# Advertise system options:
+set(THRUST_HOST_SYSTEM_OPTIONS
+  CPP OMP TBB
+  CACHE INTERNAL "Valid Thrust host systems."
+)
+set(THRUST_DEVICE_SYSTEM_OPTIONS
+  CUDA CPP OMP TBB
+  CACHE INTERNAL "Valid Thrust device systems"
+)
+
+# Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+set(THRUST_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
+set(THRUST_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
+set(THRUST_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
+set(THRUST_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
+set(THRUST_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
+set(THRUST_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
+
+function(thrust_create_target target_name)
+  thrust_debug("Assembling target ${target_name}. Options: ${ARGN}" internal)
+  set(options
+    ADVANCED
+    FROM_OPTIONS
+    IGNORE_CUB_VERSION_CHECK
+    IGNORE_DEPRECATED_COMPILER
+    IGNORE_DEPRECATED_CPP_11
+    IGNORE_DEPRECATED_CPP_DIALECT
+    )
+  set(keys
+    DEVICE
+    DEVICE_OPTION
+    DEVICE_OPTION_DOC
+    HOST
+    HOST_OPTION
+    HOST_OPTION_DOC
+    )
+  cmake_parse_arguments(TCT "${options}" "${keys}" "" ${ARGN})
+  if (TCT_UNPARSED_ARGUMENTS)
+    message(AUTHOR_WARNING
+      "Unrecognized arguments passed to thrust_create_target: "
+      ${TCT_UNPARSED_ARGUMENTS}
+      )
+  endif()
+
+  # Check that the main Thrust internal target is available
+  # (functions have global scope, targets have directory scope, so this
+  # might happen)
+  if (NOT TARGET Thrust::Thrust)
+    message(AUTHOR_WARNING
+      "The `thrust_create_target` function was called outside the scope of the "
+      "thrust targets. Call find_package again to recreate targets."
+      )
+  endif()
+
+  _thrust_set_if_undefined(TCT_HOST CPP)
+  _thrust_set_if_undefined(TCT_DEVICE CUDA)
+  _thrust_set_if_undefined(TCT_HOST_OPTION THRUST_HOST_SYSTEM)
+  _thrust_set_if_undefined(TCT_DEVICE_OPTION THRUST_DEVICE_SYSTEM)
+  _thrust_set_if_undefined(TCT_HOST_OPTION_DOC "Thrust host system.")
+  _thrust_set_if_undefined(TCT_DEVICE_OPTION_DOC "Thrust device system.")
+
+  if (NOT TCT_HOST IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
+    message(FATAL_ERROR
+      "Requested HOST=${TCT_HOST}; must be one of ${THRUST_HOST_SYSTEM_OPTIONS}")
+  endif()
+
+  if (NOT TCT_DEVICE IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    message(FATAL_ERROR
+      "Requested DEVICE=${TCT_DEVICE}; must be one of ${THRUST_DEVICE_SYSTEM_OPTIONS}")
+  endif()
+
+  if (TCT_FROM_OPTIONS)
+    _thrust_create_cache_options(
+      ${TCT_HOST} ${TCT_DEVICE}
+      ${TCT_HOST_OPTION} ${TCT_DEVICE_OPTION}
+      ${TCT_HOST_OPTION_DOC} ${TCT_DEVICE_OPTION_DOC}
+      ${TCT_ADVANCED}
+    )
+    set(TCT_HOST ${${TCT_HOST_OPTION}})
+    set(TCT_DEVICE ${${TCT_DEVICE_OPTION}})
+    thrust_debug("Current option settings:" internal)
+    thrust_debug("  - ${TCT_HOST_OPTION}=${TCT_HOST}" internal)
+    thrust_debug("  - ${TCT_DEVICE_OPTION}=${TCT_DEVICE}" internal)
+  endif()
+
+  _thrust_find_backend(${TCT_HOST} REQUIRED)
+  _thrust_find_backend(${TCT_DEVICE} REQUIRED)
+
+  # We can just create an INTERFACE IMPORTED target here instead of going
+  # through _thrust_declare_interface_alias as long as we aren't hanging any
+  # Thrust/CUB include paths on ${target_name}.
+  add_library(${target_name} INTERFACE IMPORTED)
+  target_link_libraries(${target_name}
+    INTERFACE
+    Thrust::${TCT_HOST}::Host
+    Thrust::${TCT_DEVICE}::Device
+  )
+
+  # This would be nice to enforce, but breaks when using old cmake + new
+  # compiler, since cmake doesn't know what features the new compiler version
+  # supports.
+  # Leaving this here as a reminder not to add it back. Just let the
+  # compile-time checks in thrust/detail/config/cpp_dialect.h handle it.
+  #
+  #  if (NOT TCT_IGNORE_DEPRECATED_CPP_DIALECT)
+  #    if (TCT_IGNORE_DEPRECATED_CPP_11)
+  #      target_compile_features(${target_name} INTERFACE cxx_std_11)
+  #    else()
+  #      target_compile_features(${target_name} INTERFACE cxx_std_14)
+  #    endif()
+  #  endif()
+
+  if (TCT_IGNORE_DEPRECATED_CPP_DIALECT)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_CPP_DIALECT")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_CPP_11)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_CPP_11")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_COMPILER)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_COMPILER")
+  endif()
+
+  if (TCT_IGNORE_CUB_VERSION_CHECK)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_CUB_VERSION_CHECK")
+  else()
+    if (("${TCT_HOST}" STREQUAL "CUDA" OR "${TCT_DEVICE}" STREQUAL "CUDA") AND
+    (NOT THRUST_VERSION VERSION_EQUAL THRUST_CUB_VERSION))
+      message(FATAL_ERROR
+        "The version of CUB found by CMake is not compatible with this release of Thrust. "
+        "CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. "
+        "Pass IGNORE_CUB_VERSION_CHECK to thrust_create_target to ignore. "
+        "(CUB ${THRUST_CUB_VERSION}, Thrust ${THRUST_VERSION})."
+        )
+    endif()
+  endif()
+
+  thrust_debug_target(${target_name} "Thrust ${THRUST_VERSION}"  internal)
+endfunction()
+
+function(thrust_is_system_found system var_name)
+  if (TARGET Thrust::${system})
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(thrust_is_cpp_system_found var_name)
+  thrust_is_system_found(CPP ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_cuda_system_found var_name)
+  thrust_is_system_found(CUDA ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_tbb_system_found var_name)
+  thrust_is_system_found(TBB ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_omp_system_found var_name)
+  thrust_is_system_found(OMP ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+# Since components are loaded lazily, this will refresh the
+# THRUST_${component}_FOUND flags in the current scope.
+# Alternatively, check system states individually using the
+# thrust_is_system_found functions.
+macro(thrust_update_system_found_flags)
+  set(THRUST_FOUND TRUE)
+  thrust_is_system_found(CPP  THRUST_CPP_FOUND)
+  thrust_is_system_found(CUDA THRUST_CUDA_FOUND)
+  thrust_is_system_found(TBB  THRUST_TBB_FOUND)
+  thrust_is_system_found(OMP  THRUST_OMP_FOUND)
+endmacro()
+
+function(thrust_debug msg)
+  # Use the VERBOSE channel when called internally
+  # Run `cmake . --log-level=VERBOSE` to view.
+  if ("${ARGN}" STREQUAL "internal")
+    # If CMake is too old to know about the VERBOSE channel, just be silent.
+    # Users reproduce much the same output on the STATUS channel by using:
+    # thrust_create_target(Thrust [...])
+    # thrust_debug_internal_targets()
+    # thrust_debug_target(Thrust)
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.15.7")
+      set(channel VERBOSE)
+    else()
+      return()
+    endif()
+  else()
+    set(channel STATUS)
+  endif()
+
+  message(${channel} "Thrust: ${msg}")
+endfunction()
+
+# Print details of the specified target.
+function(thrust_debug_target target_name version)
+  if (NOT TARGET ${target_name})
+    return()
+  endif()
+
+  set(is_internal "${ARGN}")
+
+  if (version)
+    set(version "(${version})")
+  endif()
+
+  thrust_debug("TargetInfo: ${target_name}: ${version}" ${is_internal})
+
+  function(_thrust_print_prop_if_set target_name prop)
+    get_target_property(value ${target_name} ${prop})
+    if (value)
+      thrust_debug("TargetInfo: ${target_name} > ${prop}: ${value}" ${is_internal})
+    endif()
+  endfunction()
+
+  function(_thrust_print_imported_prop_if_set target_name prop)
+    get_target_property(imported ${target_name} IMPORTED)
+    get_target_property(type ${target_name} TYPE)
+    if (imported AND NOT ${type} STREQUAL "INTERFACE_LIBRARY")
+      _thrust_print_prop_if_set(${target_name} ${prop})
+    endif()
+  endfunction()
+
+  _thrust_print_prop_if_set(${target_name} ALIASED_TARGET)
+  _thrust_print_prop_if_set(${target_name} IMPORTED)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_DEFINITIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_FEATURES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_OPTIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_INCLUDE_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_DEPENDS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_LIBRARIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_OPTIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_THRUST_HOST)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_THRUST_DEVICE)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION_DEBUG)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION_RELEASE)
+endfunction()
+
+function(thrust_debug_internal_targets)
+  function(_thrust_debug_backend_targets backend version)
+    thrust_debug_target(Thrust::${backend} "${version}")
+    thrust_debug_target(Thrust::${backend}::Host "${version}")
+    thrust_debug_target(Thrust::${backend}::Device "${version}")
+  endfunction()
+
+  thrust_debug_target(Thrust::Thrust "${THRUST_VERSION}")
+
+  _thrust_debug_backend_targets(CPP "Thrust ${THRUST_VERSION}")
+
+  _thrust_debug_backend_targets(CUDA "CUB ${THRUST_CUB_VERSION}")
+  thrust_debug_target(CUB::CUB "${THRUST_CUB_VERSION}")
+
+  _thrust_debug_backend_targets(TBB "${THRUST_TBB_VERSION}")
+  thrust_debug_target(TBB:tbb "${THRUST_TBB_VERSION}")
+
+  _thrust_debug_backend_targets(OMP "${THRUST_OMP_VERSION}")
+  thrust_debug_target(OpenMP::OpenMP_CXX "${THRUST_OMP_VERSION}")
+endfunction()
+
+################################################################################
+# Internal utilities. Subject to change.
+#
+
+function(_thrust_set_if_undefined var)
+  if (NOT DEFINED ${var})
+    set(${var} ${ARGN} PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_thrust_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit Thrust will *always* be used
+  #    during compilation, and the include paths of an IMPORTED Thrust::Thrust
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to Thrust::Thrust. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong thrust!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+# Create cache options for selecting the user/device systems with ccmake/cmake-gui.
+function(_thrust_create_cache_options host device host_option device_option host_doc device_doc advanced)
+  thrust_debug("Creating system cache options: (advanced=${advanced})" internal)
+  thrust_debug("  - Host Option=${host_option} Default=${host} Doc='${host_doc}'" internal)
+  thrust_debug("  - Device Option=${device_option} Default=${device} Doc='${device_doc}'" internal)
+  set(${host_option} ${host} CACHE STRING "${host_doc}")
+  set_property(CACHE ${host_option} PROPERTY STRINGS ${THRUST_HOST_SYSTEM_OPTIONS})
+  set(${device_option} ${device} CACHE STRING "${device_doc}")
+  set_property(CACHE ${device_option} PROPERTY STRINGS ${THRUST_DEVICE_SYSTEM_OPTIONS})
+  if (advanced)
+    mark_as_advanced(${host_option} ${device_option})
+  endif()
+endfunction()
+
+# Create Thrust::${backend}::Host and Thrust::${backend}::Device targets.
+# Assumes that `Thrust::${backend}` and `_Thrust_${backend}` have been created
+# by _thrust_declare_interface_alias and configured to bring in system
+# dependency interfaces (including Thrust::Thrust).
+function(_thrust_setup_system backend)
+  set(backend_target_alias "Thrust::${backend}")
+
+  if (backend IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
+    set(host_target "_Thrust_${backend}_Host")
+    set(host_target_alias "Thrust::${backend}::Host")
+    if (NOT TARGET ${host_target_alias})
+      _thrust_declare_interface_alias(${host_target_alias} ${host_target})
+      target_compile_definitions(${host_target} INTERFACE
+        "THRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_${backend}")
+      target_link_libraries(${host_target} INTERFACE ${backend_target_alias})
+      set_property(TARGET ${host_target} PROPERTY INTERFACE_THRUST_HOST ${backend})
+      set_property(TARGET ${host_target} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING THRUST_HOST)
+      thrust_debug_target(${host_target_alias} "" internal)
+    endif()
+  endif()
+
+  if (backend IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    set(device_target "_Thrust_${backend}_Device")
+    set(device_target_alias "Thrust::${backend}::Device")
+    if (NOT TARGET ${device_target_alias})
+      _thrust_declare_interface_alias(${device_target_alias} ${device_target})
+      target_compile_definitions(${device_target} INTERFACE
+        "THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${backend}")
+      target_link_libraries(${device_target} INTERFACE ${backend_target_alias})
+      set_property(TARGET ${device_target} PROPERTY INTERFACE_THRUST_DEVICE ${backend})
+      set_property(TARGET ${device_target} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING THRUST_DEVICE)
+      thrust_debug_target(${device_target_alias} "" internal)
+    endif()
+  endif()
+endfunction()
+
+# Use the provided cub_target for the CUDA backend. If Thrust::CUDA already
+# exists, this call has no effect.
+function(thrust_set_CUB_target cub_target)
+  if (NOT TARGET Thrust::CUDA)
+    thrust_debug("Setting CUB target to ${cub_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_CUB_VERSION ${CUB_VERSION} CACHE INTERNAL "CUB version used by Thrust")
+    _thrust_declare_interface_alias(Thrust::CUDA _Thrust_CUDA)
+    target_link_libraries(_Thrust_CUDA INTERFACE Thrust::Thrust ${cub_target})
+    thrust_debug_target(${cub_target} "${THRUST_CUB_VERSION}" internal)
+    thrust_debug_target(Thrust::CUDA "CUB ${THRUST_CUB_VERSION}" internal)
+    _thrust_setup_system(CUDA)
+  endif()
+endfunction()
+
+# Use the provided tbb_target for the TBB backend. If Thrust::TBB already
+# exists, this call has no effect.
+function(thrust_set_TBB_target tbb_target)
+  if (NOT TARGET Thrust::TBB)
+    thrust_debug("Setting TBB target to ${tbb_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_TBB_VERSION ${TBB_VERSION} CACHE INTERNAL "TBB version used by Thrust")
+    _thrust_declare_interface_alias(Thrust::TBB _Thrust_TBB)
+    target_link_libraries(_Thrust_TBB INTERFACE Thrust::Thrust ${tbb_target})
+    thrust_debug_target(${tbb_target} "${THRUST_TBB_VERSION}" internal)
+    thrust_debug_target(Thrust::TBB "${THRUST_TBB_VERSION}" internal)
+    _thrust_setup_system(TBB)
+  endif()
+endfunction()
+
+# Use the provided omp_target for the OMP backend. If Thrust::OMP already
+# exists, this call has no effect.
+function(thrust_set_OMP_target omp_target)
+  if (NOT TARGET Thrust::OMP)
+    thrust_debug("Setting OMP target to ${omp_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_OMP_VERSION ${OpenMP_CXX_VERSION} CACHE INTERNAL "OpenMP version used by Thrust")
+    _thrust_declare_interface_alias(Thrust::OMP _Thrust_OMP)
+    target_link_libraries(_Thrust_OMP INTERFACE Thrust::Thrust ${omp_target})
+    thrust_debug_target(${omp_target} "${THRUST_OMP_VERSION}" internal)
+    thrust_debug_target(Thrust::OMP "${THRUST_OMP_VERSION}" internal)
+    _thrust_setup_system(OMP)
+  endif()
+endfunction()
+
+function(_thrust_find_CPP required)
+  if (NOT TARGET Thrust::CPP)
+    thrust_debug("Generating CPP targets." internal)
+    _thrust_declare_interface_alias(Thrust::CPP _Thrust_CPP)
+    target_link_libraries(_Thrust_CPP INTERFACE Thrust::Thrust)
+    thrust_debug_target(Thrust::CPP "Thrust ${THRUST_VERSION}" internal)
+    _thrust_setup_system(CPP)
+  endif()
+endfunction()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like CUB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_CUDA required)
+  if (NOT TARGET Thrust::CUDA)
+    thrust_debug("Searching for CUB ${required}" internal)
+    find_package(CUB CONFIG
+      ${_THRUST_QUIET_FLAG}
+      ${required}
+      NO_DEFAULT_PATH # Only check the explicit HINTS below:
+      HINTS
+        "${_THRUST_INCLUDE_DIR}/dependencies/cub" # Source layout
+        "${_THRUST_INCLUDE_DIR}"                  # Install layout
+    )
+
+    if (TARGET CUB::CUB)
+      thrust_set_CUB_target(CUB::CUB)
+    else()
+      thrust_debug("CUB not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like TBB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_TBB required)
+  if(NOT TARGET Thrust::TBB)
+    thrust_debug("Searching for TBB ${required}" internal)
+    # Swap in a temporary module path to make sure we use our FindTBB.cmake
+    set(_THRUST_STASH_MODULE_PATH "${CMAKE_MODULE_PATH}")
+    set(CMAKE_MODULE_PATH "${_THRUST_CMAKE_DIR}")
+
+    # Push policy CMP0074 to silence warnings about TBB_ROOT being set. This
+    # var is used unconventionally in this FindTBB.cmake module.
+    # Someday we'll have a suitable TBB cmake configuration and can avoid this.
+    cmake_policy(PUSH)
+    cmake_policy(SET CMP0074 OLD)
+    set(THRUST_TBB_ROOT "" CACHE PATH "Path to the root of the TBB installation.")
+    if (TBB_ROOT AND NOT THRUST_TBB_ROOT)
+      message(
+        "Warning: TBB_ROOT is set. "
+        "Thrust uses THRUST_TBB_ROOT to avoid issues with CMake Policy CMP0074. "
+        "Please set this variable instead when using Thrust with TBB."
+      )
+    endif()
+    set(TBB_ROOT "${THRUST_TBB_ROOT}")
+    set(_THRUST_STASH_TBB_ROOT "${TBB_ROOT}")
+
+    find_package(TBB
+      ${_THRUST_QUIET_FLAG}
+      ${required}
+    )
+
+    cmake_policy(POP)
+    set(TBB_ROOT "${_THRUST_STASH_TBB_ROOT}")
+    set(CMAKE_MODULE_PATH "${_THRUST_STASH_MODULE_PATH}")
+
+    if (TARGET TBB::tbb)
+      thrust_set_TBB_target(TBB::tbb)
+    else()
+      thrust_debug("TBB not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# Wrap the OpenMP flags for CUDA targets
+function(thrust_fixup_omp_target omp_target)
+  get_target_property(opts ${omp_target} INTERFACE_COMPILE_OPTIONS)
+  if (opts MATCHES "\\$<\\$<COMPILE_LANGUAGE:CXX>:([^>]*)>")
+    target_compile_options(${omp_target} INTERFACE
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${CMAKE_MATCH_1}>
+    )
+  endif()
+endfunction()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like OpenMP_CXX_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_OMP required)
+  if (NOT TARGET Thrust::OMP)
+    thrust_debug("Searching for OMP ${required}" internal)
+    find_package(OpenMP
+      ${_THRUST_QUIET_FLAG}
+      ${_THRUST_REQUIRED_FLAG_OMP}
+      COMPONENTS CXX
+    )
+
+    if (TARGET OpenMP::OpenMP_CXX)
+      thrust_fixup_omp_target(OpenMP::OpenMP_CXX)
+      thrust_set_OMP_target(OpenMP::OpenMP_CXX)
+    else()
+      thrust_debug("OpenMP::OpenMP_CXX not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like CUB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_backend backend required)
+  # Unfortunately, _thrust_find_${backend}(req) is not valid CMake syntax. Hence
+  # why this function exists.
+  if ("${backend}" STREQUAL "CPP")
+    _thrust_find_CPP("${required}")
+  elseif ("${backend}" STREQUAL "CUDA")
+    _thrust_find_CUDA("${required}")
+  elseif ("${backend}" STREQUAL "TBB")
+    _thrust_find_TBB("${required}")
+  elseif ("${backend}" STREQUAL "OMP")
+    _thrust_find_OMP("${required}")
+  else()
+    message(FATAL_ERROR "_thrust_find_backend: Invalid system: ${backend}")
+  endif()
+endmacro()
+
+################################################################################
+# Initialization. Executed inside find_package(Thrust) call.
+#
+
+if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
+  set(_THRUST_QUIET ON CACHE INTERNAL "Quiet mode enabled for Thrust find_package calls.")
+  set(_THRUST_QUIET_FLAG "QUIET" CACHE INTERNAL "")
+else()
+  unset(_THRUST_QUIET CACHE)
+  unset(_THRUST_QUIET_FLAG CACHE)
+endif()
+
+set(_THRUST_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}" CACHE INTERNAL "Location of thrust-config.cmake")
+
+# Internal target that actually holds the Thrust interface. Used by all other Thrust targets.
+if (NOT TARGET Thrust::Thrust)
+  _thrust_declare_interface_alias(Thrust::Thrust _Thrust_Thrust)
+  # Strip out the 'thrust/cmake/' from '[thrust_include_path]/thrust/cmake/':
+  get_filename_component(_THRUST_INCLUDE_DIR "../.." ABSOLUTE BASE_DIR "${_THRUST_CMAKE_DIR}")
+  set(_THRUST_INCLUDE_DIR "${_THRUST_INCLUDE_DIR}"
+    CACHE INTERNAL "Location of thrust headers."
+  )
+  target_include_directories(_Thrust_Thrust INTERFACE "${_THRUST_INCLUDE_DIR}")
+  thrust_debug_target(Thrust::Thrust "${THRUST_VERSION}" internal)
+endif()
+
+# Handle find_package COMPONENT requests:
+foreach(component ${${CMAKE_FIND_PACKAGE_NAME}_FIND_COMPONENTS})
+  if (NOT component IN_LIST THRUST_HOST_SYSTEM_OPTIONS AND
+      NOT component IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    message(FATAL_ERROR "Invalid component requested: '${component}'")
+  endif()
+
+  unset(req)
+  if (${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED_${component})
+    set(req "REQUIRED")
+  endif()
+
+  thrust_debug("Preloading COMPONENT '${component}' ${req}" internal)
+  _thrust_find_backend(${component} "${req}")
+endforeach()
+
+thrust_update_system_found_flags()
diff --git a/thrust/thrust/complex.h b/thrust/thrust/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..badacb467976415e8f1e922d31c35027aeabf02b
--- /dev/null
+++ b/thrust/thrust/complex.h
@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2008-2019 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file complex.h
+ *  \brief Complex numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <cmath>
+#include <complex>
+#include <sstream>
+#include <thrust/detail/type_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  define THRUST_STD_COMPLEX_REAL(z) \
+    reinterpret_cast< \
+      const typename thrust::detail::remove_reference<decltype(z)>::type::value_type (&)[2] \
+    >(z)[0]
+#  define THRUST_STD_COMPLEX_IMAG(z) \
+    reinterpret_cast< \
+      const typename thrust::detail::remove_reference<decltype(z)>::type::value_type (&)[2] \
+    >(z)[1]
+#  define THRUST_STD_COMPLEX_DEVICE __device__
+#else
+#  define THRUST_STD_COMPLEX_REAL(z) (z).real()
+#  define THRUST_STD_COMPLEX_IMAG(z) (z).imag()
+#  define THRUST_STD_COMPLEX_DEVICE
+#endif
+
+namespace thrust
+{
+
+/*
+ *  Calls to the standard math library from inside the thrust namespace
+ *  with real arguments require explicit scope otherwise they will fail
+ *  to resolve as it will find the equivalent complex function but then
+ *  fail to match the template, and give up looking for other scopes.
+ */
+
+
+/*! \addtogroup numerics
+ *  \{
+ */
+
+/*! \addtogroup complex_numbers Complex Numbers
+ *  \{
+ */
+
+namespace detail
+{
+  
+template <typename T, std::size_t Align>
+struct complex_storage;
+
+#if THRUST_CPP_DIALECT >= 2011                                                    \
+  && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                       \
+  && (THRUST_GCC_VERSION >= 40800)
+  // C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
+  template <typename T, std::size_t Align>
+  struct complex_storage
+  {
+    struct alignas(Align) type { T x; T y; };
+  };
+#elif  (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)                    \
+    || (   (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                 \
+        && (THRUST_GCC_VERSION < 40600))
+  // C++03 implementation for MSVC and GCC <= 4.5.
+  // 
+  // We have to implement `aligned_type` with specializations for MSVC
+  // and GCC 4.2 and older because they require literals as arguments to 
+  // their alignment attribute.
+
+  #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+    // MSVC implementation.
+    #define THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(X)                   \
+      template <typename T>                                                   \
+      struct complex_storage<T, X>                                            \
+      {                                                                       \
+        __declspec(align(X)) struct type { T x; T y; };                       \
+      };                                                                      \
+      /**/
+  #else
+    // GCC <= 4.2 implementation.
+    #define THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(X)                   \
+      template <typename T>                                                   \
+      struct complex_storage<T, X>                                            \
+      {                                                                       \
+        struct type { T x; T y; } __attribute__((aligned(X)));                \
+      };                                                                      \
+      /**/
+  #endif
+
+  // The primary template is a fallback, which doesn't specify any alignment.
+  // It's only used when T is very large and we're using an older compilers
+  // which we have to fully specialize each alignment case.
+  template <typename T, std::size_t Align>
+  struct complex_storage
+  {
+    T x; T y;
+  };
+  
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(1);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(2);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(4);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(8);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(16);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(32);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(64);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(128);
+
+  #undef THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION
+#else
+  // C++03 implementation for GCC > 4.5, Clang, PGI, ICPC, and xlC.
+  template <typename T, std::size_t Align>
+  struct complex_storage
+  {
+    struct type { T x; T y; } __attribute__((aligned(Align)));
+  };
+#endif
+
+} // end namespace detail
+
+  /*! \p complex is the Thrust equivalent to <tt>std::complex</tt>. It is
+   *  functionally identical to it, but can also be used in device code which
+   *  <tt>std::complex</tt> currently cannot.
+   *
+   *  \tparam T The type used to hold the real and imaginary parts. Should be
+   *  <tt>float</tt> or <tt>double</tt>. Others types are not supported.
+   *
+   */
+template <typename T>
+struct complex
+{
+public:
+
+  /*! \p value_type is the type of \p complex's real and imaginary parts.
+   */
+  typedef T value_type;
+
+
+
+  /* --- Constructors --- */
+
+  /*! Construct a complex number with an imaginary part of 0.
+   *
+   *  \param re The real part of the number.
+   */
+  __host__ __device__
+  complex(const T& re);
+
+  /*! Construct a complex number from its real and imaginary parts.
+   *
+   *  \param re The real part of the number.
+   *  \param im The imaginary part of the number.
+   */
+  __host__ __device__
+  complex(const T& re, const T& im);
+
+#if THRUST_CPP_DIALECT >= 2011
+  /*! Default construct a complex number.
+   */
+  complex() = default;
+
+  /*! This copy constructor copies from a \p complex with a type that is
+   *  convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  complex(const complex<T>& z) = default;
+#else
+  /*! Default construct a complex number.
+   */
+  __host__ __device__
+  complex();
+
+  /*! This copy constructor copies from a \p complex with a type that is
+   *  convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  __host__ __device__
+  complex(const complex<T>& z);
+#endif
+
+  /*! This converting copy constructor copies from a \p complex with a type
+   *  that is convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex(const complex<U>& z);
+
+  /*! This converting copy constructor copies from a <tt>std::complex</tt> with
+   *  a type that is convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex(const std::complex<T>& z);
+
+  /*! This converting copy constructor copies from a <tt>std::complex</tt> with
+   *  a type that is convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex(const std::complex<U>& z);
+
+
+
+  /* --- Assignment Operators --- */
+
+  /*! Assign `re` to the real part of this \p complex and set the imaginary part
+   *  to 0.
+   *
+   *  \param re The real part of the number.
+   */
+  __host__ __device__
+  complex& operator=(const T& re);
+
+#if THRUST_CPP_DIALECT >= 2011
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  complex& operator=(const complex<T>& z) = default;
+#else
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  __host__ __device__
+  complex& operator=(const complex<T>& z);
+#endif
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex& operator=(const complex<U>& z);
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex& operator=(const std::complex<T>& z);
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex& operator=(const std::complex<U>& z);
+
+
+  /* --- Compound Assignment Operators --- */
+
+  /*! Adds a \p complex to this \p complex and assigns the result to this
+   *  \p complex.
+   *
+   *  \param z The \p complex to be added.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator+=(const complex<U>& z);
+
+  /*! Subtracts a \p complex from this \p complex and assigns the result to
+   *  this \p complex.
+   *
+   *  \param z The \p complex to be subtracted.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator-=(const complex<U>& z);
+
+  /*! Multiplies this \p complex by another \p complex and assigns the result
+   *  to this \p complex.
+   *
+   *  \param z The \p complex to be multiplied.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator*=(const complex<U>& z);
+
+  /*! Divides this \p complex by another \p complex and assigns the result to
+   *  this \p complex.
+   *
+   *  \param z The \p complex to be divided.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator/=(const complex<U>& z);
+
+  /*! Adds a scalar to this \p complex and assigns the result to this
+   *  \p complex.
+   *
+   *  \param z The \p complex to be added.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator+=(const U& z);
+
+  /*! Subtracts a scalar from this \p complex and assigns the result to
+   *  this \p complex.
+   *
+   *  \param z The scalar to be subtracted.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator-=(const U& z);
+
+  /*! Multiplies this \p complex by a scalar and assigns the result
+   *  to this \p complex.
+   *
+   *  \param z The scalar to be multiplied.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator*=(const U& z);
+
+  /*! Divides this \p complex by a scalar and assigns the result to
+   *  this \p complex.
+   *
+   *  \param z The scalar to be divided.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator/=(const U& z);
+
+
+
+  /* --- Getter functions ---
+   * The volatile ones are there to help for example
+   * with certain reductions optimizations
+   */
+
+  /*! Returns the real part of this \p complex.
+   */
+  __host__ __device__
+  T real() const volatile { return data.x; }
+
+  /*! Returns the imaginary part of this \p complex.
+   */
+  __host__ __device__
+  T imag() const volatile { return data.y; }
+
+  /*! Returns the real part of this \p complex.
+   */
+  __host__ __device__
+  T real() const { return data.x; }
+
+  /*! Returns the imaginary part of this \p complex.
+   */
+  __host__ __device__
+  T imag() const { return data.y; }
+
+
+
+  /* --- Setter functions ---
+   * The volatile ones are there to help for example
+   * with certain reductions optimizations
+   */
+
+  /*! Sets the real part of this \p complex.
+   *
+   *  \param re The new real part of this \p complex.
+   */
+  __host__ __device__
+  void real(T re) volatile { data.x = re; }
+
+  /*! Sets the imaginary part of this \p complex.
+   *
+   *  \param im The new imaginary part of this \p complex.e
+   */
+  __host__ __device__
+  void imag(T im) volatile { data.y = im; }
+
+  /*! Sets the real part of this \p complex.
+   *
+   *  \param re The new real part of this \p complex.
+   */
+  __host__ __device__
+  void real(T re) { data.x = re; }
+
+  /*! Sets the imaginary part of this \p complex.
+   *
+   *  \param im The new imaginary part of this \p complex.
+   */
+  __host__ __device__
+  void imag(T im) { data.y = im; }
+
+
+
+  /* --- Casting functions --- */
+
+  /*! Casts this \p complex to a <tt>std::complex</tt> of the same type.
+   */
+  __host__
+  operator std::complex<T>() const { return std::complex<T>(real(), imag()); }
+
+private:
+  typename detail::complex_storage<T, sizeof(T) * 2>::type data;
+};
+
+
+/* --- General Functions --- */
+
+/*! Returns the magnitude (also known as absolute value) of a \p complex.
+ *
+ *  \param z The \p complex from which to calculate the absolute value.
+ */
+template<typename T>
+__host__ __device__
+T abs(const complex<T>& z);
+
+/*! Returns the phase angle (also known as argument) in radians of a \p complex.
+ *
+ *  \param z The \p complex from which to calculate the phase angle.
+ */
+template <typename T>
+__host__ __device__
+T arg(const complex<T>& z);
+
+/*! Returns the square of the magnitude of a \p complex.
+ *
+ *  \param z The \p complex from which to calculate the norm.
+ */
+template <typename T>
+__host__ __device__
+T norm(const complex<T>& z);
+
+/*! Returns the complex conjugate of a \p complex.
+ *
+ *  \param z The \p complex from which to calculate the complex conjugate.
+ */
+template <typename T>
+__host__ __device__
+complex<T> conj(const complex<T>& z);
+
+/*! Returns a \p complex with the specified magnitude and phase.
+ *
+ *  \param m The magnitude of the returned \p complex.
+ *  \param theta The phase of the returned \p complex in radians.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+polar(const T0& m, const T1& theta = T1());
+
+/*! Returns the projection of a \p complex on the Riemann sphere.
+ *  For all finite \p complex it returns the argument. For \p complexs
+ *  with a non finite part returns (INFINITY,+/-0) where the sign of
+ *  the zero matches the sign of the imaginary part of the argument.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> proj(const T& z);
+
+
+
+/* --- Binary Arithmetic operators --- */
+
+/*! Adds two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const complex<T1>& y);
+
+/*! Adds a scalar to a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The \p complex.
+ *  \param y The scalar.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const T1& y);
+
+/*! Adds a \p complex number to a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The scalar.
+ *  \param y The \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const T0& x, const complex<T1>& y);
+
+/*! Subtracts two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The first \p complex (minuend).
+ *  \param y The second \p complex (subtrahend).
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const complex<T1>& y);
+
+/*! Subtracts a scalar from a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The \p complex (minuend).
+ *  \param y The scalar (subtrahend).
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const T1& y);
+
+/*! Subtracts a \p complex number from a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The scalar (minuend).
+ *  \param y The \p complex (subtrahend).
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const T0& x, const complex<T1>& y);
+
+/*! Multiplies two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const complex<T1>& y);
+
+/*! Multiplies a \p complex number by a scalar.
+ *
+ *  \param x The \p complex.
+ *  \param y The scalar.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const T1& y);
+
+/*! Multiplies a scalar by a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The scalar.
+ *  \param y The \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const T0& x, const complex<T1>& y);
+
+/*! Divides two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The numerator (dividend).
+ *  \param y The denomimator (divisor).
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const complex<T1>& y);
+
+/*! Divides a \p complex number by a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The complex numerator (dividend).
+ *  \param y The scalar denomimator (divisor).
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const T1& y);
+
+/*! Divides a scalar by a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The scalar numerator (dividend).
+ *  \param y The complex denomimator (divisor).
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const T0& x, const complex<T1>& y);
+
+
+
+/* --- Unary Arithmetic operators --- */
+
+/*! Unary plus, returns its \p complex argument.
+ *
+ *  \param y The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T>
+operator+(const complex<T>& y);
+
+/*! Unary minus, returns the additive inverse (negation) of its \p complex
+ * argument.
+ *
+ *  \param y The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T>
+operator-(const complex<T>& y);
+
+
+
+/* --- Exponential Functions --- */
+
+/*! Returns the complex exponential of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> exp(const complex<T>& z);
+
+/*! Returns the complex natural logarithm of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> log(const complex<T>& z);
+
+/*! Returns the complex base 10 logarithm of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> log10(const complex<T>& z);
+
+
+
+/* --- Power Functions --- */
+
+/*! Returns a \p complex number raised to another.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The base.
+ *  \param y The exponent.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const complex<T1>& y);
+
+/*! Returns a \p complex number raised to a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The base.
+ *  \param y The exponent.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const T1& y);
+
+/*! Returns a scalar raised to a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
+ *
+ *  \param x The base.
+ *  \param y The exponent.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const T0& x, const complex<T1>& y);
+
+/*! Returns the complex square root of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> sqrt(const complex<T>& z);
+
+
+/* --- Trigonometric Functions --- */
+
+/*! Returns the complex cosine of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> cos(const complex<T>& z);
+
+/*! Returns the complex sine of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> sin(const complex<T>& z);
+
+/*! Returns the complex tangent of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> tan(const complex<T>& z);
+
+
+
+/* --- Hyperbolic Functions --- */
+
+/*! Returns the complex hyperbolic cosine of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> cosh(const complex<T>& z);
+
+/*! Returns the complex hyperbolic sine of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> sinh(const complex<T>& z);
+
+/*! Returns the complex hyperbolic tangent of a \p complex number.
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> tanh(const complex<T>& z);
+
+
+
+/* --- Inverse Trigonometric Functions --- */
+
+/*! Returns the complex arc cosine of a \p complex number.
+ *
+ *  The range of the real part of the result is [0, Pi] and
+ *  the range of the imaginary part is [-inf, +inf]
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> acos(const complex<T>& z);
+
+/*! Returns the complex arc sine of a \p complex number.
+ *
+ *  The range of the real part of the result is [-Pi/2, Pi/2] and
+ *  the range of the imaginary part is [-inf, +inf]
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> asin(const complex<T>& z);
+
+/*! Returns the complex arc tangent of a \p complex number.
+ *
+ *  The range of the real part of the result is [-Pi/2, Pi/2] and
+ *  the range of the imaginary part is [-inf, +inf]
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> atan(const complex<T>& z);
+
+
+
+/* --- Inverse Hyperbolic Functions --- */
+
+/*! Returns the complex inverse hyperbolic cosine of a \p complex number.
+ *
+ *  The range of the real part of the result is [0, +inf] and
+ *  the range of the imaginary part is [-Pi, Pi]
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> acosh(const complex<T>& z);
+
+/*! Returns the complex inverse hyperbolic sine of a \p complex number.
+ *
+ *  The range of the real part of the result is [-inf, +inf] and
+ *  the range of the imaginary part is [-Pi/2, Pi/2]
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> asinh(const complex<T>& z);
+
+/*! Returns the complex inverse hyperbolic tangent of a \p complex number.
+ *
+ *  The range of the real part of the result is [-inf, +inf] and
+ *  the range of the imaginary part is [-Pi/2, Pi/2]
+ *
+ *  \param z The \p complex argument.
+ */
+template <typename T>
+__host__ __device__
+complex<T> atanh(const complex<T>& z);
+
+
+
+/* --- Stream Operators --- */
+
+/*! Writes to an output stream a \p complex number in the form (real, imaginary).
+ *
+ *  \param os The output stream.
+ *  \param z The \p complex number to output.
+ */
+template <typename T, typename CharT, typename Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const complex<T>& z);
+
+/*! Reads a \p complex number from an input stream.
+ *
+ *  The recognized formats are:
+ * - real
+ * - (real)
+ * - (real, imaginary)
+ *
+ * The values read must be convertible to the \p complex's \c value_type
+ *
+ *  \param is The input stream.
+ *  \param z The \p complex number to set.
+ */
+template <typename T, typename CharT, typename Traits>
+__host__
+std::basic_istream<CharT, Traits>&
+operator>>(std::basic_istream<CharT, Traits>& is, complex<T>& z);
+
+
+
+/* --- Equality Operators --- */
+
+/*! Returns true if two \p complex numbers are equal and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const complex<T1>& y);
+
+/*! Returns true if two \p complex numbers are equal and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const complex<T0>& x, const std::complex<T1>& y);
+
+/*! Returns true if two \p complex numbers are equal and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const std::complex<T0>& x, const complex<T1>& y);
+
+/*! Returns true if the imaginary part of the \p complex number is zero and
+ *  the real part is equal to the scalar. Returns false otherwise.
+ *
+ *  \param x The scalar.
+ *  \param y The \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const T0& x, const complex<T1>& y);
+
+/*! Returns true if the imaginary part of the \p complex number is zero and
+ *  the real part is equal to the scalar. Returns false otherwise.
+ *
+ *  \param x The \p complex.
+ *  \param y The scalar.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const T1& y);
+
+/*! Returns true if two \p complex numbers are different and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const complex<T1>& y);
+
+/*! Returns true if two \p complex numbers are different and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const complex<T0>& x, const std::complex<T1>& y);
+
+/*! Returns true if two \p complex numbers are different and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const std::complex<T0>& x, const complex<T1>& y);
+
+/*! Returns true if the imaginary part of the \p complex number is not zero or
+ *  the real part is different from the scalar. Returns false otherwise.
+ *
+ *  \param x The scalar.
+ *  \param y The \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const T0& x, const complex<T1>& y);
+
+/*! Returns true if the imaginary part of the \p complex number is not zero or
+ *  the real part is different from the scalar. Returns false otherwise.
+ *
+ *  \param x The \p complex.
+ *  \param y The scalar.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const T1& y);
+
+} // end namespace thrust
+
+#include <thrust/detail/complex/complex.inl>
+
+#undef THRUST_STD_COMPLEX_REAL
+#undef THRUST_STD_COMPLEX_IMAG
+#undef THRUST_STD_COMPLEX_DEVICE
+
+/*! \} // complex_numbers
+ */
+
+/*! \} // numerics
+ */
+
diff --git a/thrust/thrust/copy.h b/thrust/thrust/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..23365875d661a947a6e639c3c36522402282bdbb
--- /dev/null
+++ b/thrust/thrust/copy.h
@@ -0,0 +1,513 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/copy.h
+ *  \brief Copies elements from one range to another
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup copying
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p copy copies elements from the range [\p first, \p last) to the range
+ *  [\p result, \p result + (\p last - \p first)). That is, it performs
+ *  the assignments *\p result = *\p first, *(\p result + \c 1) = *(\p first + \c 1),
+ *  and so on. Generally, for every integer \c n from \c 0 to \p last - \p first, \p copy
+ *  performs the assignment *(\p result + \c n) = *(\p first + \c n). Unlike
+ *  \c std::copy, \p copy offers no guarantee on order of operation.  As a result,
+ *  calling \p copy with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + (\p last - \p first).
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy.
+ *  \param last The end of the sequence to copy.
+ *  \param result The destination sequence.
+ *  \return The end of the destination sequence.
+ *  \see http://www.sgi.com/tech/stl/copy.html
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another using the \p thrust::device parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  thrust::device_vector<int> vec0(100);
+ *  thrust::device_vector<int> vec1(100);
+ *  ...
+ *
+ *  thrust::copy(thrust::device, vec0.begin(), vec0.end(), vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+
+/*! \p copy_n copies elements from the range <tt>[first, first + n)</tt> to the range
+ *  <tt>[result, result + n)</tt>. That is, it performs the assignments <tt>*result = *first, *(result + 1) = *(first + 1)</tt>,
+ *  and so on. Generally, for every integer \c i from \c 0 to \c n, \p copy
+ *  performs the assignment *(\p result + \c i) = *(\p first + \c i). Unlike
+ *  \c std::copy_n, \p copy_n offers no guarantee on order of operation. As a result,
+ *  calling \p copy_n with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + \p n.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to copy.
+ *  \param n The number of elements to copy.
+ *  \param result The beginning destination range.
+ *  \return The end of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam Size is an integral type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another using the \p thrust::device parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  size_t n = 100;
+ *  thrust::device_vector<int> vec0(n);
+ *  thrust::device_vector<int> vec1(n);
+ *  ...
+ *  thrust::copy_n(thrust::device, vec0.begin(), n, vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see thrust::copy
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+	
+/*! \p copy copies elements from the range [\p first, \p last) to the range
+ *  [\p result, \p result + (\p last - \p first)). That is, it performs
+ *  the assignments *\p result = *\p first, *(\p result + \c 1) = *(\p first + \c 1),
+ *  and so on. Generally, for every integer \c n from \c 0 to \p last - \p first, \p copy
+ *  performs the assignment *(\p result + \c n) = *(\p first + \c n). Unlike
+ *  \c std::copy, \p copy offers no guarantee on order of operation.  As a result,
+ *  calling \p copy with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + (\p last - \p first).
+ *
+ *  \param first The beginning of the sequence to copy.
+ *  \param last The end of the sequence to copy.
+ *  \param result The destination sequence.
+ *  \return The end of the destination sequence.
+ *  \see http://www.sgi.com/tech/stl/copy.html
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another.
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *
+ *  thrust::device_vector<int> vec0(100);
+ *  thrust::device_vector<int> vec1(100);
+ *  ...
+ *
+ *  thrust::copy(vec0.begin(), vec0.end(),
+ *               vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ */
+template<typename InputIterator, typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+/*! \p copy_n copies elements from the range <tt>[first, first + n)</tt> to the range
+ *  <tt>[result, result + n)</tt>. That is, it performs the assignments <tt>*result = *first, *(result + 1) = *(first + 1)</tt>,
+ *  and so on. Generally, for every integer \c i from \c 0 to \c n, \p copy
+ *  performs the assignment *(\p result + \c i) = *(\p first + \c i). Unlike
+ *  \c std::copy_n, \p copy_n offers no guarantee on order of operation. As a result,
+ *  calling \p copy_n with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + \p n.
+ *
+ *  \param first The beginning of the range to copy.
+ *  \param n The number of elements to copy.
+ *  \param result The beginning destination range.
+ *  \return The end of the destination range.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam Size is an integral type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another.
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  size_t n = 100;
+ *  thrust::device_vector<int> vec0(n);
+ *  thrust::device_vector<int> vec1(n);
+ *  ...
+ *  thrust::copy_n(vec0.begin(), n, vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see thrust::copy
+ */
+template<typename InputIterator, typename Size, typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+/*! \} // end copying
+ */
+
+/*! \addtogroup stream_compaction
+ *  \{
+ */
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \p result, except that any element which causes \p pred
+ *  to be \c false is not copied. \p copy_if is stable, meaning that the relative
+ *  order of elements that are copied is unchanged.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(first+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  The algorithm's execution is parallelized as determined by \p system.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[first, last)</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[first, last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy even numbers to an output range using the \p thrust::host parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(thrust::host, V, V + N, result, is_even());
+ *
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, 0, 0, 2}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \p result, except that any element which causes \p pred
+ *  to \c false is not copied. \p copy_if is stable, meaning that the relative
+ *  order of elements that are copied is unchanged.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(first+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[first, last)</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[first, last)</tt>.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy even numbers to an output range.
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(V, V + N, result, is_even());
+ *
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, 0, 0, 2}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \p result, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c false is not copied. \p copy_if is stable, meaning
+ *  that the relative order of elements that are copied is unchanged.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(stencil+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last-first))</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy numbers to an output range when corresponding stencil elements are even using the \p thrust::host execution policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int N = 6;
+ *  int data[N]    = { 0, 1,  2, 3, 4, 5};
+ *  int stencil[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(thrust::host, data, data + N, stencil, result, is_even());
+ *
+ *  // data remains    = { 0, 1,  2, 3, 4, 5};
+ *  // stencil remains = {-2, 0, -1, 0, 1, 2};
+ *  // result is now     { 0, 1,  3, 5}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \p result, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c false is not copied. \p copy_if is stable, meaning
+ *  that the relative order of elements that are copied is unchanged.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(stencil+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last-first))</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy numbers to an output range when corresponding stencil elements are even:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int N = 6;
+ *  int data[N]    = { 0, 1,  2, 3, 4, 5};
+ *  int stencil[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(data, data + N, stencil, result, is_even());
+ *
+ *  // data remains    = { 0, 1,  2, 3, 4, 5};
+ *  // stencil remains = {-2, 0, -1, 0, 1, 2};
+ *  // result is now     { 0, 1,  3, 5}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+/*! \} // end stream_compaction
+ */
+	
+} // end namespace thrust
+
+#include <thrust/detail/copy.h>
+#include <thrust/detail/copy_if.h>
+
diff --git a/thrust/thrust/count.h b/thrust/thrust/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..9225bc6a757ad6323af12c41c173b2c11bc0720d
--- /dev/null
+++ b/thrust/thrust/count.h
@@ -0,0 +1,235 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file count.h
+ *  \brief Counting elements in a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup reductions
+ *  \ingroup algorithms
+ *  \{
+ */
+
+/*! \addtogroup counting
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p count finds the number of elements in <tt>[first,last)</tt> that are equal
+ *  to \p value. More precisely, \p count returns the number of iterators \c i in
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be counted.
+ *  \return The number of elements equal to \p value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *
+ *  The following code snippet demonstrates how to use \p count to 
+ *  count the number of instances in a range of a value of interest using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  // put 3 1s in a device_vector
+ *  thrust::device_vector<int> vec(5,0);
+ *  vec[1] = 1;
+ *  vec[3] = 1;
+ *  vec[4] = 1;
+ *  
+ *  // count the 1s
+ *  int result = thrust::count(thrust::device, vec.begin(), vec.end(), 1);
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value);
+
+
+
+/*! \p count finds the number of elements in <tt>[first,last)</tt> that are equal
+ *  to \p value. More precisely, \p count returns the number of iterators \c i in
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be counted.
+ *  \return The number of elements equal to \p value.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *
+ *  The following code snippet demonstrates how to use \p count to 
+ *  count the number of instances in a range of a value of interest.
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  // put 3 1s in a device_vector
+ *  thrust::device_vector<int> vec(5,0);
+ *  vec[1] = 1;
+ *  vec[3] = 1;
+ *  vec[4] = 1;
+ *  
+ *  // count the 1s
+ *  int result = thrust::count(vec.begin(), vec.end(), 1);
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template <typename InputIterator, typename EqualityComparable>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(InputIterator first, InputIterator last, const EqualityComparable& value);
+
+
+/*! \p count_if finds the number of elements in <tt>[first,last)</tt> for which 
+ *  a predicate is \c true. More precisely, \p count_if returns the number of iterators
+ *  \c i in <tt>[first, last)</tt> such that <tt>pred(*i) == true</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param pred The predicate.
+ *  \return The number of elements where \p pred is \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p count to
+ *  count the number of odd numbers in a range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int &x)
+ *    {
+ *      return x & 1;
+ *    }
+ *  };
+ *  ...
+ *  // fill a device_vector with even & odd numbers
+ *  thrust::device_vector<int> vec(5);
+ *  vec[0] = 0;
+ *  vec[1] = 1;
+ *  vec[2] = 2;
+ *  vec[3] = 3;
+ *  vec[4] = 4;
+ *
+ *  // count the odd elements in vec
+ *  int result = thrust::count_if(thrust::device, vec.begin(), vec.end(), is_odd());
+ *  // result == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p count_if finds the number of elements in <tt>[first,last)</tt> for which 
+ *  a predicate is \c true. More precisely, \p count_if returns the number of iterators
+ *  \c i in <tt>[first, last)</tt> such that <tt>pred(*i) == true</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param pred The predicate.
+ *  \return The number of elements where \p pred is \c true.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p count to
+ *  count the number of odd numbers in a range.
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int &x)
+ *    {
+ *      return x & 1;
+ *    }
+ *  };
+ *  ...
+ *  // fill a device_vector with even & odd numbers
+ *  thrust::device_vector<int> vec(5);
+ *  vec[0] = 0;
+ *  vec[1] = 1;
+ *  vec[2] = 2;
+ *  vec[3] = 3;
+ *  vec[4] = 4;
+ *
+ *  // count the odd elements in vec
+ *  int result = thrust::count_if(vec.begin(), vec.end(), is_odd());
+ *  // result == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template <typename InputIterator, typename Predicate>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \} // end counting
+ *  \} // end reductions
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/count.inl>
+
diff --git a/thrust/thrust/detail/adjacent_difference.inl b/thrust/thrust/detail/adjacent_difference.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f8099450fd4f161c117246358648304bb66ede2d
--- /dev/null
+++ b/thrust/thrust/detail/adjacent_difference.inl
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.inl
+ *  \brief Inline file for adjacent_difference.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+#include <thrust/system/detail/adl/adjacent_difference.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last, 
+                                   OutputIterator result)
+{
+  using thrust::system::detail::generic::adjacent_difference;
+
+  return adjacent_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end adjacent_difference()
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last, 
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::adjacent_difference;
+
+  return adjacent_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
+} // end adjacent_difference()
+
+
+template <typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
+                                   OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::adjacent_difference(select_system(system1, system2), first, last, result);
+} // end adjacent_difference()
+
+
+template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::adjacent_difference(select_system(system1, system2), first, last, result, binary_op);
+} // end adjacent_difference()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/advance.inl b/thrust/thrust/detail/advance.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2694a7ec69e4cb14c381bfaa15357f55145c8e44
--- /dev/null
+++ b/thrust/thrust/detail/advance.inl
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file advance.inl
+ *  \brief Inline file for advance.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/advance.h>
+#include <thrust/system/detail/generic/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+namespace thrust
+{
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_difference_type, difference_type)
+
+template <typename InputIterator, typename Distance>
+__host__ __device__
+void advance(InputIterator& i, Distance n)
+{
+  thrust::system::detail::generic::advance(i, n);
+}
+
+template <typename InputIterator>
+__host__ __device__
+InputIterator next(
+  InputIterator i
+, typename iterator_traits<InputIterator>::difference_type n = 1
+)
+{
+  thrust::system::detail::generic::advance(i, n);
+  return i;
+}
+
+template <typename BidirectionalIterator>
+__host__ __device__
+BidirectionalIterator prev(
+  BidirectionalIterator i
+, typename iterator_traits<BidirectionalIterator>::difference_type n = 1
+)
+{
+  thrust::system::detail::generic::advance(i, -n);
+  return i;
+}
+
+template <typename BidirectionalIterator>
+__host__ __device__
+typename detail::disable_if<
+  has_difference_type<iterator_traits<BidirectionalIterator> >::value
+, BidirectionalIterator
+>::type prev(
+  BidirectionalIterator i
+, typename detail::pointer_traits<BidirectionalIterator>::difference_type n = 1
+)
+{
+  thrust::system::detail::generic::advance(i, -n);
+  return i;
+}
+
+} // namespace thrust
+
diff --git a/thrust/thrust/detail/algorithm_wrapper.h b/thrust/thrust/detail/algorithm_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c09b9a0a0b4dcc52924425d1549093668e5d2952
--- /dev/null
+++ b/thrust/thrust/detail/algorithm_wrapper.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <algorithm>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/thrust/detail/alignment.h b/thrust/thrust/detail/alignment.h
new file mode 100644
index 0000000000000000000000000000000000000000..89c8afcd8c9fc6a160c7fca0801a1d73a71adbc4
--- /dev/null
+++ b/thrust/thrust/detail/alignment.h
@@ -0,0 +1,230 @@
+/*
+ *  Copyright 2017 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file alignment.h
+ *  \brief Type-alignment utilities.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h> // For `integral_constant`.
+
+#include <cstddef> // For `std::size_t` and `std::max_align_t`.
+
+#if THRUST_CPP_DIALECT >= 2011
+    #include <type_traits> // For `std::alignment_of` and `std::aligned_storage`.
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+/// \p THRUST_ALIGNOF is a macro that takes a single type-id as a parameter,
+/// and returns the alignment requirement of the type in bytes.
+/// 
+/// It is an approximation of C++11's `alignof` operator.
+///
+/// Note: MSVC does not allow the builtin used to implement this to be placed
+/// inside of a `__declspec(align(#))` attribute. As a workaround, you can
+/// assign the result of \p THRUST_ALIGNOF to a variable and pass the variable
+/// as the argument to `__declspec(align(#))`.
+#if THRUST_CPP_DIALECT >= 2011
+    #define THRUST_ALIGNOF(x) alignof(x) 
+#else
+    #define THRUST_ALIGNOF(x) __alignof(x)
+#endif
+
+/// \p alignment_of provides the member constant `value` which is equal to the
+/// alignment requirement of the type `T`, as if obtained by a C++11 `alignof`
+/// expression.
+/// 
+/// It is an implementation of C++11's \p std::alignment_of.
+#if THRUST_CPP_DIALECT >= 2011
+    template <typename T>
+    using alignment_of = std::alignment_of<T>;
+#else
+    template <typename T>
+    struct alignment_of;
+
+    template <typename T, std::size_t size_diff>
+    struct alignment_of_helper
+    {
+        static const std::size_t value =
+            integral_constant<std::size_t, size_diff>::value;
+    };
+
+    template <typename T>
+    struct alignment_of_helper<T, 0>
+    {
+        static const std::size_t value = alignment_of<T>::value;
+    };
+
+    template <typename T>
+    struct alignment_of
+    {
+      private:
+        struct impl
+        {
+            T    x;
+            char c;
+        };
+
+      public:
+        static const std::size_t value =
+            alignment_of_helper<impl, sizeof(impl) - sizeof(T)>::value;
+    };
+#endif
+
+/// \p aligned_type provides the nested type `type`, which is a trivial
+/// type whose alignment requirement is a divisor of `Align`.
+///
+/// The behavior is undefined if `Align` is not a power of 2.
+template <std::size_t Align>
+struct aligned_type;
+
+#if THRUST_CPP_DIALECT >= 2011                                                     \
+  && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                        \
+  && (THRUST_GCC_VERSION >= 40800)
+    // C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
+    template <std::size_t Align>
+    struct aligned_type
+    {
+        struct alignas(Align) type {};
+    };
+#elif  (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)                    \
+    || (   (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                 \
+        && (THRUST_GCC_VERSION < 40600))
+    // C++03 implementation for MSVC and GCC <= 4.5.
+    // 
+    // We have to implement `aligned_type` with specializations for MSVC
+    // and GCC 4.2.x and older because they require literals as arguments to 
+    // their alignment attribute.
+
+    #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+        // MSVC implementation.
+        #define THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(X)                  \
+            template <>                                                       \
+            struct aligned_type<X>                                            \
+            {                                                                 \
+                __declspec(align(X)) struct type {};                          \
+            };                                                                \
+            /**/
+    #else
+        // GCC <= 4.2 implementation.
+        #define THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(X)                  \
+            template <>                                                       \
+            struct aligned_type<X>                                            \
+            {                                                                 \
+                struct type {} __attribute__((aligned(X)));                   \
+            };                                                                \
+            /**/
+    #endif
+    
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(1);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(2);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(4);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(8);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(16);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(32);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(64);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(128);
+
+    #undef THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION
+#else
+    // C++03 implementation for GCC > 4.5, Clang, PGI, ICPC, and xlC.
+    template <std::size_t Align>
+    struct aligned_type
+    {
+        struct type {} __attribute__((aligned(Align)));
+    };
+#endif
+
+/// \p aligned_storage provides the nested type `type`, which is a trivial type
+/// suitable for use as uninitialized storage for any object whose size is at
+/// most `Len` bytes and whose alignment requirement is a divisor of `Align`.
+/// 
+/// The behavior is undefined if `Len` is 0 or `Align` is not a power of 2.
+///
+/// It is an implementation of C++11's \p std::aligned_storage.
+#if THRUST_CPP_DIALECT >= 2011
+    template <std::size_t Len, std::size_t Align>
+    using aligned_storage = std::aligned_storage<Len, Align>;
+#else
+    template <std::size_t Len, std::size_t Align>
+    struct aligned_storage
+    {
+        union type
+        {
+            unsigned char data[Len];
+            // We put this into the union in case the alignment requirement of
+            // an array of `unsigned char` of length `Len` is greater than
+            // `Align`.
+
+            typename aligned_type<Align>::type align;
+        };
+    };
+#endif
+
+/// \p max_align_t is a trivial type whose alignment requirement is at least as
+/// strict (as large) as that of every scalar type.
+///
+/// It is an implementation of C++11's \p std::max_align_t.
+#if THRUST_CPP_DIALECT >= 2011                                                     \
+  && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                        \
+  && (THRUST_GCC_VERSION >= 40900)
+    // GCC 4.7 and 4.8 don't have `std::max_align_t`.
+    using max_align_t = std::max_align_t;
+#else
+    union max_align_t
+    {
+        // These cannot be private because C++03 POD types cannot have private
+        // data members.
+        char c;
+        short s;
+        int i;
+        long l;
+        float f;
+        double d;
+        long long ll;
+        long double ld;
+        void* p;
+    };
+#endif
+
+/// \p aligned_reinterpret_cast `reinterpret_cast`s \p u of type \p U to `void*`
+/// and then `reinterpret_cast`s the result to \p T. The indirection through
+/// `void*` suppresses compiler warnings when the alignment requirement of \p *u
+/// is less than the alignment requirement of \p *t. The caller of
+/// \p aligned_reinterpret_cast is responsible for ensuring that the alignment
+/// requirements are actually satisified.
+template <typename T, typename U>
+__host__ __device__
+T aligned_reinterpret_cast(U u)
+{
+  return reinterpret_cast<T>(reinterpret_cast<void*>(u));
+}
+
+__host__ __device__
+inline std::size_t aligned_storage_size(std::size_t n, std::size_t align)
+{
+  return ((n + align - 1) / align) * align;
+}
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/allocator/allocator_traits.h b/thrust/thrust/detail/allocator/allocator_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2557b57efa55a1538f58bf5abc790cff5a360a3
--- /dev/null
+++ b/thrust/thrust/detail/allocator/allocator_traits.h
@@ -0,0 +1,422 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// allocator_traits::rebind_alloc and allocator::rebind_traits are from libc++,
+// dual licensed under the MIT and the University of Illinois Open Source
+// Licenses.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits/has_member_function.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// forward declaration for has_member_system
+template<typename Alloc> struct allocator_system;
+
+
+namespace allocator_traits_detail
+{
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_value_type, value_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_pointer, pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_pointer, const_pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_reference, reference)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_reference, const_reference)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_void_pointer, void_pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_void_pointer, const_void_pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_difference_type, difference_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_size_type, size_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_copy_assignment, propagate_on_container_copy_assignment)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_move_assignment, propagate_on_container_move_assignment)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_swap, propagate_on_container_swap)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_system_type, system_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_is_always_equal, is_always_equal)
+__THRUST_DEFINE_HAS_MEMBER_FUNCTION(has_member_system_impl, system)
+
+template<typename Alloc, typename U>
+  struct has_rebind
+{
+  typedef char yes_type;
+  typedef int  no_type;
+
+  template<typename S>
+  static yes_type test(typename S::template rebind<U>::other*);
+  template<typename S>
+  static no_type  test(...);
+
+  static bool const value = sizeof(test<U>(0)) == sizeof(yes_type);
+
+  typedef thrust::detail::integral_constant<bool, value> type;
+};
+
+template<typename T>
+  struct nested_pointer
+{
+  typedef typename T::pointer type;
+};
+
+template<typename T>
+  struct nested_const_pointer
+{
+  typedef typename T::const_pointer type;
+};
+
+template<typename T>
+  struct nested_reference
+{
+  typedef typename T::reference type;
+};
+
+template<typename T>
+  struct nested_const_reference
+{
+  typedef typename T::const_reference type;
+};
+
+template<typename T>
+  struct nested_void_pointer
+{
+  typedef typename T::void_pointer type;
+};
+
+template<typename T>
+  struct nested_const_void_pointer
+{
+  typedef typename T::const_void_pointer type;
+};
+
+template<typename T>
+  struct nested_difference_type
+{
+  typedef typename T::difference_type type;
+};
+
+template<typename T>
+  struct nested_size_type
+{
+  typedef typename T::size_type type;
+};
+
+template<typename T>
+  struct nested_propagate_on_container_copy_assignment
+{
+  typedef typename T::propagate_on_container_copy_assignment type;
+};
+
+template<typename T>
+  struct nested_propagate_on_container_move_assignment
+{
+  typedef typename T::propagate_on_container_move_assignment type;
+};
+
+template<typename T>
+  struct nested_propagate_on_container_swap
+{
+  typedef typename T::propagate_on_container_swap type;
+};
+
+template<typename T>
+  struct nested_is_always_equal
+{
+  typedef typename T::is_always_equal type;
+};
+
+template<typename T>
+  struct nested_system_type
+{
+  typedef typename T::system_type type;
+};
+
+template<typename Alloc>
+  struct has_member_system
+{
+  typedef typename allocator_system<Alloc>::type system_type;
+
+  typedef typename has_member_system_impl<Alloc, system_type&(void)>::type type;
+  static const bool value = type::value;
+};
+
+template<class Alloc, class U, bool = has_rebind<Alloc, U>::value>
+  struct rebind_alloc
+{
+    typedef typename Alloc::template rebind<U>::other type;
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template<template<typename, typename...> class Alloc,
+         typename T, typename... Args, typename U>
+  struct rebind_alloc<Alloc<T, Args...>, U, true>
+{
+    typedef typename Alloc<T, Args...>::template rebind<U>::other type;
+};
+
+template<template<typename, typename...> class Alloc,
+         typename T, typename... Args, typename U>
+  struct rebind_alloc<Alloc<T, Args...>, U, false>
+{
+    typedef Alloc<U, Args...> type;
+};
+#else // C++03
+template <template <typename> class Alloc, typename T, typename U>
+  struct rebind_alloc<Alloc<T>, U, true>
+{
+    typedef typename Alloc<T>::template rebind<U>::other type;
+};
+
+template <template <typename> class Alloc, typename T, typename U>
+  struct rebind_alloc<Alloc<T>, U, false>
+{
+    typedef Alloc<U> type;
+};
+
+template<template<typename, typename> class Alloc,
+         typename T, typename A0, typename U>
+  struct rebind_alloc<Alloc<T, A0>, U, true>
+{
+    typedef typename Alloc<T, A0>::template rebind<U>::other type;
+};
+
+template<template<typename, typename> class Alloc,
+         typename T, typename A0, typename U>
+  struct rebind_alloc<Alloc<T, A0>, U, false>
+{
+    typedef Alloc<U, A0> type;
+};
+
+template<template<typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1>, U, true>
+{
+    typedef typename Alloc<T, A0, A1>::template rebind<U>::other type;
+};
+
+template<template<typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1>, U, false>
+{
+    typedef Alloc<U, A0, A1> type;
+};
+
+template<template<typename, typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename A2, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1, A2>, U, true>
+{
+    typedef typename Alloc<T, A0, A1, A2>::template rebind<U>::other type;
+};
+
+template<template<typename, typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename A2, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1, A2>, U, false>
+{
+    typedef Alloc<U, A0, A1, A2> type;
+};
+#endif
+
+} // end allocator_traits_detail
+
+
+template<typename Alloc>
+  struct allocator_traits
+{
+  typedef Alloc allocator_type;
+
+  typedef typename allocator_type::value_type value_type;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_pointer<allocator_type>,
+    identity_<value_type*>
+  >::type pointer;
+
+  private:
+    template<typename T>
+      struct rebind_pointer
+    {
+      typedef typename pointer_traits<pointer>::template rebind<T>::other type;
+    };
+
+  public:
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_const_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_const_pointer<allocator_type>,
+    rebind_pointer<const value_type>
+  >::type const_pointer;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_void_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_void_pointer<allocator_type>,
+    rebind_pointer<void>
+  >::type void_pointer;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_const_void_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_const_void_pointer<allocator_type>,
+    rebind_pointer<const void>
+  >::type const_void_pointer;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_difference_type<allocator_type>::value,
+    allocator_traits_detail::nested_difference_type<allocator_type>,
+    pointer_difference<pointer>
+  >::type difference_type;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_size_type<allocator_type>::value,
+    allocator_traits_detail::nested_size_type<allocator_type>,
+    make_unsigned<difference_type>
+  >::type size_type;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_propagate_on_container_copy_assignment<allocator_type>::value,
+    allocator_traits_detail::nested_propagate_on_container_copy_assignment<allocator_type>,
+    identity_<false_type>
+  >::type propagate_on_container_copy_assignment;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_propagate_on_container_move_assignment<allocator_type>::value,
+    allocator_traits_detail::nested_propagate_on_container_move_assignment<allocator_type>,
+    identity_<false_type>
+  >::type propagate_on_container_move_assignment;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_propagate_on_container_swap<allocator_type>::value,
+    allocator_traits_detail::nested_propagate_on_container_swap<allocator_type>,
+    identity_<false_type>
+  >::type propagate_on_container_swap;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_is_always_equal<allocator_type>::value,
+    allocator_traits_detail::nested_is_always_equal<allocator_type>,
+    is_empty<allocator_type>
+  >::type is_always_equal;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_system_type<allocator_type>::value,
+    allocator_traits_detail::nested_system_type<allocator_type>,
+    thrust::iterator_system<pointer>
+  >::type system_type;
+
+  // XXX rebind and rebind_traits are alias templates
+  //     and so are omitted while c++11 is unavailable
+
+#if THRUST_CPP_DIALECT >= 2011
+  template <typename U>
+  using rebind_alloc =
+    typename allocator_traits_detail::rebind_alloc<allocator_type, U>::type;
+
+  template <typename U>
+  using rebind_traits = allocator_traits<rebind_alloc<U>>;
+
+  // We define this nested type alias for compatibility with the C++03-style
+  // rebind_* mechanisms.
+  using other = allocator_traits;
+#else
+  template <typename U>
+  struct rebind_alloc
+  {
+    typedef typename
+      allocator_traits_detail::rebind_alloc<allocator_type, U>::type other;
+  };
+  template <typename U>
+  struct rebind_traits
+  {
+    typedef allocator_traits<typename rebind_alloc<U>::other> other;
+  };
+#endif
+
+  // Deprecated std::allocator typedefs that we need:
+  typedef typename thrust::detail::pointer_traits<pointer>::reference reference;
+  typedef typename thrust::detail::pointer_traits<const_pointer>::reference const_reference;
+
+  inline __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n);
+
+  inline __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n, const_void_pointer hint);
+
+  inline __host__ __device__
+  static void deallocate(allocator_type &a, pointer p, size_type n);
+
+  // XXX should probably change T* to pointer below and then relax later
+
+  template<typename T>
+  inline __host__ __device__ static void construct(allocator_type &a, T *p);
+  
+  template<typename T, typename Arg1>
+  inline __host__ __device__ static void construct(allocator_type &a, T *p, const Arg1 &arg1);
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename... Args>
+  inline __host__ __device__ static void construct(allocator_type &a, T *p, Args&&... args);
+#endif
+
+  template<typename T>
+  inline __host__ __device__ static void destroy(allocator_type &a, T *p);
+
+  inline __host__ __device__
+  static size_type max_size(const allocator_type &a);
+}; // end allocator_traits
+
+
+// we consider a type an allocator if T::value_type exists
+// it doesn't make much sense (containers, which are not allocators, will fulfill this requirement),
+// but allocator_traits is specified to work for any type with that nested typedef
+template<typename T>
+  struct is_allocator
+    : allocator_traits_detail::has_value_type<T>
+{};
+
+
+// XXX consider moving this non-standard functionality inside allocator_traits
+template<typename Alloc>
+  struct allocator_system
+{
+  // the type of the allocator's system
+  typedef typename eval_if<
+    allocator_traits_detail::has_system_type<Alloc>::value,
+    allocator_traits_detail::nested_system_type<Alloc>,
+    thrust::iterator_system<
+      typename allocator_traits<Alloc>::pointer
+    >
+  >::type type;
+
+  // the type that get returns
+  typedef typename eval_if<
+    allocator_traits_detail::has_member_system<Alloc>::value, // if Alloc.system() exists
+    add_reference<type>,                                      // then get() needs to return a reference
+    identity_<type>                                           // else get() needs to return a value
+  >::type get_result_type;
+
+  __host__ __device__
+  inline static get_result_type get(Alloc &a);
+};
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/allocator_traits.inl>
+
diff --git a/thrust/thrust/detail/allocator/allocator_traits.inl b/thrust/thrust/detail/allocator/allocator_traits.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0818941f696cfbd58ce17ddf95fc6e9118984d50
--- /dev/null
+++ b/thrust/thrust/detail/allocator/allocator_traits.inl
@@ -0,0 +1,464 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/is_call_possible.h>
+#include <thrust/detail/integer_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/type_deduction.h>
+#endif
+
+#include <thrust/detail/memory_wrapper.h>
+#include <new>
+
+namespace thrust
+{
+namespace detail
+{
+
+#if THRUST_CPP_DIALECT >= 2011
+
+// std::allocator's member functions are deprecated in C++17 and removed in
+// C++20, so we can't just use the generic implementation for allocator_traits
+// that calls the allocator's member functions.
+// Instead, specialize allocator_traits for std::allocator and defer to
+// std::allocator_traits<std::allocator> and let the STL do whatever it needs
+// to for the current c++ version. Manually forward the calls to suppress
+// host/device warnings.
+template <typename T>
+struct allocator_traits<std::allocator<T>>
+  : public std::allocator_traits<std::allocator<T>>
+{
+private:
+  using superclass = std::allocator_traits<std::allocator<T>>;
+
+public:
+  using allocator_type = typename superclass::allocator_type;
+  using value_type = typename superclass::value_type;
+  using pointer = typename superclass::pointer;
+  using const_pointer = typename superclass::const_pointer;
+  using void_pointer = typename superclass::void_pointer;
+  using const_void_pointer = typename superclass::const_void_pointer;
+  using difference_type = typename superclass::difference_type;
+  using size_type = typename superclass::size_type;
+  using propagate_on_container_swap = typename superclass::propagate_on_container_swap;
+  using propagate_on_container_copy_assignment =
+    typename superclass::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+    typename superclass::propagate_on_container_move_assignment;
+
+  // std::allocator_traits added this in C++17, but thrust::allocator_traits defines
+  // it unconditionally.
+  using is_always_equal = typename eval_if<
+      allocator_traits_detail::has_is_always_equal<allocator_type>::value,
+      allocator_traits_detail::nested_is_always_equal<allocator_type>,
+      is_empty<allocator_type>
+    >::type;
+
+  // std::allocator_traits doesn't provide these, but
+  // thrust::detail::allocator_traits does. These used to be part of the
+  // std::allocator API but were deprecated in C++17.
+  using reference = typename thrust::detail::pointer_traits<pointer>::reference;
+  using const_reference = typename thrust::detail::pointer_traits<const_pointer>::reference;
+
+  template <typename U>
+  using rebind_alloc = std::allocator<U>;
+  template <typename U>
+  using rebind_traits = allocator_traits<std::allocator<U>>;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n)
+  {
+    return superclass::allocate(a, n);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n, const_void_pointer hint)
+  {
+    return superclass::allocate(a, n, hint);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static void deallocate(allocator_type &a, pointer p, size_type n)
+  {
+    superclass::deallocate(a, p, n);
+  }
+
+  __thrust_exec_check_disable__
+  template <typename U, typename ...Args>
+  __host__ __device__
+  static void construct(allocator_type &a, U *p, Args&&... args)
+  {
+    superclass::construct(a, p, THRUST_FWD(args)...);
+  }
+
+  __thrust_exec_check_disable__
+  template <typename U>
+  __host__ __device__
+  static void destroy(allocator_type &a, U *p)
+  {
+    superclass::destroy(a, p);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static size_type max_size(const allocator_type &a)
+  {
+    return superclass::max_size(a);
+  }
+};
+
+#endif //  C++11
+
+namespace allocator_traits_detail
+{
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_allocate_with_hint_impl, allocate)
+
+template<typename Alloc>
+  class has_member_allocate_with_hint
+{
+  typedef typename allocator_traits<Alloc>::pointer            pointer;
+  typedef typename allocator_traits<Alloc>::size_type          size_type;
+  typedef typename allocator_traits<Alloc>::const_void_pointer const_void_pointer;
+
+  public:
+    typedef typename has_member_allocate_with_hint_impl<Alloc, pointer(size_type,const_void_pointer)>::type type;
+    static const bool value = type::value;
+};
+
+template<typename Alloc>
+__host__ __device__
+  typename enable_if<
+    has_member_allocate_with_hint<Alloc>::value,
+    typename allocator_traits<Alloc>::pointer
+  >::type
+    allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer hint)
+{
+  return a.allocate(n,hint);
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename disable_if<
+    has_member_allocate_with_hint<Alloc>::value,
+    typename allocator_traits<Alloc>::pointer
+  >::type
+    allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer)
+{
+  return a.allocate(n);
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct1_impl, construct)
+
+template<typename Alloc, typename T>
+  struct has_member_construct1
+    : has_member_construct1_impl<Alloc, void(T*)>
+{};
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_construct1<Alloc,T>::value
+    >::type
+      construct(Alloc &a, T *p)
+{
+  a.construct(p);
+}
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_construct1<Alloc,T>::value
+    >::type
+      construct(Alloc &, T *p)
+{
+  ::new(static_cast<void*>(p)) T();
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct2_impl, construct)
+
+template<typename Alloc, typename T, typename Arg1>
+  struct has_member_construct2
+    : has_member_construct2_impl<Alloc, void(T*,const Arg1 &)>
+{};
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T, typename Arg1>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_construct2<Alloc,T,Arg1>::value
+    >::type
+      construct(Alloc &a, T *p, const Arg1 &arg1)
+{
+  a.construct(p,arg1);
+}
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T, typename Arg1>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_construct2<Alloc,T,Arg1>::value
+    >::type
+      construct(Alloc &, T *p, const Arg1 &arg1)
+{
+  ::new(static_cast<void*>(p)) T(arg1);
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_constructN_impl, construct)
+
+template<typename Alloc, typename T, typename... Args>
+  struct has_member_constructN
+    : has_member_constructN_impl<Alloc, void(T*, Args...)>
+{};
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T, typename... Args>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_constructN<Alloc, T, Args...>::value
+    >::type
+      construct(Alloc &a, T* p, Args&&... args)
+{
+  a.construct(p, THRUST_FWD(args)...);
+}
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T, typename... Args>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_constructN<Alloc, T, Args...>::value
+    >::type
+      construct(Alloc &, T* p, Args&&... args)
+{
+  ::new(static_cast<void*>(p)) T(THRUST_FWD(args)...);
+}
+
+#endif
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_destroy_impl, destroy)
+
+template<typename Alloc, typename T>
+  struct has_member_destroy
+    : has_member_destroy_impl<Alloc, void(T*)>
+{};
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_destroy<Alloc,T>::value
+    >::type
+      destroy(Alloc &a, T *p)
+{
+  a.destroy(p);
+}
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_destroy<Alloc,T>::value
+    >::type
+      destroy(Alloc &, T *p)
+{
+  p->~T();
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_max_size_impl, max_size)
+
+template<typename Alloc>
+  class has_member_max_size
+{
+  typedef typename allocator_traits<Alloc>::size_type size_type;
+
+  public:
+    typedef typename has_member_max_size_impl<Alloc, size_type(void)>::type type;
+    static const bool value = type::value;
+};
+
+template<typename Alloc>
+__host__ __device__
+  typename enable_if<
+    has_member_max_size<Alloc>::value,
+    typename allocator_traits<Alloc>::size_type
+  >::type
+    max_size(const Alloc &a)
+{
+  return a.max_size();
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename disable_if<
+    has_member_max_size<Alloc>::value,
+    typename allocator_traits<Alloc>::size_type
+  >::type
+    max_size(const Alloc &)
+{
+  typedef typename allocator_traits<Alloc>::size_type size_type;
+  return thrust::detail::integer_traits<size_type>::const_max;
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename enable_if<
+    has_member_system<Alloc>::value,
+    typename allocator_system<Alloc>::type &
+  >::type
+    system(Alloc &a)
+{
+  // return the allocator's system
+  return a.system();
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename disable_if<
+    has_member_system<Alloc>::value,
+    typename allocator_system<Alloc>::type
+  >::type
+    system(Alloc &)
+{
+  // return a copy of a value-initialized system
+  return typename allocator_system<Alloc>::type();
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Alloc>
+__host__ __device__
+  typename allocator_traits<Alloc>::pointer
+    allocator_traits<Alloc>
+      ::allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n)
+{
+  struct workaround_warnings
+  {
+    __thrust_exec_check_disable__
+    static __host__ __device__ 
+    typename allocator_traits<Alloc>::pointer
+      allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n)
+    {
+      return a.allocate(n);
+    }
+  };
+
+  return workaround_warnings::allocate(a, n);
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename allocator_traits<Alloc>::pointer
+    allocator_traits<Alloc>
+      ::allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer hint)
+{
+  return allocator_traits_detail::allocate(a, n, hint);
+}
+
+template<typename Alloc>
+__host__ __device__
+  void allocator_traits<Alloc>
+    ::deallocate(Alloc &a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n)
+{
+  struct workaround_warnings
+  {
+    __thrust_exec_check_disable__
+    static __host__ __device__
+    void deallocate(Alloc &a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n)
+    {
+      return a.deallocate(p,n);
+    }
+  };
+
+  return workaround_warnings::deallocate(a,p,n);
+}
+
+template<typename Alloc>
+  template<typename T>
+  __host__ __device__
+    void allocator_traits<Alloc>
+      ::construct(allocator_type &a, T *p)
+{
+  return allocator_traits_detail::construct(a,p);
+}
+
+template<typename Alloc>
+  template<typename T, typename Arg1>
+  __host__ __device__
+    void allocator_traits<Alloc>
+      ::construct(allocator_type &a, T *p, const Arg1 &arg1)
+{
+  return allocator_traits_detail::construct(a,p,arg1);
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template<typename Alloc>
+  template<typename T, typename... Args>
+  __host__ __device__
+    void allocator_traits<Alloc>
+      ::construct(allocator_type &a, T *p, Args&&... args)
+{
+  return allocator_traits_detail::construct(a, p, THRUST_FWD(args)...);
+}
+
+#endif
+
+template<typename Alloc>
+  template<typename T>
+  __host__ __device__
+    void allocator_traits<Alloc>
+      ::destroy(allocator_type &a, T *p)
+{
+  return allocator_traits_detail::destroy(a,p);
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename allocator_traits<Alloc>::size_type
+    allocator_traits<Alloc>
+      ::max_size(const allocator_type &a)
+{
+  return allocator_traits_detail::max_size(a);
+}
+
+template<typename Alloc>
+__host__ __device__
+  typename allocator_system<Alloc>::get_result_type
+    allocator_system<Alloc>
+      ::get(Alloc &a)
+{
+  return allocator_traits_detail::system(a);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/copy_construct_range.h b/thrust/thrust/detail/allocator/copy_construct_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..491c8ef411ec7a3c035067708b947aa42d71ec11
--- /dev/null
+++ b/thrust/thrust/detail/allocator/copy_construct_range.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename System, typename Allocator, typename InputIterator, typename Pointer>
+__host__ __device__
+  Pointer copy_construct_range(thrust::execution_policy<System> &from_system,
+                               Allocator &a,
+                               InputIterator first,
+                               InputIterator last,
+                               Pointer result);
+
+template<typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+__host__ __device__
+  Pointer copy_construct_range_n(thrust::execution_policy<System> &from_system,
+                                 Allocator &a,
+                                 InputIterator first,
+                                 Size n,
+                                 Pointer result);
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/copy_construct_range.inl>
+
diff --git a/thrust/thrust/detail/allocator/copy_construct_range.inl b/thrust/thrust/detail/allocator/copy_construct_range.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2f0f03c3614d7dd8d39743352e9154dc0fd63c59
--- /dev/null
+++ b/thrust/thrust/detail/allocator/copy_construct_range.inl
@@ -0,0 +1,309 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/copy.h>
+#include <thrust/tuple.h>
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/for_each.h>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+
+template<typename Allocator, typename InputType, typename OutputType>
+  struct copy_construct_with_allocator
+{
+  Allocator &a;
+
+  __host__ __device__
+  copy_construct_with_allocator(Allocator &a)
+    : a(a)
+  {}
+
+  template<typename Tuple>
+  inline __host__ __device__
+  void operator()(Tuple t)
+  {
+    const InputType &in = thrust::get<0>(t);
+    OutputType &out = thrust::get<1>(t);
+
+    allocator_traits<Allocator>::construct(a, &out, in);
+  }
+};
+
+
+// we need to use allocator_traits<Allocator>::construct() to
+// copy construct a T if either:
+// 1. Allocator has a 2-argument construct() member or
+// 2. T has a non-trivial copy constructor
+template<typename Allocator, typename T>
+  struct needs_copy_construct_via_allocator
+    : integral_constant<
+        bool,
+        (has_member_construct2<Allocator,T,T>::value || !has_trivial_copy_constructor<T>::value)
+      >
+{};
+
+
+// we know that std::allocator::construct's only effect is to call T's
+// copy constructor, so we needn't consider or use its construct() member for copy construction
+template<typename U, typename T>
+  struct needs_copy_construct_via_allocator<std::allocator<U>, T>
+    : integral_constant<
+        bool,
+        !has_trivial_copy_constructor<T>::value
+      >
+{};
+
+
+// XXX it's regrettable that this implementation is copied almost
+//     exactly from system::detail::generic::uninitialized_copy
+//     perhaps generic::uninitialized_copy could call this routine
+//     with a default allocator
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
+__host__ __device__
+  typename enable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator(Allocator &a,
+                                      const thrust::execution_policy<FromSystem> &,
+                                      const thrust::execution_policy<ToSystem> &to_system,
+                                      InputIterator first,
+                                      InputIterator last,
+                                      Pointer result)
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,Pointer> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>  ZipIterator;
+
+  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+  ZipIterator end = begin;
+
+  // get a zip_iterator pointing to the end
+  const typename thrust::iterator_difference<InputIterator>::type n = thrust::distance(first,last);
+  thrust::advance(end,n);
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type InputType;
+  typedef typename iterator_traits<Pointer>::value_type       OutputType;
+
+  // do the for_each
+  // note we use to_system to dispatch the for_each
+  thrust::for_each(to_system, begin, end, copy_construct_with_allocator<Allocator,InputType,OutputType>(a));
+
+  // return the end of the output range
+  return thrust::get<1>(end.get_iterator_tuple());
+}
+
+
+// XXX it's regrettable that this implementation is copied almost
+//     exactly from system::detail::generic::uninitialized_copy_n
+//     perhaps generic::uninitialized_copy_n could call this routine
+//     with a default allocator
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
+__host__ __device__
+  typename enable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator_n(Allocator &a,
+                                        const thrust::execution_policy<FromSystem> &,
+                                        const thrust::execution_policy<ToSystem> &to_system,
+                                        InputIterator first,
+                                        Size n,
+                                        Pointer result)
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,Pointer> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>  ZipIterator;
+
+  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type InputType;
+  typedef typename iterator_traits<Pointer>::value_type       OutputType;
+
+  // do the for_each_n
+  // note we use to_system to dispatch the for_each_n
+  ZipIterator end = thrust::for_each_n(to_system, begin, n, copy_construct_with_allocator<Allocator,InputType,OutputType>(a));
+
+  // return the end of the output range
+  return thrust::get<1>(end.get_iterator_tuple());
+}
+
+
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
+__host__ __device__
+  typename disable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator(Allocator &,
+                                      const thrust::execution_policy<FromSystem> &from_system,
+                                      const thrust::execution_policy<ToSystem> &to_system,
+                                      InputIterator first,
+                                      InputIterator last,
+                                      Pointer result)
+{
+  // the systems aren't trivially interoperable
+  // just call two_system_copy and hope for the best
+  return thrust::detail::two_system_copy(from_system, to_system, first, last, result);
+} // end uninitialized_copy_with_allocator()
+
+
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
+__host__ __device__
+  typename disable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator_n(Allocator &,
+                                        const thrust::execution_policy<FromSystem> &from_system,
+                                        const thrust::execution_policy<ToSystem> &to_system,
+                                        InputIterator first,
+                                        Size n,
+                                        Pointer result)
+{
+  // the systems aren't trivially interoperable
+  // just call two_system_copy_n and hope for the best
+  return thrust::detail::two_system_copy_n(from_system, to_system, first, n, result);
+} // end uninitialized_copy_with_allocator_n()
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Pointer>
+__host__ __device__
+  typename disable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range(thrust::execution_policy<FromSystem> &from_system,
+                         Allocator &a,
+                         InputIterator first,
+                         InputIterator last,
+                         Pointer result)
+{
+  // just call two_system_copy
+  return thrust::detail::two_system_copy(from_system, allocator_system<Allocator>::get(a), first, last, result);
+}
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+__host__ __device__
+  typename disable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range_n(thrust::execution_policy<FromSystem> &from_system,
+                           Allocator &a,
+                           InputIterator first,
+                           Size n,
+                           Pointer result)
+{
+  // just call two_system_copy_n
+  return thrust::detail::two_system_copy_n(from_system, allocator_system<Allocator>::get(a), first, n, result);
+}
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Pointer>
+__host__ __device__
+  typename enable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range(thrust::execution_policy<FromSystem> &from_system,
+                         Allocator &a,
+                         InputIterator first,
+                         InputIterator last,
+                         Pointer result)
+{
+  return uninitialized_copy_with_allocator(a, from_system, allocator_system<Allocator>::get(a), first, last, result);
+}
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+__host__ __device__
+  typename enable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range_n(thrust::execution_policy<FromSystem> &from_system,
+                           Allocator &a,
+                           InputIterator first,
+                           Size n,
+                           Pointer result)
+{
+  return uninitialized_copy_with_allocator_n(a, from_system, allocator_system<Allocator>::get(a), first, n, result);
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename System, typename Allocator, typename InputIterator, typename Pointer>
+__host__ __device__
+  Pointer copy_construct_range(thrust::execution_policy<System> &from_system,
+                               Allocator &a,
+                               InputIterator first,
+                               InputIterator last,
+                               Pointer result)
+{
+  return allocator_traits_detail::copy_construct_range(from_system, a, first, last, result);
+}
+
+
+template<typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+__host__ __device__
+  Pointer copy_construct_range_n(thrust::execution_policy<System> &from_system,
+                                 Allocator &a,
+                                 InputIterator first,
+                                 Size n,
+                                 Pointer result)
+{
+  return allocator_traits_detail::copy_construct_range_n(from_system, a, first, n, result);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/default_construct_range.h b/thrust/thrust/detail/allocator/default_construct_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c3856c142990a3230c3bc4f805c0cb0a5fbcb73
--- /dev/null
+++ b/thrust/thrust/detail/allocator/default_construct_range.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+inline void default_construct_range(Allocator &a, Pointer p, Size n);
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/default_construct_range.inl>
+
+
diff --git a/thrust/thrust/detail/allocator/default_construct_range.inl b/thrust/thrust/detail/allocator/default_construct_range.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0f65d4806ac63fb29647ec0588f750ba5403b82f
--- /dev/null
+++ b/thrust/thrust/detail/allocator/default_construct_range.inl
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/for_each.h>
+#include <thrust/uninitialized_fill.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+
+template<typename Allocator>
+  struct construct1_via_allocator
+{
+  Allocator &a;
+
+  __host__ __device__
+  construct1_via_allocator(Allocator &a)
+    : a(a)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    allocator_traits<Allocator>::construct(a, &x);
+  }
+};
+
+
+// we need to construct T via the allocator if...
+template<typename Allocator, typename T>
+  struct needs_default_construct_via_allocator
+    : thrust::detail::or_<
+        has_member_construct1<Allocator,T>,               // if the Allocator does something interesting
+        thrust::detail::not_<has_trivial_constructor<T> > // or if T's default constructor does something interesting
+      >
+{};
+
+
+// we know that std::allocator::construct's only effect is to call T's 
+// default constructor, so we needn't use it for default construction
+// unless T's constructor does something interesting
+template<typename U, typename T>
+  struct needs_default_construct_via_allocator<std::allocator<U>, T>
+    : thrust::detail::not_<has_trivial_constructor<T> >
+{};
+
+
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  typename enable_if<
+    needs_default_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value
+  >::type
+    default_construct_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct1_via_allocator<Allocator>(a));
+}
+
+
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  typename disable_if<
+    needs_default_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value
+  >::type
+    default_construct_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, typename pointer_element<Pointer>::type());
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  void default_construct_range(Allocator &a, Pointer p, Size n)
+{
+  return allocator_traits_detail::default_construct_range(a,p,n);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/destroy_range.h b/thrust/thrust/detail/allocator/destroy_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf00037cecb06d17aef1125138fdfcbbcc242655
--- /dev/null
+++ b/thrust/thrust/detail/allocator/destroy_range.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  inline void destroy_range(Allocator &a, Pointer p, Size n);
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/destroy_range.inl>
+
diff --git a/thrust/thrust/detail/allocator/destroy_range.inl b/thrust/thrust/detail/allocator/destroy_range.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f34159dc3d4184743f52f28f97dcb5696a830755
--- /dev/null
+++ b/thrust/thrust/detail/allocator/destroy_range.inl
@@ -0,0 +1,164 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/allocator/destroy_range.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/for_each.h>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+
+// destroy_range has three cases:
+// if Allocator has an effectful member function destroy:
+//   1. destroy via the allocator
+// else
+//   2. if T has a non-trivial destructor, destroy the range without using the allocator
+//   3. if T has a trivial destructor, do a no-op
+
+template<typename Allocator, typename T>
+  struct has_effectful_member_destroy
+    : has_member_destroy<Allocator,T>
+{};
+
+// std::allocator::destroy's only effect is to invoke its argument's destructor
+template<typename U, typename T>
+  struct has_effectful_member_destroy<std::allocator<U>, T>
+    : thrust::detail::false_type
+{};
+
+// case 1: Allocator has an effectful 1-argument member function "destroy"
+template<typename Allocator, typename Pointer>
+  struct enable_if_destroy_range_case1
+    : thrust::detail::enable_if<
+        has_effectful_member_destroy<
+          Allocator,
+          typename pointer_element<Pointer>::type
+        >::value
+      >
+{};
+
+// case 2: Allocator has no member function "destroy", but T has a non-trivial destructor
+template<typename Allocator, typename Pointer>
+  struct enable_if_destroy_range_case2
+    : thrust::detail::enable_if<
+        !has_effectful_member_destroy<
+          Allocator,
+          typename pointer_element<Pointer>::type
+        >::value &&
+        !has_trivial_destructor<
+          typename pointer_element<Pointer>::type
+        >::value
+      >
+{};
+
+// case 3: Allocator has no member function "destroy", and T has a trivial destructor
+template<typename Allocator, typename Pointer>
+  struct enable_if_destroy_range_case3
+    : thrust::detail::enable_if<
+        !has_effectful_member_destroy<
+          Allocator,
+          typename pointer_element<Pointer>::type
+        >::value &&
+        has_trivial_destructor<
+          typename pointer_element<Pointer>::type
+        >::value
+      >
+{};
+
+
+
+template<typename Allocator>
+  struct destroy_via_allocator
+{
+  Allocator &a;
+
+  __host__ __device__
+  destroy_via_allocator(Allocator &a)
+    : a(a)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    allocator_traits<Allocator>::destroy(a, &x);
+  }
+};
+
+
+// destroy_range case 1: destroy via allocator
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  typename enable_if_destroy_range_case1<Allocator,Pointer>::type
+    destroy_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, destroy_via_allocator<Allocator>(a));
+}
+
+
+// we must prepare for His coming
+struct gozer
+{
+  __thrust_exec_check_disable__
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    x.~T();
+  }
+};
+
+// destroy_range case 2: destroy without the allocator
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  typename enable_if_destroy_range_case2<Allocator,Pointer>::type
+    destroy_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, gozer());
+}
+
+
+// destroy_range case 3: no-op
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  typename enable_if_destroy_range_case3<Allocator,Pointer>::type
+    destroy_range(Allocator &, Pointer, Size)
+{
+  // no op
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Allocator, typename Pointer, typename Size>
+__host__ __device__
+  void destroy_range(Allocator &a, Pointer p, Size n)
+{
+  return allocator_traits_detail::destroy_range(a,p,n);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/fill_construct_range.h b/thrust/thrust/detail/allocator/fill_construct_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..9de0f7bcbb86b8ed895ca597d75242578ce125f5
--- /dev/null
+++ b/thrust/thrust/detail/allocator/fill_construct_range.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Allocator, typename Pointer, typename Size, typename T>
+__host__ __device__
+inline void fill_construct_range(Allocator &a, Pointer p, Size n, const T &value);
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/fill_construct_range.inl>
+
diff --git a/thrust/thrust/detail/allocator/fill_construct_range.inl b/thrust/thrust/detail/allocator/fill_construct_range.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7f2adafc7f9be151758480acea852dd0e0646f6d
--- /dev/null
+++ b/thrust/thrust/detail/allocator/fill_construct_range.inl
@@ -0,0 +1,113 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/for_each.h>
+#include <thrust/uninitialized_fill.h>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+// fill_construct_range has 2 cases:
+// if Allocator has an effectful member function construct:
+//   1. construct via the allocator
+// else
+//   2. construct via uninitialized_fill
+
+template<typename Allocator, typename T, typename Arg1>
+  struct has_effectful_member_construct2
+    : has_member_construct2<Allocator,T,Arg1>
+{};
+
+// std::allocator::construct's only effect is to invoke placement new
+template<typename U, typename T, typename Arg1>
+  struct has_effectful_member_construct2<std::allocator<U>,T,Arg1>
+    : thrust::detail::false_type
+{};
+
+
+template<typename Allocator, typename Arg1>
+  struct construct2_via_allocator
+{
+  Allocator &a;
+  Arg1 arg;
+
+  __host__ __device__
+  construct2_via_allocator(Allocator &a, const Arg1 &arg)
+    : a(a), arg(arg)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    allocator_traits<Allocator>::construct(a, &x, arg);
+  }
+};
+
+
+template<typename Allocator, typename Pointer, typename Size, typename T>
+__host__ __device__
+  typename enable_if<
+    has_effectful_member_construct2<
+      Allocator,
+      typename pointer_element<Pointer>::type,
+      T
+    >::value
+  >::type
+    fill_construct_range(Allocator &a, Pointer p, Size n, const T &value)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct2_via_allocator<Allocator,T>(a, value));
+}
+
+
+template<typename Allocator, typename Pointer, typename Size, typename T>
+__host__ __device__
+  typename disable_if<
+    has_effectful_member_construct2<
+      Allocator,
+      typename pointer_element<Pointer>::type,
+      T
+    >::value
+  >::type
+    fill_construct_range(Allocator &a, Pointer p, Size n, const T &value)
+{
+  thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, value);
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Alloc, typename Pointer, typename Size, typename T>
+__host__ __device__
+  void fill_construct_range(Alloc &a, Pointer p, Size n, const T &value)
+{
+  return allocator_traits_detail::fill_construct_range(a,p,n,value);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/malloc_allocator.h b/thrust/thrust/detail/allocator/malloc_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c01c66bd0a8e5f6f689c580e6d79df5d4e3a45c
--- /dev/null
+++ b/thrust/thrust/detail/allocator/malloc_allocator.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/allocator/tagged_allocator.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename T, typename System, typename Pointer>
+  class malloc_allocator
+    : public thrust::detail::tagged_allocator<
+               T, System, Pointer
+             >
+{
+  private:
+    typedef thrust::detail::tagged_allocator<
+      T, System, Pointer
+    > super_t;
+
+  public:
+    typedef typename super_t::pointer   pointer;
+    typedef typename super_t::size_type size_type;
+
+    pointer allocate(size_type cnt);
+
+    void deallocate(pointer p, size_type n);
+};
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/malloc_allocator.inl>
+
diff --git a/thrust/thrust/detail/allocator/malloc_allocator.inl b/thrust/thrust/detail/allocator/malloc_allocator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..e7b7503ba8df998a72b70ce2c80c57a0861b3203
--- /dev/null
+++ b/thrust/thrust/detail/allocator/malloc_allocator.inl
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/malloc_allocator.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/bad_alloc.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename System, typename Pointer>
+  typename malloc_allocator<T,System,Pointer>::pointer
+    malloc_allocator<T,System,Pointer>
+      ::allocate(typename malloc_allocator<T,System,Pointer>::size_type cnt)
+{
+  using thrust::system::detail::generic::select_system;
+
+  // XXX should use a hypothetical thrust::static_pointer_cast here
+  System system;
+
+  pointer result = thrust::malloc<T>(select_system(system), cnt);
+
+  if(result.get() == 0)
+  {
+    throw thrust::system::detail::bad_alloc("malloc_allocator::allocate: malloc failed");
+  } // end if
+
+  return result;
+} // end malloc_allocator::allocate()
+
+
+template<typename T, typename System, typename Pointer>
+  void malloc_allocator<T,System,Pointer>
+    ::deallocate(typename malloc_allocator<T,System,Pointer>::pointer p, typename malloc_allocator<T,System,Pointer>::size_type)
+{
+  using thrust::system::detail::generic::select_system;
+
+  System system;
+  thrust::free(select_system(system), p);
+} // end malloc_allocator
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/no_throw_allocator.h b/thrust/thrust/detail/allocator/no_throw_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba8c3d852988e9add8659236293a424682701489
--- /dev/null
+++ b/thrust/thrust/detail/allocator/no_throw_allocator.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename BaseAllocator>
+  struct no_throw_allocator : BaseAllocator
+{
+  private:
+    typedef BaseAllocator super_t;
+  
+  public:
+    inline __host__ __device__
+    no_throw_allocator(const BaseAllocator &other = BaseAllocator())
+      : super_t(other)
+    {}
+
+    template<typename U>
+      struct rebind
+    {
+      typedef no_throw_allocator<typename super_t::template rebind<U>::other> other;
+    }; // end rebind
+
+    __host__ __device__
+    void deallocate(typename super_t::pointer p, typename super_t::size_type n)
+    {
+#ifndef __CUDA_ARCH__
+      try
+      {
+        super_t::deallocate(p, n);
+      } // end try
+      catch(...)
+      {
+        // catch anything
+      } // end catch
+#else
+      super_t::deallocate(p, n);
+#endif
+    } // end deallocate()
+
+    inline __host__ __device__
+    bool operator==(no_throw_allocator const &other) { return super_t::operator==(other); }
+
+    inline __host__ __device__
+    bool operator!=(no_throw_allocator const &other) { return super_t::operator!=(other); }
+}; // end no_throw_allocator
+
+} // end detail
+} // end thrust
+
+
diff --git a/thrust/thrust/detail/allocator/tagged_allocator.h b/thrust/thrust/detail/allocator/tagged_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a29115c6c1bf37de7b1515b2485620335cc3b473
--- /dev/null
+++ b/thrust/thrust/detail/allocator/tagged_allocator.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename T, typename Tag, typename Pointer> class tagged_allocator;
+
+template<typename Tag, typename Pointer>
+  class tagged_allocator<void, Tag, Pointer>
+{
+  public:
+    typedef void                                                                                 value_type;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<void>::other       pointer;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<const void>::other const_pointer;
+    typedef std::size_t                                                                          size_type;
+    typedef typename thrust::detail::pointer_traits<Pointer>::difference_type                    difference_type;
+    typedef Tag                                                                                  system_type;
+
+    template<typename U>
+      struct rebind
+    {
+      typedef tagged_allocator<U,Tag,Pointer> other;
+    }; // end rebind
+};
+
+template<typename T, typename Tag, typename Pointer>
+  class tagged_allocator
+{
+  public:
+    typedef T                                                                                 value_type;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<T>::other       pointer;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<const T>::other const_pointer;
+    typedef typename thrust::iterator_reference<pointer>::type                                reference;
+    typedef typename thrust::iterator_reference<const_pointer>::type                          const_reference;
+    typedef std::size_t                                                                       size_type;
+    typedef typename thrust::detail::pointer_traits<pointer>::difference_type                 difference_type;
+    typedef Tag                                                                               system_type;
+
+    template<typename U>
+      struct rebind
+    {
+      typedef tagged_allocator<U,Tag,Pointer> other;
+    }; // end rebind
+
+    __host__ __device__
+    inline tagged_allocator();
+
+    __host__ __device__
+    inline tagged_allocator(const tagged_allocator &);
+
+    template<typename U, typename OtherPointer>
+    __host__ __device__
+    inline tagged_allocator(const tagged_allocator<U, Tag, OtherPointer> &);
+
+    __host__ __device__
+    inline ~tagged_allocator();
+
+    __host__ __device__
+    pointer address(reference x) const;
+
+    __host__ __device__
+    const_pointer address(const_reference x) const;
+
+    size_type max_size() const;
+};
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator==(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/tagged_allocator.inl>
+
diff --git a/thrust/thrust/detail/allocator/tagged_allocator.inl b/thrust/thrust/detail/allocator/tagged_allocator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..5f4ed95968108420f5229567e2a41f9a7c7a6bbb
--- /dev/null
+++ b/thrust/thrust/detail/allocator/tagged_allocator.inl
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/tagged_allocator.h>
+#include <limits>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
+  tagged_allocator<T,Tag,Pointer>
+    ::tagged_allocator()
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
+  tagged_allocator<T,Tag,Pointer>
+    ::tagged_allocator(const tagged_allocator<T,Tag,Pointer> &)
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  template<typename U, typename OtherPointer>
+    __host__ __device__
+    tagged_allocator<T,Tag,Pointer>
+      ::tagged_allocator(const tagged_allocator<U,Tag,OtherPointer> &)
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
+  tagged_allocator<T,Tag,Pointer>
+    ::~tagged_allocator()
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
+  typename tagged_allocator<T,Tag,Pointer>::pointer
+    tagged_allocator<T,Tag,Pointer>
+      ::address(reference x) const
+{
+  return &x;
+}
+
+
+template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
+  typename tagged_allocator<T,Tag,Pointer>::const_pointer
+    tagged_allocator<T,Tag,Pointer>
+      ::address(const_reference x) const
+{
+  return &x;
+}
+
+
+template<typename T, typename Tag, typename Pointer>
+  typename tagged_allocator<T,Tag,Pointer>::size_type
+    tagged_allocator<T,Tag,Pointer>
+      ::max_size() const
+{
+  return (std::numeric_limits<size_type>::max)() / sizeof(T);
+}
+
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator==(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &)
+{
+  return true;
+}
+
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &)
+{
+  return false;
+}
+    
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator/temporary_allocator.h b/thrust/thrust/detail/allocator/temporary_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d2ac429c9b32a05e6470e5e38def2c1ada43efa
--- /dev/null
+++ b/thrust/thrust/detail/allocator/temporary_allocator.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/tagged_allocator.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/pair.h>
+#include <thrust/memory.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// XXX the pointer parameter given to tagged_allocator should be related to
+//     the type of the expression get_temporary_buffer(system, n).first
+//     without decltype, compromise on pointer<T,System>
+template<typename T, typename System>
+  class temporary_allocator
+    : public thrust::detail::tagged_allocator<
+               T, System, thrust::pointer<T,System>
+             >
+{
+  private:
+    typedef thrust::detail::tagged_allocator<
+      T, System, thrust::pointer<T,System>
+    > super_t;
+
+    System &m_system;
+
+  public:
+    typedef typename super_t::pointer   pointer;
+    typedef typename super_t::size_type size_type;
+
+    inline __host__ __device__
+    temporary_allocator(const temporary_allocator &other) :
+      super_t(),
+      m_system(other.m_system)
+    {}
+
+    inline __host__ __device__
+    explicit temporary_allocator(thrust::execution_policy<System> &system) :
+      super_t(),
+      m_system(thrust::detail::derived_cast(system))
+    {}
+
+    __host__ __device__
+    pointer allocate(size_type cnt);
+
+    __host__ __device__
+    void deallocate(pointer p, size_type n);
+
+    __host__ __device__
+    inline System &system()
+    {
+      return m_system;
+    } // end system()
+
+  private:
+    typedef thrust::pair<pointer, size_type> pointer_and_size;
+}; // end temporary_allocator
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/temporary_allocator.inl>
+
diff --git a/thrust/thrust/detail/allocator/temporary_allocator.inl b/thrust/thrust/detail/allocator/temporary_allocator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..673ed272f99080cb17aad245d44e9fee263aaddb
--- /dev/null
+++ b/thrust/thrust/detail/allocator/temporary_allocator.inl
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/temporary_allocator.h>
+#include <thrust/detail/temporary_buffer.h>
+#include <thrust/system/detail/bad_alloc.h>
+#include <cassert>
+
+#if (defined(__NVCOMPILER_CUDA__) || defined(__CUDA_ARCH__)) && \
+    THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <thrust/system/cuda/detail/terminate.h>
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename System>
+__host__ __device__
+  typename temporary_allocator<T,System>::pointer
+    temporary_allocator<T,System>
+      ::allocate(typename temporary_allocator<T,System>::size_type cnt)
+{
+  pointer_and_size result = thrust::get_temporary_buffer<T>(system(), cnt);
+
+  // handle failure
+  if(result.second < cnt)
+  {
+    // deallocate and throw
+    // note that we pass cnt to deallocate, not a value derived from result.second
+    deallocate(result.first, cnt);
+
+    if (THRUST_IS_HOST_CODE) {
+      #if THRUST_INCLUDE_HOST_CODE
+        throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+      #endif
+    } else {
+      #if THRUST_INCLUDE_DEVICE_CODE && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
+      #endif
+    }
+  } // end if
+
+  return result.first;
+} // end temporary_allocator::allocate()
+
+
+template<typename T, typename System>
+__host__ __device__
+  void temporary_allocator<T,System>
+    ::deallocate(typename temporary_allocator<T,System>::pointer p, typename temporary_allocator<T,System>::size_type n)
+{
+  return thrust::return_temporary_buffer(system(), p, n);
+} // end temporary_allocator
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/allocator_aware_execution_policy.h b/thrust/thrust/detail/allocator_aware_execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..28fd54f9b73d45ba01e10797e392151e70de690c
--- /dev/null
+++ b/thrust/thrust/detail/allocator_aware_execution_policy.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execute_with_allocator_fwd.h>
+#include <thrust/detail/alignment.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <type_traits>
+#endif
+
+namespace thrust
+{
+
+namespace mr
+{
+
+template<typename T, class MR>
+class allocator;
+
+}
+
+namespace detail
+{
+
+template<template <typename> class ExecutionPolicyCRTPBase>
+struct allocator_aware_execution_policy
+{
+  template<typename MemoryResource>
+  struct execute_with_memory_resource_type
+  {
+    typedef thrust::detail::execute_with_allocator<
+      thrust::mr::allocator<
+        thrust::detail::max_align_t,
+        MemoryResource
+      >,
+      ExecutionPolicyCRTPBase
+    > type;
+  };
+
+  template<typename Allocator>
+  struct execute_with_allocator_type
+  {
+      typedef thrust::detail::execute_with_allocator<
+        Allocator,
+        ExecutionPolicyCRTPBase
+      > type;
+  };
+
+  template<typename MemoryResource>
+    typename execute_with_memory_resource_type<MemoryResource>::type
+      operator()(MemoryResource * mem_res) const
+  {
+    return typename execute_with_memory_resource_type<MemoryResource>::type(mem_res);
+  }
+
+  template<typename Allocator>
+    typename execute_with_allocator_type<Allocator&>::type
+      operator()(Allocator &alloc) const
+  {
+    return typename execute_with_allocator_type<Allocator&>::type(alloc);
+  }
+
+  template<typename Allocator>
+    typename execute_with_allocator_type<Allocator>::type
+      operator()(const Allocator &alloc) const
+  {
+    return typename execute_with_allocator_type<Allocator>::type(alloc);
+  }
+
+#if THRUST_CPP_DIALECT >= 2011
+  // just the rvalue overload
+  // perfect forwarding doesn't help, because a const reference has to be turned
+  // into a value by copying for the purpose of storing it in execute_with_allocator
+  template<typename Allocator,
+      typename std::enable_if<!std::is_lvalue_reference<Allocator>::value>::type * = nullptr>
+    typename execute_with_allocator_type<Allocator>::type
+      operator()(Allocator &&alloc) const
+  {
+    return typename execute_with_allocator_type<Allocator>::type(std::move(alloc));
+  }
+#endif
+};
+
+}
+}
diff --git a/thrust/thrust/detail/binary_search.inl b/thrust/thrust/detail/binary_search.inl
new file mode 100644
index 0000000000000000000000000000000000000000..5703226dc825ade5c0ee6d4dc277c1fd32674ffa
--- /dev/null
+++ b/thrust/thrust/detail/binary_search.inl
@@ -0,0 +1,486 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.inl
+ *  \brief Inline file for binary_search.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/binary_search.h>
+#include <thrust/system/detail/adl/binary_search.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+__thrust_exec_check_disable__ 
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+__thrust_exec_check_disable__ 
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+__thrust_exec_check_disable__ 
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::equal_range;
+    return equal_range(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::equal_range;
+    return equal_range(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
+}
+
+
+__thrust_exec_check_disable__ 
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output,
+                             StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
+}
+
+
+//////////////////////
+// Scalar Functions //
+//////////////////////
+
+template <typename ForwardIterator, typename LessThanComparable>
+ForwardIterator lower_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+
+    System system;
+
+    return thrust::lower_bound(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+
+    System system;
+
+    return thrust::lower_bound(select_system(system), first, last, value, comp);
+}
+
+template <typename ForwardIterator, typename LessThanComparable>
+ForwardIterator upper_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::upper_bound(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::upper_bound(select_system(system), first, last, value, comp);
+}
+
+template <typename ForwardIterator, typename LessThanComparable>
+bool binary_search(ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::binary_search(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::binary_search(select_system(system), first, last, value, comp);
+}
+
+template <typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::equal_range(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::equal_range(select_system(system), first, last, value, comp);
+}
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
+}
+    
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::upper_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::upper_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output,
+                             StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
+}
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/caching_allocator.h b/thrust/thrust/detail/caching_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb98f815f70aae67f1a89c98b548b420331c1062
--- /dev/null
+++ b/thrust/thrust/detail/caching_allocator.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/disjoint_tls_pool.h>
+#include <thrust/mr/new.h>
+#include <thrust/memory/detail/device_system_resource.h>
+
+namespace thrust
+{
+namespace detail
+{
+inline
+thrust::mr::allocator<
+    char,
+    thrust::mr::disjoint_unsynchronized_pool_resource<
+        thrust::device_memory_resource,
+        thrust::mr::new_delete_resource
+    >
+> single_device_tls_caching_allocator()
+{
+    return {
+        &thrust::mr::tls_disjoint_pool(
+            thrust::mr::get_global_resource<thrust::device_memory_resource>(),
+            thrust::mr::get_global_resource<thrust::mr::new_delete_resource>()
+        )
+    };
+}
+}
+}
diff --git a/thrust/thrust/detail/complex/arithmetic.h b/thrust/thrust/detail/complex/arithmetic.h
new file mode 100644
index 0000000000000000000000000000000000000000..448166e98b1e398e6762a9a6f2e8f399c8d375ff
--- /dev/null
+++ b/thrust/thrust/detail/complex/arithmetic.h
@@ -0,0 +1,300 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/complex.h>
+#include <cfloat>
+#include <cmath>
+#include <thrust/detail/complex/c99math.h>
+
+namespace thrust
+{
+
+  /* --- Binary Arithmetic Operators --- */
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() + y.real(), x.imag() + y.imag());
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() + y, x.imag());
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x + y.real(), y.imag());
+}
+
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() - y.real(), x.imag() - y.imag());
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() - y, x.imag());
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x - y.real(), -y.imag());
+}
+
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>( x.real() * y.real() - x.imag() * y.imag()
+			             , x.real() * y.imag() + x.imag() * y.real());
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() * y, x.imag() * y);
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x * y.real(), x * y.imag());
+}
+
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+
+  // Find `abs` by ADL.
+  using std::abs;
+
+  T s = abs(y.real()) + abs(y.imag());
+
+  T oos = T(1.0) / s;
+
+  T ars = x.real() * oos;
+  T ais = x.imag() * oos;
+  T brs = y.real() * oos;
+  T bis = y.imag() * oos;
+
+  s = (brs * brs) + (bis * bis);
+
+  oos = T(1.0) / s;
+
+  complex<T> quot( ((ars * brs) + (ais * bis)) * oos
+                 , ((ais * brs) - (ars * bis)) * oos);
+  return quot;
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() / y, x.imag() / y);
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x) / y;
+}
+
+
+
+/* --- Unary Arithmetic Operators --- */
+
+template <typename T> 
+__host__ __device__
+complex<T> operator+(const complex<T>& y)
+{
+  return y;
+}
+
+template <typename T> 
+__host__ __device__
+complex<T> operator-(const complex<T>& y)
+{
+  return y * -T(1);
+}
+
+
+/* --- Other Basic Arithmetic Functions --- */
+
+// As std::hypot is only C++11 we have to use the C interface
+template <typename T>
+__host__ __device__
+T abs(const complex<T>& z)
+{
+  return hypot(z.real(), z.imag());
+}
+
+// XXX Why are we specializing here?
+namespace detail {
+namespace complex {	
+
+__host__ __device__
+inline float abs(const thrust::complex<float>& z)
+{
+  return hypotf(z.real(),z.imag());
+}
+
+__host__ __device__
+inline double abs(const thrust::complex<double>& z)
+{
+  return hypot(z.real(),z.imag());
+}
+
+} // end namespace complex
+} // end namespace detail
+
+template <>
+__host__ __device__
+inline float abs(const complex<float>& z)
+{
+  return detail::complex::abs(z);
+}
+
+template <>
+__host__ __device__
+inline double abs(const complex<double>& z)
+{
+  return detail::complex::abs(z);
+}
+
+
+template <typename T>
+__host__ __device__
+T arg(const complex<T>& z)
+{
+  // Find `atan2` by ADL.
+  using std::atan2;
+  return atan2(z.imag(), z.real());
+}
+
+
+template <typename T>
+__host__ __device__
+complex<T> conj(const complex<T>& z)
+{
+  return complex<T>(z.real(), -z.imag());
+}
+
+
+template <typename T>
+__host__ __device__
+T norm(const complex<T>& z)
+{
+  return z.real() * z.real() + z.imag() * z.imag();
+}
+
+// XXX Why specialize these, we could just rely on ADL.
+template <>
+__host__ __device__
+inline float norm(const complex<float>& z)
+{
+  // Find `abs` and `sqrt` by ADL.
+  using std::abs;
+  using std::sqrt;
+
+  if (abs(z.real()) < sqrt(FLT_MIN) && abs(z.imag()) < sqrt(FLT_MIN))
+  {
+    float a = z.real() * 4.0f;
+    float b = z.imag() * 4.0f;
+    return (a * a + b * b) / 16.0f;
+  } 
+
+  return z.real() * z.real() + z.imag() * z.imag();
+}
+
+template <>
+__host__ __device__
+inline double norm(const complex<double>& z)
+{
+  // Find `abs` and `sqrt` by ADL.
+  using std::abs;
+  using std::sqrt;
+
+  if (abs(z.real()) < sqrt(DBL_MIN) && abs(z.imag()) < sqrt(DBL_MIN))
+  {
+    double a = z.real() * 4.0;
+    double b = z.imag() * 4.0;
+    return (a * a + b * b) / 16.0;
+  } 
+
+  return z.real() * z.real() + z.imag() * z.imag();
+}
+
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+polar(const T0& m, const T1& theta)
+{ 
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+
+  // Find `cos` and `sin` by ADL.
+  using std::cos;
+  using std::sin;
+
+  return complex<T>(m * cos(theta), m * sin(theta));
+}
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/complex/c99math.h b/thrust/thrust/detail/complex/c99math.h
new file mode 100644
index 0000000000000000000000000000000000000000..7609ccf993c18c481b8582f3384d82a89124b2ab
--- /dev/null
+++ b/thrust/thrust/detail/complex/c99math.h
@@ -0,0 +1,196 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+#pragma once
+
+#include <math.h>
+#include <cmath>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace complex
+{
+
+// Define basic arithmetic functions so we can use them without explicit scope
+// keeping the code as close as possible to FreeBSDs for ease of maintenance.
+// It also provides an easy way to support compilers with missing C99 functions.
+// When possible, just use the names in the global scope.
+// Some platforms define these as macros, others as free functions.
+// Avoid using the std:: form of these as nvcc may treat std::foo() as __host__ functions.
+
+using ::log;
+using ::acos;
+using ::asin;
+using ::sqrt;
+using ::sinh;
+using ::tan;
+using ::cos;
+using ::sin;
+using ::exp;
+using ::cosh;
+using ::atan;
+
+template <typename T>
+inline __host__ __device__ T infinity();
+
+template <>
+inline __host__ __device__ float infinity<float>()
+{
+  float res;
+  set_float_word(res, 0x7f800000);
+  return res;
+}
+
+
+template <>
+inline __host__ __device__ double infinity<double>()
+{
+  double res;
+  insert_words(res, 0x7ff00000,0);
+  return res;
+}
+
+#if defined _MSC_VER
+__host__ __device__ inline int isinf(float x){
+  return std::abs(x) == infinity<float>();
+}
+
+__host__ __device__ inline int isinf(double x){
+  return std::abs(x) == infinity<double>();
+}
+
+__host__ __device__ inline int isnan(float x){
+  return x != x;
+}
+
+__host__ __device__ inline int isnan(double x){
+  return x != x;
+}
+
+__host__ __device__ inline int signbit(float x){
+  return (*((uint32_t *)&x)) & 0x80000000;
+}
+
+__host__ __device__ inline int signbit(double x){
+  return (*((uint32_t *)&x)) & 0x80000000;
+}
+
+__host__ __device__ inline int isfinite(float x){
+  return !isnan(x) && !isinf(x);
+}
+
+__host__ __device__ inline int isfinite(double x){
+  return !isnan(x) && !isinf(x);
+}
+
+#else
+
+#  if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(__NVCOMPILER_CUDA__)
+// NVCC implements at least some signature of these as functions not macros.
+using ::isinf;
+using ::isnan;
+using ::signbit;
+using ::isfinite;
+#  else
+// Some compilers do not provide these in the global scope, because they are
+// supposed to be macros. The versions in `std` are supposed to be functions.
+// Since we're not compiling with nvcc, it's safe to use the functions in std::
+using std::isinf;
+using std::isnan;
+using std::signbit;
+using std::isfinite;
+#  endif // __CUDACC__
+#endif // _MSC_VER
+
+using ::atanh;
+
+#if defined _MSC_VER
+
+__host__ __device__ inline double copysign(double x, double y){
+  uint32_t hx,hy;
+  get_high_word(hx,x);
+  get_high_word(hy,y);
+  set_high_word(x,(hx&0x7fffffff)|(hy&0x80000000));
+  return x;
+}
+
+__host__ __device__ inline float copysignf(float x, float y){
+  uint32_t ix,iy;
+  get_float_word(ix,x);
+  get_float_word(iy,y);
+  set_float_word(x,(ix&0x7fffffff)|(iy&0x80000000));
+  return x;
+}
+
+
+
+#ifndef __CUDACC__
+
+// Simple approximation to log1p as Visual Studio is lacking one
+inline double log1p(double x){
+  double u = 1.0+x;
+  if(u == 1.0){
+    return x;
+  }else{
+    if(u > 2.0){
+      // Use normal log for large arguments
+      return log(u);
+    }else{
+      return log(u)*(x/(u-1.0));
+    }
+  }
+}
+
+inline float log1pf(float x){
+  float u = 1.0f+x;
+  if(u == 1.0f){
+    return x;
+  }else{
+    if(u > 2.0f){
+      // Use normal log for large arguments
+      return logf(u);
+    }else{
+      return logf(u)*(x/(u-1.0f));
+    }
+  }
+}
+
+#if _MSV_VER <= 1500
+#include <complex>
+
+inline float hypotf(float x, float y){
+	return abs(std::complex<float>(x,y));
+}
+
+inline double hypot(double x, double y){
+	return _hypot(x,y);
+}
+
+#endif // _MSC_VER <= 1500
+
+#endif // __CUDACC__
+
+#endif // _MSC_VER
+
+} // namespace complex
+
+} // namespace detail
+
+} // namespace thrust
+
diff --git a/thrust/thrust/detail/complex/catrig.h b/thrust/thrust/detail/complex/catrig.h
new file mode 100644
index 0000000000000000000000000000000000000000..6549fbb2eea699078da00b5d93e346ac93c6f73e
--- /dev/null
+++ b/thrust/thrust/detail/complex/catrig.h
@@ -0,0 +1,785 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2012 Stephen Montgomery-Smith <stephen@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Adapted from FreeBSD by Filipe Maia <filipe.c.maia@gmail.com>:
+ *    freebsd/lib/msun/src/catrig.c
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <cfloat>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__
+inline void raise_inexact(){
+  const volatile float tiny = 7.888609052210118054117286e-31; /* 0x1p-100; */ 
+  // needs the volatile to prevent compiler from ignoring it
+  volatile float junk = 1 + tiny;
+  (void)junk;
+}
+
+__host__ __device__ inline complex<double> clog_for_large_values(complex<double> z);
+  
+/*
+ * Testing indicates that all these functions are accurate up to 4 ULP.
+ * The functions casin(h) and cacos(h) are about 2.5 times slower than asinh.
+ * The functions catan(h) are a little under 2 times slower than atanh.
+ *
+ * The code for casinh, casin, cacos, and cacosh comes first.  The code is
+ * rather complicated, and the four functions are highly interdependent.
+ *
+ * The code for catanh and catan comes at the end.  It is much simpler than
+ * the other functions, and the code for these can be disconnected from the
+ * rest of the code.
+ */
+
+/*
+ *			================================
+ *			| casinh, casin, cacos, cacosh |
+ *			================================
+ */
+
+/*
+ * The algorithm is very close to that in "Implementing the complex arcsine
+ * and arccosine functions using exception handling" by T. E. Hull, Thomas F.
+ * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on
+ * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335,
+ * http://dl.acm.org/citation.cfm?id=275324.
+ *
+ * Throughout we use the convention z = x + I*y.
+ *
+ * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B)
+ * where
+ * A = (|z+I| + |z-I|) / 2
+ * B = (|z+I| - |z-I|) / 2 = y/A
+ *
+ * These formulas become numerically unstable:
+ *   (a) for Re(casinh(z)) when z is close to the line segment [-I, I] (that
+ *       is, Re(casinh(z)) is close to 0);
+ *   (b) for Im(casinh(z)) when z is close to either of the intervals
+ *       [I, I*infinity) or (-I*infinity, -I] (that is, |Im(casinh(z))| is
+ *       close to PI/2).
+ *
+ * These numerical problems are overcome by defining
+ * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2
+ * Then if A < A_crossover, we use
+ *   log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1)))
+ *   A-1 = f(x, 1+y) + f(x, 1-y)
+ * and if B > B_crossover, we use
+ *   asin(B) = atan2(y, sqrt(A*A - y*y)) = atan2(y, sqrt((A+y)*(A-y)))
+ *   A-y = f(x, y+1) + f(x, y-1)
+ * where without loss of generality we have assumed that x and y are
+ * non-negative.
+ *
+ * Much of the difficulty comes because the intermediate computations may
+ * produce overflows or underflows.  This is dealt with in the paper by Hull
+ * et al by using exception handling.  We do this by detecting when
+ * computations risk underflow or overflow.  The hardest part is handling the
+ * underflows when computing f(a, b).
+ *
+ * Note that the function f(a, b) does not appear explicitly in the paper by
+ * Hull et al, but the idea may be found on pages 308 and 309.  Introducing the
+ * function f(a, b) allows us to concentrate many of the clever tricks in this
+ * paper into one function.
+ */
+
+/*
+ * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
+ * Pass hypot(a, b) as the third argument.
+ */
+__host__ __device__
+inline double
+f(double a, double b, double hypot_a_b)
+{
+  if (b < 0)
+    return ((hypot_a_b - b) / 2);
+  if (b == 0)
+    return (a / 2);
+  return (a * a / (hypot_a_b + b) / 2);
+}
+  
+/*
+ * All the hard work is contained in this function.
+ * x and y are assumed positive or zero, and less than RECIP_EPSILON.
+ * Upon return:
+ * rx = Re(casinh(z)) = -Im(cacos(y + I*x)).
+ * B_is_usable is set to 1 if the value of B is usable.
+ * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y.
+ * If returning sqrt_A2my2 has potential to result in an underflow, it is
+ * rescaled, and new_y is similarly rescaled.
+ */
+__host__ __device__
+inline void
+do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
+			double *sqrt_A2my2, double *new_y)
+{
+  double R, S, A; /* A, B, R, and S are as in Hull et al. */
+  double Am1, Amy; /* A-1, A-y. */
+  const double A_crossover = 10; /* Hull et al suggest 1.5, but 10 works better */
+  const double FOUR_SQRT_MIN = 5.966672584960165394632772e-154; /* =0x1p-509; >= 4 * sqrt(DBL_MIN) */
+  const double B_crossover = 0.6417; /* suggested by Hull et al */
+  
+  R = hypot(x, y + 1);		/* |z+I| */
+  S = hypot(x, y - 1);		/* |z-I| */
+  
+  /* A = (|z+I| + |z-I|) / 2 */
+  A = (R + S) / 2;
+  /*
+   * Mathematically A >= 1.  There is a small chance that this will not
+   * be so because of rounding errors.  So we will make certain it is
+   * so.
+   */
+  if (A < 1)
+    A = 1;
+  
+  if (A < A_crossover) {
+    /*
+     * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
+     * rx = log1p(Am1 + sqrt(Am1*(A+1)))
+     */
+    if (y == 1 && x < DBL_EPSILON * DBL_EPSILON / 128) {
+      /*
+       * fp is of order x^2, and fm = x/2.
+       * A = 1 (inexactly).
+       */
+      *rx = sqrt(x);
+    } else if (x >= DBL_EPSILON * fabs(y - 1)) {
+      /*
+       * Underflow will not occur because
+       * x >= DBL_EPSILON^2/128 >= FOUR_SQRT_MIN
+       */
+      Am1 = f(x, 1 + y, R) + f(x, 1 - y, S);
+      *rx = log1p(Am1 + sqrt(Am1 * (A + 1)));
+    } else if (y < 1) {
+      /*
+       * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and
+       * A = 1 (inexactly).
+       */
+      *rx = x / sqrt((1 - y) * (1 + y));
+    } else {		/* if (y > 1) */
+      /*
+       * A-1 = y-1 (inexactly).
+       */
+      *rx = log1p((y - 1) + sqrt((y - 1) * (y + 1)));
+    }
+  } else {
+    *rx = log(A + sqrt(A * A - 1));
+  }
+  
+  *new_y = y;
+  
+  if (y < FOUR_SQRT_MIN) {
+    /*
+     * Avoid a possible underflow caused by y/A.  For casinh this
+     * would be legitimate, but will be picked up by invoking atan2
+     * later on.  For cacos this would not be legitimate.
+     */
+    *B_is_usable = 0;
+    *sqrt_A2my2 = A * (2 / DBL_EPSILON);
+    *new_y = y * (2 / DBL_EPSILON);
+    return;
+  }
+  
+  /* B = (|z+I| - |z-I|) / 2 = y/A */
+  *B = y / A;
+  *B_is_usable = 1;
+  
+  if (*B > B_crossover) {
+    *B_is_usable = 0;
+    /*
+     * Amy = fp + fm, where fp = f(x, y+1), and fm = f(x, y-1).
+     * sqrt_A2my2 = sqrt(Amy*(A+y))
+     */
+    if (y == 1 && x < DBL_EPSILON / 128) {
+      /*
+       * fp is of order x^2, and fm = x/2.
+       * A = 1 (inexactly).
+       */
+      *sqrt_A2my2 = sqrt(x) * sqrt((A + y) / 2);
+    } else if (x >= DBL_EPSILON * fabs(y - 1)) {
+      /*
+       * Underflow will not occur because
+       * x >= DBL_EPSILON/128 >= FOUR_SQRT_MIN
+       * and
+       * x >= DBL_EPSILON^2 >= FOUR_SQRT_MIN
+       */
+      Amy = f(x, y + 1, R) + f(x, y - 1, S);
+      *sqrt_A2my2 = sqrt(Amy * (A + y));
+    } else if (y > 1) {
+      /*
+       * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and
+       * A = y (inexactly).
+       *
+       * y < RECIP_EPSILON.  So the following
+       * scaling should avoid any underflow problems.
+       */
+      *sqrt_A2my2 = x * (4 / DBL_EPSILON / DBL_EPSILON) * y /
+	sqrt((y + 1) * (y - 1));
+      *new_y = y * (4 / DBL_EPSILON / DBL_EPSILON);
+    } else {		/* if (y < 1) */
+      /*
+       * fm = 1-y >= DBL_EPSILON, fp is of order x^2, and
+       * A = 1 (inexactly).
+       */
+      *sqrt_A2my2 = sqrt((1 - y) * (1 + y));
+    }
+  }
+}
+  
+/*
+ * casinh(z) = z + O(z^3)   as z -> 0
+ *
+ * casinh(z) = sign(x)*clog(sign(x)*z) + O(1/z^2)   as z -> infinity
+ * The above formula works for the imaginary part as well, because
+ * Im(casinh(z)) = sign(x)*atan2(sign(x)*y, fabs(x)) + O(y/z^3)
+ *    as z -> infinity, uniformly in y
+ */
+__host__ __device__ inline
+complex<double> casinh(complex<double> z)
+{
+  double x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y;
+  int B_is_usable;
+  complex<double> w;
+  const double RECIP_EPSILON = 1.0 / DBL_EPSILON;
+  const double m_ln2 = 6.9314718055994531e-1; /*  0x162e42fefa39ef.0p-53 */
+  x = z.real();
+  y = z.imag();
+  ax = fabs(x);
+  ay = fabs(y);
+  
+  if (isnan(x) || isnan(y)) {
+    /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+    if (isinf(x))
+      return (complex<double>(x, y + y));
+    /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+    if (isinf(y))
+      return (complex<double>(y, x + x));
+    /* casinh(NaN + I*0) = NaN + I*0 */
+    if (y == 0)
+      return (complex<double>(x + x, y));
+    /*
+     * All other cases involving NaN return NaN + I*NaN.
+     * C99 leaves it optional whether to raise invalid if one of
+     * the arguments is not NaN, so we opt not to raise it.
+     */
+    return (complex<double>(x + 0.0 + (y + 0.0), x + 0.0 + (y + 0.0)));
+  }
+
+  if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+    /* clog...() will raise inexact unless x or y is infinite. */
+    if (signbit(x) == 0)
+      w = clog_for_large_values(z) + m_ln2;
+    else
+      w = clog_for_large_values(-z) + m_ln2;
+    return (complex<double>(copysign(w.real(), x), copysign(w.imag(), y)));
+  }
+
+  /* Avoid spuriously raising inexact for z = 0. */
+  if (x == 0 && y == 0)
+    return (z);
+
+  /* All remaining cases are inexact. */
+  raise_inexact();
+
+  const double SQRT_6_EPSILON = 3.6500241499888571e-8; /*  0x13988e1409212e.0p-77 */
+  if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4)
+    return (z);
+
+  do_hard_work(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y);
+  if (B_is_usable)
+    ry = asin(B);
+  else
+    ry = atan2(new_y, sqrt_A2my2);
+  return (complex<double>(copysign(rx, x), copysign(ry, y)));
+}
+
+/*
+ * casin(z) = reverse(casinh(reverse(z)))
+ * where reverse(x + I*y) = y + I*x = I*conj(z).
+ */
+__host__ __device__ inline
+complex<double> casin(complex<double> z)
+{
+  complex<double> w = casinh(complex<double>(z.imag(), z.real()));
+  
+  return (complex<double>(w.imag(), w.real()));
+}
+  
+/*
+ * cacos(z) = PI/2 - casin(z)
+ * but do the computation carefully so cacos(z) is accurate when z is
+ * close to 1.
+ *
+ * cacos(z) = PI/2 - z + O(z^3)   as z -> 0
+ *
+ * cacos(z) = -sign(y)*I*clog(z) + O(1/z^2)   as z -> infinity
+ * The above formula works for the real part as well, because
+ * Re(cacos(z)) = atan2(fabs(y), x) + O(y/z^3)
+ *    as z -> infinity, uniformly in y
+ */
+__host__ __device__ inline
+complex<double> cacos(complex<double> z)
+{
+  double x, y, ax, ay, rx, ry, B, sqrt_A2mx2, new_x;
+  int sx, sy;
+  int B_is_usable;
+  complex<double> w;
+  const double pio2_hi = 1.5707963267948966e0; /*  0x1921fb54442d18.0p-52 */
+  const volatile double pio2_lo = 6.1232339957367659e-17;	/*  0x11a62633145c07.0p-106 */
+  const double m_ln2 = 6.9314718055994531e-1; /*  0x162e42fefa39ef.0p-53 */
+
+  x = z.real();
+  y = z.imag();
+  sx = signbit(x);
+  sy = signbit(y);
+  ax = fabs(x);
+  ay = fabs(y);
+
+  if (isnan(x) || isnan(y)) {
+    /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+    if (isinf(x))
+      return (complex<double>(y + y, -infinity<double>()));
+    /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
+    if (isinf(y))
+      return (complex<double>(x + x, -y));
+    /* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+    if (x == 0)
+      return (complex<double>(pio2_hi + pio2_lo, y + y));
+    /*
+     * All other cases involving NaN return NaN + I*NaN.
+     * C99 leaves it optional whether to raise invalid if one of
+     * the arguments is not NaN, so we opt not to raise it.
+     */
+    return (complex<double>(x + 0.0 + (y + 0), x + 0.0 + (y + 0)));
+  }
+
+  const double RECIP_EPSILON = 1.0 / DBL_EPSILON;
+  if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+    /* clog...() will raise inexact unless x or y is infinite. */
+    w = clog_for_large_values(z);
+    rx = fabs(w.imag());
+    ry = w.real() + m_ln2;
+    if (sy == 0)
+      ry = -ry;
+    return (complex<double>(rx, ry));
+  }
+
+  /* Avoid spuriously raising inexact for z = 1. */
+  if (x == 1.0 && y == 0.0)
+    return (complex<double>(0, -y));
+
+  /* All remaining cases are inexact. */
+  raise_inexact();
+
+  const double SQRT_6_EPSILON = 3.6500241499888571e-8; /*  0x13988e1409212e.0p-77 */
+  if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4)
+    return (complex<double>(pio2_hi - (x - pio2_lo), -y));
+
+  do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
+  if (B_is_usable) {
+    if (sx == 0)
+      rx = acos(B);
+    else
+      rx = acos(-B);
+  } else {
+    if (sx == 0)
+      rx = atan2(sqrt_A2mx2, new_x);
+    else
+      rx = atan2(sqrt_A2mx2, -new_x);
+  }
+  if (sy == 0)
+    ry = -ry;
+  return (complex<double>(rx, ry));
+}
+
+/*
+ * cacosh(z) = I*cacos(z) or -I*cacos(z)
+ * where the sign is chosen so Re(cacosh(z)) >= 0.
+ */
+__host__ __device__ inline
+complex<double> cacosh(complex<double> z)
+{
+  complex<double> w;
+  double rx, ry;
+  
+  w = cacos(z);
+  rx = w.real();
+  ry = w.imag();
+  /* cacosh(NaN + I*NaN) = NaN + I*NaN */
+  if (isnan(rx) && isnan(ry))
+    return (complex<double>(ry, rx));
+  /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
+  /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
+  if (isnan(rx))
+    return (complex<double>(fabs(ry), rx));
+  /* cacosh(0 + I*NaN) = NaN + I*NaN */
+  if (isnan(ry))
+    return (complex<double>(ry, ry));
+  return (complex<double>(fabs(ry), copysign(rx, z.imag())));
+}
+
+/*
+ * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
+ */
+__host__ __device__ inline
+complex<double> clog_for_large_values(complex<double> z)
+{
+  double x, y;
+  double ax, ay, t;
+  const double m_e = 2.7182818284590452e0; /*  0x15bf0a8b145769.0p-51 */
+  
+  x = z.real();
+  y = z.imag();
+  ax = fabs(x);
+  ay = fabs(y);
+  if (ax < ay) {
+    t = ax;
+    ax = ay;
+    ay = t;
+  }
+  
+  /*
+   * Avoid overflow in hypot() when x and y are both very large.
+   * Divide x and y by E, and then add 1 to the logarithm.  This depends
+   * on E being larger than sqrt(2).
+   * Dividing by E causes an insignificant loss of accuracy; however
+   * this method is still poor since it is uneccessarily slow.
+   */
+  if (ax > DBL_MAX / 2)
+    return (complex<double>(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x)));
+  
+  /*
+   * Avoid overflow when x or y is large.  Avoid underflow when x or
+   * y is small.
+   */
+  const double QUARTER_SQRT_MAX = 5.966672584960165394632772e-154; /* = 0x1p509; <= sqrt(DBL_MAX) / 4 */
+  const double SQRT_MIN =	1.491668146240041348658193e-154; /* = 0x1p-511; >= sqrt(DBL_MIN) */
+  if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
+    return (complex<double>(log(hypot(x, y)), atan2(y, x)));
+  
+  return (complex<double>(log(ax * ax + ay * ay) / 2, atan2(y, x)));
+}
+  
+/*
+ *				=================
+ *				| catanh, catan |
+ *				=================
+ */
+  
+/*
+   * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
+   * Assumes x*x and y*y will not overflow.
+   * Assumes x and y are finite.
+   * Assumes y is non-negative.
+   * Assumes fabs(x) >= DBL_EPSILON.
+   */
+__host__ __device__
+inline double sum_squares(double x, double y)
+{
+  const double SQRT_MIN =	1.491668146240041348658193e-154; /* = 0x1p-511; >= sqrt(DBL_MIN) */
+  /* Avoid underflow when y is small. */
+  if (y < SQRT_MIN)
+    return (x * x);
+  
+  return (x * x + y * y);
+}
+  
+/*
+ * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
+ * Assumes x and y are not NaN, and one of x and y is larger than
+ * RECIP_EPSILON.  We avoid unwarranted underflow.  It is important to not use
+ * the code creal(1/z), because the imaginary part may produce an unwanted
+ * underflow.
+ * This is only called in a context where inexact is always raised before
+ * the call, so no effort is made to avoid or force inexact.
+ */
+__host__ __device__
+inline double real_part_reciprocal(double x, double y)
+{
+  double scale;
+  uint32_t hx, hy;
+  int32_t ix, iy;
+  
+  /*
+   * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
+   * example 2.
+   */
+  get_high_word(hx, x);
+  ix = hx & 0x7ff00000;
+  get_high_word(hy, y);
+  iy = hy & 0x7ff00000;
+  //#define	BIAS	(DBL_MAX_EXP - 1)
+  const int BIAS = DBL_MAX_EXP - 1;
+  /* XXX more guard digits are useful iff there is extra precision. */
+  //#define	CUTOFF	(DBL_MANT_DIG / 2 + 1)	/* just half or 1 guard digit */
+  const int CUTOFF = (DBL_MANT_DIG / 2 + 1);
+  if (ix - iy >= CUTOFF << 20 || isinf(x))
+    return (1 / x);		/* +-Inf -> +-0 is special */
+  if (iy - ix >= CUTOFF << 20)
+    return (x / y / y);	/* should avoid double div, but hard */
+  if (ix <= (BIAS + DBL_MAX_EXP / 2 - CUTOFF) << 20)
+    return (x / (x * x + y * y));
+  scale = 1;
+  set_high_word(scale, 0x7ff00000 - ix);	/* 2**(1-ilogb(x)) */
+  x *= scale;
+  y *= scale;
+  return (x / (x * x + y * y) * scale);
+}
+  
+  
+/*
+ * catanh(z) = log((1+z)/(1-z)) / 2
+ *           = log1p(4*x / |z-1|^2) / 4
+ *             + I * atan2(2*y, (1-x)*(1+x)-y*y) / 2
+ *
+ * catanh(z) = z + O(z^3)   as z -> 0
+ *
+ * catanh(z) = 1/z + sign(y)*I*PI/2 + O(1/z^3)   as z -> infinity
+ * The above formula works for the real part as well, because
+ * Re(catanh(z)) = x/|z|^2 + O(x/z^4)
+ *    as z -> infinity, uniformly in x
+ */
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+__host__ __device__ inline
+complex<double> catanh(complex<double> z)
+{
+  double x, y, ax, ay, rx, ry;
+  const volatile double pio2_lo = 6.1232339957367659e-17; /*  0x11a62633145c07.0p-106 */
+  const double pio2_hi = 1.5707963267948966e0;/*  0x1921fb54442d18.0p-52 */
+  
+  
+  x = z.real();
+  y = z.imag();
+  ax = fabs(x);
+  ay = fabs(y);
+
+  /* This helps handle many cases. */
+  if (y == 0 && ax <= 1)
+    return (complex<double>(atanh(x), y));
+  
+  /* To ensure the same accuracy as atan(), and to filter out z = 0. */
+  if (x == 0)
+    return (complex<double>(x, atan(y)));
+  
+  if (isnan(x) || isnan(y)) {
+    /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
+    if (isinf(x))
+      return (complex<double>(copysign(0.0, x), y + y));
+    /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+    if (isinf(y))
+      return (complex<double>(copysign(0.0, x),
+			      copysign(pio2_hi + pio2_lo, y)));
+    /*
+     * All other cases involving NaN return NaN + I*NaN.
+     * C99 leaves it optional whether to raise invalid if one of
+     * the arguments is not NaN, so we opt not to raise it.
+     */
+    return (complex<double>(x + 0.0 + (y + 0), x + 0.0 + (y + 0)));
+  }
+  
+  const double RECIP_EPSILON = 1.0 / DBL_EPSILON;
+  if (ax > RECIP_EPSILON || ay > RECIP_EPSILON)
+    return (complex<double>(real_part_reciprocal(x, y),
+			    copysign(pio2_hi + pio2_lo, y)));
+  
+  const double SQRT_3_EPSILON = 2.5809568279517849e-8; /*  0x1bb67ae8584caa.0p-78 */
+  if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
+    /*
+     * z = 0 was filtered out above.  All other cases must raise
+     * inexact, but this is the only only that needs to do it
+     * explicitly.
+     */
+    raise_inexact();
+    return (z);
+  }
+  
+  const double m_ln2 = 6.9314718055994531e-1; /*  0x162e42fefa39ef.0p-53 */
+  if (ax == 1 && ay < DBL_EPSILON)
+    rx = (m_ln2 - log(ay)) / 2;
+  else
+    rx = log1p(4 * ax / sum_squares(ax - 1, ay)) / 4;
+  
+  if (ax == 1)
+    ry = atan2(2.0, -ay) / 2;
+  else if (ay < DBL_EPSILON)
+    ry = atan2(2 * ay, (1 - ax) * (1 + ax)) / 2;
+  else
+    ry = atan2(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
+  
+  return (complex<double>(copysign(rx, x), copysign(ry, y)));
+}
+  
+/*
+ * catan(z) = reverse(catanh(reverse(z)))
+ * where reverse(x + I*y) = y + I*x = I*conj(z).
+ */
+__host__ __device__ inline
+complex<double>catan(complex<double> z)
+{
+  complex<double> w = catanh(complex<double>(z.imag(), z.real()));
+  return (complex<double>(w.imag(), w.real()));
+}
+
+#endif
+
+} // namespace complex
+
+} // namespace detail
+
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> acos(const complex<ValueType>& z){
+  const complex<ValueType> ret = thrust::asin(z);
+  const ValueType pi = ValueType(3.14159265358979323846);
+  return complex<ValueType>(pi/2 - ret.real(),-ret.imag());
+}
+
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> asin(const complex<ValueType>& z){
+  const complex<ValueType> i(0,1);
+  return -i*asinh(i*z);
+}
+  
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> atan(const complex<ValueType>& z){
+  const complex<ValueType> i(0,1);
+  return -i*thrust::atanh(i*z);
+}
+  
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> acosh(const complex<ValueType>& z){
+  thrust::complex<ValueType> ret((z.real() - z.imag()) * (z.real() + z.imag()) - ValueType(1.0),
+				 ValueType(2.0) * z.real() * z.imag());    
+  ret = thrust::sqrt(ret);
+  if (z.real() < ValueType(0.0)){
+    ret = -ret;
+  }
+  ret += z;
+  ret = thrust::log(ret);
+  if (ret.real() < ValueType(0.0)){
+    ret = -ret;
+  }
+  return ret;
+}
+  
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> asinh(const complex<ValueType>& z){
+  return thrust::log(thrust::sqrt(z*z+ValueType(1))+z);
+}
+  
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> atanh(const complex<ValueType>& z){
+  ValueType imag2 = z.imag() *  z.imag();   
+  ValueType n = ValueType(1.0) + z.real();
+  n = imag2 + n * n;
+  
+  ValueType d = ValueType(1.0) - z.real();
+  d = imag2 + d * d;
+  complex<ValueType> ret(ValueType(0.25) * (std::log(n) - std::log(d)),0);
+  
+  d = ValueType(1.0) -  z.real() * z.real() - imag2;
+  
+  ret.imag(ValueType(0.5) * std::atan2(ValueType(2.0) * z.imag(), d));
+  return ret;
+}
+  
+template <>
+__host__ __device__
+inline complex<double> acos(const complex<double>& z){
+  return detail::complex::cacos(z);
+}
+  
+template <>
+__host__ __device__
+inline complex<double> asin(const complex<double>& z){
+  return detail::complex::casin(z);
+}
+  
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+template <>
+__host__ __device__
+inline complex<double> atan(const complex<double>& z){
+  return detail::complex::catan(z);
+}
+#endif
+
+template <>
+__host__ __device__
+inline complex<double> acosh(const complex<double>& z){
+  return detail::complex::cacosh(z);
+}
+
+
+template <>
+__host__ __device__
+inline complex<double> asinh(const complex<double>& z){
+  return detail::complex::casinh(z);
+}
+  
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+template <>
+__host__ __device__
+inline complex<double> atanh(const complex<double>& z){
+  return detail::complex::catanh(z);
+}
+#endif
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/catrigf.h b/thrust/thrust/detail/complex/catrigf.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa924717a7c30e380dcbaf8fe9d1a69b52c4f27e
--- /dev/null
+++ b/thrust/thrust/detail/complex/catrigf.h
@@ -0,0 +1,500 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2012 Stephen Montgomery-Smith <stephen@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Adapted from FreeBSD by Filipe Maia <filipe.c.maia@gmail.com>:
+ *    freebsd/lib/msun/src/catrig.c
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <thrust/detail/config.h>
+#include <cfloat>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+  
+__host__ __device__ inline
+      complex<float> clog_for_large_values(complex<float> z);
+
+/*
+ * The algorithm is very close to that in "Implementing the complex arcsine
+ * and arccosine functions using exception handling" by T. E. Hull, Thomas F.
+ * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on
+ * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335,
+ * http://dl.acm.org/citation.cfm?id=275324.
+ *
+ * See catrig.c for complete comments.
+ *
+ * XXX comments were removed automatically, and even short ones on the right
+ * of statements were removed (all of them), contrary to normal style.  Only
+ * a few comments on the right of declarations remain.
+ */
+
+__host__ __device__
+inline float
+f(float a, float b, float hypot_a_b)
+{
+  if (b < 0.0f)
+    return ((hypot_a_b - b) / 2.0f);
+  if (b == 0.0f)
+    return (a / 2.0f);
+  return (a * a / (hypot_a_b + b) / 2.0f);
+}
+
+/*
+ * All the hard work is contained in this function.
+ * x and y are assumed positive or zero, and less than RECIP_EPSILON.
+ * Upon return:
+ * rx = Re(casinh(z)) = -Im(cacos(y + I*x)).
+ * B_is_usable is set to 1 if the value of B is usable.
+ * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y.
+ * If returning sqrt_A2my2 has potential to result in an underflow, it is
+ * rescaled, and new_y is similarly rescaled.
+ */
+__host__ __device__ 
+inline void
+do_hard_work(float x, float y, float *rx, int *B_is_usable, float *B,
+	     float *sqrt_A2my2, float *new_y)
+{
+  float R, S, A; /* A, B, R, and S are as in Hull et al. */
+  float Am1, Amy; /* A-1, A-y. */
+  const float A_crossover = 10; /* Hull et al suggest 1.5, but 10 works better */
+  const float FOUR_SQRT_MIN = 4.336808689942017736029811e-19f;; /* =0x1p-61; >= 4 * sqrt(FLT_MIN) */
+  const float B_crossover = 0.6417f; /* suggested by Hull et al */
+  R = hypotf(x, y + 1);
+  S = hypotf(x, y - 1);
+
+  A = (R + S) / 2;
+  if (A < 1)
+    A = 1;
+
+  if (A < A_crossover) {
+    if (y == 1 && x < FLT_EPSILON * FLT_EPSILON / 128) {
+      *rx = sqrtf(x);
+    } else if (x >= FLT_EPSILON * fabsf(y - 1)) {
+      Am1 = f(x, 1 + y, R) + f(x, 1 - y, S);
+      *rx = log1pf(Am1 + sqrtf(Am1 * (A + 1)));
+    } else if (y < 1) {
+      *rx = x / sqrtf((1 - y) * (1 + y));
+    } else {
+      *rx = log1pf((y - 1) + sqrtf((y - 1) * (y + 1)));
+    }
+  } else {
+    *rx = logf(A + sqrtf(A * A - 1));
+  }
+
+  *new_y = y;
+
+  if (y < FOUR_SQRT_MIN) {
+    *B_is_usable = 0;
+    *sqrt_A2my2 = A * (2 / FLT_EPSILON);
+    *new_y = y * (2 / FLT_EPSILON);
+    return;
+  }
+
+  *B = y / A;
+  *B_is_usable = 1;
+
+  if (*B > B_crossover) {
+    *B_is_usable = 0;
+    if (y == 1 && x < FLT_EPSILON / 128) {
+      *sqrt_A2my2 = sqrtf(x) * sqrtf((A + y) / 2);
+    } else if (x >= FLT_EPSILON * fabsf(y - 1)) {
+      Amy = f(x, y + 1, R) + f(x, y - 1, S);
+      *sqrt_A2my2 = sqrtf(Amy * (A + y));
+    } else if (y > 1) {
+      *sqrt_A2my2 = x * (4 / FLT_EPSILON / FLT_EPSILON) * y /
+	sqrtf((y + 1) * (y - 1));
+      *new_y = y * (4 / FLT_EPSILON / FLT_EPSILON);
+    } else {
+      *sqrt_A2my2 = sqrtf((1 - y) * (1 + y));
+    }
+  }
+
+}
+
+__host__ __device__ inline
+complex<float>
+casinhf(complex<float> z)
+{
+  float x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y;
+  int B_is_usable;
+  complex<float> w;
+  const float RECIP_EPSILON = 1.0 / FLT_EPSILON;
+  const float m_ln2 = 6.9314718055994531e-1f; /*  0x162e42fefa39ef.0p-53 */
+  x = z.real();
+  y = z.imag();
+  ax = fabsf(x);
+  ay = fabsf(y);
+
+  if (isnan(x) || isnan(y)) {
+    if (isinf(x))
+      return (complex<float>(x, y + y));
+    if (isinf(y))
+      return (complex<float>(y, x + x));
+    if (y == 0)
+      return (complex<float>(x + x, y));
+    return (complex<float>(x + 0.0f + (y + 0), x + 0.0f + (y + 0)));
+  }
+
+  if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+    if (signbit(x) == 0)
+      w = clog_for_large_values(z) + m_ln2;
+    else
+      w = clog_for_large_values(-z) + m_ln2;
+    return (complex<float>(copysignf(w.real(), x),
+			   copysignf(w.imag(), y)));
+  }
+
+  if (x == 0 && y == 0)
+    return (z);
+
+  raise_inexact();
+
+  const float SQRT_6_EPSILON = 8.4572793338e-4f;	/*  0xddb3d7.0p-34 */
+  if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4)
+    return (z);
+
+  do_hard_work(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y);
+  if (B_is_usable)
+    ry = asinf(B);
+  else
+    ry = atan2f(new_y, sqrt_A2my2);
+  return (complex<float>(copysignf(rx, x), copysignf(ry, y)));
+}
+
+__host__ __device__ inline
+complex<float> casinf(complex<float> z)
+{
+  complex<float> w = casinhf(complex<float>(z.imag(), z.real()));
+
+  return (complex<float>(w.imag(), w.real()));
+}
+
+__host__ __device__ inline
+complex<float> cacosf(complex<float> z)
+{
+  float x, y, ax, ay, rx, ry, B, sqrt_A2mx2, new_x;
+  int sx, sy;
+  int B_is_usable;
+  complex<float> w;
+  const float pio2_hi = 1.5707963267948966e0f; /*  0x1921fb54442d18.0p-52 */
+  const volatile float pio2_lo = 6.1232339957367659e-17f;	/*  0x11a62633145c07.0p-106 */
+  const float m_ln2 = 6.9314718055994531e-1f; /*  0x162e42fefa39ef.0p-53 */
+
+  x = z.real();
+  y = z.imag();
+  sx = signbit(x);
+  sy = signbit(y);
+  ax = fabsf(x);
+  ay = fabsf(y);
+
+  if (isnan(x) || isnan(y)) {
+    if (isinf(x))
+      return (complex<float>(y + y, -infinity<float>()));
+    if (isinf(y))
+      return (complex<float>(x + x, -y));
+    if (x == 0)
+      return (complex<float>(pio2_hi + pio2_lo, y + y));
+    return (complex<float>(x + 0.0f + (y + 0), x + 0.0f + (y + 0)));
+  }
+
+  const float RECIP_EPSILON = 1.0 / FLT_EPSILON;
+  if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+    w = clog_for_large_values(z);
+    rx = fabsf(w.imag());
+    ry = w.real() + m_ln2;
+    if (sy == 0)
+      ry = -ry;
+    return (complex<float>(rx, ry));
+  }
+
+  if (x == 1 && y == 0)
+    return (complex<float>(0, -y));
+
+  raise_inexact();
+
+  const float SQRT_6_EPSILON = 8.4572793338e-4f;	/*  0xddb3d7.0p-34 */
+  if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4)
+    return (complex<float>(pio2_hi - (x - pio2_lo), -y));
+
+  do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
+  if (B_is_usable) {
+    if (sx == 0)
+      rx = acosf(B);
+    else
+      rx = acosf(-B);
+  } else {
+    if (sx == 0)
+      rx = atan2f(sqrt_A2mx2, new_x);
+    else
+      rx = atan2f(sqrt_A2mx2, -new_x);
+  }
+  if (sy == 0)
+    ry = -ry;
+  return (complex<float>(rx, ry));
+}
+
+__host__ __device__ inline
+complex<float> cacoshf(complex<float> z)
+{
+  complex<float> w;
+  float rx, ry;
+
+  w = cacosf(z);
+  rx = w.real();
+  ry = w.imag();
+  /* cacosh(NaN + I*NaN) = NaN + I*NaN */
+  if (isnan(rx) && isnan(ry))
+    return (complex<float>(ry, rx));
+  /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
+  /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
+  if (isnan(rx))
+    return (complex<float>(fabsf(ry), rx));
+  /* cacosh(0 + I*NaN) = NaN + I*NaN */
+  if (isnan(ry))
+    return (complex<float>(ry, ry));
+  return (complex<float>(fabsf(ry), copysignf(rx, z.imag())));
+}
+
+  /*
+   * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
+   */
+__host__ __device__ inline
+complex<float> clog_for_large_values(complex<float> z)
+{
+  float x, y;
+  float ax, ay, t;
+  const float m_e = 2.7182818284590452e0f; /*  0x15bf0a8b145769.0p-51 */
+
+  x = z.real();
+  y = z.imag();
+  ax = fabsf(x);
+  ay = fabsf(y);
+  if (ax < ay) {
+    t = ax;
+    ax = ay;
+    ay = t;
+  }
+
+  if (ax > FLT_MAX / 2)
+    return (complex<float>(logf(hypotf(x / m_e, y / m_e)) + 1,
+			   atan2f(y, x)));
+
+  const float QUARTER_SQRT_MAX = 2.3058430092136939520000000e+18f; /* = 0x1p61; <= sqrt(FLT_MAX) / 4 */
+  const float SQRT_MIN =	1.084202172485504434007453e-19f; /* 0x1p-63; >= sqrt(FLT_MIN) */
+  if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
+    return (complex<float>(logf(hypotf(x, y)), atan2f(y, x)));
+
+  return (complex<float>(logf(ax * ax + ay * ay) / 2, atan2f(y, x)));
+}
+
+/*
+ *				=================
+ *				| catanh, catan |
+ *				=================
+ */
+
+/*
+ * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
+ * Assumes x*x and y*y will not overflow.
+ * Assumes x and y are finite.
+ * Assumes y is non-negative.
+ * Assumes fabsf(x) >= FLT_EPSILON.
+ */
+__host__ __device__
+inline float sum_squares(float x, float y)
+{
+  const float SQRT_MIN =	1.084202172485504434007453e-19f; /* 0x1p-63; >= sqrt(FLT_MIN) */
+  /* Avoid underflow when y is small. */
+  if (y < SQRT_MIN)
+    return (x * x);
+
+  return (x * x + y * y);
+}
+
+__host__ __device__
+inline float real_part_reciprocal(float x, float y)
+{
+  float scale;
+  uint32_t hx, hy;
+  int32_t ix, iy;
+
+  get_float_word(hx, x);
+  ix = hx & 0x7f800000;
+  get_float_word(hy, y);
+  iy = hy & 0x7f800000;
+  //#define	BIAS	(FLT_MAX_EXP - 1)
+  const int BIAS = FLT_MAX_EXP - 1;
+  //#define	CUTOFF	(FLT_MANT_DIG / 2 + 1)
+  const int CUTOFF = (FLT_MANT_DIG / 2 + 1);
+  if (ix - iy >= CUTOFF << 23 || isinf(x))
+    return (1 / x);
+  if (iy - ix >= CUTOFF << 23)
+    return (x / y / y);
+  if (ix <= (BIAS + FLT_MAX_EXP / 2 - CUTOFF) << 23)
+    return (x / (x * x + y * y));
+  set_float_word(scale, 0x7f800000 - ix);
+  x *= scale;
+  y *= scale;
+  return (x / (x * x + y * y) * scale);
+}
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+__host__ __device__ inline
+complex<float> catanhf(complex<float> z)
+{
+  float x, y, ax, ay, rx, ry;
+  const volatile float pio2_lo = 6.1232339957367659e-17f; /*  0x11a62633145c07.0p-106 */
+  const float pio2_hi = 1.5707963267948966e0f;/*  0x1921fb54442d18.0p-52 */
+
+
+  x = z.real();
+  y = z.imag();
+  ax = fabsf(x);
+  ay = fabsf(y);
+
+
+  if (y == 0 && ax <= 1)
+    return (complex<float>(atanhf(x), y));
+
+  if (x == 0)
+    return (complex<float>(x, atanf(y)));
+
+  if (isnan(x) || isnan(y)) {
+    if (isinf(x))
+      return (complex<float>(copysignf(0, x), y + y));
+    if (isinf(y))
+      return (complex<float>(copysignf(0, x),
+			     copysignf(pio2_hi + pio2_lo, y)));
+    return (complex<float>(x + 0.0f + (y + 0.0f), x + 0.0f + (y + 0.0f)));
+  }
+
+  const float RECIP_EPSILON = 1.0f / FLT_EPSILON;
+  if (ax > RECIP_EPSILON || ay > RECIP_EPSILON)
+    return (complex<float>(real_part_reciprocal(x, y),
+			   copysignf(pio2_hi + pio2_lo, y)));
+
+  const float SQRT_3_EPSILON = 5.9801995673e-4f; /*  0x9cc471.0p-34 */
+  if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
+    raise_inexact();
+    return (z);
+  }
+
+  const float m_ln2 = 6.9314718056e-1f; /*  0xb17218.0p-24 */
+  if (ax == 1 && ay < FLT_EPSILON)
+    rx = (m_ln2 - logf(ay)) / 2;
+  else
+    rx = log1pf(4 * ax / sum_squares(ax - 1, ay)) / 4;
+
+  if (ax == 1)
+    ry = atan2f(2, -ay) / 2;
+  else if (ay < FLT_EPSILON)
+    ry = atan2f(2 * ay, (1 - ax) * (1 + ax)) / 2;
+  else
+    ry = atan2f(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
+
+  return (complex<float>(copysignf(rx, x), copysignf(ry, y)));
+}
+
+__host__ __device__ inline
+complex<float>catanf(complex<float> z){
+  complex<float> w = catanhf(complex<float>(z.imag(), z.real()));
+  return (complex<float>(w.imag(), w.real()));
+}
+#endif
+
+} // namespace complex
+
+} // namespace detail
+
+
+template <>
+__host__ __device__
+inline complex<float> acos(const complex<float>& z){
+  return detail::complex::cacosf(z);
+}
+
+template <>
+__host__ __device__
+inline complex<float> asin(const complex<float>& z){
+  return detail::complex::casinf(z);
+}
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+template <>
+__host__ __device__
+inline complex<float> atan(const complex<float>& z){
+  return detail::complex::catanf(z);
+}
+#endif
+
+template <>
+__host__ __device__
+inline complex<float> acosh(const complex<float>& z){
+  return detail::complex::cacoshf(z);
+}
+
+
+template <>
+__host__ __device__
+inline complex<float> asinh(const complex<float>& z){
+  return detail::complex::casinhf(z);
+}
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+template <>
+__host__ __device__
+inline complex<float> atanh(const complex<float>& z){
+  return detail::complex::catanhf(z);
+}
+#endif
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/ccosh.h b/thrust/thrust/detail/complex/ccosh.h
new file mode 100644
index 0000000000000000000000000000000000000000..300f08afc306bdfa62b1bc105efadce86179cca0
--- /dev/null
+++ b/thrust/thrust/detail/complex/ccosh.h
@@ -0,0 +1,213 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/s_ccosh.c
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+/*
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+      
+__host__ __device__ inline
+thrust::complex<double> ccosh(const thrust::complex<double>& z){
+  
+
+  const double huge = 8.98846567431157953864652595395e+307; // 0x1p1023
+  double x, y, h;
+  uint32_t hx, hy, ix, iy, lx, ly;
+
+  x = z.real();
+  y = z.imag();
+
+  extract_words(hx, lx, x);
+  extract_words(hy, ly, y);
+
+  ix = 0x7fffffff & hx;
+  iy = 0x7fffffff & hy;
+
+  /* Handle the nearly-non-exceptional cases where x and y are finite. */
+  if (ix < 0x7ff00000 && iy < 0x7ff00000) {
+    if ((iy | ly) == 0)
+      return (thrust::complex<double>(::cosh(x), x * y));
+    if (ix < 0x40360000)	/* small x: normal case */
+      return (thrust::complex<double>(::cosh(x) * ::cos(y), ::sinh(x) * ::sin(y)));
+
+    /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+    if (ix < 0x40862e42) {
+      /* x < 710: exp(|x|) won't overflow */
+      h = ::exp(::fabs(x)) * 0.5;
+      return (thrust::complex<double>(h * cos(y), copysign(h, x) * sin(y)));
+    } else if (ix < 0x4096bbaa) {
+      /* x < 1455: scale to avoid overflow */
+      thrust::complex<double> z_;
+      z_ = ldexp_cexp(thrust::complex<double>(fabs(x), y), -1);
+      return (thrust::complex<double>(z_.real(), z_.imag() * copysign(1.0, x)));
+    } else {
+      /* x >= 1455: the result always overflows */
+      h = huge * x;
+      return (thrust::complex<double>(h * h * cos(y), h * sin(y)));
+    }
+  }
+
+  /*
+   * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+   * The sign of 0 in the result is unspecified.  Choice = normally
+   * the same as dNaN.  Raise the invalid floating-point exception.
+   *
+   * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+   * The sign of 0 in the result is unspecified.  Choice = normally
+   * the same as d(NaN).
+   */
+  if ((ix | lx) == 0 && iy >= 0x7ff00000)
+    return (thrust::complex<double>(y - y, copysign(0.0, x * (y - y))));
+
+  /*
+   * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+   *
+   * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+   * The sign of 0 in the result is unspecified.
+   */
+  if ((iy | ly) == 0 && ix >= 0x7ff00000) {
+    if (((hx & 0xfffff) | lx) == 0)
+      return (thrust::complex<double>(x * x, copysign(0.0, x) * y));
+    return (thrust::complex<double>(x * x, copysign(0.0, (x + x) * y)));
+  }
+
+  /*
+   * cosh(x +- I Inf) = dNaN + I dNaN.
+   * Raise the invalid floating-point exception for finite nonzero x.
+   *
+   * cosh(x + I NaN) = d(NaN) + I d(NaN).
+   * Optionally raises the invalid floating-point exception for finite
+   * nonzero x.  Choice = don't raise (except for signaling NaNs).
+   */
+  if (ix < 0x7ff00000 && iy >= 0x7ff00000)
+    return (thrust::complex<double>(y - y, x * (y - y)));
+
+  /*
+   * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+   *
+   * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+   * The sign of Inf in the result is unspecified.  Choice = always +.
+   * Raise the invalid floating-point exception.
+   *
+   * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+   */
+  if (ix >= 0x7ff00000 && ((hx & 0xfffff) | lx) == 0) {
+    if (iy >= 0x7ff00000)
+      return (thrust::complex<double>(x * x, x * (y - y)));
+    return (thrust::complex<double>((x * x) * cos(y), x * sin(y)));
+  }
+
+  /*
+   * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+   *
+   * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+   * Optionally raises the invalid floating-point exception.
+   * Choice = raise.
+   *
+   * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+   * Optionally raises the invalid floating-point exception for finite
+   * nonzero y.  Choice = don't raise (except for signaling NaNs).
+   */
+  return (thrust::complex<double>((x * x) * (y - y), (x + x) * (y - y)));
+}
+
+
+__host__ __device__ inline
+thrust::complex<double> ccos(const thrust::complex<double>& z){	
+  /* ccos(z) = ccosh(I * z) */
+  return (ccosh(thrust::complex<double>(-z.imag(), z.real())));
+}
+
+} // namespace complex
+
+} // namespace detail
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> cos(const complex<ValueType>& z){
+  const ValueType re = z.real();
+  const ValueType im = z.imag();
+  return complex<ValueType>(std::cos(re) * std::cosh(im), 
+			    -std::sin(re) * std::sinh(im));
+}
+  
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> cosh(const complex<ValueType>& z){
+  const ValueType re = z.real();
+  const ValueType im = z.imag();
+  return complex<ValueType>(std::cosh(re) * std::cos(im), 
+			    std::sinh(re) * std::sin(im));
+}
+
+template <>
+__host__ __device__
+inline thrust::complex<double> cos(const thrust::complex<double>& z){
+  return detail::complex::ccos(z);
+}
+
+template <>
+__host__ __device__
+inline thrust::complex<double> cosh(const thrust::complex<double>& z){
+  return detail::complex::ccosh(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/ccoshf.h b/thrust/thrust/detail/complex/ccoshf.h
new file mode 100644
index 0000000000000000000000000000000000000000..d33af7c4c765afb6187df8524fe9ee541e86e0cb
--- /dev/null
+++ b/thrust/thrust/detail/complex/ccoshf.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/s_ccoshf.c
+ */
+
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+      
+__host__ __device__ inline
+complex<float> ccoshf(const complex<float>& z){
+  float x, y, h;
+  uint32_t hx, hy, ix, iy;
+  const float huge = 1.70141183460469231731687303716e+38; //0x1p127;	
+  
+  
+  x = z.real();
+  y = z.imag();
+  
+  get_float_word(hx, x);
+  get_float_word(hy, y);
+  
+  ix = 0x7fffffff & hx;
+  iy = 0x7fffffff & hy;
+  if (ix < 0x7f800000 && iy < 0x7f800000) {
+    if (iy == 0){
+      return (complex<float>(coshf(x), x * y));
+    }
+    if (ix < 0x41100000){	/* small x: normal case */
+      return (complex<float>(coshf(x) * cosf(y), sinhf(x) * sinf(y)));
+    }
+    /* |x| >= 9, so cosh(x) ~= exp(|x|) */
+    if (ix < 0x42b17218) {
+      /* x < 88.7: expf(|x|) won't overflow */
+      h = expf(fabsf(x)) * 0.5f;
+      return (complex<float>(h * cosf(y), copysignf(h, x) * sinf(y)));
+    } else if (ix < 0x4340b1e7) {
+      /* x < 192.7: scale to avoid overflow */
+      thrust::complex<float> z_;
+      z_ = ldexp_cexpf(complex<float>(fabsf(x), y), -1);
+      return (complex<float>(z_.real(), z_.imag() * copysignf(1.0f, x)));
+    } else {
+      /* x >= 192.7: the result always overflows */
+      h = huge * x;
+      return (complex<float>(h * h * cosf(y), h * sinf(y)));
+    }
+  }
+  
+  if (ix == 0 && iy >= 0x7f800000){
+    return (complex<float>(y - y, copysignf(0.0f, x * (y - y))));
+  }
+  if (iy == 0 && ix >= 0x7f800000) {
+    if ((hx & 0x7fffff) == 0)
+      return (complex<float>(x * x, copysignf(0.0f, x) * y));
+    return (complex<float>(x * x, copysignf(0.0f, (x + x) * y)));
+  }
+  
+  if (ix < 0x7f800000 && iy >= 0x7f800000){
+    return (complex<float>(y - y, x * (y - y)));
+  }
+  
+  if (ix >= 0x7f800000 && (hx & 0x7fffff) == 0) {
+    if (iy >= 0x7f800000)
+      return (complex<float>(x * x, x * (y - y)));
+    return (complex<float>((x * x) * cosf(y), x * sinf(y)));
+  }
+  return (complex<float>((x * x) * (y - y), (x + x) * (y - y)));
+}
+  
+__host__ __device__ inline
+complex<float> ccosf(const complex<float>& z){	
+  return (ccoshf(complex<float>(-z.imag(), z.real())));
+}
+
+} // namespace complex
+
+} // namespace detail
+
+template <>
+__host__ __device__
+inline complex<float> cos(const complex<float>& z){
+  return detail::complex::ccosf(z);
+}
+  
+template <>
+__host__ __device__
+inline complex<float> cosh(const complex<float>& z){
+  return detail::complex::ccoshf(z);
+}
+  
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/cexp.h b/thrust/thrust/detail/complex/cexp.h
new file mode 100644
index 0000000000000000000000000000000000000000..151df397bd6cd2839cc01f0a55db0b96a54d520c
--- /dev/null
+++ b/thrust/thrust/detail/complex/cexp.h
@@ -0,0 +1,183 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2011 David Schultz <das@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/s_cexp.c
+ *    lib/msun/src/k_exp.c
+ *
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+/*
+ * Compute exp(x), scaled to avoid spurious overflow.  An exponent is
+ * returned separately in 'expt'.
+ *
+ * Input:  ln(DBL_MAX) <= x < ln(2 * DBL_MAX / DBL_MIN_DENORM) ~= 1454.91
+ * Output: 2**1023 <= y < 2**1024
+ */
+__host__ __device__ inline
+	double frexp_exp(double x, int *expt){
+  const uint32_t k = 1799;		/* constant for reduction */
+  const double kln2 =  1246.97177782734161156;	/* k * ln2 */
+	
+  double exp_x;
+  uint32_t hx;
+	
+  /*
+   * We use exp(x) = exp(x - kln2) * 2**k, carefully chosen to
+   * minimize |exp(kln2) - 2**k|.  We also scale the exponent of
+   * exp_x to MAX_EXP so that the result can be multiplied by
+   * a tiny number without losing accuracy due to denormalization.
+   */
+  exp_x = exp(x - kln2);
+  get_high_word(hx, exp_x);
+  *expt = (hx >> 20) - (0x3ff + 1023) + k;
+  set_high_word(exp_x, (hx & 0xfffff) | ((0x3ff + 1023) << 20));
+  return (exp_x);
+}
+      
+      
+__host__ __device__ inline
+complex<double>	ldexp_cexp(complex<double> z, int expt){
+  double x, y, exp_x, scale1, scale2;
+  int ex_expt, half_expt;
+	
+  x = z.real();
+  y = z.imag();
+  exp_x = frexp_exp(x, &ex_expt);
+  expt += ex_expt;
+	
+  /*
+   * Arrange so that scale1 * scale2 == 2**expt.  We use this to
+   * compensate for scalbn being horrendously slow.
+   */
+  half_expt = expt / 2;
+  insert_words(scale1, (0x3ff + half_expt) << 20, 0);
+  half_expt = expt - half_expt;
+  insert_words(scale2, (0x3ff + half_expt) << 20, 0);
+	
+  return (complex<double>(cos(y) * exp_x * scale1 * scale2,
+			  sin(y) * exp_x * scale1 * scale2));
+}
+	
+
+__host__ __device__ inline
+complex<double> cexp(const complex<double>& z){
+  double x, y, exp_x;
+  uint32_t hx, hy, lx, ly;
+
+  const uint32_t
+    exp_ovfl  = 0x40862e42,			/* high bits of MAX_EXP * ln2 ~= 710 */
+    cexp_ovfl = 0x4096b8e4;			/* (MAX_EXP - MIN_DENORM_EXP) * ln2 */
+
+	  
+  x = z.real();
+  y = z.imag();
+	  
+  extract_words(hy, ly, y);
+  hy &= 0x7fffffff;
+	  
+  /* cexp(x + I 0) = exp(x) + I 0 */
+  if ((hy | ly) == 0)
+    return (complex<double>(exp(x), y));
+  extract_words(hx, lx, x);
+  /* cexp(0 + I y) = cos(y) + I sin(y) */
+  if (((hx & 0x7fffffff) | lx) == 0)
+    return (complex<double>(cos(y), sin(y)));
+	  
+  if (hy >= 0x7ff00000) {
+    if (lx != 0 || (hx & 0x7fffffff) != 0x7ff00000) {
+      /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
+      return (complex<double>(y - y, y - y));
+    } else if (hx & 0x80000000) {
+      /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
+      return (complex<double>(0.0, 0.0));
+    } else {
+      /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
+      return (complex<double>(x, y - y));
+    }
+  }
+	  
+  if (hx >= exp_ovfl && hx <= cexp_ovfl) {
+    /*
+     * x is between 709.7 and 1454.3, so we must scale to avoid
+     * overflow in exp(x).
+     */
+    return (ldexp_cexp(z, 0));
+  } else {
+    /*
+     * Cases covered here:
+     *  -  x < exp_ovfl and exp(x) won't overflow (common case)
+     *  -  x > cexp_ovfl, so exp(x) * s overflows for all s > 0
+     *  -  x = +-Inf (generated by exp())
+     *  -  x = NaN (spurious inexact exception from y)
+     */
+    exp_x = std::exp(x);
+    return (complex<double>(exp_x * cos(y), exp_x * sin(y)));
+  }
+}
+	
+} // namespace complex
+ 
+} // namespace detail
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> exp(const complex<ValueType>& z){    
+  return polar(std::exp(z.real()),z.imag());
+}
+
+template <>
+__host__ __device__
+inline complex<double> exp(const complex<double>& z){    
+  return detail::complex::cexp(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/cexpf.h b/thrust/thrust/detail/complex/cexpf.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d85c45ed83a6d1489f81cb2ba3dc769f93e0a10
--- /dev/null
+++ b/thrust/thrust/detail/complex/cexpf.h
@@ -0,0 +1,161 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2011 David Schultz <das@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/s_cexpf.c
+ *    lib/msun/src/k_exp.c
+ *
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{
+
+__host__ __device__ inline
+float frexp_expf(float x, int *expt){
+  const uint32_t k = 235;                 /* constant for reduction */
+  const float kln2 =  162.88958740F;       /* k * ln2 */
+	
+  // should this be a double instead?
+  float exp_x;
+  uint32_t hx;
+	
+  exp_x = expf(x - kln2);
+  get_float_word(hx, exp_x);
+  *expt = (hx >> 23) - (0x7f + 127) + k;
+  set_float_word(exp_x, (hx & 0x7fffff) | ((0x7f + 127) << 23));
+  return (exp_x);
+}
+      
+__host__ __device__ inline
+complex<float> 
+ldexp_cexpf(complex<float> z, int expt)
+{
+  float x, y, exp_x, scale1, scale2;
+  int ex_expt, half_expt;
+	
+  x = z.real();
+  y = z.imag();
+  exp_x = frexp_expf(x, &ex_expt);
+  expt += ex_expt;
+	
+  half_expt = expt / 2;
+  set_float_word(scale1, (0x7f + half_expt) << 23);
+  half_expt = expt - half_expt;
+  set_float_word(scale2, (0x7f + half_expt) << 23);
+	
+  return (complex<float>(std::cos(y) * exp_x * scale1 * scale2,
+			 std::sin(y) * exp_x * scale1 * scale2));
+}
+      
+__host__ __device__ inline
+complex<float> cexpf(const complex<float>& z){
+  float x, y, exp_x;
+  uint32_t hx, hy;
+
+  const uint32_t
+    exp_ovfl  = 0x42b17218,		/* MAX_EXP * ln2 ~= 88.722839355 */
+    cexp_ovfl = 0x43400074;		/* (MAX_EXP - MIN_DENORM_EXP) * ln2 */
+
+  x = z.real();
+  y = z.imag();
+
+  get_float_word(hy, y);
+  hy &= 0x7fffffff;
+
+  /* cexp(x + I 0) = exp(x) + I 0 */
+  if (hy == 0)
+    return (complex<float>(std::exp(x), y));
+  get_float_word(hx, x);
+  /* cexp(0 + I y) = cos(y) + I sin(y) */
+  if ((hx & 0x7fffffff) == 0){
+    return (complex<float>(std::cos(y), std::sin(y)));
+  }
+  if (hy >= 0x7f800000) {
+    if ((hx & 0x7fffffff) != 0x7f800000) {
+      /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
+      return (complex<float>(y - y, y - y));
+    } else if (hx & 0x80000000) {
+      /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
+      return (complex<float>(0.0, 0.0));
+    } else {
+      /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
+      return (complex<float>(x, y - y));
+    }
+  }
+
+  if (hx >= exp_ovfl && hx <= cexp_ovfl) {
+    /*
+     * x is between 88.7 and 192, so we must scale to avoid
+     * overflow in expf(x).
+     */
+    return (ldexp_cexpf(z, 0));
+  } else {
+    /*
+     * Cases covered here:
+     *  -  x < exp_ovfl and exp(x) won't overflow (common case)
+     *  -  x > cexp_ovfl, so exp(x) * s overflows for all s > 0
+     *  -  x = +-Inf (generated by exp())
+     *  -  x = NaN (spurious inexact exception from y)
+     */
+    exp_x = std::exp(x);
+    return (complex<float>(exp_x * std::cos(y), exp_x * std::sin(y)));
+  }
+}
+
+} // namespace complex
+
+} // namespace detail
+
+template <>
+__host__ __device__
+inline complex<float> exp(const complex<float>& z){    
+  return detail::complex::cexpf(z);
+}    
+  
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/clog.h b/thrust/thrust/detail/complex/clog.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d288df0240e7fc2562ad415ff4f4fa2de1048c2
--- /dev/null
+++ b/thrust/thrust/detail/complex/clog.h
@@ -0,0 +1,212 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2012 Stephen Montgomery-Smith <stephen@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSDs msun:*/
+
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{
+
+using thrust::complex;
+
+/* round down to 18 = 54/3 bits */
+__host__ __device__ inline
+double trim(double x){
+  uint32_t hi;    
+  get_high_word(hi, x);
+  insert_words(x, hi &0xfffffff8, 0);
+  return x;
+}
+
+
+__host__ __device__ inline
+complex<double> clog(const complex<double>& z){
+
+  // Adapted from FreeBSDs msun
+  double x, y;
+  double ax, ay;
+  double x0, y0, x1, y1, x2, y2, t, hm1;
+  double val[12];
+  int i, sorted;
+  const double e = 2.7182818284590452354;
+
+  x = z.real();
+  y = z.imag();
+
+  /* Handle NaNs using the general formula to mix them right. */
+  if (x != x || y != y){
+    return (complex<double>(std::log(norm(z)), std::atan2(y, x)));
+  }
+
+  ax = std::abs(x);
+  ay = std::abs(y);
+  if (ax < ay) {
+    t = ax;
+    ax = ay;
+    ay = t;
+  }
+
+  /*
+   * To avoid unnecessary overflow, if x and y are very large, divide x
+   * and y by M_E, and then add 1 to the logarithm.  This depends on
+   * M_E being larger than sqrt(2).
+   * There is a potential loss of accuracy caused by dividing by M_E,
+   * but this case should happen extremely rarely.
+   */
+  //    if (ay > 5e307){
+  // For high values of ay -> hypotf(DBL_MAX,ay) = inf
+  // We expect that for values at or below ay = 5e307 this should not happen
+  if (ay > 5e307){
+    return (complex<double>(std::log(hypot(x / e, y / e)) + 1.0, std::atan2(y, x)));
+  }
+  if (ax == 1.) {
+    if (ay < 1e-150){
+      return (complex<double>((ay * 0.5) * ay, std::atan2(y, x)));
+    }
+    return (complex<double>(log1p(ay * ay) * 0.5, std::atan2(y, x)));
+  }
+
+  /*
+   * Because atan2 and hypot conform to C99, this also covers all the
+   * edge cases when x or y are 0 or infinite.
+   */
+  if (ax < 1e-50 || ay < 1e-50 || ax > 1e50 || ay > 1e50){
+    return (complex<double>(std::log(hypot(x, y)), std::atan2(y, x)));
+  }
+
+  /* 
+   * From this point on, we don't need to worry about underflow or
+   * overflow in calculating ax*ax or ay*ay.
+   */
+
+  /* Some easy cases. */
+
+  if (ax >= 1.0){
+    return (complex<double>(log1p((ax-1)*(ax+1) + ay*ay) * 0.5, atan2(y, x)));
+  }
+
+  if (ax*ax + ay*ay <= 0.7){
+    return (complex<double>(std::log(ax*ax + ay*ay) * 0.5, std::atan2(y, x)));
+  }
+
+  /*
+   * Take extra care so that ULP of real part is small if hypot(x,y) is
+   * moderately close to 1.
+   */
+
+
+  x0 = trim(ax);
+  ax = ax-x0;
+  x1 = trim(ax);
+  x2 = ax-x1;
+  y0 = trim(ay);
+  ay = ay-y0;
+  y1 = trim(ay);
+  y2 = ay-y1;
+
+  val[0] = x0*x0;
+  val[1] = y0*y0;
+  val[2] = 2*x0*x1;
+  val[3] = 2*y0*y1;
+  val[4] = x1*x1;
+  val[5] = y1*y1;
+  val[6] = 2*x0*x2;
+  val[7] = 2*y0*y2;
+  val[8] = 2*x1*x2;
+  val[9] = 2*y1*y2;
+  val[10] = x2*x2;
+  val[11] = y2*y2;
+
+  /* Bubble sort. */
+
+  do {
+    sorted = 1;
+    for (i=0;i<11;i++) {
+      if (val[i] < val[i+1]) {
+	sorted = 0;
+	t = val[i];
+	val[i] = val[i+1];
+	val[i+1] = t;
+      }
+    }
+  } while (!sorted);
+
+  hm1 = -1;
+  for (i=0;i<12;i++){
+    hm1 += val[i];
+  }
+  return (complex<double>(0.5 * log1p(hm1), atan2(y, x)));
+}
+  
+} // namespace complex
+
+} // namespace detail
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> log(const complex<ValueType>& z){
+  return complex<ValueType>(std::log(thrust::abs(z)),thrust::arg(z));
+}
+
+template <>
+__host__ __device__
+inline complex<double> log(const complex<double>& z){
+  return detail::complex::clog(z);
+}
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> log10(const complex<ValueType>& z){ 
+  // Using the explicit literal prevents compile time warnings in
+  // devices that don't support doubles 
+  return thrust::log(z)/ValueType(2.30258509299404568402);
+}
+
+} // namespace thrust
+    
diff --git a/thrust/thrust/detail/complex/clogf.h b/thrust/thrust/detail/complex/clogf.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f3314ed2635c28ff5627235525da9c1fa8709ad
--- /dev/null
+++ b/thrust/thrust/detail/complex/clogf.h
@@ -0,0 +1,198 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2012 Stephen Montgomery-Smith <stephen@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSDs msun:*/
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{
+
+using thrust::complex;
+
+/* round down to 8 = 24/3 bits */
+__host__ __device__ inline
+float trim(float x){
+  uint32_t hx;
+  get_float_word(hx, x);
+  hx &= 0xffff0000;
+  float ret;
+  set_float_word(ret,hx);
+  return ret;
+}
+
+
+__host__ __device__ inline
+complex<float> clogf(const complex<float>& z){
+
+  // Adapted from FreeBSDs msun
+  float x, y;
+  float ax, ay;
+  float x0, y0, x1, y1, x2, y2, t, hm1;
+  float val[12];
+  int i, sorted;	
+  const float e = 2.7182818284590452354f;
+
+  x = z.real();
+  y = z.imag();
+
+  /* Handle NaNs using the general formula to mix them right. */
+  if (x != x || y != y){
+    return (complex<float>(std::log(norm(z)), std::atan2(y, x)));
+  }
+
+  ax = std::abs(x);
+  ay = std::abs(y);
+  if (ax < ay) {
+    t = ax;
+    ax = ay;
+    ay = t;
+  }
+
+  /*
+   * To avoid unnecessary overflow, if x and y are very large, divide x
+   * and y by M_E, and then add 1 to the logarithm.  This depends on
+   * M_E being larger than sqrt(2).
+   * There is a potential loss of accuracy caused by dividing by M_E,
+   * but this case should happen extremely rarely.
+   */
+  // For high values of ay -> hypotf(FLT_MAX,ay) = inf
+  // We expect that for values at or below ay = 1e34f this should not happen
+  if (ay > 1e34f){ 
+    return (complex<float>(std::log(hypotf(x / e, y / e)) + 1.0f, std::atan2(y, x)));
+  }
+  if (ax == 1.f) {
+    if (ay < 1e-19f){
+      return (complex<float>((ay * 0.5f) * ay, std::atan2(y, x)));
+    }
+    return (complex<float>(log1pf(ay * ay) * 0.5f, std::atan2(y, x)));
+  }
+
+  /*
+   * Because atan2 and hypot conform to C99, this also covers all the
+   * edge cases when x or y are 0 or infinite.
+   */
+  if (ax < 1e-6f || ay < 1e-6f || ax > 1e6f || ay > 1e6f){
+    return (complex<float>(std::log(hypotf(x, y)), std::atan2(y, x)));
+  }
+
+  /* 
+   * From this point on, we don't need to worry about underflow or
+   * overflow in calculating ax*ax or ay*ay.
+   */
+
+  /* Some easy cases. */
+
+  if (ax >= 1.0f){
+    return (complex<float>(log1pf((ax-1.f)*(ax+1.f) + ay*ay) * 0.5f, atan2(y, x)));
+  }
+
+  if (ax*ax + ay*ay <= 0.7f){
+    return (complex<float>(std::log(ax*ax + ay*ay) * 0.5f, std::atan2(y, x)));
+  }
+
+  /*
+   * Take extra care so that ULP of real part is small if hypot(x,y) is
+   * moderately close to 1.
+   */
+
+
+  x0 = trim(ax);
+  ax = ax-x0;
+  x1 = trim(ax);
+  x2 = ax-x1;
+  y0 = trim(ay);
+  ay = ay-y0;
+  y1 = trim(ay);
+  y2 = ay-y1;
+
+  val[0] = x0*x0;
+  val[1] = y0*y0;
+  val[2] = 2*x0*x1;
+  val[3] = 2*y0*y1;
+  val[4] = x1*x1;
+  val[5] = y1*y1;
+  val[6] = 2*x0*x2;
+  val[7] = 2*y0*y2;
+  val[8] = 2*x1*x2;
+  val[9] = 2*y1*y2;
+  val[10] = x2*x2;
+  val[11] = y2*y2;
+
+  /* Bubble sort. */
+
+  do {
+    sorted = 1;
+    for (i=0;i<11;i++) {
+      if (val[i] < val[i+1]) {
+	sorted = 0;
+	t = val[i];
+	val[i] = val[i+1];
+	val[i+1] = t;
+      }
+    }
+  } while (!sorted);
+
+  hm1 = -1;
+  for (i=0;i<12;i++){
+    hm1 += val[i];
+  }
+  return (complex<float>(0.5f * log1pf(hm1), atan2(y, x)));
+}
+
+} // namespace complex
+
+} // namespace detail
+
+template <>
+__host__ __device__
+inline complex<float> log(const complex<float>& z){
+  return detail::complex::clogf(z);
+}
+
+} // namespace thrust
+    
diff --git a/thrust/thrust/detail/complex/complex.inl b/thrust/thrust/detail/complex/complex.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2e2a106bc8f32dee406f41452d265b348b95f6db
--- /dev/null
+++ b/thrust/thrust/detail/complex/complex.inl
@@ -0,0 +1,353 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/complex.h>
+
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+namespace thrust
+{
+
+/* --- Constructors --- */
+
+#if THRUST_CPP_DIALECT < 2011
+template <typename T>
+__host__ __device__
+complex<T>::complex()
+{
+  real(T());
+  imag(T());
+}
+#endif
+
+template <typename T>
+__host__ __device__
+complex<T>::complex(const T& re)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  : data{re, T()}
+{}
+#else
+{
+  real(re);
+  imag(T());
+}
+#endif
+
+
+template <typename T>
+__host__ __device__
+complex<T>::complex(const T& re, const T& im)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  : data{re, im}
+{}
+#else
+{
+  real(re);
+  imag(im);
+}
+#endif
+
+#if THRUST_CPP_DIALECT < 2011
+template <typename T>
+__host__ __device__
+complex<T>::complex(const complex<T>& z)
+{
+  real(z.real());
+  imag(z.imag());
+}
+#endif
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>::complex(const complex<U>& z)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  // We do a functional-style cast here to suppress conversion warnings.
+  : data{T(z.real()), T(z.imag())}
+{}
+#else
+{
+  real(T(z.real()));
+  imag(T(z.imag()));
+}
+#endif
+
+template <typename T>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>::complex(const std::complex<T>& z)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  : data{THRUST_STD_COMPLEX_REAL(z), THRUST_STD_COMPLEX_IMAG(z)}
+{}
+#else
+{
+  real(THRUST_STD_COMPLEX_REAL(z));
+  imag(THRUST_STD_COMPLEX_IMAG(z));
+}
+#endif
+
+template <typename T>
+template <typename U>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>::complex(const std::complex<U>& z)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  // We do a functional-style cast here to suppress conversion warnings.
+  : data{T(THRUST_STD_COMPLEX_REAL(z)), T(THRUST_STD_COMPLEX_IMAG(z))}
+{}
+#else
+{
+  real(T(THRUST_STD_COMPLEX_REAL(z)));
+  imag(T(THRUST_STD_COMPLEX_IMAG(z)));
+}
+#endif
+
+
+
+/* --- Assignment Operators --- */
+
+template <typename T>
+__host__ __device__
+complex<T>& complex<T>::operator=(const T& re)
+{
+  real(re);
+  imag(T());
+  return *this;
+}
+
+#if THRUST_CPP_DIALECT < 2011
+template <typename T>
+__host__ __device__
+complex<T>& complex<T>::operator=(const complex<T>& z)
+{
+  real(z.real());
+  imag(z.imag());
+  return *this;
+}
+#endif
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator=(const complex<U>& z)
+{
+  real(T(z.real()));
+  imag(T(z.imag()));
+  return *this;
+}
+
+template <typename T>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>& complex<T>::operator=(const std::complex<T>& z)
+{
+  real(THRUST_STD_COMPLEX_REAL(z));
+  imag(THRUST_STD_COMPLEX_IMAG(z));
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>& complex<T>::operator=(const std::complex<U>& z)
+{
+  real(T(THRUST_STD_COMPLEX_REAL(z)));
+  imag(T(THRUST_STD_COMPLEX_IMAG(z)));
+  return *this;
+}
+
+
+
+/* --- Compound Assignment Operators --- */
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator+=(const complex<U>& z)
+{
+  *this = *this + z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator-=(const complex<U>& z)
+{
+  *this = *this - z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator*=(const complex<U>& z)
+{
+  *this = *this * z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator/=(const complex<U>& z)
+{
+  *this = *this / z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator+=(const U& z)
+{
+  *this = *this + z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator-=(const U& z)
+{
+  *this = *this - z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator*=(const U& z)
+{
+  *this = *this * z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator/=(const U& z)
+{
+  *this = *this / z;
+  return *this;
+}
+
+
+
+/* --- Equality Operators --- */
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const complex<T1>& y)
+{
+  return x.real() == y.real() && x.imag() == y.imag();
+}
+
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const complex<T0>& x, const std::complex<T1>& y)
+{
+  return x.real() == THRUST_STD_COMPLEX_REAL(y) && x.imag() == THRUST_STD_COMPLEX_IMAG(y);
+}
+
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const std::complex<T0>& x, const complex<T1>& y)
+{
+  return THRUST_STD_COMPLEX_REAL(x) == y.real() && THRUST_STD_COMPLEX_IMAG(x) == y.imag();
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const T0& x, const complex<T1>& y)
+{
+  return x == y.real() && y.imag() == T1();
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const T1& y)
+{
+  return x.real() == y && x.imag() == T1();
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const complex<T1>& y)
+{
+  return !(x == y);
+}
+
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const complex<T0>& x, const std::complex<T1>& y)
+{
+  return !(x == y);
+}
+
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const std::complex<T0>& x, const complex<T1>& y)
+{
+  return !(x == y);
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const T0& x, const complex<T1>& y)
+{
+  return !(x == y);
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const T1& y)
+{
+  return !(x == y);
+}
+
+template <typename T>
+struct proclaim_trivially_relocatable<complex<T> > : thrust::true_type {};
+
+} // end namespace thrust
+
+#include <thrust/detail/complex/arithmetic.h>
+#include <thrust/detail/complex/cproj.h>
+#include <thrust/detail/complex/cexp.h>
+#include <thrust/detail/complex/cexpf.h>
+#include <thrust/detail/complex/clog.h>
+#include <thrust/detail/complex/clogf.h>
+#include <thrust/detail/complex/cpow.h>
+#include <thrust/detail/complex/ccosh.h>
+#include <thrust/detail/complex/ccoshf.h>
+#include <thrust/detail/complex/csinh.h>
+#include <thrust/detail/complex/csinhf.h>
+#include <thrust/detail/complex/ctanh.h>
+#include <thrust/detail/complex/ctanhf.h>
+#include <thrust/detail/complex/csqrt.h>
+#include <thrust/detail/complex/csqrtf.h>
+#include <thrust/detail/complex/catrig.h>
+#include <thrust/detail/complex/catrigf.h>
+#include <thrust/detail/complex/stream.h>
+
diff --git a/thrust/thrust/detail/complex/cpow.h b/thrust/thrust/detail/complex/cpow.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d6ad051eb18b47cb628a1673e64ba6584d52de8
--- /dev/null
+++ b/thrust/thrust/detail/complex/cpow.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust {
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return exp(log(complex<T>(x)) * complex<T>(y));
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return exp(log(complex<T>(x)) * T(y));
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  // Find `log` by ADL.
+  using std::log;
+  return exp(log(T(x)) * complex<T>(y));
+}
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/complex/cproj.h b/thrust/thrust/detail/complex/cproj.h
new file mode 100644
index 0000000000000000000000000000000000000000..563c92f69764323f98066c37d06227a14a50a3b4
--- /dev/null
+++ b/thrust/thrust/detail/complex/cproj.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{	 
+__host__ __device__
+inline complex<float> cprojf(const complex<float>& z){
+  if(!isinf(z.real()) && !isinf(z.imag())){
+    return z;
+  }else{
+    // std::numeric_limits<T>::infinity() doesn't run on the GPU
+    return complex<float>(infinity<float>(), copysignf(0.0, z.imag()));
+  }
+}
+  
+__host__ __device__
+inline complex<double> cproj(const complex<double>& z){
+  if(!isinf(z.real()) && !isinf(z.imag())){
+    return z;
+  }else{
+    // std::numeric_limits<T>::infinity() doesn't run on the GPU
+    return complex<double>(infinity<double>(), copysign(0.0, z.imag()));
+  }
+}
+
+}
+ 
+}
+
+template <typename T>
+__host__ __device__
+inline thrust::complex<T> proj(const thrust::complex<T>& z){
+  return detail::complex::cproj(z);
+}
+  
+
+template <>
+__host__ __device__
+inline thrust::complex<double> proj(const thrust::complex<double>& z){
+  return detail::complex::cproj(z);
+}
+  
+template <>
+__host__ __device__
+inline thrust::complex<float> proj(const thrust::complex<float>& z){
+  return detail::complex::cprojf(z);
+}
+
+}
+
diff --git a/thrust/thrust/detail/complex/csinh.h b/thrust/thrust/detail/complex/csinh.h
new file mode 100644
index 0000000000000000000000000000000000000000..869f367f2657faf53c9b7b1d99f5498cd8619cd3
--- /dev/null
+++ b/thrust/thrust/detail/complex/csinh.h
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/s_csinh.c
+ */
+
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__ inline
+complex<double> csinh(const complex<double>& z){
+  double x, y, h;
+  uint32_t hx, hy, ix, iy, lx, ly;
+  const double huge = 8.98846567431157953864652595395e+307; // 0x1p1023;
+
+  x = z.real();
+  y = z.imag();
+
+  extract_words(hx, lx, x);
+  extract_words(hy, ly, y);
+
+  ix = 0x7fffffff & hx;
+  iy = 0x7fffffff & hy;
+
+  /* Handle the nearly-non-exceptional cases where x and y are finite. */
+  if (ix < 0x7ff00000 && iy < 0x7ff00000) {
+    if ((iy | ly) == 0)
+      return (complex<double>(sinh(x), y));
+    if (ix < 0x40360000)	/* small x: normal case */
+      return (complex<double>(sinh(x) * cos(y), cosh(x) * sin(y)));
+
+    /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+    if (ix < 0x40862e42) {
+      /* x < 710: exp(|x|) won't overflow */
+      h = exp(fabs(x)) * 0.5;
+      return (complex<double>(copysign(h, x) * cos(y), h * sin(y)));
+    } else if (ix < 0x4096bbaa) {
+      /* x < 1455: scale to avoid overflow */
+      complex<double> z_ = ldexp_cexp(complex<double>(fabs(x), y), -1);
+      return (complex<double>(z_.real() * copysign(1.0, x), z_.imag()));
+    } else {
+      /* x >= 1455: the result always overflows */
+      h = huge * x;
+      return (complex<double>(h * cos(y), h * h * sin(y)));
+    }
+  }
+
+  /*
+   * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+   * The sign of 0 in the result is unspecified.  Choice = normally
+   * the same as dNaN.  Raise the invalid floating-point exception.
+   *
+   * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+   * The sign of 0 in the result is unspecified.  Choice = normally
+   * the same as d(NaN).
+   */
+  if ((ix | lx) == 0 && iy >= 0x7ff00000)
+    return (complex<double>(copysign(0.0, x * (y - y)), y - y));
+
+  /*
+   * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+   *
+   * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+   */
+  if ((iy | ly) == 0 && ix >= 0x7ff00000) {
+    if (((hx & 0xfffff) | lx) == 0)
+      return (complex<double>(x, y));
+    return (complex<double>(x, copysign(0.0, y)));
+  }
+
+  /*
+   * sinh(x +- I Inf) = dNaN + I dNaN.
+   * Raise the invalid floating-point exception for finite nonzero x.
+   *
+   * sinh(x + I NaN) = d(NaN) + I d(NaN).
+   * Optionally raises the invalid floating-point exception for finite
+   * nonzero x.  Choice = don't raise (except for signaling NaNs).
+   */
+  if (ix < 0x7ff00000 && iy >= 0x7ff00000)
+    return (complex<double>(y - y, x * (y - y)));
+
+  /*
+   * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+   * The sign of Inf in the result is unspecified.  Choice = normally
+   * the same as d(NaN).
+   *
+   * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+   * The sign of Inf in the result is unspecified.  Choice = always +.
+   * Raise the invalid floating-point exception.
+   *
+   * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+   */
+  if (ix >= 0x7ff00000 && ((hx & 0xfffff) | lx) == 0) {
+    if (iy >= 0x7ff00000)
+      return (complex<double>(x * x, x * (y - y)));
+    return (complex<double>(x * cos(y), infinity<double>() * sin(y)));
+  }
+
+  /*
+   * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+   *
+   * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+   * Optionally raises the invalid floating-point exception.
+   * Choice = raise.
+   *
+   * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+   * Optionally raises the invalid floating-point exception for finite
+   * nonzero y.  Choice = don't raise (except for signaling NaNs).
+   */
+  return (complex<double>((x * x) * (y - y), (x + x) * (y - y)));
+}
+
+__host__ __device__ inline
+complex<double> csin(complex<double> z){
+  /* csin(z) = -I * csinh(I * z) */
+  z = csinh(complex<double>(-z.imag(), z.real()));
+  return (complex<double>(z.imag(), -z.real()));
+}
+
+} // namespace complex
+
+} // namespace detail
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> sin(const complex<ValueType>& z){
+  const ValueType re = z.real();
+  const ValueType im = z.imag();
+  return complex<ValueType>(std::sin(re) * std::cosh(im), 
+			    std::cos(re) * std::sinh(im));
+}
+
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> sinh(const complex<ValueType>& z){
+  const ValueType re = z.real();
+  const ValueType im = z.imag();
+  return complex<ValueType>(std::sinh(re) * std::cos(im), 
+			    std::cosh(re) * std::sin(im));
+}
+
+template <>
+__host__ __device__
+inline complex<double> sin(const complex<double>& z){
+  return detail::complex::csin(z);
+}
+
+template <>
+__host__ __device__
+inline complex<double> sinh(const complex<double>& z){
+  return detail::complex::csinh(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/csinhf.h b/thrust/thrust/detail/complex/csinhf.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf4fb0816478f9882fdbe9082b8f4c266d713206
--- /dev/null
+++ b/thrust/thrust/detail/complex/csinhf.h
@@ -0,0 +1,142 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/s_csinhf.c
+ */
+
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__ inline
+complex<float> csinhf(const complex<float>& z){
+
+  float x, y, h;
+  uint32_t hx, hy, ix, iy;
+
+  const float huge = 1.70141183460469231731687303716e+38; //0x1p127;
+
+  x = z.real();
+  y = z.imag();
+
+  get_float_word(hx, x);
+  get_float_word(hy, y);
+
+  ix = 0x7fffffff & hx;
+  iy = 0x7fffffff & hy;
+
+  if (ix < 0x7f800000 && iy < 0x7f800000) {
+    if (iy == 0)
+      return (complex<float>(sinhf(x), y));
+    if (ix < 0x41100000)	/* small x: normal case */
+      return (complex<float>(sinhf(x) * cosf(y), coshf(x) * sinf(y)));
+
+    /* |x| >= 9, so cosh(x) ~= exp(|x|) */
+    if (ix < 0x42b17218) {
+      /* x < 88.7: expf(|x|) won't overflow */
+      h = expf(fabsf(x)) * 0.5f;
+      return (complex<float>(copysignf(h, x) * cosf(y), h * sinf(y)));
+    } else if (ix < 0x4340b1e7) {
+      /* x < 192.7: scale to avoid overflow */
+      complex<float> z_ = ldexp_cexpf(complex<float>(fabsf(x), y), -1);
+      return (complex<float>(z_.real() * copysignf(1.0f, x), z_.imag()));
+    } else {
+      /* x >= 192.7: the result always overflows */
+      h = huge * x;
+      return (complex<float>(h * cosf(y), h * h * sinf(y)));
+    }
+  }
+
+  if (ix == 0 && iy >= 0x7f800000)
+    return (complex<float>(copysignf(0, x * (y - y)), y - y));
+
+  if (iy == 0 && ix >= 0x7f800000) {
+    if ((hx & 0x7fffff) == 0)
+      return (complex<float>(x, y));
+    return (complex<float>(x, copysignf(0.0f, y)));
+  }
+
+  if (ix < 0x7f800000 && iy >= 0x7f800000)
+    return (complex<float>(y - y, x * (y - y)));
+
+  if (ix >= 0x7f800000 && (hx & 0x7fffff) == 0) {
+    if (iy >= 0x7f800000)
+      return (complex<float>(x * x, x * (y - y)));
+    return (complex<float>(x * cosf(y), infinity<float>() * sinf(y)));
+  }
+
+  return (complex<float>((x * x) * (y - y), (x + x) * (y - y)));
+}
+
+__host__ __device__ inline
+complex<float> csinf(complex<float> z){
+  z = csinhf(complex<float>(-z.imag(), z.real()));
+  return (complex<float>(z.imag(), -z.real()));
+}
+      
+} // namespace complex
+
+} // namespace detail
+  
+template <>
+__host__ __device__
+inline complex<float> sin(const complex<float>& z){
+  return detail::complex::csinf(z);
+}
+
+template <>
+__host__ __device__
+inline complex<float> sinh(const complex<float>& z){
+  return detail::complex::csinhf(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/csqrt.h b/thrust/thrust/detail/complex/csqrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcffbee9540d85b7b1c226d6ad3d332876533f8f
--- /dev/null
+++ b/thrust/thrust/detail/complex/csqrt.h
@@ -0,0 +1,152 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2007 David Schultz <das@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Adapted from FreeBSD by Filipe Maia <filipe.c.maia@gmail.com>:
+ *    freebsd/lib/msun/src/s_csqrt.c
+ */
+
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__ inline
+complex<double> csqrt(const complex<double>& z){
+  complex<double> result;
+  double a, b;
+  double t;
+  int scale;
+
+  /* We risk spurious overflow for components >= DBL_MAX / (1 + sqrt(2)). */
+  const double THRESH = 7.446288774449766337959726e+307;
+
+  a = z.real();
+  b = z.imag();
+
+  /* Handle special cases. */
+  if (z == 0.0)
+    return (complex<double>(0.0, b));
+  if (isinf(b))
+    return (complex<double>(infinity<double>(), b));
+  if (isnan(a)) {
+    t = (b - b) / (b - b);	/* raise invalid if b is not a NaN */
+    return (complex<double>(a, t));	/* return NaN + NaN i */
+  }
+  if (isinf(a)) {
+    /*
+     * csqrt(inf + NaN i)  = inf +  NaN i
+     * csqrt(inf + y i)    = inf +  0 i
+     * csqrt(-inf + NaN i) = NaN +- inf i
+     * csqrt(-inf + y i)   = 0   +  inf i
+     */
+    if (signbit(a))
+      return (complex<double>(fabs(b - b), copysign(a, b)));
+    else
+      return (complex<double>(a, copysign(b - b, b)));
+  }
+  /*
+   * The remaining special case (b is NaN) is handled just fine by
+   * the normal code path below.
+   */
+
+  // DBL_MIN*2
+  const double low_thresh = 4.450147717014402766180465e-308;
+  scale = 0;
+
+  if (fabs(a) >= THRESH || fabs(b) >= THRESH) {
+    /* Scale to avoid overflow. */
+    a *= 0.25;
+    b *= 0.25;
+    scale = 1;
+  }else if (fabs(a) <= low_thresh && fabs(b) <= low_thresh) {
+    /* Scale to avoid underflow. */
+    a *= 4.0;
+    b *= 4.0;
+    scale = 2;
+  }
+	
+
+  /* Algorithm 312, CACM vol 10, Oct 1967. */
+  if (a >= 0.0) {
+    t = sqrt((a + hypot(a, b)) * 0.5);
+    result = complex<double>(t, b / (2 * t));
+  } else {
+    t = sqrt((-a + hypot(a, b)) * 0.5);
+    result = complex<double>(fabs(b) / (2 * t), copysign(t, b));
+  }
+
+  /* Rescale. */
+  if (scale == 1)
+    return (result * 2.0);
+  else if (scale == 2)
+    return (result * 0.5);
+  else
+    return (result);
+}
+      
+} // namespace complex
+
+} // namespace detail
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> sqrt(const complex<ValueType>& z){
+  return thrust::polar(std::sqrt(thrust::abs(z)),thrust::arg(z)/ValueType(2));
+}
+
+template <>
+__host__ __device__
+inline complex<double> sqrt(const complex<double>& z){
+  return detail::complex::csqrt(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/csqrtf.h b/thrust/thrust/detail/complex/csqrtf.h
new file mode 100644
index 0000000000000000000000000000000000000000..125d4b60d72b2345cce4babcb9b84cc2b7122110
--- /dev/null
+++ b/thrust/thrust/detail/complex/csqrtf.h
@@ -0,0 +1,147 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2007 David Schultz <das@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Adapted from FreeBSD by Filipe Maia <filipe.c.maia@gmail.com>:
+ *    freebsd/lib/msun/src/s_csqrt.c
+ */
+
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__ inline
+complex<float> csqrtf(const complex<float>& z){
+  float a = z.real(), b = z.imag();
+  float t;
+  int scale;
+  complex<float> result;
+
+  /* We risk spurious overflow for components >= FLT_MAX / (1 + sqrt(2)). */
+  const float THRESH = 1.40949553037932e+38f;
+
+  /* Handle special cases. */
+  if (z == 0.0f)
+    return (complex<float>(0, b));
+  if (isinf(b))
+    return (complex<float>(infinity<float>(), b));
+  if (isnan(a)) {
+    t = (b - b) / (b - b);	/* raise invalid if b is not a NaN */
+    return (complex<float>(a, t));	/* return NaN + NaN i */
+  }
+  if (isinf(a)) {
+    /*
+     * csqrtf(inf + NaN i)  = inf +  NaN i
+     * csqrtf(inf + y i)    = inf +  0 i
+     * csqrtf(-inf + NaN i) = NaN +- inf i
+     * csqrtf(-inf + y i)   = 0   +  inf i
+     */
+    if (signbit(a))
+      return (complex<float>(fabsf(b - b), copysignf(a, b)));
+    else
+      return (complex<float>(a, copysignf(b - b, b)));
+  }
+  /*
+   * The remaining special case (b is NaN) is handled just fine by
+   * the normal code path below.
+   */
+
+  /* 
+   * Unlike in the FreeBSD code we'll avoid using double precision as
+   * not all hardware supports it.
+   */
+
+  // FLT_MIN*2
+  const float low_thresh = 2.35098870164458e-38f;
+  scale = 0;
+
+  if (fabsf(a) >= THRESH || fabsf(b) >= THRESH) {
+    /* Scale to avoid overflow. */
+    a *= 0.25f;
+    b *= 0.25f;
+    scale = 1;
+  }else if (fabsf(a) <= low_thresh && fabsf(b) <= low_thresh) {
+    /* Scale to avoid underflow. */
+    a *= 4.f;
+    b *= 4.f;
+    scale = 2;
+  }
+
+  /* Algorithm 312, CACM vol 10, Oct 1967. */
+  if (a >= 0.0f) {
+    t = sqrtf((a + hypotf(a, b)) * 0.5f);
+    result = complex<float>(t, b / (2.0f * t));
+  } else {
+    t = sqrtf((-a + hypotf(a, b)) * 0.5f);
+    result = complex<float>(fabsf(b) / (2.0f * t), copysignf(t, b));
+  }
+
+  /* Rescale. */
+  if (scale == 1)
+    return (result * 2.0f);
+  else if (scale == 2)
+    return (result * 0.5f);
+  else
+    return (result);
+}      
+
+} // namespace complex
+
+} // namespace detail
+
+template <>
+__host__ __device__
+inline complex<float> sqrt(const complex<float>& z){
+  return detail::complex::csqrtf(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/ctanh.h b/thrust/thrust/detail/complex/ctanh.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ef1590920db65c50582787fc953af1ffb582099
--- /dev/null
+++ b/thrust/thrust/detail/complex/ctanh.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2011 David Schultz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Adapted from FreeBSD by Filipe Maia <filipe.c.maia@gmail.com>:
+ *    freebsd/lib/msun/src/s_ctanh.c
+ */
+
+/*
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__ inline
+complex<double> ctanh(const complex<double>& z){
+  double x, y;
+  double t, beta, s, rho, denom;
+  uint32_t hx, ix, lx;
+
+  x = z.real();
+  y = z.imag();
+
+  extract_words(hx, lx, x);
+  ix = hx & 0x7fffffff;
+
+  /*
+   * ctanh(NaN + i 0) = NaN + i 0
+   *
+   * ctanh(NaN + i y) = NaN + i NaN		for y != 0
+   *
+   * The imaginary part has the sign of x*sin(2*y), but there's no
+   * special effort to get this right.
+   *
+   * ctanh(+-Inf +- i Inf) = +-1 +- 0
+   *
+   * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)		for y finite
+   *
+   * The imaginary part of the sign is unspecified.  This special
+   * case is only needed to avoid a spurious invalid exception when
+   * y is infinite.
+   */
+  if (ix >= 0x7ff00000) {
+    if ((ix & 0xfffff) | lx)	/* x is NaN */
+      return (complex<double>(x, (y == 0 ? y : x * y)));
+    set_high_word(x, hx - 0x40000000);	/* x = copysign(1, x) */
+    return (complex<double>(x, copysign(0.0, isinf(y) ? y : sin(y) * cos(y))));
+  }
+
+  /*
+   * ctanh(x + i NAN) = NaN + i NaN
+   * ctanh(x +- i Inf) = NaN + i NaN
+   */
+  if (!isfinite(y))
+    return (complex<double>(y - y, y - y));
+
+  /*
+   * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+   * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+   * We use a modified formula to avoid spurious overflow.
+   */
+  if (ix >= 0x40360000) {	/* x >= 22 */
+    double exp_mx = exp(-fabs(x));
+    return (complex<double>(copysign(1.0, x),
+			    4.0 * sin(y) * cos(y) * exp_mx * exp_mx));
+  }
+
+  /* Kahan's algorithm */
+  t = tan(y);
+  beta = 1.0 + t * t;	/* = 1 / cos^2(y) */
+  s = sinh(x);
+  rho = sqrt(1.0 + s * s);	/* = cosh(x) */
+  denom = 1.0 + beta * s * s;
+  return (complex<double>((beta * rho * s) / denom, t / denom));
+}
+
+__host__ __device__ inline
+complex<double> ctan(complex<double> z){
+  /* ctan(z) = -I * ctanh(I * z) */
+  z = ctanh(complex<double>(-z.imag(), z.real()));
+  return (complex<double>(z.imag(), -z.real()));
+}
+
+} // namespace complex
+
+} // namespace detail
+
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> tan(const complex<ValueType>& z){
+  return sin(z)/cos(z);
+}
+
+template <typename ValueType>
+__host__ __device__
+inline complex<ValueType> tanh(const complex<ValueType>& z){
+  // This implementation seems better than the simple sin/cos
+  return (thrust::exp(ValueType(2)*z)-ValueType(1))/
+    (thrust::exp(ValueType(2)*z)+ValueType(1));
+}
+
+template <>
+__host__ __device__
+inline complex<double> tan(const complex<double>& z){
+  return detail::complex::ctan(z);
+}
+  
+template <>
+__host__ __device__
+inline complex<double> tanh(const complex<double>& z){
+  return detail::complex::ctanh(z);
+}
+  
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/ctanhf.h b/thrust/thrust/detail/complex/ctanhf.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6923d1df6d723092fc7522dd197bb66fa7f3fa4
--- /dev/null
+++ b/thrust/thrust/detail/complex/ctanhf.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*-
+ * Copyright (c) 2011 David Schultz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Adapted from FreeBSD by Filipe Maia, filipe.c.maia@gmail.com:
+ *    freebsd/lib/msun/src/s_ctanhf.c
+ */
+
+/*
+ * Hyperbolic tangent of a complex argument z.  See ctanh.c for details.
+ */
+
+#pragma once
+
+#include <thrust/complex.h>
+#include <thrust/detail/complex/math_private.h>
+#include <cmath>
+
+namespace thrust{
+namespace detail{
+namespace complex{		      	
+
+using thrust::complex;
+
+__host__ __device__ inline
+complex<float> ctanhf(const complex<float>& z){
+  float x, y;
+  float t, beta, s, rho, denom;
+  uint32_t hx, ix;
+
+  x = z.real();
+  y = z.imag();
+
+  get_float_word(hx, x);
+  ix = hx & 0x7fffffff;
+
+  if (ix >= 0x7f800000) {
+    if (ix & 0x7fffff)
+      return (complex<float>(x, (y == 0.0f ? y : x * y)));
+    set_float_word(x, hx - 0x40000000);
+    return (complex<float>(x,
+			   copysignf(0, isinf(y) ? y : sinf(y) * cosf(y))));
+  }
+
+  if (!isfinite(y))
+    return (complex<float>(y - y, y - y));
+
+  if (ix >= 0x41300000) {	/* x >= 11 */
+    float exp_mx = expf(-fabsf(x));
+    return (complex<float>(copysignf(1.0f, x),
+			   4.0f * sinf(y) * cosf(y) * exp_mx * exp_mx));
+  }
+
+  t = tanf(y);
+  beta = 1.0f + t * t;
+  s = sinhf(x);
+  rho = sqrtf(1.0f + s * s);
+  denom = 1.0f + beta * s * s;
+  return (complex<float>((beta * rho * s) / denom, t / denom));
+}
+
+  __host__ __device__ inline
+  complex<float> ctanf(complex<float> z){
+    z = ctanhf(complex<float>(-z.imag(), z.real()));
+    return (complex<float>(z.imag(), -z.real()));
+  }
+
+} // namespace complex
+
+} // namespace detail
+
+template <>
+__host__ __device__
+inline complex<float> tan(const complex<float>& z){
+  return detail::complex::ctanf(z);
+}
+
+template <>
+__host__ __device__
+inline complex<float> tanh(const complex<float>& z){
+  return detail::complex::ctanhf(z);
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/complex/math_private.h b/thrust/thrust/detail/complex/math_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc2d6357f2c169ee7e4e60f466dc09f4ed4b30d2
--- /dev/null
+++ b/thrust/thrust/detail/complex/math_private.h
@@ -0,0 +1,136 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* adapted from FreeBSD:
+ *    lib/msun/src/math_private.h
+ */
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/complex.h>
+#include <thrust/detail/cstdint.h>
+
+namespace thrust{
+namespace detail{
+namespace complex{
+
+using thrust::complex;
+
+typedef union
+{
+  float value;
+  uint32_t word;
+} ieee_float_shape_type;
+  
+__host__ __device__
+inline void get_float_word(uint32_t & i, float d){
+  ieee_float_shape_type gf_u;
+  gf_u.value = (d);
+  (i) = gf_u.word;
+}
+
+__host__ __device__
+inline void get_float_word(int32_t & i, float d){
+  ieee_float_shape_type gf_u;
+  gf_u.value = (d);
+  (i) = gf_u.word;
+}
+
+__host__ __device__
+inline void set_float_word(float & d, uint32_t i){
+  ieee_float_shape_type sf_u;
+  sf_u.word = (i);
+  (d) = sf_u.value;
+}
+
+// Assumes little endian ordering
+typedef union
+{
+  double value;
+  struct
+  {
+    uint32_t lsw;
+    uint32_t msw;
+  } parts;
+  struct
+  {
+    uint64_t w;
+  } xparts;
+} ieee_double_shape_type;
+  
+__host__ __device__ inline
+void get_high_word(uint32_t & i,double d){
+  ieee_double_shape_type gh_u;
+  gh_u.value = (d);
+  (i) = gh_u.parts.msw;                                   
+}
+  
+/* Set the more significant 32 bits of a double from an int.  */
+__host__ __device__ inline
+void set_high_word(double & d, uint32_t v){
+  ieee_double_shape_type sh_u;
+  sh_u.value = (d);
+  sh_u.parts.msw = (v);
+  (d) = sh_u.value;
+}
+  
+  
+__host__ __device__ inline 
+void  insert_words(double & d, uint32_t ix0, uint32_t ix1){
+  ieee_double_shape_type iw_u;
+  iw_u.parts.msw = (ix0);
+  iw_u.parts.lsw = (ix1);
+  (d) = iw_u.value;
+}
+  
+/* Get two 32 bit ints from a double.  */
+__host__ __device__ inline
+void  extract_words(uint32_t & ix0,uint32_t & ix1, double d){
+  ieee_double_shape_type ew_u;
+  ew_u.value = (d);
+  (ix0) = ew_u.parts.msw;
+  (ix1) = ew_u.parts.lsw;
+}
+  
+/* Get two 32 bit ints from a double.  */
+__host__ __device__ inline
+void  extract_words(int32_t & ix0,int32_t & ix1, double d){
+  ieee_double_shape_type ew_u;
+  ew_u.value = (d);
+  (ix0) = ew_u.parts.msw;
+  (ix1) = ew_u.parts.lsw;
+}
+  
+} // namespace complex
+
+} // namespace detail
+
+} // namespace thrust
+
+
+#include <thrust/detail/complex/c99math.h>
diff --git a/thrust/thrust/detail/complex/stream.h b/thrust/thrust/detail/complex/stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d87bbd548974a745da11521302d27524703f4a0
--- /dev/null
+++ b/thrust/thrust/detail/complex/stream.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2013 Filipe RNC Maia
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/complex.h>
+
+namespace thrust
+{
+template<typename ValueType,class charT, class traits>
+std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>& os, const complex<ValueType>& z)
+{
+  os << '(' << z.real() << ',' << z.imag() << ')';
+  return os;
+}
+  
+template<typename ValueType, typename charT, class traits>
+std::basic_istream<charT, traits>&
+operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z)
+{
+  ValueType re, im;
+    
+  charT ch;
+  is >> ch;
+    
+  if(ch == '(')
+    {
+      is >> re >> ch;
+      if (ch == ',')
+        {
+          is >> im >> ch;
+          if (ch == ')')
+	    {
+	      z = complex<ValueType>(re, im);
+	    }
+          else
+	    {
+	      is.setstate(std::ios_base::failbit);
+	    }
+        }
+      else if (ch == ')')
+        {
+          z = re;
+        }
+      else
+        {
+          is.setstate(std::ios_base::failbit);
+        }
+    }
+  else
+    {
+      is.putback(ch);
+      is >> re;
+      z = re;
+    }
+  return is;
+}
+
+} // namespace thrust
diff --git a/thrust/thrust/detail/config.h b/thrust/thrust/detail/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a5573a410e6ee8ec7b062ee4bd330390fb37e9b
--- /dev/null
+++ b/thrust/thrust/detail/config.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/*! \file config.h
+ *  \brief Defines platform configuration.
+ */
+
+#pragma once
+
+#include <thrust/version.h>
+#include <thrust/detail/config/config.h>
+
diff --git a/thrust/thrust/detail/config/compiler.h b/thrust/thrust/detail/config/compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..644db93d4e00c4e81cd32f38eab017a6637ca9dd
--- /dev/null
+++ b/thrust/thrust/detail/config/compiler.h
@@ -0,0 +1,186 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file compiler.h
+ *  \brief Compiler-specific configuration
+ */
+
+#pragma once
+
+// enumerate host compilers we know about
+#define THRUST_HOST_COMPILER_UNKNOWN 0
+#define THRUST_HOST_COMPILER_MSVC    1
+#define THRUST_HOST_COMPILER_GCC     2
+#define THRUST_HOST_COMPILER_CLANG   3
+
+// enumerate device compilers we know about
+#define THRUST_DEVICE_COMPILER_UNKNOWN 0
+#define THRUST_DEVICE_COMPILER_MSVC    1
+#define THRUST_DEVICE_COMPILER_GCC     2
+#define THRUST_DEVICE_COMPILER_NVCC    3
+#define THRUST_DEVICE_COMPILER_CLANG   4
+
+// figure out which host compiler we're using
+// XXX we should move the definition of THRUST_DEPRECATED out of this logic
+#if   defined(_MSC_VER)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
+#define THRUST_MSVC_VERSION _MSC_VER
+#define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__clang__)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_CLANG
+#define THRUST_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#elif defined(__GNUC__)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_GCC
+#define THRUST_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#if (THRUST_GCC_VERSION >= 50000)
+#define THRUST_MODERN_GCC
+#else
+#define THRUST_LEGACY_GCC
+#endif
+#else
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_UNKNOWN
+#endif // THRUST_HOST_COMPILER
+
+// figure out which device compiler we're using
+#if defined(__CUDACC__)
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_GCC
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+// CUDA-capable clang should behave similar to NVCC.
+#if defined(__CUDA__)
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
+#else
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_CLANG
+#endif
+#else
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_UNKNOWN
+#endif
+
+// is the device compiler capable of compiling omp?
+#ifdef _OPENMP
+#define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_TRUE
+#else
+#define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_FALSE
+#endif // _OPENMP
+
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && !defined(__CUDA_ARCH__)
+  #define THRUST_DISABLE_MSVC_WARNING_BEGIN(x)                                \
+    __pragma(warning(push))                                                   \
+    __pragma(warning(disable : x))                                            \
+    /**/
+  #define THRUST_DISABLE_MSVC_WARNING_END(x)                                  \
+    __pragma(warning(pop))                                                    \
+    /**/
+#else
+  #define THRUST_DISABLE_MSVC_WARNING_BEGIN(x)
+  #define THRUST_DISABLE_MSVC_WARNING_END(x)
+#endif
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) && !defined(__CUDA_ARCH__)
+  #define THRUST_IGNORE_CLANG_WARNING_IMPL(x)                                 \
+    THRUST_PP_STRINGIZE(clang diagnostic ignored x)                           \
+    /**/
+  #define THRUST_IGNORE_CLANG_WARNING(x)                                      \
+    THRUST_IGNORE_CLANG_WARNING_IMPL(THRUST_PP_STRINGIZE(x))                  \
+    /**/
+
+  #define THRUST_DISABLE_CLANG_WARNING_BEGIN(x)                               \
+    _Pragma("clang diagnostic push")                                          \
+    _Pragma(THRUST_IGNORE_CLANG_WARNING(x))                                   \
+    /**/
+  #define THRUST_DISABLE_CLANG_WARNING_END(x)                                 \
+    _Pragma("clang diagnostic pop")                                           \
+    /**/
+#else
+  #define THRUST_DISABLE_CLANG_WARNING_BEGIN(x)
+  #define THRUST_DISABLE_CLANG_WARNING_END(x)
+#endif
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && !defined(__CUDA_ARCH__)
+  #define THRUST_IGNORE_GCC_WARNING_IMPL(x)                                   \
+    THRUST_PP_STRINGIZE(GCC diagnostic ignored x)                             \
+    /**/
+  #define THRUST_IGNORE_GCC_WARNING(x)                                        \
+    THRUST_IGNORE_GCC_WARNING_IMPL(THRUST_PP_STRINGIZE(x))                    \
+    /**/
+
+  #define THRUST_DISABLE_GCC_WARNING_BEGIN(x)                                 \
+    _Pragma("GCC diagnostic push")                                            \
+    _Pragma(THRUST_IGNORE_GCC_WARNING(x))                                     \
+    /**/
+  #define THRUST_DISABLE_GCC_WARNING_END(x)                                   \
+    _Pragma("GCC diagnostic pop")                                             \
+    /**/
+#else
+  #define THRUST_DISABLE_GCC_WARNING_BEGIN(x)
+  #define THRUST_DISABLE_GCC_WARNING_END(x)
+#endif
+
+#define THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN               \
+  THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267)                                \
+  /**/
+#define THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END                 \
+  THRUST_DISABLE_MSVC_WARNING_END(4244 4267)                                  \
+  /**/
+#define THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(x)                  \
+  THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN                     \
+  x;                                                                          \
+  THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END                       \
+  /**/
+
+#define THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN               \
+  THRUST_DISABLE_MSVC_WARNING_BEGIN(4800)                                     \
+  /**/
+#define THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END                 \
+  THRUST_DISABLE_MSVC_WARNING_END(4800)                                       \
+  /**/
+#define THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING(x)                  \
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN                     \
+  x;                                                                          \
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END                       \
+  /**/
+
+#define THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_BEGIN                    \
+  THRUST_DISABLE_CLANG_WARNING_BEGIN(-Wself-assign)                           \
+  /**/
+#define THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_END                      \
+  THRUST_DISABLE_CLANG_WARNING_END(-Wself-assign)                             \
+  /**/
+#define THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING(x)                       \
+  THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_BEGIN                          \
+  x;                                                                          \
+  THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_END                            \
+  /**/
+
+#define THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_BEGIN     \
+  THRUST_DISABLE_CLANG_WARNING_BEGIN(-Wreorder)                               \
+  THRUST_DISABLE_GCC_WARNING_BEGIN(-Wreorder)                                 \
+  /**/
+#define THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END       \
+  THRUST_DISABLE_CLANG_WARNING_END(-Wreorder)                                 \
+  THRUST_DISABLE_GCC_WARNING_END(-Wreorder)                                   \
+  /**/
+#define THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING(x)        \
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_BEGIN           \
+  x;                                                                          \
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END             \
+  /**/
+
+
diff --git a/thrust/thrust/detail/config/compiler_fence.h b/thrust/thrust/detail/config/compiler_fence.h
new file mode 100644
index 0000000000000000000000000000000000000000..c379abaf364b460031a93a0ad6d4ee3d8419ab78
--- /dev/null
+++ b/thrust/thrust/detail/config/compiler_fence.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/preprocessor.h>
+
+// TODO: Enable this or remove this file once nvGRAPH/CUSP migrates off of it.
+//#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+//  #pragma message("warning: The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 or C11 atomics instead.")
+//#else
+//  #warning The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 or C11 atomics instead.
+//#endif
+
+// msvc case
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+
+#ifndef _DEBUG
+
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+#define __thrust_compiler_fence() _ReadWriteBarrier()
+#else
+
+#define __thrust_compiler_fence() do {} while (0)
+
+#endif // _DEBUG
+
+// gcc case
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+
+#if THRUST_GCC_VERSION >= 40200 // atomic built-ins were introduced ~4.2
+#define __thrust_compiler_fence() __sync_synchronize()
+#else
+// allow the code to compile without any guarantees
+#define __thrust_compiler_fence() do {} while (0)
+#endif // THRUST_GCC_VERSION
+
+// unknown case
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+#define __thrust_compiler_fence() __sync_synchronize()
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_UNKNOWN
+
+// allow the code to compile without any guarantees
+#define __thrust_compiler_fence() do {} while (0)
+
+#endif
+
diff --git a/thrust/thrust/detail/config/config.h b/thrust/thrust/detail/config/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..800bc4c51a8bedd5dc922da8a980dc62f02c62aa
--- /dev/null
+++ b/thrust/thrust/detail/config/config.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file config.h
+ *  \brief Defines platform configuration.
+ */
+
+#pragma once
+
+// NOTE: The order of these #includes matters.
+
+#include <thrust/detail/config/simple_defines.h>
+#include <thrust/detail/config/compiler.h>
+#include <thrust/detail/config/cpp_dialect.h>
+#include <thrust/detail/config/cpp_compatibility.h>
+#include <thrust/detail/config/deprecated.h>
+// host_system.h & device_system.h must be #included as early as possible
+// because other config headers depend on it
+#include <thrust/detail/config/host_system.h>
+#include <thrust/detail/config/device_system.h>
+#include <thrust/detail/config/host_device.h>
+#include <thrust/detail/config/debug.h>
+#include <thrust/detail/config/forceinline.h>
+#include <thrust/detail/config/exec_check_disable.h>
+#include <thrust/detail/config/global_workarounds.h>
+
diff --git a/thrust/thrust/detail/config/cpp_compatibility.h b/thrust/thrust/detail/config/cpp_compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..646f57504d202adb9263ccd2b0e92e73e8c82921
--- /dev/null
+++ b/thrust/thrust/detail/config/cpp_compatibility.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#include <cstddef>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  ifndef __has_cpp_attribute
+#    define __has_cpp_attribute(X) 0
+#  endif
+
+#  if __has_cpp_attribute(nodiscard)
+#    define THRUST_NODISCARD [[nodiscard]]
+#  endif
+
+#  define THRUST_CONSTEXPR constexpr
+#  define THRUST_OVERRIDE override
+#  define THRUST_DEFAULT = default;
+#  define THRUST_NOEXCEPT noexcept
+#  define THRUST_FINAL final
+#else
+#  define THRUST_CONSTEXPR
+#  define THRUST_OVERRIDE
+#  define THRUST_DEFAULT {}
+#  define THRUST_NOEXCEPT throw()
+#  define THRUST_FINAL
+#endif
+
+#ifndef THRUST_NODISCARD
+#  define THRUST_NODISCARD
+#endif
+
+// FIXME: Combine THRUST_INLINE_CONSTANT and
+// THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
+// supports `constexpr` globals in host and device code.
+#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__)
+// FIXME: Add this when NVCC supports inline variables.
+//#  if   THRUST_CPP_DIALECT >= 2017
+//#    define THRUST_INLINE_CONSTANT                 inline constexpr
+//#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
+#  if THRUST_CPP_DIALECT >= 2011
+#    define THRUST_INLINE_CONSTANT                 static const __device__
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
+#  else
+#    define THRUST_INLINE_CONSTANT                 static const __device__
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static const
+#  endif
+#else
+// FIXME: Add this when NVCC supports inline variables.
+//#  if   THRUST_CPP_DIALECT >= 2017
+//#    define THRUST_INLINE_CONSTANT                 inline constexpr
+//#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
+#  if THRUST_CPP_DIALECT >= 2011
+#    define THRUST_INLINE_CONSTANT                 static constexpr
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
+#  else
+#    define THRUST_INLINE_CONSTANT                 static const
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static const
+#  endif
+#endif
+
+#if defined(__NVCOMPILER_CUDA__)
+#  define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
+#  define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
+#  define THRUST_INCLUDE_DEVICE_CODE 1
+#  define THRUST_INCLUDE_HOST_CODE 1
+#elif defined(__CUDA_ARCH__)
+#  define THRUST_IS_DEVICE_CODE 1
+#  define THRUST_IS_HOST_CODE 0
+#  define THRUST_INCLUDE_DEVICE_CODE 1
+#  define THRUST_INCLUDE_HOST_CODE 0
+#else
+#  define THRUST_IS_DEVICE_CODE 0
+#  define THRUST_IS_HOST_CODE 1
+#  define THRUST_INCLUDE_DEVICE_CODE 0
+#  define THRUST_INCLUDE_HOST_CODE 1
+#endif
+
diff --git a/thrust/thrust/detail/config/cpp_dialect.h b/thrust/thrust/detail/config/cpp_dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b7ecc2ebe1f3c525c08bc0691e82d5650f29423
--- /dev/null
+++ b/thrust/thrust/detail/config/cpp_dialect.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file cpp_dialect.h
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/compiler.h>
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - THRUST_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - THRUST_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - THRUST_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the CUB opt-outs as well:
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    THRUST_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_11) && \
+     defined(CUB_IGNORE_DEPRECATED_CPP_11)
+#  define    THRUST_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(THRUST_IGNORE_DEPRECATED_COMPILER) && \
+     defined(CUB_IGNORE_DEPRECATED_COMPILER)
+#  define    THRUST_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef THRUST_IGNORE_DEPRECATED_CPP_DIALECT
+#  define THRUST_IGNORE_DEPRECATED_CPP_11
+#  define THRUST_IGNORE_DEPRECATED_COMPILER
+#endif
+
+// Define this to override the built-in detection.
+#ifndef THRUST_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with THRUST_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if THRUST_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define THRUST_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define THRUST_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define THRUST_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if THRUST_CPLUSPLUS < 201103L
+#    define THRUST_CPP_DIALECT 2003
+#  elif THRUST_CPLUSPLUS < 201402L
+#    define THRUST_CPP_DIALECT 2011
+#  elif THRUST_CPLUSPLUS < 201703L
+#    define THRUST_CPP_DIALECT 2014
+#  elif THRUST_CPLUSPLUS == 201703L
+#    define THRUST_CPP_DIALECT 2017
+#  elif THRUST_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define THRUST_CPP_DIALECT 2020
+#  endif
+
+#  undef THRUST_CPLUSPLUS // cleanup
+
+#endif // !THRUST_CPP_DIALECT
+
+// Define THRUST_COMPILER_DEPRECATION macro:
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#  define THRUST_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" THRUST_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define THRUST_COMP_DEPR_IMPL0(x) THRUST_COMP_DEPR_IMPL1(x)
+#  define THRUST_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define THRUST_COMP_DEPR_IMPL(msg) THRUST_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define THRUST_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define THRUST_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define THRUST_COMPILER_DEPRECATION(REQ, FIX) \
+  THRUST_COMP_DEPR_IMPL(Thrust requires REQ. Please FIX. Define THRUST_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+// Minimum required compiler checks:
+#ifndef THRUST_IGNORE_DEPRECATED_COMPILER
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && THRUST_GCC_VERSION < 50000
+     THRUST_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
+#  endif
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG && THRUST_CLANG_VERSION < 60000
+     THRUST_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
+#  endif
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && THRUST_MSVC_VERSION < 1910
+     THRUST_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
+#  endif
+#endif
+
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) && THRUST_CPP_DIALECT < 2014 && \
+    (THRUST_CPP_DIALECT != 2011 || !defined(THRUST_IGNORE_DEPRECATED_CPP_11))
+  THRUST_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
+#endif
+
+#undef THRUST_COMPILER_DEPRECATION
+#undef THRUST_COMP_DEPR_IMPL
+#undef THRUST_COMP_DEPR_IMPL0
+#undef THRUST_COMP_DEPR_IMPL1
diff --git a/thrust/thrust/detail/config/debug.h b/thrust/thrust/detail/config/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..16f65d67c9054f4a32a6c7a4e437b1cdb16a6c30
--- /dev/null
+++ b/thrust/thrust/detail/config/debug.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#ifndef THRUST_DEBUG
+#  ifndef NDEBUG
+#    if defined(DEBUG) || defined(_DEBUG)
+#      define THRUST_DEBUG 1
+#    endif // (DEBUG || _DEBUG)
+#  endif // NDEBUG
+#endif // THRUST_DEBUG
+
+#if THRUST_DEBUG
+#  ifndef __THRUST_SYNCHRONOUS
+#    define __THRUST_SYNCHRONOUS 1
+#  endif // __THRUST_SYNCHRONOUS
+#endif // THRUST_DEBUG
+
diff --git a/thrust/thrust/detail/config/deprecated.h b/thrust/thrust/detail/config/deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd18f3ac9282e9001308cc911cc5811b7093e560
--- /dev/null
+++ b/thrust/thrust/detail/config/deprecated.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file deprecated.h
+ *  \brief Defines the THRUST_DEPRECATED macro
+ */
+
+#pragma once
+
+#include <thrust/detail/config/compiler.h>
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#  define THRUST_DEPRECATED __declspec(deprecated)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+#  define THRUST_DEPRECATED __attribute__((deprecated))
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+#  define THRUST_DEPRECATED __attribute__((deprecated))
+#else
+#  define THRUST_DEPRECATED
+#endif
diff --git a/thrust/thrust/detail/config/device_system.h b/thrust/thrust/detail/config/device_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4106d3fbb744186a325c07dcd30651394365d0c
--- /dev/null
+++ b/thrust/thrust/detail/config/device_system.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// reserve 0 for undefined
+#define THRUST_DEVICE_SYSTEM_CUDA    1
+#define THRUST_DEVICE_SYSTEM_OMP     2
+#define THRUST_DEVICE_SYSTEM_TBB     3
+#define THRUST_DEVICE_SYSTEM_CPP     4
+
+#ifndef THRUST_DEVICE_SYSTEM
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
+#endif // THRUST_DEVICE_SYSTEM
+
+// XXX make the use of THRUST_DEVICE_BACKEND an error in Thrust 1.7
+// XXX eliminate the following in Thrust 1.7
+
+#define THRUST_DEVICE_BACKEND_CUDA THRUST_DEVICE_SYSTEM_CUDA
+#define THRUST_DEVICE_BACKEND_OMP  THRUST_DEVICE_SYSTEM_OMP
+#define THRUST_DEVICE_BACKEND_TBB  THRUST_DEVICE_SYSTEM_TBB
+
+#ifdef THRUST_DEVICE_BACKEND
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#    pragma message("----------------------------------------------------------------------------------")
+#    pragma message("| WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |")
+#    pragma message("----------------------------------------------------------------------------------")
+#  else
+#    warning ----------------------------------------------------------------------------------
+#    warning | WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |
+#    warning ----------------------------------------------------------------------------------
+#  endif // THRUST_HOST_COMPILER
+#  undef THRUST_DEVICE_SYSTEM
+#  define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND
+#endif // THRUST_DEVICE_BACKEND
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE cuda
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE omp
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE tbb
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE cpp
+#endif
+
+#define __THRUST_DEVICE_SYSTEM_ROOT thrust/system/__THRUST_DEVICE_SYSTEM_NAMESPACE
+
diff --git a/thrust/thrust/detail/config/exec_check_disable.h b/thrust/thrust/detail/config/exec_check_disable.h
new file mode 100644
index 0000000000000000000000000000000000000000..114ca3853a9e148a5b52161c4409d52873dc5b3d
--- /dev/null
+++ b/thrust/thrust/detail/config/exec_check_disable.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file exec_check_disable.h
+ *  \brief Defines __thrust_exec_check_disable__
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #pragma nv_exec_check_disable is only recognized by NVCC.  Having a macro
+// expand to a #pragma (rather than _Pragma) only works with NVCC's compilation
+// model, not with other compilers.
+#if defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__) && \
+    !(defined(__CUDA__) && defined(__clang__))
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#define __thrust_exec_check_disable__ __pragma("nv_exec_check_disable")
+#else // MSVC
+#define __thrust_exec_check_disable__ _Pragma("nv_exec_check_disable")
+#endif // MSVC
+
+#else
+
+#define __thrust_exec_check_disable__
+
+#endif
+
+
diff --git a/thrust/thrust/detail/config/forceinline.h b/thrust/thrust/detail/config/forceinline.h
new file mode 100644
index 0000000000000000000000000000000000000000..6641304258aa9229df152fc8c6a137ec52df2302
--- /dev/null
+++ b/thrust/thrust/detail/config/forceinline.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file forceinline.h
+ *  \brief Defines __thrust_forceinline__
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(__CUDACC__)
+
+#define __thrust_forceinline__ __forceinline__
+
+#else
+
+// TODO add 
+
+#define __thrust_forceinline__
+
+#endif
+
diff --git a/thrust/thrust/detail/config/global_workarounds.h b/thrust/thrust/detail/config/global_workarounds.h
new file mode 100644
index 0000000000000000000000000000000000000000..9800f03593ac253914c149815a497bf01baee4b2
--- /dev/null
+++ b/thrust/thrust/detail/config/global_workarounds.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/compiler.h>
+
+// XXX workaround gcc 4.8+'s complaints about unused local typedefs by silencing them globally
+#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION >= 40800)
+#  if defined(__NVCC__) && (CUDART_VERSION >= 6000)
+#    pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#  endif // nvcc & cuda 6+
+#endif // gcc 4.8
+
diff --git a/thrust/thrust/detail/config/host_device.h b/thrust/thrust/detail/config/host_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..5540f91260d807bfb2ef06064767aeaccea2fc1a
--- /dev/null
+++ b/thrust/thrust/detail/config/host_device.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file host_device.h
+ *  \brief Defines __host__ and __device__
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// since nvcc defines __host__ and __device__ for us,
+// and only nvcc knows what to do with __host__ and __device__,
+// define them to be the empty string for other compilers
+
+#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
+
+// since __host__ & __device__ might have already be defined, only
+// #define them if not defined already
+// XXX this will break if the client does #include <host_defines.h> later
+
+#ifndef __host__
+#define __host__
+#endif // __host__
+
+#ifndef __device__
+#define __device__
+#endif // __device__
+
+#endif
+
diff --git a/thrust/thrust/detail/config/host_system.h b/thrust/thrust/detail/config/host_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c13878032206d560e0ea5115b014af3841dc7b3
--- /dev/null
+++ b/thrust/thrust/detail/config/host_system.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// reserve 0 for undefined
+#define THRUST_HOST_SYSTEM_CPP    1
+#define THRUST_HOST_SYSTEM_OMP    2
+#define THRUST_HOST_SYSTEM_TBB    3
+
+#ifndef THRUST_HOST_SYSTEM
+#define THRUST_HOST_SYSTEM THRUST_HOST_SYSTEM_CPP
+#endif // THRUST_HOST_SYSTEM
+
+// XXX make the use of THRUST_HOST_BACKEND an error in Thrust 1.7
+// XXX eliminate the following in Thrust 1.7
+
+#define THRUST_HOST_BACKEND_CPP THRUST_HOST_SYSTEM_CPP
+#define THRUST_HOST_BACKEND_OMP THRUST_HOST_SYSTEM_OMP
+#define THRUST_HOST_BACKEND_TBB THRUST_HOST_SYSTEM_TBB
+
+#ifdef THRUST_HOST_BACKEND
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#    pragma message("------------------------------------------------------------------------------")
+#    pragma message("| WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |")
+#    pragma message("------------------------------------------------------------------------------")
+#  else
+#    warning ------------------------------------------------------------------------------
+#    warning | WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |
+#    warning ------------------------------------------------------------------------------
+#  endif // THRUST_HOST_COMPILER
+#  undef THRUST_HOST_SYSTEM
+#  define THRUST_HOST_SYSTEM THRUST_HOST_BACKEND
+#endif // THRUST_HOST_BACKEND
+
+#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
+#define __THRUST_HOST_SYSTEM_NAMESPACE cpp
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP
+#define __THRUST_HOST_SYSTEM_NAMESPACE omp
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB
+#define __THRUST_HOST_SYSTEM_NAMESPACE tbb
+#endif
+
+#define __THRUST_HOST_SYSTEM_ROOT thrust/system/__THRUST_HOST_SYSTEM_NAMESPACE
+
diff --git a/thrust/thrust/detail/config/simple_defines.h b/thrust/thrust/detail/config/simple_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3ea2eb64e766ea2147ecf1de308a454a739d88e
--- /dev/null
+++ b/thrust/thrust/detail/config/simple_defines.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file simple_defines.h
+ *  \brief Primitive macros without dependencies.
+ */
+
+#pragma once
+
+#define THRUST_UNKNOWN 0
+#define THRUST_FALSE   0
+#define THRUST_TRUE    1
+
+#define THRUST_UNUSED_VAR(expr) do { (void)(expr); } while (0)
+
+#define THRUST_PREVENT_MACRO_SUBSTITUTION
+
diff --git a/thrust/thrust/detail/contiguous_storage.h b/thrust/thrust/detail/contiguous_storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..a128223a992bafbf9f15903f48936d4258e647fe
--- /dev/null
+++ b/thrust/thrust/detail/contiguous_storage.h
@@ -0,0 +1,236 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/detail/normal_iterator.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+struct copy_allocator_t {};
+
+// XXX parameter T is redundant with parameter Alloc
+template<typename T, typename Alloc>
+  class contiguous_storage
+{
+  private:
+    typedef thrust::detail::allocator_traits<Alloc> alloc_traits;
+
+  public:
+    typedef Alloc                                      allocator_type;
+    typedef T                                          value_type;
+    typedef typename alloc_traits::pointer             pointer;
+    typedef typename alloc_traits::const_pointer       const_pointer;
+    typedef typename alloc_traits::size_type           size_type;
+    typedef typename alloc_traits::difference_type     difference_type;
+    typedef typename alloc_traits::reference           reference;
+    typedef typename alloc_traits::const_reference     const_reference;
+
+    typedef thrust::detail::normal_iterator<pointer>       iterator;
+    typedef thrust::detail::normal_iterator<const_pointer> const_iterator;
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    explicit contiguous_storage(const allocator_type &alloc = allocator_type());
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    explicit contiguous_storage(size_type n, const allocator_type &alloc = allocator_type());
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    explicit contiguous_storage(copy_allocator_t, const contiguous_storage &other);
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    explicit contiguous_storage(copy_allocator_t, const contiguous_storage &other, size_type n);
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    ~contiguous_storage();
+
+    __host__ __device__
+    size_type size() const;
+
+    __host__ __device__
+    size_type max_size() const;
+
+    __host__ __device__
+    pointer data();
+
+    __host__ __device__
+    const_pointer data() const;
+
+    __host__ __device__
+    iterator begin();
+
+    __host__ __device__
+    const_iterator begin() const;
+
+    __host__ __device__
+    iterator end();
+
+    __host__ __device__
+    const_iterator end() const;
+
+    __host__ __device__
+    reference operator[](size_type n);
+
+    __host__ __device__
+    const_reference operator[](size_type n) const;
+
+    __host__ __device__
+    allocator_type get_allocator() const;
+
+    // note that allocate does *not* automatically call deallocate
+    __host__ __device__
+    void allocate(size_type n);
+
+    __host__ __device__
+    void deallocate();
+
+    __host__ __device__
+    void swap(contiguous_storage &x);
+
+    __host__ __device__
+    void default_construct_n(iterator first, size_type n);
+
+    __host__ __device__
+    void uninitialized_fill_n(iterator first, size_type n, const value_type &value);
+
+    template<typename InputIterator>
+    __host__ __device__
+    iterator uninitialized_copy(InputIterator first, InputIterator last, iterator result);
+
+    template<typename System, typename InputIterator>
+    __host__ __device__
+    iterator uninitialized_copy(thrust::execution_policy<System> &from_system,
+                                InputIterator first,
+                                InputIterator last,
+                                iterator result);
+
+    template<typename InputIterator, typename Size>
+    __host__ __device__
+    iterator uninitialized_copy_n(InputIterator first, Size n, iterator result);
+
+    template<typename System, typename InputIterator, typename Size>
+    __host__ __device__
+    iterator uninitialized_copy_n(thrust::execution_policy<System> &from_system,
+                                  InputIterator first,
+                                  Size n,
+                                  iterator result);
+
+    __host__ __device__
+    void destroy(iterator first, iterator last);
+
+    __host__ __device__
+    void deallocate_on_allocator_mismatch(const contiguous_storage &other);
+
+    __host__ __device__
+    void destroy_on_allocator_mismatch(const contiguous_storage &other,
+        iterator first, iterator last);
+
+    __host__ __device__
+    void set_allocator(const allocator_type &alloc);
+
+    __host__ __device__
+    bool is_allocator_not_equal(const allocator_type &alloc) const;
+
+    __host__ __device__
+    bool is_allocator_not_equal(const contiguous_storage &other) const;
+
+    __host__ __device__
+    void propagate_allocator(const contiguous_storage &other);
+
+#if THRUST_CPP_DIALECT >= 2011
+    __host__ __device__
+    void propagate_allocator(contiguous_storage &other);
+
+    // allow move assignment for a sane implementation of allocator propagation
+    // on move assignment
+    __host__ __device__
+    contiguous_storage &operator=(contiguous_storage &&other);
+#endif
+
+  private:
+    // XXX we could inherit from this to take advantage of empty base class optimization
+    allocator_type m_allocator;
+
+    iterator m_begin;
+
+    size_type m_size;
+
+    // disallow assignment
+    contiguous_storage &operator=(const contiguous_storage &x);
+
+    __host__ __device__
+    void swap_allocators(true_type, const allocator_type &);
+
+    __host__ __device__
+    void swap_allocators(false_type, allocator_type &);
+
+    __host__ __device__
+    bool is_allocator_not_equal_dispatch(true_type, const allocator_type &) const;
+
+    __host__ __device__
+    bool is_allocator_not_equal_dispatch(false_type, const allocator_type &) const;
+
+    __host__ __device__
+    void deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other);
+
+    __host__ __device__
+    void deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &other);
+
+    __host__ __device__
+    void destroy_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other,
+        iterator first, iterator last);
+
+    __host__ __device__
+    void destroy_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &other,
+        iterator first, iterator last);
+
+    __host__ __device__
+    void propagate_allocator_dispatch(true_type, const contiguous_storage &other);
+
+    __host__ __device__
+    void propagate_allocator_dispatch(false_type, const contiguous_storage &other);
+
+#if THRUST_CPP_DIALECT >= 2011
+    __host__ __device__
+    void propagate_allocator_dispatch(true_type, contiguous_storage &other);
+
+    __host__ __device__
+    void propagate_allocator_dispatch(false_type, contiguous_storage &other);
+#endif
+}; // end contiguous_storage
+
+} // end detail
+
+template<typename T, typename Alloc>
+__host__ __device__
+void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs);
+
+} // end thrust
+
+#include <thrust/detail/contiguous_storage.inl>
+
diff --git a/thrust/thrust/detail/contiguous_storage.inl b/thrust/thrust/detail/contiguous_storage.inl
new file mode 100644
index 0000000000000000000000000000000000000000..89f78e0b22d3aad17354b022d9da84ff6c57849a
--- /dev/null
+++ b/thrust/thrust/detail/contiguous_storage.inl
@@ -0,0 +1,553 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/contiguous_storage.h>
+#include <thrust/detail/swap.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/allocator/copy_construct_range.h>
+#include <thrust/detail/allocator/default_construct_range.h>
+#include <thrust/detail/allocator/destroy_range.h>
+#include <thrust/detail/allocator/fill_construct_range.h>
+
+#include <stdexcept> // for std::runtime_error
+#include <utility> // for use of std::swap in the WAR below
+
+namespace thrust
+{
+
+namespace detail
+{
+
+class allocator_mismatch_on_swap : public std::runtime_error
+{
+public:
+  allocator_mismatch_on_swap()
+    :std::runtime_error("swap called on containers with allocators that propagate on swap, but compare non-equal")
+  {
+  }
+};
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(const Alloc &alloc)
+      :m_allocator(alloc),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+  ;
+} // end contiguous_storage::contiguous_storage()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(size_type n, const Alloc &alloc)
+      :m_allocator(alloc),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+  allocate(n);
+} // end contiguous_storage::contiguous_storage()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(copy_allocator_t,
+        const contiguous_storage &other)
+      :m_allocator(other.m_allocator),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+} // end contiguous_storage::contiguous_storage()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(copy_allocator_t,
+        const contiguous_storage &other, size_type n)
+      :m_allocator(other.m_allocator),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+  allocate(n);
+} // end contiguous_storage::contiguous_storage()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::~contiguous_storage()
+{
+  deallocate();
+} // end contiguous_storage::~contiguous_storage()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::size_type
+    contiguous_storage<T,Alloc>
+      ::size() const
+{
+  return m_size;
+} // end contiguous_storage::size()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::size_type
+    contiguous_storage<T,Alloc>
+      ::max_size() const
+{
+  return alloc_traits::max_size(m_allocator);
+} // end contiguous_storage::max_size()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::iterator
+    contiguous_storage<T,Alloc>
+      ::begin()
+{
+  return m_begin;
+} // end contiguous_storage::begin()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::const_iterator
+    contiguous_storage<T,Alloc>
+      ::begin() const
+{
+  return m_begin;
+} // end contiguous_storage::begin()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::iterator
+    contiguous_storage<T,Alloc>
+      ::end()
+{
+  return m_begin + size();
+} // end contiguous_storage::end()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::const_iterator
+    contiguous_storage<T,Alloc>
+      ::end() const
+{
+  return m_begin + size();
+} // end contiguous_storage::end()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::pointer
+    contiguous_storage<T,Alloc>
+      ::data()
+{
+  return &*m_begin;
+} // end contiguous_storage::data()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::const_pointer
+    contiguous_storage<T,Alloc>
+      ::data() const
+{
+  return &*m_begin;
+} // end contiguous_storage::data()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::reference
+    contiguous_storage<T,Alloc>
+      ::operator[](size_type n)
+{
+  return m_begin[n];
+} // end contiguous_storage::operator[]()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::const_reference
+    contiguous_storage<T,Alloc>
+      ::operator[](size_type n) const
+{
+  return m_begin[n];
+} // end contiguous_storage::operator[]()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::allocator_type
+    contiguous_storage<T,Alloc>
+      ::get_allocator() const
+{
+  return m_allocator;
+} // end contiguous_storage::get_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::allocate(size_type n)
+{
+  if(n > 0)
+  {
+    m_begin = iterator(alloc_traits::allocate(m_allocator,n));
+    m_size = n;
+  } // end if
+  else
+  {
+    m_begin = iterator(pointer(static_cast<T*>(0)));
+    m_size = 0;
+  } // end else
+} // end contiguous_storage::allocate()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate()
+{
+  if(size() > 0)
+  {
+    alloc_traits::deallocate(m_allocator,m_begin.base(), size());
+    m_begin = iterator(pointer(static_cast<T*>(0)));
+    m_size = 0;
+  } // end if
+} // end contiguous_storage::deallocate()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::swap(contiguous_storage &x)
+{
+  thrust::swap(m_begin, x.m_begin);
+  thrust::swap(m_size, x.m_size);
+
+  swap_allocators(
+    integral_constant<
+      bool,
+      allocator_traits<Alloc>::propagate_on_container_swap::value
+    >(),
+    x.m_allocator);
+
+  thrust::swap(m_allocator, x.m_allocator);
+} // end contiguous_storage::swap()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::default_construct_n(iterator first, size_type n)
+{
+  default_construct_range(m_allocator, first.base(), n);
+} // end contiguous_storage::default_construct_n()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::uninitialized_fill_n(iterator first, size_type n, const value_type &x)
+{
+  fill_construct_range(m_allocator, first.base(), n, x);
+} // end contiguous_storage::uninitialized_fill()
+
+template<typename T, typename Alloc>
+  template<typename System, typename InputIterator>
+  __host__ __device__
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy(thrust::execution_policy<System> &from_system, InputIterator first, InputIterator last, iterator result)
+{
+  return iterator(copy_construct_range(from_system, m_allocator, first, last, result.base()));
+} // end contiguous_storage::uninitialized_copy()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+  __host__ __device__
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy(InputIterator first, InputIterator last, iterator result)
+{
+  // XXX assumes InputIterator's associated System is default-constructible
+  typename thrust::iterator_system<InputIterator>::type from_system;
+
+  return iterator(copy_construct_range(from_system, m_allocator, first, last, result.base()));
+} // end contiguous_storage::uninitialized_copy()
+
+template<typename T, typename Alloc>
+  template<typename System, typename InputIterator, typename Size>
+  __host__ __device__
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy_n(thrust::execution_policy<System> &from_system, InputIterator first, Size n, iterator result)
+{
+  return iterator(copy_construct_range_n(from_system, m_allocator, first, n, result.base()));
+} // end contiguous_storage::uninitialized_copy_n()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator, typename Size>
+  __host__ __device__
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy_n(InputIterator first, Size n, iterator result)
+{
+  // XXX assumes InputIterator's associated System is default-constructible
+  typename thrust::iterator_system<InputIterator>::type from_system;
+
+  return iterator(copy_construct_range_n(from_system, m_allocator, first, n, result.base()));
+} // end contiguous_storage::uninitialized_copy_n()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy(iterator first, iterator last)
+{
+  destroy_range(m_allocator, first.base(), last - first);
+} // end contiguous_storage::destroy()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate_on_allocator_mismatch(const contiguous_storage &other)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_copy_assignment::value
+  > c;
+
+  deallocate_on_allocator_mismatch_dispatch(c, other);
+} // end contiguous_storage::deallocate_on_allocator_mismatch
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy_on_allocator_mismatch(const contiguous_storage &other,
+        iterator first, iterator last)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_copy_assignment::value
+  > c;
+
+  destroy_on_allocator_mismatch_dispatch(c, other, first, last);
+} // end contiguous_storage::destroy_on_allocator_mismatch
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::set_allocator(const Alloc &alloc)
+{
+  m_allocator = alloc;
+} // end contiguous_storage::set_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal(const Alloc &alloc) const
+{
+  return is_allocator_not_equal_dispatch(
+    integral_constant<
+      bool,
+      allocator_traits<Alloc>::is_always_equal::value
+    >(),
+    alloc);
+} // end contiguous_storage::is_allocator_not_equal()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal(const contiguous_storage<T,Alloc> &other) const
+{
+  return is_allocator_not_equal(m_allocator, other.m_allocator);
+} // end contiguous_storage::is_allocator_not_equal()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator(const contiguous_storage &other)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_copy_assignment::value
+  > c;
+
+  propagate_allocator_dispatch(c, other);
+} // end contiguous_storage::propagate_allocator()
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator(contiguous_storage &other)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_move_assignment::value
+  > c;
+
+  propagate_allocator_dispatch(c, other);
+} // end contiguous_storage::propagate_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc> &contiguous_storage<T,Alloc>
+    ::operator=(contiguous_storage &&other)
+{
+  if (size() > 0)
+  {
+    deallocate();
+  }
+  propagate_allocator(other);
+  m_begin = std::move(other.m_begin);
+  m_size = std::move(other.m_size);
+
+  other.m_begin = pointer(static_cast<T*>(0));
+  other.m_size = 0;
+
+  return *this;
+} // end contiguous_storage::propagate_allocator()
+#endif
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::swap_allocators(true_type, const Alloc &)
+{
+} // end contiguous_storage::swap_allocators()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::swap_allocators(false_type, Alloc &other)
+{
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      // allocators must be equal when swapping containers with allocators that propagate on swap
+      assert(!is_allocator_not_equal(other));
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      if (is_allocator_not_equal(other))
+      {
+        throw allocator_mismatch_on_swap();
+      }
+    #endif
+  }
+  thrust::swap(m_allocator, other);
+} // end contiguous_storage::swap_allocators()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal_dispatch(true_type /*is_always_equal*/, const Alloc &) const
+{
+  return false;
+} // end contiguous_storage::is_allocator_not_equal_dispatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal_dispatch(false_type /*!is_always_equal*/, const Alloc& other) const
+{
+  return m_allocator != other;
+} // end contiguous_storage::is_allocator_not_equal_dispatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other)
+{
+  if (m_allocator != other.m_allocator)
+  {
+    deallocate();
+  }
+} // end contiguous_storage::deallocate_on_allocator_mismatch()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &)
+{
+} // end contiguous_storage::deallocate_on_allocator_mismatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other,
+        iterator first, iterator last)
+{
+  if (m_allocator != other.m_allocator)
+  {
+    destroy(first, last);
+  }
+} // end contiguous_storage::destroy_on_allocator_mismatch()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &,
+        iterator, iterator)
+{
+} // end contiguous_storage::destroy_on_allocator_mismatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(true_type, const contiguous_storage &other)
+{
+  m_allocator = other.m_allocator;
+} // end contiguous_storage::propagate_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(false_type, const contiguous_storage &)
+{
+} // end contiguous_storage::propagate_allocator()
+
+#if THRUST_CPP_DIALECT >= 2011
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(true_type, contiguous_storage &other)
+{
+  m_allocator = std::move(other.m_allocator);
+} // end contiguous_storage::propagate_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(false_type, contiguous_storage &)
+{
+} // end contiguous_storage::propagate_allocator()
+#endif
+
+} // end detail
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs)
+{
+  lhs.swap(rhs);
+} // end swap()
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/copy.h b/thrust/thrust/detail/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e9feb0f90c8773be2db8ddf74600e79fd988b5f
--- /dev/null
+++ b/thrust/thrust/detail/copy.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+template<typename System,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(const thrust::detail::execution_policy_base<System> &system,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+template<typename System,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(const thrust::detail::execution_policy_base<System> &system,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+namespace detail
+{
+
+
+template<typename FromSystem,
+         typename ToSystem,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator two_system_copy(const thrust::execution_policy<FromSystem> &from_system,
+                                 const thrust::execution_policy<ToSystem>   &two_system,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result);
+
+
+template<typename FromSystem,
+         typename ToSystem,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator two_system_copy_n(const thrust::execution_policy<FromSystem> &from_system,
+                                   const thrust::execution_policy<ToSystem>   &two_system,
+                                   InputIterator first,
+                                   Size n,
+                                   OutputIterator result);
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/copy.inl>
+
diff --git a/thrust/thrust/detail/copy.inl b/thrust/thrust/detail/copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..85701fde72838764e6aea1a2398f2f838c134af1
--- /dev/null
+++ b/thrust/thrust/detail/copy.inl
@@ -0,0 +1,132 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/copy.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/system/detail/adl/copy.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  using thrust::system::detail::generic::copy;
+  return copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  using thrust::system::detail::generic::copy_n;
+  return copy_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, result);
+} // end copy_n()
+
+
+namespace detail
+{
+
+
+__thrust_exec_check_disable__ // because we might call e.g. std::ostream_iterator's constructor
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator two_system_copy(const thrust::execution_policy<System1> &system1,
+                                 const thrust::execution_policy<System2> &system2,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  return thrust::copy(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(system1)), thrust::detail::derived_cast(thrust::detail::strip_const(system2))), first, last, result);
+} // end two_system_copy()
+
+
+__thrust_exec_check_disable__ // because we might call e.g. std::ostream_iterator's constructor
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator two_system_copy_n(const thrust::execution_policy<System1> &system1,
+                                   const thrust::execution_policy<System2> &system2,
+                                   InputIterator first,
+                                   Size n,
+                                   OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  return thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(system1)), thrust::detail::derived_cast(thrust::detail::strip_const(system2))), first, n, result);
+} // end two_system_copy_n()
+
+
+} // end detail
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::detail::two_system_copy(system1, system2, first, last, result);
+} // end copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::detail::two_system_copy_n(system1, system2, first, n, result);
+} // end copy_n()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/copy_if.h b/thrust/thrust/detail/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..563623c889b4c7abb19c6140488bf0c15e6e1af0
--- /dev/null
+++ b/thrust/thrust/detail/copy_if.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+} // end thrust
+
+#include <thrust/detail/copy_if.inl>
+
diff --git a/thrust/thrust/detail/copy_if.inl b/thrust/thrust/detail/copy_if.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f4c22f8a502f5e6748889100005923f8795d4e4b
--- /dev/null
+++ b/thrust/thrust/detail/copy_if.inl
@@ -0,0 +1,109 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/copy_if.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/adl/copy_if.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::copy_if;
+  return copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred);
+} // end copy_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::copy_if;
+  return copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred);
+} // end copy_if()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::copy_if(select_system(system1,system2), first, last, result, pred);
+} // end copy_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
+} // end copy_if()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/count.inl b/thrust/thrust/detail/count.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f7ba7a54e00da634bdfbf87fdf06a39518515310
--- /dev/null
+++ b/thrust/thrust/detail/count.inl
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file count.inl
+ *  \brief Inline file for count.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/count.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/count.h>
+#include <thrust/system/detail/adl/count.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
+{
+  using thrust::system::detail::generic::count;
+  return count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end count()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::count_if;
+  return count_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end count_if()
+
+
+template <typename InputIterator, typename EqualityComparable>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count(InputIterator first, InputIterator last, const EqualityComparable& value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::count(select_system(system), first, last, value);
+} // end count()
+
+
+template <typename InputIterator, typename Predicate>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count_if(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::count_if(select_system(system), first, last, pred);
+} // end count_if()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/cpp11_required.h b/thrust/thrust/detail/cpp11_required.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7fb4fb12cddade3e9a1e83b5310b6cf1a0a5a1d
--- /dev/null
+++ b/thrust/thrust/detail/cpp11_required.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#ifndef THRUST_CPP11_REQUIRED_NO_ERROR
+#  if THRUST_CPP_DIALECT < 2011 
+#    error C++11 is required for this Thrust feature; please upgrade your compiler or pass the appropriate -std=c++XX flag to it.
+#  endif
+#endif
+
diff --git a/thrust/thrust/detail/cpp14_required.h b/thrust/thrust/detail/cpp14_required.h
new file mode 100644
index 0000000000000000000000000000000000000000..083c8a1ad478f18e8bc5151385362e87ec933962
--- /dev/null
+++ b/thrust/thrust/detail/cpp14_required.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#ifndef THRUST_CPP14_REQUIRED_NO_ERROR
+#  if THRUST_CPP_DIALECT < 2014
+#    error C++14 is required for this Thrust feature; please upgrade your compiler or pass the appropriate -std=c++14 flag to it.
+#  endif
+#endif
+
diff --git a/thrust/thrust/detail/cstdint.h b/thrust/thrust/detail/cstdint.h
new file mode 100644
index 0000000000000000000000000000000000000000..248390a528d5885a2a6f00e6a34cec5185cfbdcf
--- /dev/null
+++ b/thrust/thrust/detail/cstdint.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG)
+#include <stdint.h>
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+
+#if (_MSC_VER < 1300)
+   typedef signed   char     int8_t;
+   typedef signed   short    int16_t;
+   typedef signed   int      int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed   __int8   int8_t;
+   typedef signed   __int16  int16_t;
+   typedef signed   __int32  int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed   __int64     int64_t;
+typedef unsigned __int64     uint64_t;
+
+#else
+
+typedef ::int8_t   int8_t;
+typedef ::int16_t  int16_t;
+typedef ::int32_t  int32_t;
+typedef ::int64_t  int64_t;
+typedef ::uint8_t  uint8_t;
+typedef ::uint16_t uint16_t;
+typedef ::uint32_t uint32_t;
+typedef ::uint64_t uint64_t;
+
+#endif
+
+
+// an oracle to tell us how to define intptr_t
+template<int word_size = sizeof(void*)> struct divine_intptr_t;
+template<int word_size = sizeof(void*)> struct divine_uintptr_t;
+
+// 32b platforms
+template<>  struct divine_intptr_t<4>  {  typedef thrust::detail::int32_t  type; };
+template<>  struct divine_uintptr_t<4> {  typedef thrust::detail::uint32_t type; };
+
+// 64b platforms
+template<>  struct divine_intptr_t<8>  { typedef thrust::detail::int64_t  type; };
+template<>  struct divine_uintptr_t<8> { typedef thrust::detail::uint64_t type; };
+
+typedef divine_intptr_t<>::type   intptr_t;
+typedef divine_uintptr_t<>::type  uintptr_t;
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/dependencies_aware_execution_policy.h b/thrust/thrust/detail/dependencies_aware_execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..1806276f9d65c37d24ec7a3c83c6e9d32735117f
--- /dev/null
+++ b/thrust/thrust/detail/dependencies_aware_execution_policy.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <tuple>
+
+#include <thrust/detail/execute_with_dependencies.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<template<typename> class ExecutionPolicyCRTPBase>
+struct dependencies_aware_execution_policy
+{
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    after(Dependencies&& ...dependencies) const
+    {
+        return { capture_as_dependency(THRUST_FWD(dependencies))... };
+    }
+
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    after(std::tuple<Dependencies...>& dependencies) const
+    {
+        return { capture_as_dependency(dependencies) };
+    }
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    after(std::tuple<Dependencies...>&& dependencies) const
+    {
+        return { capture_as_dependency(std::move(dependencies)) };
+    }
+
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    rebind_after(Dependencies&& ...dependencies) const
+    {
+        return { capture_as_dependency(THRUST_FWD(dependencies))... };
+    }
+
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    rebind_after(std::tuple<Dependencies...>& dependencies) const
+    {
+        return { capture_as_dependency(dependencies) };
+    }
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    rebind_after(std::tuple<Dependencies...>&& dependencies) const
+    {
+        return { capture_as_dependency(std::move(dependencies)) };
+    }
+};
+
+} // end detail
+} // end thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/detail/device_delete.inl b/thrust/thrust/detail/device_delete.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f1a67f91bef0af6abff72c5c34b40c88c3219f2e
--- /dev/null
+++ b/thrust/thrust/detail/device_delete.inl
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_delete.inl
+ *  \brief Inline file for device_delete.h.
+ */
+
+#include <thrust/device_delete.h>
+#include <thrust/device_free.h>
+#include <thrust/detail/allocator/destroy_range.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+// define an empty allocator class to use below
+struct device_delete_allocator {};
+
+}
+
+template<typename T>
+  void device_delete(device_ptr<T> ptr,
+                     const size_t n)
+{
+  // we can use device_allocator to destroy the range
+  thrust::detail::device_delete_allocator a;
+  thrust::detail::destroy_range(a, ptr, n);
+  thrust::device_free(ptr);
+} // end device_delete()
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/device_free.inl b/thrust/thrust/detail/device_free.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7a1b6c1239ff86a80109477c0377edb21dd6210a
--- /dev/null
+++ b/thrust/thrust/detail/device_free.inl
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_free.inl
+ *  \brief Inline file for device_free.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/device_free.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+
+void device_free(thrust::device_ptr<void> ptr)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
+
+  // XXX lower to select_system(system) here
+  system s;
+
+  thrust::free(s, ptr);
+} // end device_free()
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/device_malloc.inl b/thrust/thrust/detail/device_malloc.inl
new file mode 100644
index 0000000000000000000000000000000000000000..938c3c807cab3ce6f8ff5e747acd1ef6fd8c9556
--- /dev/null
+++ b/thrust/thrust/detail/device_malloc.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_malloc.inl
+ *  \brief Inline file for device_malloc.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/device_malloc.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+
+
+thrust::device_ptr<void> device_malloc(const std::size_t n)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
+
+  // XXX lower to select_system(system) here
+  system s;
+
+  return thrust::device_ptr<void>(thrust::malloc(s, n).get());
+} // end device_malloc()
+
+
+template<typename T>
+  thrust::device_ptr<T> device_malloc(const std::size_t n)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
+
+  // XXX lower to select_system(system) here
+  system s;
+
+  return thrust::device_ptr<T>(thrust::malloc<T>(s,n).get());
+} // end device_malloc()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/device_new.inl b/thrust/thrust/detail/device_new.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2551badb4362f26221225e449dea690deceab748
--- /dev/null
+++ b/thrust/thrust/detail/device_new.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_new.inl
+ *  \brief Inline file for device_new.h.
+ */
+
+#include <thrust/device_new.h>
+#include <thrust/device_malloc.h>
+#include <thrust/uninitialized_fill.h>
+
+namespace thrust
+{
+
+template<typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const size_t n)
+{
+  // XXX TODO dispatch n null device constructors at p here
+  // in the meantime, dispatch 1 null host constructor here
+  // and dispatch n copy constructors
+  return device_new<T>(p, T(), n);
+} // end device_new()
+
+template<typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const T &exemplar,
+                           const size_t n)
+{
+  device_ptr<T> result(reinterpret_cast<T*>(p.get()));
+
+  // run copy constructors at p here
+  thrust::uninitialized_fill(result, result + n, exemplar);
+  
+  return result;
+} // end device_new()
+
+template<typename T>
+  device_ptr<T> device_new(const size_t n)
+{
+  // call placement new
+  return device_new<T>(thrust::device_malloc<T>(n));
+} // end device_new()
+
+} // thrust
+
diff --git a/thrust/thrust/detail/device_ptr.inl b/thrust/thrust/detail/device_ptr.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d1058ca6aa13c310249dba1bf198da2aade2c013
--- /dev/null
+++ b/thrust/thrust/detail/device_ptr.inl
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_ptr.inl
+ *  \brief Inline file for device_ptr.h.
+ */
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_reference.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+template<typename T>
+  __host__ __device__
+  device_ptr<T> device_pointer_cast(T *ptr)
+{
+  return device_ptr<T>(ptr);
+} // end device_pointer_cast()
+
+template<typename T>
+  __host__ __device__
+  device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr)
+{
+  return ptr;
+} // end device_pointer_cast()
+
+
+namespace detail
+{
+
+template<typename T>
+  struct is_device_ptr< thrust::device_ptr<T> >
+    : public true_type
+{
+}; // end is_device_ptr
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+// XXX WAR MSVC 2005 problem with correctly implementing
+//     pointer_raw_pointer for device_ptr by specializing it here
+template<typename T>
+  struct pointer_raw_pointer< thrust::device_ptr<T> >
+{
+  typedef typename device_ptr<T>::raw_pointer type;
+}; // end pointer_raw_pointer
+#endif
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/device_reference.inl b/thrust/thrust/detail/device_reference.inl
new file mode 100644
index 0000000000000000000000000000000000000000..07f6af726cf8218dde22170efa7ba43e18c95b1d
--- /dev/null
+++ b/thrust/thrust/detail/device_reference.inl
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_reference.inl
+ *  \brief Inline file for device_reference.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/device_reference.h>
+
+namespace thrust
+{
+
+template<typename T>
+  template<typename OtherT>
+    __host__ __device__
+    device_reference<T> &
+      device_reference<T>
+        ::operator=(const device_reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end operator=()
+
+template<typename T>
+  __host__ __device__
+  device_reference<T> &
+    device_reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end operator=()
+
+template<typename T>
+__host__ __device__
+void swap(device_reference<T> a, device_reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/device_vector.inl b/thrust/thrust/detail/device_vector.inl
new file mode 100644
index 0000000000000000000000000000000000000000..e59b5670e255b027919278359ed8ce68eee8f1f1
--- /dev/null
+++ b/thrust/thrust/detail/device_vector.inl
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_vector.inl
+ *  \brief Inline file for device_vector.h.
+ */
+
+#include <thrust/host_vector.h>
+
+namespace thrust
+{
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    __host__
+    device_vector<T,Alloc>
+      ::device_vector(const host_vector<OtherT,OtherAlloc> &v)
+        :Parent(v)
+{
+  ;
+} // end device_vector::device_vector()
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/distance.inl b/thrust/thrust/detail/distance.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f12ef204cc24a0b90a99f00a0949229e7bef753b
--- /dev/null
+++ b/thrust/thrust/detail/distance.inl
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file distance.inl
+ *  \brief Inline file for distance.h
+ */
+
+#include <thrust/advance.h>
+#include <thrust/system/detail/generic/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator>
+inline __host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last)
+{
+  return thrust::system::detail::generic::distance(first, last);
+} // end distance()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/equal.inl b/thrust/thrust/detail/equal.inl
new file mode 100644
index 0000000000000000000000000000000000000000..08bfbab0be0e8c37090e84cfbd48c398e3e583b0
--- /dev/null
+++ b/thrust/thrust/detail/equal.inl
@@ -0,0 +1,86 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file equal.inl
+ *  \brief Inline file for equal.h.
+ */
+
+#include <thrust/equal.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/equal.h>
+#include <thrust/system/detail/adl/equal.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename System, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+bool equal(const thrust::detail::execution_policy_base<System> &system, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
+{
+  using thrust::system::detail::generic::equal;
+  return equal(thrust::detail::derived_cast(thrust::detail::strip_const(system)), first1, last1, first2);
+} // end equal()
+
+
+__thrust_exec_check_disable__
+template<typename System, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+bool equal(const thrust::detail::execution_policy_base<System> &system, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::equal;
+  return equal(thrust::detail::derived_cast(thrust::detail::strip_const(system)), first1, last1, first2, binary_pred);
+} // end equal()
+
+
+template <typename InputIterator1, typename InputIterator2>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::equal(select_system(system1,system2), first1, last1, first2);
+}
+
+
+template <typename InputIterator1, typename InputIterator2, 
+          typename BinaryPredicate>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2, BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::equal(select_system(system1,system2), first1, last1, first2, binary_pred);
+}
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/event_error.h b/thrust/thrust/detail/event_error.h
new file mode 100644
index 0000000000000000000000000000000000000000..114d4763f116ef20966572a86ca52076b837f1cc
--- /dev/null
+++ b/thrust/thrust/detail/event_error.h
@@ -0,0 +1,166 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/// \file thrust/detail/event_error.h
+/// \brief \c thrust::future and thrust::future error handling types and codes.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/error_code.h>
+
+#include <stdexcept>
+
+namespace thrust
+{
+
+enum class event_errc
+{
+  unknown_event_error
+, no_state
+, no_content
+, last_event_error
+};
+
+/// \return <tt>error_code(static_cast<int>(e), event_category())</tt>
+inline error_code make_error_code(event_errc e);
+
+/// \return <tt>error_condition(static_cast<int>(e), event_category())</tt>.
+inline error_condition make_error_condition(event_errc e);
+
+struct event_error_category : error_category
+{
+  event_error_category() = default;
+
+  virtual char const* name() const
+  {
+    return "event";
+  }
+
+  virtual std::string message(int ev) const
+  {
+    switch (static_cast<event_errc>(ev))
+    {
+      case event_errc::no_state:
+      {
+        return "no_state: an operation that requires an event or future to have "
+               "a stream or content has been performed on a event or future "
+               "without either, e.g. a moved-from or default constructed event "
+               "or future (an event or future may have been consumed more than "
+               "once)";
+      }
+      case event_errc::no_content:
+      {
+        return "no_content: an operation that requires a future to have content "
+               "has been performed on future without any, e.g. a moved-from, "
+               "default constructed, or `thrust::new_stream` constructed future "
+               "(a future may have been consumed more than once)";
+      }
+      default:
+      {
+        return "unknown_event_error: an unknown error with a future "
+               "object has occurred";
+      }
+    };
+  }
+
+  virtual error_condition default_error_condition(int ev) const
+  {
+    if (
+         event_errc::last_event_error
+         >
+         static_cast<event_errc>(ev)
+       )
+      return make_error_condition(static_cast<event_errc>(ev));
+
+    return system_category().default_error_condition(ev);
+  }
+};
+
+/// Obtains a reference to the static error category object for the errors
+/// related to futures and promises. The object is required to override the
+/// virtual function error_category::name() to return a pointer to the string
+/// "event". It is used to identify error codes provided in the
+/// exceptions of type event_error.
+inline error_category const& event_category()
+{
+  static const event_error_category result;
+  return result;
+}
+
+namespace system
+{
+/// Specialization of \p is_error_code_enum for \p event_errc.
+template<> struct is_error_code_enum<event_errc> : true_type {};
+} // end system
+
+/// \return <tt>error_code(static_cast<int>(e), event_category())</tt>
+inline error_code make_error_code(event_errc e)
+{
+  return error_code(static_cast<int>(e), event_category());
+}
+
+/// \return <tt>error_condition(static_cast<int>(e), event_category())</tt>.
+inline error_condition make_error_condition(event_errc e)
+{
+  return error_condition(static_cast<int>(e), event_category());
+}
+
+struct event_error : std::logic_error
+{
+  __host__
+  explicit event_error(error_code ec)
+    : std::logic_error(ec.message()), ec_(ec)
+  {}
+
+  __host__
+  explicit event_error(event_errc e)
+    : event_error(make_error_code(e))
+  {}
+
+  __host__
+  error_code const& code() const noexcept
+  {
+    return ec_;
+  }
+
+  __host__
+  virtual ~event_error() noexcept {}
+
+private:
+  error_code ec_;
+};
+
+inline bool operator==(event_error const& lhs, event_error const& rhs) noexcept
+{
+  return lhs.code() == rhs.code();
+}
+
+inline bool operator<(event_error const& lhs, event_error const& rhs) noexcept
+{
+  return lhs.code() < rhs.code();
+}
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/detail/execute_with_allocator.h b/thrust/thrust/detail/execute_with_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..93dee663cc13bf44b2dc0cdf91c0aea5348e9ccb
--- /dev/null
+++ b/thrust/thrust/detail/execute_with_allocator.h
@@ -0,0 +1,148 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/execute_with_allocator_fwd.h>
+#include <thrust/pair.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/integer_math.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template <
+    typename T
+  , typename Allocator
+  , template <typename> class BaseSystem
+>
+__host__
+thrust::pair<T*, std::ptrdiff_t>
+get_temporary_buffer(
+    thrust::detail::execute_with_allocator<Allocator, BaseSystem>& system
+  , std::ptrdiff_t n
+    )
+{
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::void_pointer                        void_pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+
+  // How many elements of type value_type do we need to accommodate n elements
+  // of type T?
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  void_pointer ptr = alloc_traits::allocate(system.get_allocator(), num_elements);
+
+  // Return the pointer and the number of elements of type T allocated.
+  return thrust::make_pair(thrust::reinterpret_pointer_cast<T*>(ptr),n);
+}
+
+template <
+    typename Pointer
+  , typename Allocator
+  , template <typename> class BaseSystem
+>
+__host__
+void
+return_temporary_buffer(
+    thrust::detail::execute_with_allocator<Allocator, BaseSystem>& system
+  , Pointer p
+  , std::ptrdiff_t n
+    )
+{
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::pointer                             pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+  typedef typename thrust::detail::pointer_traits<Pointer>::element_type T;
+
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
+  alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template <
+    typename T,
+    template <typename> class BaseSystem,
+    typename Allocator,
+    typename ...Dependencies
+>
+__host__
+thrust::pair<T*, std::ptrdiff_t>
+get_temporary_buffer(
+    thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system,
+    std::ptrdiff_t n
+    )
+{
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::void_pointer                        void_pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+
+  // How many elements of type value_type do we need to accommodate n elements
+  // of type T?
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  void_pointer ptr = alloc_traits::allocate(system.get_allocator(), num_elements);
+
+  // Return the pointer and the number of elements of type T allocated.
+  return thrust::make_pair(thrust::reinterpret_pointer_cast<T*>(ptr),n);
+}
+
+template <
+    typename Pointer,
+    template <typename> class BaseSystem,
+    typename Allocator,
+    typename ...Dependencies
+>
+__host__
+void
+return_temporary_buffer(
+    thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system,
+    Pointer p,
+    std::ptrdiff_t n
+    )
+{
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::pointer                             pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+  typedef typename thrust::detail::pointer_traits<Pointer>::element_type T;
+
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
+  alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
+}
+
+#endif
+
+}} // namespace thrust::detail
+
diff --git a/thrust/thrust/detail/execute_with_allocator_fwd.h b/thrust/thrust/detail/execute_with_allocator_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..22d78fdd6b06dff998027737ab321f94607d7897
--- /dev/null
+++ b/thrust/thrust/detail/execute_with_allocator_fwd.h
@@ -0,0 +1,104 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/execute_with_dependencies.h>
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename Allocator, template <typename> class BaseSystem>
+struct execute_with_allocator
+  : BaseSystem<execute_with_allocator<Allocator, BaseSystem> >
+{
+private:
+  typedef BaseSystem<execute_with_allocator<Allocator, BaseSystem> > super_t;
+
+  Allocator alloc;
+
+public:
+  __host__ __device__
+  execute_with_allocator(super_t const& super, Allocator alloc_)
+    : super_t(super), alloc(alloc_)
+  {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  execute_with_allocator(Allocator alloc_)
+    : alloc(alloc_)
+  {}
+
+  typename remove_reference<Allocator>::type& get_allocator() { return alloc; }
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  after(Dependencies&& ...dependencies) const
+  {
+    return { alloc, capture_as_dependency(THRUST_FWD(dependencies))... };
+  }
+
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  after(std::tuple<Dependencies...>& dependencies) const
+  {
+      return { alloc, capture_as_dependency(dependencies) };
+  }
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  after(std::tuple<Dependencies...>&& dependencies) const
+  {
+      return { alloc, capture_as_dependency(std::move(dependencies)) };
+  }
+
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  rebind_after(Dependencies&& ...dependencies) const
+  {
+    return { alloc, capture_as_dependency(THRUST_FWD(dependencies))... };
+  }
+
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  rebind_after(std::tuple<Dependencies...>& dependencies) const
+  {
+      return { alloc, capture_as_dependency(dependencies) };
+  }
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  rebind_after(std::tuple<Dependencies...>&& dependencies) const
+  {
+      return { alloc, capture_as_dependency(std::move(dependencies)) };
+  }
+#endif
+};
+
+}} // namespace thrust::detail
diff --git a/thrust/thrust/detail/execute_with_dependencies.h b/thrust/thrust/detail/execute_with_dependencies.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb92b1ba2b372d8cba9be817aee2e2db48160dc0
--- /dev/null
+++ b/thrust/thrust/detail/execute_with_dependencies.h
@@ -0,0 +1,267 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <tuple>
+#include <type_traits>
+
+namespace thrust
+{
+namespace detail
+{
+
+struct capture_as_dependency_fn
+{
+  template<typename Dependency>
+  auto operator()(Dependency&& dependency) const
+  THRUST_DECLTYPE_RETURNS(capture_as_dependency(THRUST_FWD(dependency)))
+};
+
+// Default implementation: universal forwarding.
+template<typename Dependency>
+auto capture_as_dependency(Dependency&& dependency)
+THRUST_DECLTYPE_RETURNS(THRUST_FWD(dependency))
+
+template<typename... Dependencies>
+auto capture_as_dependency(std::tuple<Dependencies...>& dependencies)
+THRUST_DECLTYPE_RETURNS(
+  tuple_for_each(THRUST_FWD(dependencies), capture_as_dependency_fn{})
+)
+
+template<template<typename> class BaseSystem, typename... Dependencies>
+struct execute_with_dependencies
+    : BaseSystem<execute_with_dependencies<BaseSystem, Dependencies...>>
+{
+private:
+    using super_t = BaseSystem<execute_with_dependencies<BaseSystem, Dependencies...>>;
+
+    std::tuple<remove_cvref_t<Dependencies>...> dependencies;
+
+public:
+    __host__
+    execute_with_dependencies(super_t const &super, Dependencies && ...dependencies)
+        : super_t(super), dependencies(std::forward<Dependencies>(dependencies)...)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(super_t const &super, UDependencies && ...deps)
+        : super_t(super), dependencies(THRUST_FWD(deps)...)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(UDependencies && ...deps)
+        : dependencies(THRUST_FWD(deps)...)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(super_t const &super, std::tuple<UDependencies...>&& deps)
+        : super_t(super), dependencies(std::move(deps))
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(std::tuple<UDependencies...>&& deps)
+        : dependencies(std::move(deps))
+    {
+    }
+
+    std::tuple<remove_cvref_t<Dependencies>...>
+    __host__
+    extract_dependencies() 
+    {
+        return std::move(dependencies);
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_dependencies<BaseSystem, UDependencies...>
+    rebind_after(UDependencies&& ...udependencies) const
+    {
+        return { capture_as_dependency(THRUST_FWD(udependencies))... };
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_dependencies<BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>& udependencies) const
+    {
+        return { capture_as_dependency(udependencies) };
+    }
+    template<typename ...UDependencies>
+    __host__
+    execute_with_dependencies<BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>&& udependencies) const
+    {
+        return { capture_as_dependency(std::move(udependencies)) };
+    }
+};
+
+template<
+    typename Allocator,
+    template<typename> class BaseSystem,
+    typename... Dependencies
+>
+struct execute_with_allocator_and_dependencies
+    : BaseSystem<
+        execute_with_allocator_and_dependencies<
+            Allocator,
+            BaseSystem,
+            Dependencies...
+        >
+    >
+{
+private:
+    using super_t = BaseSystem<
+        execute_with_allocator_and_dependencies<
+            Allocator,
+            BaseSystem,
+            Dependencies...
+        >
+    >;
+
+    std::tuple<remove_cvref_t<Dependencies>...> dependencies;
+    Allocator alloc;
+
+public:
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(super_t const &super, Allocator a, UDependencies && ...deps)
+        : super_t(super), dependencies(THRUST_FWD(deps)...), alloc(a)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(Allocator a, UDependencies && ...deps)
+        : dependencies(THRUST_FWD(deps)...), alloc(a)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(super_t const &super, Allocator a, std::tuple<UDependencies...>&& deps)
+        : super_t(super), dependencies(std::move(deps)), alloc(a)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(Allocator a, std::tuple<UDependencies...>&& deps)
+        : dependencies(std::move(deps)), alloc(a)
+    {
+    }
+
+    std::tuple<remove_cvref_t<Dependencies>...>
+    __host__
+    extract_dependencies() 
+    {
+        return std::move(dependencies);
+    }
+
+    __host__
+    typename std::add_lvalue_reference<Allocator>::type
+    get_allocator()
+    {
+        return alloc;
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies<Allocator, BaseSystem, UDependencies...>
+    rebind_after(UDependencies&& ...udependencies) const
+    {
+        return { alloc, capture_as_dependency(THRUST_FWD(udependencies))... };
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies<Allocator, BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>& udependencies) const
+    {
+        return { alloc, capture_as_dependency(udependencies) };
+    }
+    template<typename ...UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies<Allocator, BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>&& udependencies) const
+    {
+        return { alloc, capture_as_dependency(std::move(udependencies)) };
+    }
+};
+
+template<template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_dependencies<BaseSystem, Dependencies...>&& system)
+{
+    return std::move(system).extract_dependencies();
+}
+template<template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_dependencies<BaseSystem, Dependencies...>& system)
+{
+    return std::move(system).extract_dependencies();
+}
+
+template<typename Allocator, template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>&& system)
+{
+    return std::move(system).extract_dependencies();
+}
+template<typename Allocator, template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system)
+{
+    return std::move(system).extract_dependencies();
+}
+
+template<typename System>
+__host__
+std::tuple<>
+extract_dependencies(System &&)
+{
+    return std::tuple<>{};
+}
+
+} // end detail
+} // end thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/detail/execution_policy.h b/thrust/thrust/detail/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec554b689016f0482ccfeccf9c6c81bcc528db8d
--- /dev/null
+++ b/thrust/thrust/detail/execution_policy.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+struct execution_policy_marker {};
+
+// execution_policy_base serves as a guard against
+// inifinite recursion in thrust entry points:
+//
+// template<typename DerivedPolicy>
+// void foo(const thrust::detail::execution_policy_base<DerivedPolicy> &s)
+// {
+//   using thrust::system::detail::generic::foo;
+//
+//   foo(thrust::detail::derived_cast(thrust::detail::strip_const(s));
+// }
+//
+// foo is not recursive when
+// 1. DerivedPolicy is derived from thrust::execution_policy below
+// 2. generic::foo takes thrust::execution_policy as a parameter
+template<typename DerivedPolicy>
+struct execution_policy_base : execution_policy_marker {};
+
+
+template<typename DerivedPolicy>
+THRUST_CONSTEXPR __host__ __device__
+execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<DerivedPolicy> &x)
+{
+  return const_cast<execution_policy_base<DerivedPolicy>&>(x);
+}
+
+
+template<typename DerivedPolicy>
+THRUST_CONSTEXPR __host__ __device__
+DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
+{
+  return static_cast<DerivedPolicy&>(x);
+}
+
+
+template<typename DerivedPolicy>
+THRUST_CONSTEXPR __host__ __device__
+const DerivedPolicy &derived_cast(const execution_policy_base<DerivedPolicy> &x)
+{
+  return static_cast<const DerivedPolicy&>(x);
+}
+
+} // end detail
+
+template<typename DerivedPolicy>
+  struct execution_policy
+    : thrust::detail::execution_policy_base<DerivedPolicy>
+{};
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/extrema.inl b/thrust/thrust/detail/extrema.inl
new file mode 100644
index 0000000000000000000000000000000000000000..3f60743e649253150fefbce1c5390ec3efea87aa
--- /dev/null
+++ b/thrust/thrust/detail/extrema.inl
@@ -0,0 +1,172 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/extrema.h>
+#include <thrust/system/detail/adl/extrema.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::min_element;
+  return min_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end min_element()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::min_element;
+  return min_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end min_element()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::max_element;
+  return max_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end max_element()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::max_element;
+  return max_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end max_element()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::minmax_element;
+  return minmax_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end minmax_element()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::minmax_element;
+  return minmax_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end minmax_element()
+
+
+template <typename ForwardIterator>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::min_element(select_system(system), first, last);
+} // end min_element()
+
+
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::min_element(select_system(system), first, last, comp);
+} // end min_element()
+
+
+template <typename ForwardIterator>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::max_element(select_system(system), first, last);
+} // end max_element()
+
+
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::max_element(select_system(system), first, last, comp);
+} // end max_element()
+
+
+template <typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> 
+minmax_element(ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::minmax_element(select_system(system), first, last);
+} // end minmax_element()
+
+
+template <typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> 
+minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::minmax_element(select_system(system), first, last, comp);
+} // end minmax_element()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/fill.inl b/thrust/thrust/detail/fill.inl
new file mode 100644
index 0000000000000000000000000000000000000000..6e957ca1f9c8f728e96911a8b1346978fa1a2152
--- /dev/null
+++ b/thrust/thrust/detail/fill.inl
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.inl
+ *  \brief Inline file for fill.h.
+ */
+
+#include <thrust/fill.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/fill.h>
+#include <thrust/system/detail/adl/fill.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  using thrust::system::detail::generic::fill;
+  return fill(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end fill()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+__host__ __device__
+  OutputIterator fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  using thrust::system::detail::generic::fill_n;
+  return fill_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, value);
+} // end fill_n()
+
+
+template<typename ForwardIterator, typename T>
+__host__ __device__
+  void fill(ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  thrust::fill(select_system(system), first, last, value);
+} // end fill()
+
+
+template<typename OutputIterator, typename Size, typename T>
+__host__ __device__
+  OutputIterator fill_n(OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<OutputIterator>::type System;
+
+  System system;
+
+  return thrust::fill_n(select_system(system), first, n, value);
+} // end fill()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/find.inl b/thrust/thrust/detail/find.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f42ff46506b04feb165fb03c2c035cdabb9c80bd
--- /dev/null
+++ b/thrust/thrust/detail/find.inl
@@ -0,0 +1,115 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.inl
+ *  \brief Inline file for find.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/find.h>
+#include <thrust/system/detail/adl/find.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value)
+{
+  using thrust::system::detail::generic::find;
+  return find(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end find()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::find_if;
+  return find_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end find_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::find_if_not;
+  return find_if_not(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end find_if_not()
+
+
+template <typename InputIterator, typename T>
+InputIterator find(InputIterator first,
+                   InputIterator last,
+                   const T& value)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+  
+  System system;
+  
+  return thrust::find(select_system(system), first, last, value);
+}
+
+template <typename InputIterator, typename Predicate>
+InputIterator find_if(InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+  
+  System system;
+  
+  return thrust::find_if(select_system(system), first, last, pred);
+}
+
+template <typename InputIterator, typename Predicate>
+InputIterator find_if_not(InputIterator first,
+                          InputIterator last,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+  
+  System system;
+  
+  return thrust::find_if_not(select_system(system), first, last, pred);
+}
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/for_each.inl b/thrust/thrust/detail/for_each.inl
new file mode 100644
index 0000000000000000000000000000000000000000..3365ce2e0f6f19c3516185b8fe255e1f516e6b07
--- /dev/null
+++ b/thrust/thrust/detail/for_each.inl
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.inl
+ *  \brief Inline file for for_each.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/for_each.h>
+#include <thrust/system/detail/adl/for_each.h>
+
+namespace thrust
+{
+
+__thrust_exec_check_disable__ 
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+__host__ __device__
+  InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         UnaryFunction f)
+{
+  using thrust::system::detail::generic::for_each;
+
+  return for_each(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, f);
+}
+
+
+template<typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f)
+{
+  using thrust::system::detail::generic::select_system;
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+  return thrust::for_each(select_system(system), first, last, f);
+} // end for_each()
+
+__thrust_exec_check_disable__ 
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename UnaryFunction>
+__host__ __device__
+  InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           Size n,
+                           UnaryFunction f)
+{
+  using thrust::system::detail::generic::for_each_n;
+
+  return for_each_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, f);
+} // end for_each_n()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(InputIterator first,
+                         Size n,
+                         UnaryFunction f)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+  return thrust::for_each_n(select_system(system), first, n, f);
+} // end for_each_n()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/function.h b/thrust/thrust/detail/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..a251c298a26142c741b8ba2cb522cdcf473abd1f
--- /dev/null
+++ b/thrust/thrust/detail/function.h
@@ -0,0 +1,160 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_reference_cast.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename Function, typename Result>
+struct wrapped_function
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+
+  inline __host__ __device__
+  wrapped_function()
+      : m_f()
+  {}
+
+  inline __host__ __device__
+  wrapped_function(const Function& f)
+      : m_f(f)
+  {}
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  inline __host__ __device__
+  Result operator()(Argument& x) const
+  {
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  inline __host__ __device__
+  Result operator()(const Argument& x) const
+  {
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  Result operator()(Argument1& x, Argument2& y) const
+  {
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  Result operator()(const Argument1& x, Argument2& y) const
+  {
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  Result operator()(const Argument1& x, const Argument2& y) const
+  {
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  Result operator()(Argument1& x, const Argument2& y) const
+  {
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
+  }
+}; // end wrapped_function
+
+// Specialize for void return types:
+template <typename Function>
+struct wrapped_function<Function, void>
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+  inline __host__ __device__
+  wrapped_function()
+    : m_f()
+  {}
+
+  inline __host__ __device__
+  wrapped_function(const Function& f)
+    : m_f(f)
+  {}
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  inline __host__ __device__
+  void operator()(Argument& x) const
+  {
+    m_f(thrust::raw_reference_cast(x));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  inline __host__ __device__
+  void operator()(const Argument& x) const
+  {
+    m_f(thrust::raw_reference_cast(x));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  void operator()(Argument1& x, Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  void operator()(const Argument1& x, Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  void operator()(const Argument1& x, const Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  inline __host__ __device__
+  void operator()(Argument1& x, const Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+}; // end wrapped_function
+
+} // namespace detail
+} // namespace thrust
diff --git a/thrust/thrust/detail/functional.inl b/thrust/thrust/detail/functional.inl
new file mode 100644
index 0000000000000000000000000000000000000000..ea13227978fd276115fccd5cb53c8e9b875d5e88
--- /dev/null
+++ b/thrust/thrust/detail/functional.inl
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Operation>
+  struct unary_traits_imp;
+
+template<typename Operation>
+  struct unary_traits_imp<Operation*>
+{
+  typedef Operation                         function_type;
+  typedef const function_type &             param_type;
+  typedef typename Operation::result_type   result_type;
+  typedef typename Operation::argument_type argument_type;
+}; // end unary_traits_imp
+
+template<typename Result, typename Argument>
+  struct unary_traits_imp<Result(*)(Argument)>
+{
+  typedef Result   (*function_type)(Argument);
+  typedef Result   (*param_type)(Argument);
+  typedef Result   result_type;
+  typedef Argument argument_type;
+}; // end unary_traits_imp
+
+template<typename Operation>
+  struct binary_traits_imp;
+
+template<typename Operation>
+  struct binary_traits_imp<Operation*>
+{
+  typedef Operation                                function_type;
+  typedef const function_type &                    param_type;
+  typedef typename Operation::result_type          result_type;
+  typedef typename Operation::first_argument_type  first_argument_type;
+  typedef typename Operation::second_argument_type second_argument_type;
+}; // end binary_traits_imp
+
+template<typename Result, typename Argument1, typename Argument2>
+  struct binary_traits_imp<Result(*)(Argument1, Argument2)>
+{
+  typedef Result (*function_type)(Argument1, Argument2);
+  typedef Result (*param_type)(Argument1, Argument2);
+  typedef Result result_type;
+  typedef Argument1 first_argument_type;
+  typedef Argument2 second_argument_type;
+}; // end binary_traits_imp
+
+} // end detail
+
+template<typename Operation>
+  struct unary_traits
+{
+  typedef typename detail::unary_traits_imp<Operation*>::function_type function_type;
+  typedef typename detail::unary_traits_imp<Operation*>::param_type    param_type;
+  typedef typename detail::unary_traits_imp<Operation*>::result_type   result_type;
+  typedef typename detail::unary_traits_imp<Operation*>::argument_type argument_type;
+}; // end unary_traits
+
+template<typename Result, typename Argument>
+  struct unary_traits<Result(*)(Argument)>
+{
+  typedef Result   (*function_type)(Argument);
+  typedef Result   (*param_type)(Argument);
+  typedef Result   result_type;
+  typedef Argument argument_type;
+}; // end unary_traits
+
+template<typename Operation>
+  struct binary_traits
+{
+  typedef typename detail::binary_traits_imp<Operation*>::function_type        function_type;
+  typedef typename detail::binary_traits_imp<Operation*>::param_type           param_type;
+  typedef typename detail::binary_traits_imp<Operation*>::result_type          result_type;
+  typedef typename detail::binary_traits_imp<Operation*>::first_argument_type  first_argument_type;
+  typedef typename detail::binary_traits_imp<Operation*>::second_argument_type second_argument_type;
+}; // end binary_traits
+
+template<typename Result, typename Argument1, typename Argument2>
+  struct binary_traits<Result(*)(Argument1, Argument2)>
+{
+  typedef Result (*function_type)(Argument1, Argument2);
+  typedef Result (*param_type)(Argument1, Argument2);
+  typedef Result result_type;
+  typedef Argument1 first_argument_type;
+  typedef Argument2 second_argument_type;
+}; // end binary_traits
+
+template<typename Predicate>
+  __host__ __device__
+  unary_negate<Predicate> not1(const Predicate &pred)
+{
+  return unary_negate<Predicate>(pred);
+} // end not1()
+
+template<typename BinaryPredicate>
+  __host__ __device__
+  binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred)
+{
+  return binary_negate<BinaryPredicate>(pred);
+} // end not2()
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h
new file mode 100644
index 0000000000000000000000000000000000000000..01e8d5cd358cc2e81aca079dde1c9c8639ad12ca
--- /dev/null
+++ b/thrust/thrust/detail/functional/actor.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/functional/value.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/assignment_operator.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+// eval_ref<T> is
+// - T when T is a subclass of thrust::reference
+// - T& otherwise
+// This is used to let thrust::references pass through actor evaluations.
+template <typename T>
+using eval_ref = typename std::conditional<
+  thrust::detail::is_wrapped_reference<T>::value, T, T&>::type;
+
+template<typename Action, typename Env>
+  struct apply_actor
+{
+  typedef typename Action::template result<Env>::type type;
+};
+
+template<typename Eval>
+  struct actor
+    : Eval
+{
+  typedef Eval eval_type;
+
+  __host__ __device__
+  THRUST_CONSTEXPR actor();
+
+  __host__ __device__
+  actor(const Eval &base);
+
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::null_type >::type
+  operator()(void) const;
+
+  template <typename... Ts>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<eval_ref<Ts>...>>::type
+  operator()(Ts&&... ts) const;
+
+  template<typename T>
+  __host__ __device__
+  typename assign_result<Eval,T>::type
+  operator=(const T &_1) const;
+}; // end actor
+
+// in general, as_actor should turn things into values
+template<typename T>
+  struct as_actor
+{
+  typedef value<T> type;
+
+  static inline __host__ __device__ type convert(const T &x)
+  {
+    return val(x);
+  } // end convert()
+}; // end as_actor
+
+// specialization for things which are already actors
+template<typename Eval>
+  struct as_actor<actor<Eval> >
+{
+  typedef actor<Eval> type;
+
+  static inline __host__ __device__ const type &convert(const actor<Eval> &x)
+  {
+    return x;
+  } // end convert()
+}; // end as_actor
+
+template<typename T>
+  typename as_actor<T>::type
+  __host__ __device__
+    make_actor(const T &x)
+{
+  return as_actor<T>::convert(x);
+} // end make_actor()
+
+} // end functional
+
+// provide specializations for result_of for nullary, unary, and binary invocations of actor
+template<typename Eval>
+  struct result_of_adaptable_function<
+    thrust::detail::functional::actor<Eval>()
+  >
+{
+  typedef typename thrust::detail::functional::apply_actor<
+    thrust::detail::functional::actor<Eval>,
+    thrust::null_type
+  >::type type;
+}; // end result_of
+
+template<typename Eval, typename Arg1>
+  struct result_of_adaptable_function<
+    thrust::detail::functional::actor<Eval>(Arg1)
+  >
+{
+  typedef typename thrust::detail::functional::apply_actor<
+    thrust::detail::functional::actor<Eval>,
+    thrust::tuple<Arg1>
+  >::type type;
+}; // end result_of
+
+template<typename Eval, typename Arg1, typename Arg2>
+  struct result_of_adaptable_function<
+    thrust::detail::functional::actor<Eval>(Arg1,Arg2)
+  >
+{
+  typedef typename thrust::detail::functional::apply_actor<
+    thrust::detail::functional::actor<Eval>,
+    thrust::tuple<Arg1,Arg2>
+  >::type type;
+}; // end result_of
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/functional/actor.inl>
+
diff --git a/thrust/thrust/detail/functional/actor.inl b/thrust/thrust/detail/functional/actor.inl
new file mode 100644
index 0000000000000000000000000000000000000000..444d2ff1a578dcdba8a183b7bdd5b17721755e2d
--- /dev/null
+++ b/thrust/thrust/detail/functional/actor.inl
@@ -0,0 +1,113 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/assignment_operator.h>
+#include <thrust/functional.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+
+namespace detail
+{
+namespace functional
+{
+
+template<typename Eval>
+  __host__ __device__
+  THRUST_CONSTEXPR actor<Eval>
+    ::actor()
+      : eval_type()
+{}
+
+template<typename Eval>
+  __host__ __device__
+  actor<Eval>
+    ::actor(const Eval &base)
+      : eval_type(base)
+{}
+
+template<typename Eval>
+  __host__ __device__
+  typename apply_actor<
+    typename actor<Eval>::eval_type,
+    typename thrust::null_type
+  >::type
+    actor<Eval>
+      ::operator()(void) const
+{
+  return eval_type::eval(thrust::null_type());
+} // end basic_environment::operator()
+
+// actor::operator() needs to construct a tuple of references to its
+// arguments. To make this work with thrust::reference<T>, we need to
+// detect thrust proxy references and store them as T rather than T&.
+// This check ensures that the forwarding references passed into
+// actor::operator() are either:
+// - T&& if and only if T is a thrust::reference<U>, or
+// - T& for any other types.
+// This struct provides a nicer diagnostic for when these conditions aren't
+// met.
+template <typename T>
+using actor_check_ref_type =
+  thrust::detail::integral_constant<bool,
+    ( std::is_lvalue_reference<T>::value ||
+      thrust::detail::is_wrapped_reference<T>::value )>;
+
+template <typename... Ts>
+using actor_check_ref_types =
+  thrust::conjunction<actor_check_ref_type<Ts>...>;
+
+template<typename Eval>
+template<typename... Ts>
+__host__ __device__
+typename apply_actor<typename actor<Eval>::eval_type,
+                     thrust::tuple<eval_ref<Ts>...>>::type
+actor<Eval>::operator()(Ts&&... ts) const
+{
+  static_assert(actor_check_ref_types<Ts...>::value,
+                "Actor evaluations only support rvalue references to "
+                "thrust::reference subclasses.");
+  using tuple_type = thrust::tuple<eval_ref<Ts>...>;
+  return eval_type::eval(tuple_type(THRUST_FWD(ts)...));
+} // end actor<Eval>::operator()
+
+template<typename Eval>
+  template<typename T>
+    __host__ __device__
+    typename assign_result<Eval,T>::type
+      actor<Eval>
+        ::operator=(const T& _1) const
+{
+  return do_assign(*this,_1);
+} // end actor::operator=()
+
+} // end functional
+} // end detail
+} // end thrust
diff --git a/thrust/thrust/detail/functional/argument.h b/thrust/thrust/detail/functional/argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b7541716e5ddb6e15eb8c3bdb5f950dd7218677
--- /dev/null
+++ b/thrust/thrust/detail/functional/argument.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<unsigned int i, typename Env>
+  struct argument_helper
+{
+  typedef typename thrust::tuple_element<i,Env>::type type;
+};
+
+template<unsigned int i>
+  struct argument_helper<i,thrust::null_type>
+{
+  typedef thrust::null_type type;
+};
+
+
+template<unsigned int i>
+  class argument
+{
+  public:
+    template<typename Env>
+      struct result
+        : argument_helper<i,Env>
+    {
+    };
+
+    __host__ __device__
+    THRUST_CONSTEXPR argument(){}
+
+    template<typename Env>
+    __host__ __device__
+    typename result<Env>::type eval(const Env &e) const
+    {
+      return thrust::get<i>(e);
+    } // end eval()
+}; // end argument
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/composite.h b/thrust/thrust/detail/functional/composite.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cf095bf116122a652b6c6d8bc5cb01100977dd7
--- /dev/null
+++ b/thrust/thrust/detail/functional/composite.h
@@ -0,0 +1,163 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/functional/actor.h>
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+// XXX we should just take a single EvalTuple
+template<typename Eval0,
+         typename Eval1  = thrust::null_type,
+         typename Eval2  = thrust::null_type,
+         typename Eval3  = thrust::null_type,
+         typename Eval4  = thrust::null_type,
+         typename Eval5  = thrust::null_type,
+         typename Eval6  = thrust::null_type,
+         typename Eval7  = thrust::null_type,
+         typename Eval8  = thrust::null_type,
+         typename Eval9  = thrust::null_type,
+         typename Eval10 = thrust::null_type>
+  class composite;
+
+template<typename Eval0, typename Eval1>
+  class composite<
+    Eval0,
+    Eval1,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type
+  >
+{
+  public:
+    template<typename Env>
+      struct result
+    {
+      typedef typename Eval0::template result<
+        thrust::tuple<
+          typename Eval1::template result<Env>::type
+        >
+      >::type type;
+    };
+
+    __host__ __device__
+    composite(const Eval0 &e0, const Eval1 &e1)
+      : m_eval0(e0),
+        m_eval1(e1)
+    {}
+
+    template<typename Env>
+    __host__ __device__
+    typename result<Env>::type
+    eval(const Env &x) const
+    {
+      typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
+      return m_eval0.eval(thrust::tie(result1));
+    }
+
+  private:
+    Eval0 m_eval0;
+    Eval1 m_eval1;
+}; // end composite<Eval0,Eval1>
+
+template<typename Eval0, typename Eval1, typename Eval2>
+  class composite<
+    Eval0,
+    Eval1,
+    Eval2,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type
+  >
+{
+  public:
+    template<typename Env>
+      struct result
+    {
+      typedef typename Eval0::template result<
+        thrust::tuple<
+          typename Eval1::template result<Env>::type,
+          typename Eval2::template result<Env>::type
+        >
+      >::type type;
+    };
+
+    __host__ __device__
+    composite(const Eval0 &e0, const Eval1 &e1, const Eval2 &e2)
+      : m_eval0(e0),
+        m_eval1(e1),
+        m_eval2(e2)
+    {}
+
+    template<typename Env>
+    __host__ __device__
+    typename result<Env>::type
+    eval(const Env &x) const
+    {
+      typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
+      typename Eval2::template result<Env>::type result2 = m_eval2.eval(x);
+      return m_eval0.eval(thrust::tie(result1,result2));
+    }
+
+  private:
+    Eval0 m_eval0;
+    Eval1 m_eval1;
+    Eval2 m_eval2;
+}; // end composite<Eval0,Eval1,Eval2>
+
+template<typename Eval0, typename Eval1>
+__host__ __device__
+  actor<composite<Eval0,Eval1> > compose(const Eval0 &e0, const Eval1 &e1)
+{
+  return actor<composite<Eval0,Eval1> >(composite<Eval0,Eval1>(e0,e1));
+}
+
+template<typename Eval0, typename Eval1, typename Eval2>
+__host__ __device__
+  actor<composite<Eval0,Eval1,Eval2> > compose(const Eval0 &e0, const Eval1 &e1, const Eval2 &e2)
+{
+  return actor<composite<Eval0,Eval1,Eval2> >(composite<Eval0,Eval1,Eval2>(e0,e1,e2));
+}
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators.h b/thrust/thrust/detail/functional/operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..f86ea20521811911e53812320e134a1e5c68079c
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/operators/arithmetic_operators.h>
+#include <thrust/detail/functional/operators/relational_operators.h>
+#include <thrust/detail/functional/operators/logical_operators.h>
+#include <thrust/detail/functional/operators/bitwise_operators.h>
+#include <thrust/detail/functional/operators/compound_assignment_operators.h>
+
diff --git a/thrust/thrust/detail/functional/operators/arithmetic_operators.h b/thrust/thrust/detail/functional/operators/arithmetic_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd5b707e3ba163d7308b3d893a4f4b773af1933f
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/arithmetic_operators.h
@@ -0,0 +1,432 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<thrust::negate<>>,
+    actor<Eval>
+  >
+>
+__host__ __device__
+operator-(const actor<Eval> &_1)
+{
+  return compose(transparent_unary_operator<thrust::negate<>>(), _1);
+} // end operator-()
+
+// there's no standard unary_plus functional, so roll an ad hoc one here
+struct unary_plus
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(+THRUST_FWD(t1))) -> decltype(+THRUST_FWD(t1))
+  {
+    return +THRUST_FWD(t1);
+  }
+};
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<unary_plus>,
+    actor<Eval>
+  >
+>
+operator+(const actor<Eval> &_1)
+{
+  return compose(transparent_unary_operator<unary_plus>(), _1);
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::plus<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator+(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::plus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::plus<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator+(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::plus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::plus<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator+(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::plus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::minus<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator-(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::minus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::minus<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator-(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::minus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::minus<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator-(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::minus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::multiplies<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator*(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::multiplies<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator*(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::multiplies<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator*(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::divides<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator/(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::divides<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::divides<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator/(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::divides<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::divides<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator/(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::divides<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::modulus<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator%(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::modulus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::modulus<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator%(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::modulus<void>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::modulus<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator%(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::modulus<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%()
+
+// there's no standard prefix_increment functional, so roll an ad hoc one here
+struct prefix_increment
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(++THRUST_FWD(t1))) -> decltype(++THRUST_FWD(t1))
+  {
+    return ++THRUST_FWD(t1);
+  }
+}; // end prefix_increment
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<prefix_increment>,
+    actor<Eval>
+  >
+>
+operator++(const actor<Eval> &_1)
+{
+  return compose(transparent_unary_operator<prefix_increment>(), _1);
+} // end operator++()
+
+
+// there's no standard postfix_increment functional, so roll an ad hoc one here
+struct postfix_increment
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(THRUST_FWD(t1)++)) -> decltype(THRUST_FWD(t1)++)
+  {
+    return THRUST_FWD(t1)++;
+  }
+}; // end postfix_increment
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<postfix_increment>,
+    actor<Eval>
+  >
+>
+operator++(const actor<Eval> &_1, int)
+{
+  return compose(transparent_unary_operator<postfix_increment>(), _1);
+} // end operator++()
+
+
+// there's no standard prefix_decrement functional, so roll an ad hoc one here
+struct prefix_decrement
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(--THRUST_FWD(t1))) -> decltype(--THRUST_FWD(t1))
+  {
+    return --THRUST_FWD(t1);
+  }
+}; // end prefix_decrement
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<prefix_decrement>,
+    actor<Eval>
+  >
+>
+operator--(const actor<Eval> &_1)
+{
+  return compose(transparent_unary_operator<prefix_decrement>(), _1);
+} // end operator--()
+
+
+// there's no standard postfix_decrement functional, so roll an ad hoc one here
+struct postfix_decrement
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(THRUST_FWD(t1)--)) -> decltype(THRUST_FWD(t1)--)
+  {
+    return THRUST_FWD(t1)--;
+  }
+}; // end prefix_increment
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<postfix_decrement>,
+    actor<Eval>
+  >
+>
+operator--(const actor<Eval> &_1, int)
+{
+  return compose(transparent_unary_operator<postfix_decrement>(), _1);
+} // end operator--()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators/assignment_operator.h b/thrust/thrust/detail/functional/operators/assignment_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2f18339bc3956871e63f81b697cfd87d065ad62
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/assignment_operator.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+// XXX WAR circular inclusion with this forward declaration
+template<typename,typename,typename> struct binary_function;
+
+namespace detail
+{
+namespace functional
+{
+
+// XXX WAR circular inclusion with this forward declaration
+template<typename> struct as_actor;
+
+// there's no standard assign functional, so roll an ad hoc one here
+struct assign
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) = THRUST_FWD(t2)))
+      -> decltype(THRUST_FWD(t1) = THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) = THRUST_FWD(t2);
+  }
+};
+
+template<typename Eval, typename T>
+  struct assign_result
+{
+  typedef actor<
+    composite<
+      transparent_binary_operator<assign>,
+      actor<Eval>,
+      typename as_actor<T>::type
+    >
+  > type;
+}; // end assign_result
+
+template<typename Eval, typename T>
+  __host__ __device__
+    typename assign_result<Eval,T>::type
+      do_assign(const actor<Eval> &_1, const T &_2)
+{
+  return compose(transparent_binary_operator<assign>(),
+                 _1,
+                 as_actor<T>::convert(_2));
+} // end do_assign()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators/bitwise_operators.h b/thrust/thrust/detail/functional/operators/bitwise_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6461f9d493132f6f7c331dedb619cc2fa79f8a9
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/bitwise_operators.h
@@ -0,0 +1,338 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_and<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator&(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_and<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_and<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator&(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_and<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_and<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator&(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_and<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_or<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator|(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_or<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_or<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator|(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_or<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_or<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator|(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_or<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_xor<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator^(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_xor<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator^()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_xor<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator^(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_xor<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator^()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_xor<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator^(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_xor<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator^()
+
+
+// there's no standard bit_not functional, so roll an ad hoc one here
+struct bit_not
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(~THRUST_FWD(t1))) -> decltype(~THRUST_FWD(t1))
+  {
+    return ~THRUST_FWD(t1);
+  }
+}; // end prefix_increment
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<bit_not>,
+    actor<Eval>
+  >
+>
+__host__ __device__
+operator~(const actor<Eval> &_1)
+{
+  return compose(transparent_unary_operator<bit_not>(), _1);
+} // end operator~()
+
+// there's no standard bit_lshift functional, so roll an ad hoc one here
+struct bit_lshift
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2)))
+      -> decltype(THRUST_FWD(t1) << THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) << THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_lshift>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<<(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_lshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_lshift>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator<<(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_lshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_lshift>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<<(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_lshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<()
+
+// there's no standard bit_rshift functional, so roll an ad hoc one here
+struct bit_rshift
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) >> THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) >> THRUST_FWD(t2);
+  }
+};
+
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_rshift>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>>(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_rshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_rshift>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator>>(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_rshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_rshift>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>>(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_rshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators/compound_assignment_operators.h b/thrust/thrust/detail/functional/operators/compound_assignment_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..737d6abd098e0acc666ec9678e3219d8c9586cca
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/compound_assignment_operators.h
@@ -0,0 +1,513 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+// there's no standard plus_equal functional, so roll an ad hoc one here
+struct plus_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2)))
+      -> decltype(THRUST_FWD(t1) += THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) += THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<plus_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator+=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<plus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<plus_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator+=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<plus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+=()
+
+// there's no standard minus_equal functional, so roll an ad hoc one here
+struct minus_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) -= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) -= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<minus_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator-=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<minus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<minus_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator-=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<minus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-=()
+
+// there's no standard multiplies_equal functional, so roll an ad hoc one here
+struct multiplies_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) *= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) *= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<multiplies_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator*=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<multiplies_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<multiplies_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator*=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<multiplies_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*=()
+
+// there's no standard divides_equal functional, so roll an ad hoc one here
+struct divides_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) /= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) /= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<divides_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator/=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<divides_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<divides_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator/=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<divides_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/=()
+
+// there's no standard modulus_equal functional, so roll an ad hoc one here
+struct modulus_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) %= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) %= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<modulus_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator%=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<modulus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<modulus_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator%=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<modulus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%=()
+
+// there's no standard bit_and_equal functional, so roll an ad hoc one here
+struct bit_and_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) &= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) &= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_and_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator&=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_and_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_and_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator&=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_and_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&=()
+
+// there's no standard bit_or_equal functional, so roll an ad hoc one here
+struct bit_or_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) |= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) |= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_or_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator|=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_or_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_or_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator|=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_or_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+// there's no standard bit_xor_equal functional, so roll an ad hoc one here
+struct bit_xor_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) ^= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_xor_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator^=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_xor_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_xor_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator^=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_xor_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+// there's no standard bit_lshift_equal functional, so roll an ad hoc one here
+struct bit_lshift_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) <<= THRUST_FWD(t2);
+  }
+};
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_lshift_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<<=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_lshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_lshift_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<<=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_lshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<=()
+
+// there's no standard bit_rshift_equal functional, so roll an ad hoc one here
+struct bit_rshift_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2)))
+  -> decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t1) >>= THRUST_FWD(t2);
+  }
+};
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_rshift_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>>=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<bit_rshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<bit_rshift_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>>=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<bit_rshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>=()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators/logical_operators.h b/thrust/thrust/detail/functional/operators/logical_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a2e5e0402bf26f2205e1992af6d451990d6c19
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/logical_operators.h
@@ -0,0 +1,144 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::logical_and<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator&&(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::logical_and<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator&&(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::logical_and<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator&&(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::logical_or<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator||(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::logical_or<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator||(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::logical_or<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator||(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    transparent_unary_operator<thrust::logical_not<>>,
+    actor<Eval>
+  >
+>
+operator!(const actor<Eval> &_1)
+{
+  return compose(transparent_unary_operator<thrust::logical_not<>>(), _1);
+} // end operator!()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators/operator_adaptors.h b/thrust/thrust/detail/functional/operators/operator_adaptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..67a1f6e37d6180c0ec35f19d11a134f89c518925
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/operator_adaptors.h
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/argument.h>
+#include <thrust/detail/type_deduction.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/void_t.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>)
+// into the Eval interface.
+template <typename UnaryFunctor>
+struct transparent_unary_operator
+{
+  template <typename>
+  using operator_type = UnaryFunctor;
+
+  template <typename Env>
+  using argument =
+  typename thrust::detail::eval_if<
+    thrust::tuple_size<Env>::value != 1,
+    thrust::detail::identity_<thrust::null_type>,
+    thrust::detail::functional::argument_helper<0, Env>
+  >::type;
+
+  template <typename Env>
+  struct result_type_impl
+  {
+    using type = decltype(
+      std::declval<UnaryFunctor>()(std::declval<argument<Env>>()));
+  };
+
+  template <typename Env>
+  using result_type =
+  typename thrust::detail::eval_if<
+    std::is_same<thrust::null_type, argument<Env>>::value,
+    thrust::detail::identity_<thrust::null_type>,
+    result_type_impl<Env>
+  >::type;
+
+  template <typename Env>
+  struct result
+  {
+    using op_type = UnaryFunctor;
+    using type = result_type<Env>;
+  };
+
+  template <typename Env>
+  __host__ __device__
+  result_type<Env> eval(Env&& e) const
+  THRUST_RETURNS(UnaryFunctor{}(thrust::get<0>(THRUST_FWD(e))))
+};
+
+
+// Adapts a transparent binary functor from functional.h (e.g. thrust::less<>)
+// into the Eval interface.
+template <typename BinaryFunctor>
+struct transparent_binary_operator
+{
+  template <typename>
+  using operator_type = BinaryFunctor;
+
+  template <typename Env>
+  using first_argument =
+    typename thrust::detail::eval_if<
+      thrust::tuple_size<Env>::value != 2,
+      thrust::detail::identity_<thrust::null_type>,
+      thrust::detail::functional::argument_helper<0, Env>
+    >::type;
+
+  template <typename Env>
+  using second_argument =
+    typename thrust::detail::eval_if<
+      thrust::tuple_size<Env>::value != 2,
+      thrust::detail::identity_<thrust::null_type>,
+      thrust::detail::functional::argument_helper<1, Env>
+    >::type;
+
+  template <typename Env>
+  struct result_type_impl
+  {
+    using type = decltype(
+      std::declval<BinaryFunctor>()(std::declval<first_argument<Env>>(),
+                                    std::declval<second_argument<Env>>()));
+  };
+
+  template <typename Env>
+  using result_type =
+    typename thrust::detail::eval_if<
+      (std::is_same<thrust::null_type, first_argument<Env>>::value ||
+       std::is_same<thrust::null_type, second_argument<Env>>::value),
+      thrust::detail::identity_<thrust::null_type>,
+      result_type_impl<Env>
+    >::type;
+
+  template <typename Env>
+  struct result
+  {
+    using op_type = BinaryFunctor;
+    using type = result_type<Env>;
+  };
+
+  template <typename Env>
+  __host__ __device__
+  result_type<Env> eval(Env&& e) const
+  THRUST_RETURNS(BinaryFunctor{}(thrust::get<0>(e), thrust::get<1>(e)))
+};
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/operators/relational_operators.h b/thrust/thrust/detail/functional/operators/relational_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..51fd4640a2928021d9ef017c0dd96182d816b856
--- /dev/null
+++ b/thrust/thrust/detail/functional/operators/relational_operators.h
@@ -0,0 +1,323 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::equal_to<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator==(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator==()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::equal_to<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator==(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator==()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::equal_to<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator==(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator==()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::not_equal_to<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator!=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator!=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::not_equal_to<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator!=(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator!=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::not_equal_to<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator!=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator!=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::greater<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::greater<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::greater<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator>(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::greater<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::greater<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::greater<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::less<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::less<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::less<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator<(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::less<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::less<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::less<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::greater_equal<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::greater_equal<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator>=(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::greater_equal<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::less_equal<>>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::less_equal<>>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator<=(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    transparent_binary_operator<thrust::less_equal<>>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<=()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/placeholder.h b/thrust/thrust/detail/functional/placeholder.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0832cfecb1c70dd28d78c44349f0ee5ad78c0fa
--- /dev/null
+++ b/thrust/thrust/detail/functional/placeholder.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/argument.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<unsigned int i>
+  struct placeholder
+{
+  typedef actor<argument<i> > type;
+};
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/functional/value.h b/thrust/thrust/detail/functional/value.h
new file mode 100644
index 0000000000000000000000000000000000000000..27a584676fe0f9d6c2f87a345a3e185ba0ac5bde
--- /dev/null
+++ b/thrust/thrust/detail/functional/value.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+
+template<typename Eval> struct actor;
+
+
+template<typename T>
+  class value
+{
+  public:
+
+    template<typename Env>
+      struct result
+    {
+      typedef T type;
+    };
+
+    __host__ __device__
+    value(const T &arg)
+      : m_val(arg)
+    {}
+
+    template<typename Env>
+    __host__ __device__
+      T eval(const Env &) const
+    {
+      return m_val;
+    }
+
+  private:
+    T m_val;
+}; // end value
+
+template<typename T>
+__host__ __device__
+actor<value<T> > val(const T &x)
+{
+  return value<T>(x);
+} // end val()
+
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/gather.inl b/thrust/thrust/detail/gather.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4550742c5f4eae7941120de3853b8b63ce5f8196
--- /dev/null
+++ b/thrust/thrust/detail/gather.inl
@@ -0,0 +1,166 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file gather.inl
+ *  \brief Inline file for gather.h.
+ */
+
+#include <thrust/gather.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/gather.h>
+#include <thrust/system/detail/adl/gather.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator                                               map_first,
+                        InputIterator                                               map_last,
+                        RandomAccessIterator                                        input_first,
+                        OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::gather;
+  return gather(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, input_first, result);
+} // end gather()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::gather_if;
+  return gather_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, stencil, input_first, result);
+} // end gather_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result,
+                           Predicate                                                   pred)
+{
+  using thrust::system::detail::generic::gather_if;
+  return gather_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, stencil, input_first, result, pred);
+} // end gather_if()
+
+
+template<typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(InputIterator        map_first,
+                        InputIterator        map_last,
+                        RandomAccessIterator input_first,
+                        OutputIterator       result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type        System1; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2; 
+  typedef typename thrust::iterator_system<OutputIterator>::type       System3; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::gather(select_system(system1,system2,system3), map_first, map_last, input_first, result);
+} // end gather()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result);
+} // end gather_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result,
+                           Predicate            pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result, pred);
+} // end gather_if()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/generate.inl b/thrust/thrust/detail/generate.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2ac936e537be7f160aae31890b07b993a4e85
--- /dev/null
+++ b/thrust/thrust/detail/generate.inl
@@ -0,0 +1,98 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generate.inl
+ *  \author Jared Hoberock
+ *  \brief Inline file for generate.h.
+ */
+
+#include <thrust/generate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/generate.h>
+#include <thrust/system/detail/adl/generate.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Generator>
+__host__ __device__
+  void generate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen)
+{
+  using thrust::system::detail::generic::generate;
+  return generate(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, gen);
+} // end generate()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+__host__ __device__
+  OutputIterator generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen)
+{
+  using thrust::system::detail::generic::generate_n;
+  return generate_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, gen);
+} // end generate_n()
+
+
+template<typename ForwardIterator,
+         typename Generator>
+  void generate(ForwardIterator first,
+                ForwardIterator last,
+                Generator gen)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::generate(select_system(system), first, last, gen);
+} // end generate()
+
+
+template<typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(OutputIterator first,
+                            Size n,
+                            Generator gen)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<OutputIterator>::type System;
+
+  System system;
+
+  return thrust::generate_n(select_system(system), first, n, gen);
+} // end generate_n()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/get_iterator_value.h b/thrust/thrust/detail/get_iterator_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7bd1b9d9d7e513ad31ea91c134126d4ee5c239f
--- /dev/null
+++ b/thrust/thrust/detail/get_iterator_value.h
@@ -0,0 +1,53 @@
+#pragma once
+/*
+ *  Copyright 2008-2016 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/execution_policy.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/system/detail/generic/memory.h> // for get_value()
+
+namespace thrust {
+namespace detail {
+
+// get_iterator_value specialization on iterators
+// --------------------------------------------------
+// it is okay to dereference iterator in the usual way
+template<typename DerivedPolicy, typename Iterator>
+__host__ __device__
+typename thrust::iterator_traits<Iterator>::value_type
+get_iterator_value(thrust::execution_policy<DerivedPolicy> &, Iterator it)
+{
+  return *it;
+} // get_iterator_value(exec,Iterator);
+
+// get_iterator_value specialization on pointer
+// ----------------------------------------------
+// we can't just dereference a pointer in the usual way, because
+// it may point to a location in the device memory. 
+// we use get_value(exec,pointer*) function
+// to perform a dereferencing consistent with the execution policy
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+typename thrust::detail::pointer_traits<Pointer*>::element_type 
+get_iterator_value(thrust::execution_policy<DerivedPolicy> &exec, Pointer* ptr)
+{
+  return get_value(derived_cast(exec),ptr);
+} // get_iterator_value(exec,Pointer*)
+
+} // namespace detail
+} // namespace thrust
diff --git a/thrust/thrust/detail/host_vector.inl b/thrust/thrust/detail/host_vector.inl
new file mode 100644
index 0000000000000000000000000000000000000000..e424dd1e1e176a5a6875851c16144dc6239196e2
--- /dev/null
+++ b/thrust/thrust/detail/host_vector.inl
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file host_vector.inl
+ *  \brief Inline file for host_vector.h.
+ */
+
+#include <thrust/host_vector.h>
+
+namespace thrust
+{
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector<T,Alloc>
+      ::host_vector(const device_vector<OtherT,OtherAlloc> &v)
+        :Parent(v)
+{
+  ;
+} // end host_vector::host_vector()
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/inner_product.inl b/thrust/thrust/detail/inner_product.inl
new file mode 100644
index 0000000000000000000000000000000000000000..37247e68ee46db1f25d261fb39c7980562f50720
--- /dev/null
+++ b/thrust/thrust/detail/inner_product.inl
@@ -0,0 +1,108 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file inner_product.inl
+ *  \brief Inline file for inner_product.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/inner_product.h>
+#include <thrust/system/detail/adl/inner_product.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType>
+__host__ __device__
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init)
+{
+  using thrust::system::detail::generic::inner_product;
+  return inner_product(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, init);
+} // end inner_product()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType,
+         typename BinaryFunction1,
+         typename BinaryFunction2>
+__host__ __device__
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2)
+{
+  using thrust::system::detail::generic::inner_product;
+  return inner_product(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, init, binary_op1, binary_op2);
+} // end inner_product()
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputType>
+OutputType 
+inner_product(InputIterator1 first1, InputIterator1 last1,
+              InputIterator2 first2, OutputType init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init);
+} // end inner_product()
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputType,
+         typename BinaryFunction1, typename BinaryFunction2>
+OutputType
+inner_product(InputIterator1 first1, InputIterator1 last1,
+              InputIterator2 first2, OutputType init, 
+              BinaryFunction1 binary_op1, BinaryFunction2 binary_op2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init, binary_op1, binary_op2);
+} // end inner_product()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2495c0b249b65a9be4138e21f1bb11aea8cd0b0
--- /dev/null
+++ b/thrust/thrust/detail/integer_math.h
@@ -0,0 +1,155 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <limits>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/type_deduction.h>
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename Integer>
+__host__ __device__ __thrust_forceinline__
+Integer clz(Integer x)
+{
+  Integer result;
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      result = ::__clz(x);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      int num_bits = 8 * sizeof(Integer);
+      int num_bits_minus_one = num_bits - 1;
+      result = num_bits;
+      for (int i = num_bits_minus_one; i >= 0; --i)
+      {
+        if ((Integer(1) << i) & x)
+        {
+          result = num_bits_minus_one - i;
+          break;
+        }
+      }
+    #endif
+  }
+  return result;
+}
+
+template <typename Integer>
+__host__ __device__ __thrust_forceinline__
+bool is_power_of_2(Integer x)
+{
+  return 0 == (x & (x - 1));
+}
+
+template <typename Integer>
+__host__ __device__ __thrust_forceinline__
+bool is_odd(Integer x)
+{
+  return 1 & x;
+}
+
+template <typename Integer>
+__host__ __device__ __thrust_forceinline__
+Integer log2(Integer x)
+{
+  Integer num_bits = 8 * sizeof(Integer);
+  Integer num_bits_minus_one = num_bits - 1;
+
+  return num_bits_minus_one - clz(x);
+}
+
+
+template <typename Integer>
+__host__ __device__ __thrust_forceinline__
+Integer log2_ri(Integer x)
+{
+  Integer result = log2(x);
+
+  // This is where we round up to the nearest log.
+  if (!is_power_of_2(x))
+    ++result;
+
+  return result;
+}
+
+// x/y rounding towards +infinity for integers
+// Used to determine # of blocks/warps etc.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+// FIXME: Should use common_type.
+auto divide_ri(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS((x + (y - 1)) / y)
+#else
+// FIXME: Should use common_type.
+Integer0 divide_ri(Integer0 const x, Integer1 const y)
+{
+  return (x + (y - 1)) / y;
+}
+#endif
+
+// x/y rounding towards zero for integers.
+// Used to determine # of blocks/warps etc.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+auto divide_rz(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS(x / y)
+#else
+// FIXME: Should use common_type.
+Integer0 divide_rz(Integer0 const x, Integer1 const y)
+{
+  return x / y;
+}
+#endif
+
+// Round x towards infinity to the next multiple of y.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+auto round_i(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS(y * divide_ri(x, y))
+#else
+Integer0 round_i(Integer0 const x, Integer1 const y)
+{
+  return y * divide_ri(x, y);
+}
+#endif
+
+// Round x towards 0 to the next multiple of y.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+auto round_z(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS(y * divide_rz(x, y))
+#else
+Integer0 round_z(Integer0 const x, Integer1 const y)
+{
+  return y * divide_rz(x, y);
+}
+#endif
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/integer_traits.h b/thrust/thrust/detail/integer_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..97ab4f94da2272829be05545121e1ec2b186cf46
--- /dev/null
+++ b/thrust/thrust/detail/integer_traits.h
@@ -0,0 +1,132 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <limits>
+#include <limits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T>
+  class integer_traits
+{
+  public:
+    static const bool is_integral = false;
+};
+
+template<typename T, T min_val, T max_val>
+  class integer_traits_base
+{
+  public:
+    static const bool is_integral = true;
+    static const T const_min = min_val;
+    static const T const_max = max_val;
+};
+
+
+template<>
+  class integer_traits<bool>
+    : public std::numeric_limits<bool>,
+      public integer_traits_base<bool, false, true>
+{};
+
+
+template<>
+  class integer_traits<char>
+    : public std::numeric_limits<char>,
+      public integer_traits_base<char, CHAR_MIN, CHAR_MAX>
+{};
+
+
+template<>
+  class integer_traits<signed char>
+    : public std::numeric_limits<signed char>,
+      public integer_traits_base<signed char, SCHAR_MIN, SCHAR_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned char>
+    : public std::numeric_limits<unsigned char>,
+      public integer_traits_base<unsigned char, 0, UCHAR_MAX>
+{};
+
+
+template<>
+  class integer_traits<short>
+    : public std::numeric_limits<short>,
+      public integer_traits_base<short, SHRT_MIN, SHRT_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned short>
+    : public std::numeric_limits<unsigned short>,
+      public integer_traits_base<unsigned short, 0, USHRT_MAX>
+{};
+
+
+template<>
+  class integer_traits<int>
+    : public std::numeric_limits<int>,
+      public integer_traits_base<int, INT_MIN, INT_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned int>
+    : public std::numeric_limits<unsigned int>,
+      public integer_traits_base<unsigned int, 0, UINT_MAX>
+{};
+
+
+template<>
+  class integer_traits<long>
+    : public std::numeric_limits<long>,
+      public integer_traits_base<long, LONG_MIN, LONG_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned long>
+    : public std::numeric_limits<unsigned long>,
+      public integer_traits_base<unsigned long, 0, ULONG_MAX>
+{};
+
+
+template<>
+  class integer_traits<long long>
+    : public std::numeric_limits<long long>,
+      public integer_traits_base<long long, LLONG_MIN, LLONG_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned long long>
+    : public std::numeric_limits<unsigned long long>,
+      public integer_traits_base<unsigned long long, 0, ULLONG_MAX>
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/internal_functional.h b/thrust/thrust/detail/internal_functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ae6634b7ddf81fa7fc56b58ac7338b509526fba
--- /dev/null
+++ b/thrust/thrust/detail/internal_functional.h
@@ -0,0 +1,560 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file internal_functional.inl
+ *  \brief Non-public functionals used to implement algorithm internals.
+ */
+
+#pragma once
+
+#include <thrust/tuple.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tuple_of_iterator_references.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/memory_wrapper.h> // for ::new
+
+namespace thrust
+{
+namespace detail
+{
+
+// unary_negate does not need to know argument_type
+template<typename Predicate>
+struct unary_negate
+{
+  typedef bool result_type;
+  
+  Predicate pred;
+  
+  __host__ __device__
+  explicit unary_negate(const Predicate& pred) : pred(pred) {}
+  
+  template <typename T>
+  __host__ __device__
+  bool operator()(const T& x)
+  {
+    return !bool(pred(x));
+  }
+};
+
+// binary_negate does not need to know first_argument_type or second_argument_type
+template<typename Predicate>
+struct binary_negate
+{
+  typedef bool result_type;
+  
+  Predicate pred;
+  
+  __host__ __device__
+  explicit binary_negate(const Predicate& pred) : pred(pred) {}
+  
+  template <typename T1, typename T2>
+  __host__ __device__
+  bool operator()(const T1& x, const T2& y)
+  {
+    return !bool(pred(x,y));
+  }
+};
+
+template<typename Predicate>
+__host__ __device__
+thrust::detail::unary_negate<Predicate> not1(const Predicate &pred)
+{
+  return thrust::detail::unary_negate<Predicate>(pred);
+}
+
+template<typename Predicate>
+__host__ __device__
+thrust::detail::binary_negate<Predicate> not2(const Predicate &pred)
+{
+  return thrust::detail::binary_negate<Predicate>(pred);
+}
+
+
+// convert a predicate to a 0 or 1 integral value
+template<typename Predicate, typename IntegralType>
+struct predicate_to_integral
+{
+  Predicate pred;
+  
+  __host__ __device__
+  explicit predicate_to_integral(const Predicate& pred) : pred(pred) {}
+  
+  template <typename T>
+  __host__ __device__
+  IntegralType operator()(const T& x)
+  {
+    return pred(x) ? IntegralType(1) : IntegralType(0);
+  }
+};
+
+
+// note that detail::equal_to does not force conversion from T2 -> T1 as equal_to does
+template<typename T1>
+struct equal_to
+{
+  typedef bool result_type;
+  
+  template <typename T2>
+  __host__ __device__
+  bool operator()(const T1& lhs, const T2& rhs) const
+  {
+    return lhs == rhs;
+  }
+};
+
+// note that equal_to_value does not force conversion from T2 -> T1 as equal_to does
+template<typename T2>
+struct equal_to_value
+{
+  T2 rhs;
+  
+  __host__ __device__
+  equal_to_value(const T2& rhs) : rhs(rhs) {}
+  
+  template <typename T1>
+  __host__ __device__
+  bool operator()(const T1& lhs) const
+  {
+    return lhs == rhs;
+  }
+};
+
+template<typename Predicate>
+struct tuple_binary_predicate
+{
+  typedef bool result_type;
+  
+  __host__ __device__
+  tuple_binary_predicate(const Predicate& p) : pred(p) {}
+  
+  template<typename Tuple>
+  __host__ __device__
+  bool operator()(const Tuple& t) const
+  { 
+    return pred(thrust::get<0>(t), thrust::get<1>(t));
+  }
+  
+  mutable Predicate pred;
+};
+
+template<typename Predicate>
+struct tuple_not_binary_predicate
+{
+  typedef bool result_type;
+  
+  __host__ __device__
+  tuple_not_binary_predicate(const Predicate& p) : pred(p) {}
+  
+  template<typename Tuple>
+  __host__ __device__
+  bool operator()(const Tuple& t) const
+  { 
+    return !pred(thrust::get<0>(t), thrust::get<1>(t));
+  }
+  
+  mutable Predicate pred;
+};
+
+template<typename Generator>
+  struct host_generate_functor
+{
+  typedef void result_type;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  host_generate_functor(Generator g)
+    : gen(g) {}
+
+  // operator() does not take an lvalue reference because some iterators
+  // produce temporary proxy references when dereferenced. for example,
+  // consider the temporary tuple of references produced by zip_iterator.
+  // such temporaries cannot bind to an lvalue reference.
+  //
+  // to WAR this, accept a const reference (which is bindable to a temporary),
+  // and const_cast in the implementation.
+  //
+  // XXX change to an rvalue reference upon c++0x (which either a named variable
+  //     or temporary can bind to)
+  template<typename T>
+  __host__
+  void operator()(const T &x)
+  {
+    // we have to be naughty and const_cast this to get it to work
+    T &lvalue = const_cast<T&>(x);
+
+    // this assigns correctly whether x is a true reference or proxy
+    lvalue = gen();
+  }
+
+  Generator gen;
+};
+
+template<typename Generator>
+  struct device_generate_functor
+{
+  typedef void result_type;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  device_generate_functor(Generator g)
+    : gen(g) {}
+
+  // operator() does not take an lvalue reference because some iterators
+  // produce temporary proxy references when dereferenced. for example,
+  // consider the temporary tuple of references produced by zip_iterator.
+  // such temporaries cannot bind to an lvalue reference.
+  //
+  // to WAR this, accept a const reference (which is bindable to a temporary),
+  // and const_cast in the implementation.
+  //
+  // XXX change to an rvalue reference upon c++0x (which either a named variable
+  //     or temporary can bind to)
+  template<typename T>
+  __host__ __device__
+  void operator()(const T &x)
+  {
+    // we have to be naughty and const_cast this to get it to work
+    T &lvalue = const_cast<T&>(x);
+
+    // this assigns correctly whether x is a true reference or proxy
+    lvalue = gen();
+  }
+
+  Generator gen;
+};
+
+template<typename System, typename Generator>
+  struct generate_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_generate_functor<Generator> >,
+        thrust::detail::identity_<device_generate_functor<Generator> >
+      >
+{};
+
+
+template<typename ResultType, typename BinaryFunction>
+  struct zipped_binary_op
+{
+  typedef ResultType result_type;
+
+  __host__ __device__
+  zipped_binary_op(BinaryFunction binary_op)
+    : m_binary_op(binary_op) {}
+
+  template<typename Tuple>
+  __host__ __device__
+  inline result_type operator()(Tuple t)
+  {
+    return m_binary_op(thrust::get<0>(t), thrust::get<1>(t));
+  }
+
+  BinaryFunction m_binary_op;
+};
+
+
+template<typename T>
+  struct is_non_const_reference
+    : thrust::detail::and_<
+        thrust::detail::not_<thrust::detail::is_const<T> >,
+        thrust::detail::or_<thrust::detail::is_reference<T>,
+                            thrust::detail::is_proxy_reference<T> >
+      >
+{};
+
+template<typename T> struct is_tuple_of_iterator_references : thrust::detail::false_type {};
+
+template<typename T1, typename T2, typename T3,
+         typename T4, typename T5, typename T6,
+         typename T7, typename T8, typename T9,
+         typename T10>
+  struct is_tuple_of_iterator_references<
+    thrust::detail::tuple_of_iterator_references<
+      T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
+    >
+  >
+    : thrust::detail::true_type
+{};
+
+// use this enable_if to avoid assigning to temporaries in the transform functors below
+// XXX revisit this problem with c++11 perfect forwarding
+template<typename T>
+  struct enable_if_non_const_reference_or_tuple_of_iterator_references
+    : thrust::detail::enable_if<
+        is_non_const_reference<T>::value || is_tuple_of_iterator_references<T>::value
+      >
+{};
+
+
+template<typename UnaryFunction>
+  struct unary_transform_functor
+{
+  typedef void result_type;
+
+  UnaryFunction f;
+
+  __host__ __device__
+  unary_transform_functor(UnaryFunction f)
+    : f(f)
+  {}
+
+  __thrust_exec_check_disable__
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<1,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    thrust::get<1>(t) = f(thrust::get<0>(t));
+  }
+};
+
+
+template<typename BinaryFunction>
+  struct binary_transform_functor
+{
+  BinaryFunction f;
+
+  __host__ __device__
+  binary_transform_functor(BinaryFunction f)
+    : f(f)
+  {}
+
+  __thrust_exec_check_disable__
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<2,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    thrust::get<2>(t) = f(thrust::get<0>(t), thrust::get<1>(t));
+  }
+};
+
+
+template<typename UnaryFunction, typename Predicate>
+struct unary_transform_if_functor
+{
+  UnaryFunction unary_op;
+  Predicate pred;
+
+  __host__ __device__
+  unary_transform_if_functor(UnaryFunction unary_op, Predicate pred)
+    : unary_op(unary_op), pred(pred)
+  {}
+
+  __thrust_exec_check_disable__
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<1,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<0>(t)))
+    {
+      thrust::get<1>(t) = unary_op(thrust::get<0>(t));
+    }
+  }
+}; // end unary_transform_if_functor
+
+
+template<typename UnaryFunction, typename Predicate>
+struct unary_transform_if_with_stencil_functor
+{
+  UnaryFunction unary_op;
+  Predicate pred;
+
+  __host__ __device__
+  unary_transform_if_with_stencil_functor(UnaryFunction unary_op, Predicate pred)
+    : unary_op(unary_op), pred(pred)
+  {}
+
+  __thrust_exec_check_disable__
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<2,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<1>(t)))
+      thrust::get<2>(t) = unary_op(thrust::get<0>(t));
+  }
+}; // end unary_transform_if_with_stencil_functor
+
+
+template<typename BinaryFunction, typename Predicate>
+struct binary_transform_if_functor
+{
+  BinaryFunction binary_op;
+  Predicate pred;
+
+  __host__ __device__
+  binary_transform_if_functor(BinaryFunction binary_op, Predicate pred)
+    : binary_op(binary_op), pred(pred) {} 
+
+  __thrust_exec_check_disable__
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<3,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<2>(t)))
+      thrust::get<3>(t) = binary_op(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end binary_transform_if_functor
+
+
+template<typename T>
+  struct host_destroy_functor
+{
+  __host__
+  void operator()(T &x) const
+  {
+    x.~T();
+  } // end operator()()
+}; // end host_destroy_functor
+
+
+template<typename T>
+  struct device_destroy_functor
+{
+  // add __host__ to allow the omp backend to compile with nvcc
+  __host__ __device__
+  void operator()(T &x) const
+  {
+    x.~T();
+  } // end operator()()
+}; // end device_destroy_functor
+
+
+template<typename System, typename T>
+  struct destroy_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_destroy_functor<T> >,
+        thrust::detail::identity_<device_destroy_functor<T> >
+      >
+{};
+
+
+template <typename T>
+struct fill_functor
+{
+  T exemplar;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  fill_functor(const T& _exemplar) 
+    : exemplar(_exemplar) {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  fill_functor(const fill_functor & other)
+    :exemplar(other.exemplar){}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ~fill_functor() {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  T operator()(void) const
+  {
+    return exemplar;
+  }
+};
+
+
+template<typename T>
+  struct uninitialized_fill_functor
+{
+  T exemplar;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  uninitialized_fill_functor(const T & x):exemplar(x){}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  uninitialized_fill_functor(const uninitialized_fill_functor & other)
+    :exemplar(other.exemplar){}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ~uninitialized_fill_functor() {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void operator()(T &x)
+  {
+    ::new(static_cast<void*>(&x)) T(exemplar);
+  } // end operator()()
+}; // end uninitialized_fill_functor
+
+
+// this predicate tests two two-element tuples
+// we first use a Compare for the first element
+// if the first elements are equivalent, we use
+// < for the second elements
+template<typename Compare>
+  struct compare_first_less_second
+{
+  compare_first_less_second(Compare c)
+    : comp(c) {}
+
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(T1 lhs, T2 rhs)
+  {
+    return comp(thrust::get<0>(lhs), thrust::get<0>(rhs)) || (!comp(thrust::get<0>(rhs), thrust::get<0>(lhs)) && thrust::get<1>(lhs) < thrust::get<1>(rhs));
+  }
+
+  Compare comp;
+}; // end compare_first_less_second
+
+
+template<typename Compare>
+  struct compare_first
+{
+  Compare comp;
+
+  __host__ __device__
+  compare_first(Compare comp)
+    : comp(comp)
+  {}
+
+  template<typename Tuple1, typename Tuple2>
+  __host__ __device__
+  bool operator()(const Tuple1 &x, const Tuple2 &y)
+  {
+    return comp(thrust::raw_reference_cast(thrust::get<0>(x)), thrust::raw_reference_cast(thrust::get<0>(y)));
+  }
+}; // end compare_first
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/logical.inl b/thrust/thrust/detail/logical.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2f428bc5ff86c2d13a00322df4859f42ba28e649
--- /dev/null
+++ b/thrust/thrust/detail/logical.inl
@@ -0,0 +1,102 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file logical.inl
+ *  \brief Inline file for logical.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/logical.h>
+#include <thrust/system/detail/adl/logical.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::all_of;
+  return all_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end all_of()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::any_of;
+  return any_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end any_of()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::none_of;
+  return none_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end none_of()
+
+
+template<typename InputIterator, typename Predicate>
+bool all_of(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::all_of(select_system(system), first, last, pred);
+}
+
+
+template<typename InputIterator, typename Predicate>
+bool any_of(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::any_of(select_system(system), first, last, pred);
+}
+
+
+template<typename InputIterator, typename Predicate>
+bool none_of(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::none_of(select_system(system), first, last, pred);
+}
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/malloc_and_free.h b/thrust/thrust/detail/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dc238adb49311d0a0e6187ba65108183d5599a4
--- /dev/null
+++ b/thrust/thrust/detail/malloc_and_free.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/malloc_and_free.h>
+
+namespace thrust
+{
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy>
+__host__ __device__
+pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, std::size_t n)
+{
+  using thrust::system::detail::generic::malloc;
+
+  // XXX should use a hypothetical thrust::static_pointer_cast here
+  void *raw_ptr = static_cast<void*>(thrust::raw_pointer_cast(malloc(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n)));
+
+  return pointer<void,DerivedPolicy>(raw_ptr);
+}
+
+__thrust_exec_check_disable__
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, std::size_t n)
+{
+  using thrust::system::detail::generic::malloc;
+
+  T *raw_ptr = static_cast<T*>(thrust::raw_pointer_cast(malloc<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n)));
+
+  return pointer<T,DerivedPolicy>(raw_ptr);
+}
+
+
+// XXX WAR nvbug 992955
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#if CUDART_VERSION < 5000
+
+// cudafe generates unqualified calls to free(int *volatile)
+// which get confused with thrust::free
+// spoof a thrust::free which simply maps to ::free
+inline __host__ __device__
+void free(int *volatile ptr)
+{
+  ::free(ptr);
+}
+
+#endif // CUDART_VERSION
+#endif // THRUST_DEVICE_COMPILER
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void free(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer ptr)
+{
+  using thrust::system::detail::generic::free;
+
+  free(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), ptr);
+}
+
+// XXX consider another form of free which does not take a system argument and
+// instead infers the system from the pointer
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/memory_algorithms.h b/thrust/thrust/detail/memory_algorithms.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffa25aff8b564218dd43d1c8ac82b8b7d5962e10
--- /dev/null
+++ b/thrust/thrust/detail/memory_algorithms.h
@@ -0,0 +1,210 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+// TODO: These need to be turned into proper Thrust algorithms (dispatch layer,
+// backends, etc).
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/addressof.h>
+
+#include <utility>
+#include <new>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+__host__ __device__
+void destroy_at(T* location)
+{
+  location->~T();
+}
+
+template <typename Allocator, typename T>
+__host__ __device__
+void destroy_at(Allocator const& alloc, T* location)
+{
+  typedef typename detail::allocator_traits<
+    typename detail::remove_cv<
+      typename detail::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>::other traits;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  traits::destroy(alloc_T, location);
+}
+
+template <typename ForwardIt>
+__host__ __device__
+ForwardIt destroy(ForwardIt first, ForwardIt last)
+{
+  for (; first != last; ++first)
+    destroy_at(addressof(*first));
+
+  return first;
+}
+
+template <typename Allocator, typename ForwardIt>
+__host__ __device__
+ForwardIt destroy(Allocator const& alloc, ForwardIt first, ForwardIt last)
+{
+  typedef typename iterator_traits<ForwardIt>::value_type T;
+  typedef typename detail::allocator_traits<
+    typename detail::remove_cv<
+      typename detail::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>::other traits;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  for (; first != last; ++first)
+    destroy_at(alloc_T, addressof(*first));
+
+  return first;
+}
+
+template <typename ForwardIt, typename Size>
+__host__ __device__
+ForwardIt destroy_n(ForwardIt first, Size n)
+{
+  for (; n > 0; (void) ++first, --n)
+    destroy_at(addressof(*first));
+
+  return first;
+}
+
+template <typename Allocator, typename ForwardIt, typename Size>
+__host__ __device__
+ForwardIt destroy_n(Allocator const& alloc, ForwardIt first, Size n)
+{
+  typedef typename iterator_traits<ForwardIt>::value_type T;
+  typedef typename detail::allocator_traits<
+    typename detail::remove_cv<
+      typename detail::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>::other traits;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  for (; n > 0; (void) ++first, --n)
+    destroy_at(alloc_T, addressof(*first));
+
+  return first;
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+template <typename ForwardIt, typename... Args>
+__host__ __device__
+void uninitialized_construct(
+  ForwardIt first, ForwardIt last, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  ForwardIt current = first;
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  try {
+  #endif
+    for (; current != last; ++current)
+      ::new (static_cast<void*>(addressof(*current))) T(args...);
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  } catch (...) {
+    destroy(first, current);
+    throw;
+  }
+  #endif
+}
+
+template <typename Allocator, typename ForwardIt, typename... Args>
+void uninitialized_construct_with_allocator(
+  Allocator const& alloc, ForwardIt first, ForwardIt last, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  ForwardIt current = first;
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  try {
+  #endif
+    for (; current != last; ++current)
+      traits::construct(alloc_T, addressof(*current), args...);
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  } catch (...) {
+    destroy(alloc_T, first, current);
+    throw;
+  }
+  #endif
+}
+
+template <typename ForwardIt, typename Size, typename... Args>
+void uninitialized_construct_n(
+  ForwardIt first, Size n, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  ForwardIt current = first;
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  try {
+  #endif
+    for (; n > 0; (void) ++current, --n)
+      ::new (static_cast<void*>(addressof(*current))) T(args...);
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  } catch (...) {
+    destroy(first, current);
+    throw;
+  }
+  #endif
+}
+
+template <typename Allocator, typename ForwardIt, typename Size, typename... Args>
+void uninitialized_construct_n_with_allocator(
+  Allocator const& alloc, ForwardIt first, Size n, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  ForwardIt current = first;
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  try {
+  #endif
+    for (; n > 0; (void) ++current, --n)
+      traits::construct(alloc_T, addressof(*current), args...);
+  #if !__CUDA_ARCH__ // No exceptions in CUDA.
+  } catch (...) {
+    destroy(alloc_T, first, current);
+    throw;
+  }
+  #endif
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/memory_wrapper.h b/thrust/thrust/detail/memory_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfc9056fa15ff6d123659499e5fb9044f937f769
--- /dev/null
+++ b/thrust/thrust/detail/memory_wrapper.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.  (<memory> declares several standard
+// algorithms, including all of the uninitialized_* algorithms.  "_ALGORITHMS_"
+// in the macro name is meant generically, not as a specific reference to
+// the header <algorithms>.)
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <memory>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/thrust/detail/merge.inl b/thrust/thrust/detail/merge.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d42475709785c3c8f475762aa441d777f4959a83
--- /dev/null
+++ b/thrust/thrust/detail/merge.inl
@@ -0,0 +1,225 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file merge.inl
+ *  \brief Inline file for merge.h.
+ */
+
+#include <thrust/merge.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/merge.h>
+#include <thrust/system/detail/adl/merge.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result)
+{
+  using thrust::system::detail::generic::merge;
+  return merge(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end merge()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakCompare comp)
+{
+  using thrust::system::detail::generic::merge;
+  return merge(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end merge()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::merge_by_key;
+  return merge_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end merge_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp)
+{
+  using thrust::system::detail::generic::merge_by_key;
+  return merge_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end merge_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::merge(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end merge()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::merge(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end merge()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1,
+                 InputIterator1 keys_last1,
+                 InputIterator2 keys_first2,
+                 InputIterator2 keys_last2,
+                 InputIterator3 values_first1,
+                 InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end merge_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1,
+                 InputIterator1 keys_last1,
+                 InputIterator2 keys_first2,
+                 InputIterator2 keys_last2,
+                 InputIterator3 values_first1,
+                 InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end merge_by_key()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/minmax.h b/thrust/thrust/detail/minmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f59c649629006e606c8b293a2301ab19bff2d7a8
--- /dev/null
+++ b/thrust/thrust/detail/minmax.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp)
+{
+  return comp(rhs, lhs) ? rhs : lhs;
+} // end min()
+
+template<typename T>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs)
+{
+  return rhs < lhs ? rhs : lhs;
+} // end min()
+
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp)
+{
+  return comp(lhs,rhs) ? rhs : lhs;
+} // end max()
+
+template<typename T>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs)
+{
+  return lhs < rhs ? rhs : lhs;
+} // end max()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/mismatch.inl b/thrust/thrust/detail/mismatch.inl
new file mode 100644
index 0000000000000000000000000000000000000000..6c39aab86b4b82a2113d6bdbe4590ab7ecf9294c
--- /dev/null
+++ b/thrust/thrust/detail/mismatch.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file mismatch.inl
+ *  \brief Inline file for mismatch.h
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/mismatch.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/mismatch.h>
+#include <thrust/system/detail/adl/mismatch.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2)
+{
+  using thrust::system::detail::generic::mismatch;
+  return mismatch(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2);
+} // end mismatch()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred)
+{
+  using thrust::system::detail::generic::mismatch;
+  return mismatch(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, pred);
+} // end mismatch()
+
+
+template<typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::mismatch(select_system(system1,system2), first1, last1, first2);
+} // end mismatch()
+
+
+template<typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::mismatch(select_system(system1,system2), first1, last1, first2, pred);
+} // end mismatch()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/modern_gcc_required.h b/thrust/thrust/detail/modern_gcc_required.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8c3d98ba996eec9d6b010dabad65d2261d7e7bc
--- /dev/null
+++ b/thrust/thrust/detail/modern_gcc_required.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#ifndef THRUST_MODERN_GCC_REQUIRED_NO_ERROR
+#  if defined(THRUST_GCC_VERSION) && !defined(THRUST_MODERN_GCC)
+#    error GCC 5 or later is required for this Thrust feature; please upgrade your compiler.
+#  endif
+#endif
+
diff --git a/thrust/thrust/detail/mpl/math.h b/thrust/thrust/detail/mpl/math.h
new file mode 100644
index 0000000000000000000000000000000000000000..5356c9c155159fbdb17967e75bf332739ce8476e
--- /dev/null
+++ b/thrust/thrust/detail/mpl/math.h
@@ -0,0 +1,174 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file math.h
+ *  \brief Math-related metaprogramming functionality.
+ */
+
+
+#pragma once
+
+namespace thrust
+{
+
+namespace detail
+{
+
+namespace mpl
+{
+
+namespace math
+{
+
+namespace detail
+{
+
+// compute the log base-2 of an integer at compile time
+template <unsigned int N, unsigned int Cur>
+struct log2
+{
+    static const unsigned int value = log2<N / 2,Cur+1>::value;
+};
+
+template <unsigned int Cur>
+struct log2<1, Cur>
+{
+    static const unsigned int value = Cur;
+};
+
+template <unsigned int Cur>
+struct log2<0, Cur>
+{
+    // undefined
+};
+
+} // end namespace detail
+
+
+template <unsigned int N>
+struct log2
+{
+    static const unsigned int value = detail::log2<N,0>::value;
+};
+
+
+template <typename T, T lhs, T rhs>
+struct min
+{
+  static const T value = (lhs < rhs) ? lhs : rhs;
+};
+
+
+template <typename T, T lhs, T rhs>
+struct max
+{
+  static const T value = (!(lhs < rhs)) ? lhs : rhs;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct mul
+{
+  static const result_type value = x * y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct mod
+{
+  static const result_type value = x % y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct div
+{
+  static const result_type value = x / y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct geq
+{
+  static const bool value = x >= y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct lt
+{
+  static const bool value = x < y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct gt
+{
+  static const bool value = x > y;
+};
+
+
+template<bool x, bool y>
+  struct or_
+{
+  static const bool value = (x || y);
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct bit_and
+{
+  static const result_type value = x & y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct plus
+{
+  static const result_type value = x + y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct minus
+{
+  static const result_type value = x - y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct equal
+{
+  static const bool value = x == y;
+};
+
+
+template<typename result_type, result_type x>
+  struct is_odd
+{
+  static const bool value = x & 1;
+};
+
+
+} // end namespace math
+
+} // end namespace mpl
+
+} // end namespace detail
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/numeric_traits.h b/thrust/thrust/detail/numeric_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..168b9ad0f4b63657845915ba1718737773be687a
--- /dev/null
+++ b/thrust/thrust/detail/numeric_traits.h
@@ -0,0 +1,130 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <limits>
+
+//#include <stdint.h> // for intmax_t (not provided on MSVS 2005)
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// XXX good enough for the platforms we care about
+typedef long long intmax_t;
+
+template<typename Number>
+  struct is_signed
+    : integral_constant<bool, std::numeric_limits<Number>::is_signed>
+{}; // end is_signed
+
+
+template<typename T>
+  struct num_digits
+    : eval_if<
+        std::numeric_limits<T>::is_specialized,
+        integral_constant<
+          int,
+          std::numeric_limits<T>::digits
+        >,
+        integral_constant<
+          int,
+          sizeof(T) * std::numeric_limits<unsigned char>::digits - (is_signed<T>::value ? 1 : 0)  
+        >
+      >::type
+{}; // end num_digits
+
+
+template<typename Integer>
+  struct integer_difference
+    //: eval_if<
+    //    sizeof(Integer) >= sizeof(intmax_t),
+    //    eval_if<
+    //      is_signed<Integer>::value,
+    //      identity_<Integer>,
+    //      identity_<intmax_t>
+    //    >,
+    //    eval_if<
+    //      sizeof(Integer) < sizeof(std::ptrdiff_t),
+    //      identity_<std::ptrdiff_t>,
+    //      identity_<intmax_t>
+    //    >
+    //  >
+{
+  private:
+    // XXX workaround a pedantic warning in old versions of g++
+    //     which complains about &&ing with a constant value
+    template<bool x, bool y>
+      struct and_
+    {
+      static const bool value = false;
+    };
+
+    template<bool y>
+      struct and_<true,y>
+    {
+      static const bool value = y;
+    };
+
+  public:
+    typedef typename
+      eval_if<
+        and_<
+          std::numeric_limits<Integer>::is_signed,
+          // digits is the number of no-sign bits
+          (!std::numeric_limits<Integer>::is_bounded || (int(std::numeric_limits<Integer>::digits) + 1 >= num_digits<intmax_t>::value))
+        >::value,
+        identity_<Integer>,
+        eval_if<
+          int(std::numeric_limits<Integer>::digits) + 1 < num_digits<signed int>::value,
+          identity_<signed int>,
+          eval_if<
+            int(std::numeric_limits<Integer>::digits) + 1 < num_digits<signed long>::value,
+            identity_<signed long>,
+            identity_<intmax_t>
+          >
+        >
+      >::type type;
+}; // end integer_difference
+
+
+template<typename Number>
+  struct numeric_difference
+    : eval_if<
+      is_integral<Number>::value,
+      integer_difference<Number>,
+      identity_<Number>
+    >
+{}; // end numeric_difference
+
+
+template<typename Number>
+__host__ __device__
+typename numeric_difference<Number>::type
+numeric_distance(Number x, Number y)
+{
+  typedef typename numeric_difference<Number>::type difference_type;
+  return difference_type(y) - difference_type(x);
+} // end numeric_distance
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/overlapped_copy.h b/thrust/thrust/detail/overlapped_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6bb85a91129bfbca84721ab5c4943b3a2034698
--- /dev/null
+++ b/thrust/thrust/detail/overlapped_copy.h
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/detail/copy.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator sequential_copy(InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result)
+{
+  for(; first != last; ++first, ++result)
+  {
+    *result = *first;
+  } // end for
+
+  return result;
+} // end sequential_copy()
+
+
+template<typename BidirectionalIterator1,
+         typename BidirectionalIterator2>
+  BidirectionalIterator2 sequential_copy_backward(BidirectionalIterator1 first,
+                                                  BidirectionalIterator1 last,
+                                                  BidirectionalIterator2 result)
+{
+  // yes, we preincrement
+  // the ranges are open on the right, i.e. [first, last)
+  while(first != last)
+  {
+    *--result = *--last;
+  } // end while
+
+  return result;
+} // end sequential_copy_backward()
+
+
+namespace dispatch
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 overlapped_copy(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
+                                        RandomAccessIterator1 first,
+                                        RandomAccessIterator1 last,
+                                        RandomAccessIterator2 result)
+{
+  if(first < last && first <= result && result < last)
+  {
+    // result lies in [first, last)
+    // it's safe to use std::copy_backward here
+    thrust::detail::sequential_copy_backward(first, last, result + (last - first));
+    result += (last - first);
+  } // end if
+  else
+  {
+    // result + (last - first) lies in [first, last)
+    // it's safe to use sequential_copy here
+    result = thrust::detail::sequential_copy(first, last, result);
+  } // end else
+
+  return result;
+} // end overlapped_copy()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 overlapped_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                                        RandomAccessIterator1 first,
+                                        RandomAccessIterator1 last,
+                                        RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+
+  // make a temporary copy of [first,last), and copy into it first
+  thrust::detail::temporary_array<value_type, DerivedPolicy> temp(exec, first, last);
+  return thrust::copy(exec, temp.begin(), temp.end(), result);
+} // end overlapped_copy()
+
+} // end dispatch
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 overlapped_copy(RandomAccessIterator1 first,
+                                        RandomAccessIterator1 last,
+                                        RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  typedef typename thrust::detail::minimum_system<System1, System2>::type System;
+
+  // XXX presumes System is default constructible
+  System system;
+
+  return thrust::detail::dispatch::overlapped_copy(system, first, last, result);
+} // end overlapped_copy()
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/pair.inl b/thrust/thrust/detail/pair.inl
new file mode 100644
index 0000000000000000000000000000000000000000..426668b992112d9d4258cab1ded986afbc426987
--- /dev/null
+++ b/thrust/thrust/detail/pair.inl
@@ -0,0 +1,229 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/pair.h>
+#include <thrust/detail/swap.h>
+
+namespace thrust
+{
+
+template <typename T1, typename T2>
+  __host__ __device__
+  pair<T1,T2>
+    ::pair(void)
+      :first(),second()
+{
+  ;
+} // end pair::pair()
+
+
+template <typename T1, typename T2>
+  __host__ __device__
+  pair<T1,T2>
+    ::pair(const T1 &x, const T2 &y)
+      :first(x),second(y)
+{
+  ;
+} // end pair::pair()
+
+
+template <typename T1, typename T2>
+  template <typename U1, typename U2>
+    __host__ __device__
+    pair<T1,T2>
+      ::pair(const pair<U1,U2> &p)
+        :first(p.first),second(p.second)
+{
+  ;
+} // end pair::pair()
+
+
+template <typename T1, typename T2>
+  template <typename U1, typename U2>
+    __host__ __device__
+    pair<T1,T2>
+      ::pair(const std::pair<U1,U2> &p)
+        :first(p.first),second(p.second)
+{
+  ;
+} // end pair::pair()
+
+
+template<typename T1, typename T2>
+  inline __host__ __device__
+    void pair<T1,T2>
+      ::swap(thrust::pair<T1,T2> &p)
+{
+  using thrust::swap;
+
+  swap(first, p.first);
+  swap(second, p.second);
+} // end pair::swap()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator==(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return x.first == y.first && x.second == y.second;
+} // end operator==()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+} // end operator<()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator!=(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return !(x == y);
+} // end operator==()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return y < x;
+} // end operator<()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<=(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return !(y < x);
+} // end operator<=()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>=(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return !(x < y);
+} // end operator>=()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    void swap(pair<T1,T2> &x, pair<T1,T2> &y)
+{
+  return x.swap(y);
+} // end swap()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    pair<T1,T2> make_pair(T1 x, T2 y)
+{
+  return pair<T1,T2>(x,y);
+} // end make_pair()
+
+
+// specializations of tuple_element for pair
+template<typename T1, typename T2>
+  struct tuple_element<0, pair<T1,T2> >
+{
+  typedef T1 type;
+}; // end tuple_element
+
+template<typename T1, typename T2>
+  struct tuple_element<1, pair<T1,T2> >
+{
+  typedef T2 type;
+}; // end tuple_element
+
+
+// specialization of tuple_size for pair
+template<typename T1, typename T2>
+  struct tuple_size< pair<T1,T2 > >
+{
+  static const unsigned int value = 2;
+}; // end tuple_size
+
+
+
+namespace detail
+{
+
+
+template<int N, typename Pair> struct pair_get {};
+
+template<typename Pair>
+  struct pair_get<0, Pair>
+{
+  inline __host__ __device__
+    const typename tuple_element<0, Pair>::type &
+      operator()(const Pair &p) const
+  {
+    return p.first;
+  } // end operator()()
+
+  inline __host__ __device__
+    typename tuple_element<0, Pair>::type &
+      operator()(Pair &p) const
+  {
+    return p.first;
+  } // end operator()()
+}; // end pair_get
+
+
+template<typename Pair>
+  struct pair_get<1, Pair>
+{
+  inline __host__ __device__
+    const typename tuple_element<1, Pair>::type &
+      operator()(const Pair &p) const
+  {
+    return p.second;
+  } // end operator()()
+
+  inline __host__ __device__
+    typename tuple_element<1, Pair>::type &
+      operator()(Pair &p) const
+  {
+    return p.second;
+  } // end operator()()
+}; // end pair_get
+
+} // end detail
+
+
+
+template<unsigned int N, typename T1, typename T2>
+  inline __host__ __device__
+    typename tuple_element<N, pair<T1,T2> >::type &
+      get(pair<T1,T2> &p)
+{
+  return detail::pair_get<N, pair<T1,T2> >()(p);
+} // end get()
+
+template<unsigned int N, typename T1, typename T2>
+  inline __host__ __device__
+    const typename tuple_element<N, pair<T1,T2> >::type &
+      get(const pair<T1,T2> &p)
+{
+  return detail::pair_get<N, pair<T1,T2> >()(p);
+} // end get()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/partition.inl b/thrust/thrust/detail/partition.inl
new file mode 100644
index 0000000000000000000000000000000000000000..a667264c636df9edcc721c09c8834a0a6edd64e4
--- /dev/null
+++ b/thrust/thrust/detail/partition.inl
@@ -0,0 +1,418 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.inl
+ *  \brief Inline file for partition.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/partition.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/partition.h>
+#include <thrust/system/detail/adl/partition.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::partition;
+  return partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end partition()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::partition;
+  return partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
+} // end partition()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::partition_copy;
+  return partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, out_true, out_false, pred);
+} // end partition_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::partition_copy;
+  return partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, out_true, out_false, pred);
+} // end partition_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition;
+  return stable_partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end stable_partition()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition;
+  return stable_partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
+} // end stable_partition()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition_copy;
+  return stable_partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition_copy;
+  return stable_partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate>
+__host__ __device__
+  ForwardIterator partition_point(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred)
+{
+  using thrust::system::detail::generic::partition_point;
+  return partition_point(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end partition_point()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+  bool is_partitioned(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::is_partitioned;
+  return is_partitioned(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end is_partitioned()
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::partition(select_system(system), first, last, pred);
+} // end partition()
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::partition(select_system(system1,system2), first, last, stencil, pred);
+} // end partition()
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::stable_partition(select_system(system), first, last, pred);
+} // end stable_partition()
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type   System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::stable_partition(select_system(system1,system2), first, last, stencil, pred);
+} // end stable_partition()
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::partition_copy(select_system(system1,system2,system3), first, last, out_true, out_false, pred);
+} // end partition_copy()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator1>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::partition_copy(select_system(system1,system2,system3,system4), first, last, stencil, out_true, out_false, pred);
+} // end partition_copy()
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::stable_partition_copy(select_system(system1,system2,system3), first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type   System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type   System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type  System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type  System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::stable_partition_copy(select_system(system1,system2,system3,system4), first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename ForwardIterator, typename Predicate>
+  ForwardIterator partition_point(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::partition_point(select_system(system), first, last, pred);
+} // end partition_point()
+
+
+template<typename InputIterator, typename Predicate>
+  bool is_partitioned(InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::is_partitioned(select_system(system), first, last, pred);
+} // end is_partitioned()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/pointer.h b/thrust/thrust/detail/pointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9204978f5d5990476698917842a1d77b779b5ba
--- /dev/null
+++ b/thrust/thrust/detail/pointer.h
@@ -0,0 +1,253 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/reference_forward_declaration.h>
+#include <ostream>
+
+
+namespace thrust
+{
+
+// declare pointer with default values of template parameters
+template<typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default> class pointer;
+
+} // end thrust
+
+
+// specialize thrust::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace thrust
+{
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct iterator_traits<thrust::pointer<Element,Tag,Reference,Derived> >
+{
+  private:
+    typedef thrust::pointer<Element,Tag,Reference,Derived> ptr;
+
+  public:
+    typedef typename ptr::iterator_category iterator_category;
+    typedef typename ptr::value_type        value_type;
+    typedef typename ptr::difference_type   difference_type;
+    // XXX implement this type (the result of operator->) later
+    typedef void                             pointer;
+    typedef typename ptr::reference         reference;
+}; // end iterator_traits
+
+} // end thrust
+
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// this metafunction computes the type of iterator_adaptor thrust::pointer should inherit from
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct pointer_base
+{
+  // void pointers should have no element type
+  // note that we remove_cv from the Element type to get the value_type
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::identity_<void>,
+    thrust::detail::remove_cv<Element>
+  >::type value_type;
+
+  // if no Derived type is given, just use pointer
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_same<Derived,use_default>::value,
+    thrust::detail::identity_<pointer<Element,Tag,Reference,Derived> >,
+    thrust::detail::identity_<Derived>
+  >::type derived_type;
+
+  // void pointers should have no reference type
+  // if no Reference type is given, just use reference
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::identity_<void>,
+    thrust::detail::eval_if<
+      thrust::detail::is_same<Reference,use_default>::value,
+      thrust::detail::identity_<reference<Element,derived_type> >,
+      thrust::detail::identity_<Reference>
+    >
+  >::type reference_arg;
+
+  typedef thrust::iterator_adaptor<
+    derived_type,                        // pass along the type of our Derived class to iterator_adaptor
+    Element *,                           // we adapt a raw pointer
+    value_type,                          // the value type
+    Tag,                                 // system tag
+    thrust::random_access_traversal_tag, // pointers have random access traversal
+    reference_arg,                       // pass along our Reference type
+    std::ptrdiff_t
+  > type;
+}; // end pointer_base
+
+
+} // end detail
+
+
+// the base type for all of thrust's tagged pointers.
+// for reasonable pointer-like semantics, derived types should reimplement the following:
+// 1. no-argument constructor
+// 2. constructor from OtherElement *
+// 3. constructor from OtherPointer related by convertibility
+// 4. constructor from OtherPointer to void
+// 5. assignment from OtherPointer related by convertibility
+// These should just call the corresponding members of pointer.
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  class pointer
+    : public thrust::detail::pointer_base<Element,Tag,Reference,Derived>::type
+{
+  private:
+    typedef typename thrust::detail::pointer_base<Element,Tag,Reference,Derived>::type         super_t;
+
+    typedef typename thrust::detail::pointer_base<Element,Tag,Reference,Derived>::derived_type derived_type;
+
+    // friend iterator_core_access to give it access to dereference
+    friend class thrust::iterator_core_access;
+
+    __host__ __device__
+    typename super_t::reference dereference() const;
+
+    // don't provide access to this part of super_t's interface
+    using super_t::base;
+    using typename super_t::base_type;
+
+  public:
+    typedef typename super_t::base_type raw_pointer;
+
+    // constructors
+
+    __host__ __device__
+    pointer();
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer(decltype(nullptr));
+    #endif
+
+    // OtherValue shall be convertible to Value
+    // XXX consider making the pointer implementation a template parameter which defaults to Element *
+    template<typename OtherElement>
+    __host__ __device__
+    explicit pointer(OtherElement *ptr);
+
+    // OtherPointer's element_type shall be convertible to Element
+    // OtherPointer's system shall be convertible to Tag
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer<Element,Tag,Reference,Derived>
+            >::type * = 0);
+
+    // OtherPointer's element_type shall be void
+    // OtherPointer's system shall be convertible to Tag
+    template<typename OtherPointer>
+    __host__ __device__
+    explicit
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+              OtherPointer,
+              pointer<Element,Tag,Reference,Derived>
+            >::type * = 0);
+
+    // assignment
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    derived_type& operator=(decltype(nullptr));
+    #endif
+
+    // OtherPointer's element_type shall be convertible to Element
+    // OtherPointer's system shall be convertible to Tag
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      derived_type &
+    >::type
+    operator=(const OtherPointer &other);
+
+    // observers
+
+    __host__ __device__
+    Element *get() const;
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    explicit operator bool() const;
+    #endif
+
+    __host__ __device__
+    static derived_type pointer_to(typename thrust::detail::pointer_traits_detail::pointer_to_param<Element>::type r)
+    {
+      return thrust::detail::pointer_traits<derived_type>::pointer_to(r);
+    }
+}; // end pointer
+
+// Output stream operator
+template<typename Element, typename Tag, typename Reference, typename Derived,
+         typename charT, typename traits>
+__host__
+std::basic_ostream<charT, traits> &
+operator<<(std::basic_ostream<charT, traits> &os,
+           const pointer<Element, Tag, Reference, Derived> &p);
+
+#if THRUST_CPP_DIALECT >= 2011
+// NOTE: This is needed so that Thrust smart pointers can be used in
+// `std::unique_ptr`.
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p);
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr));
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p);
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr));
+#endif
+
+} // end thrust
+
+#include <thrust/detail/pointer.inl>
+
diff --git a/thrust/thrust/detail/pointer.inl b/thrust/thrust/detail/pointer.inl
new file mode 100644
index 0000000000000000000000000000000000000000..464c3579eda2363cfd41d2fe38a19cb9f03d38c5
--- /dev/null
+++ b/thrust/thrust/detail/pointer.inl
@@ -0,0 +1,273 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/type_traits.h>
+
+
+namespace thrust
+{
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  pointer<Element,Tag,Reference,Derived>
+    ::pointer()
+      : super_t(static_cast<Element*>(
+          #if THRUST_CPP_DIALECT >= 2011
+          nullptr
+          #else
+          0
+          #endif
+        ))
+{} // end pointer::pointer
+
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  pointer<Element,Tag,Reference,Derived>
+    ::pointer(decltype(nullptr))
+      : super_t(static_cast<Element*>(nullptr))
+{} // end pointer::pointer
+#endif
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherElement>
+    __host__ __device__
+    pointer<Element,Tag,Reference,Derived>
+      ::pointer(OtherElement *other)
+        : super_t(other)
+{} // end pointer::pointer
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherPointer>
+    __host__ __device__
+    pointer<Element,Tag,Reference,Derived>
+      ::pointer(const OtherPointer &other,
+                typename thrust::detail::enable_if_pointer_is_convertible<
+                  OtherPointer,
+                  pointer<Element,Tag,Reference,Derived>
+                 >::type *)
+        : super_t(thrust::detail::pointer_traits<OtherPointer>::get(other))
+{} // end pointer::pointer
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherPointer>
+    __host__ __device__
+    pointer<Element,Tag,Reference,Derived>
+      ::pointer(const OtherPointer &other,
+                typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+                  OtherPointer,
+                  pointer<Element,Tag,Reference,Derived>
+                 >::type *)
+        : super_t(static_cast<Element *>(thrust::detail::pointer_traits<OtherPointer>::get(other)))
+{} // end pointer::pointer
+
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  typename pointer<Element,Tag,Reference,Derived>::derived_type &
+    pointer<Element,Tag,Reference,Derived>
+      ::operator=(decltype(nullptr))
+{
+  super_t::base_reference() = nullptr;
+  return static_cast<derived_type&>(*this);
+} // end pointer::operator=
+#endif
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer<Element,Tag,Reference,Derived>,
+      typename pointer<Element,Tag,Reference,Derived>::derived_type &
+    >::type
+      pointer<Element,Tag,Reference,Derived>
+        ::operator=(const OtherPointer &other)
+{
+  super_t::base_reference() = thrust::detail::pointer_traits<OtherPointer>::get(other);
+  return static_cast<derived_type&>(*this);
+} // end pointer::operator=
+
+namespace detail
+{
+
+// Implementation for dereference() when Reference is Element&,
+// e.g. cuda's managed_memory_pointer
+template <typename Reference, typename Derived>
+__host__ __device__
+Reference pointer_dereference_impl(const Derived& ptr,
+                                   thrust::detail::true_type /* is_cpp_ref */)
+{
+  return *ptr.get();
+}
+
+// Implementation for pointers with proxy references:
+template <typename Reference, typename Derived>
+__host__ __device__
+Reference pointer_dereference_impl(const Derived& ptr,
+                                   thrust::detail::false_type /* is_cpp_ref */)
+{
+  return Reference(ptr);
+}
+
+} // namespace detail
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  typename pointer<Element,Tag,Reference,Derived>::super_t::reference
+  pointer<Element,Tag,Reference,Derived>
+    ::dereference() const
+{
+  // Need to handle cpp refs and fancy refs differently:
+  typedef typename super_t::reference RefT;
+  typedef typename thrust::detail::is_reference<RefT>::type IsCppRef;
+
+  const derived_type& derivedPtr = static_cast<const derived_type&>(*this);
+
+  return detail::pointer_dereference_impl<RefT>(derivedPtr, IsCppRef());
+} // end pointer::dereference
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  Element *pointer<Element,Tag,Reference,Derived>
+    ::get() const
+{
+  return super_t::base();
+} // end pointer::get
+
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  pointer<Element,Tag,Reference,Derived>
+    ::operator bool() const
+{
+  return bool(get());
+} // end pointer::operator bool
+#endif
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived,
+         typename charT, typename traits>
+__host__
+std::basic_ostream<charT, traits> &
+operator<<(std::basic_ostream<charT, traits> &os,
+           const pointer<Element, Tag, Reference, Derived> &p) {
+  return os << p.get();
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+// NOTE: These are needed so that Thrust smart pointers work with
+// `std::unique_ptr`.
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p)
+{
+  return nullptr == p.get();
+}
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr))
+{
+  return nullptr == p.get();
+}
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p)
+{
+  return !(nullptr == p);
+}
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr))
+{
+  return !(nullptr == p);
+}
+#endif
+
+namespace detail
+{
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+// XXX WAR MSVC 2005 problem with correctly implementing
+//     pointer_raw_pointer for pointer by specializing it here
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct pointer_raw_pointer< thrust::pointer<Element,Tag,Reference,Derived> >
+{
+  typedef typename pointer<Element,Tag,Reference,Derived>::raw_pointer type;
+}; // end pointer_raw_pointer
+#endif
+
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40200)
+// XXX WAR g++-4.1 problem with correctly implementing
+//     pointer_element for pointer by specializing it here
+template<typename Element, typename Tag>
+  struct pointer_element< thrust::pointer<Element,Tag> >
+{
+  typedef Element type;
+}; // end pointer_element
+
+template<typename Element, typename Tag, typename Reference>
+  struct pointer_element< thrust::pointer<Element,Tag,Reference> >
+    : pointer_element< thrust::pointer<Element,Tag> >
+{}; // end pointer_element
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct pointer_element< thrust::pointer<Element,Tag,Reference,Derived> >
+    : pointer_element< thrust::pointer<Element,Tag,Reference> >
+{}; // end pointer_element
+
+
+
+// XXX WAR g++-4.1 problem with correctly implementing
+//     rebind_pointer for pointer by specializing it here
+template<typename Element, typename Tag, typename NewElement>
+  struct rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
+{
+  // XXX note we don't attempt to rebind the pointer's Reference type (or Derived)
+  typedef thrust::pointer<NewElement,Tag> type;
+};
+
+template<typename Element, typename Tag, typename Reference, typename NewElement>
+  struct rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
+    : rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
+{};
+
+template<typename Element, typename Tag, typename Reference, typename Derived, typename NewElement>
+  struct rebind_pointer<thrust::pointer<Element,Tag,Reference,Derived>, NewElement>
+    : rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
+{};
+#endif
+
+} // end namespace detail
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/preprocessor.h b/thrust/thrust/detail/preprocessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e9943b76f84ed5481364aa1fce7d35970d26097
--- /dev/null
+++ b/thrust/thrust/detail/preprocessor.h
@@ -0,0 +1,1182 @@
+// Copyright (c) 2017-2018 NVIDIA Corporation
+// Copyright (c) 2014-2018 Bryce Adelstein Lelbach
+// Copyright (c) 2001-2015 Housemarque Oy (housemarque.com)
+// Copyright (c) 2007-2015 Hartmut Kaiser
+// Copyright (c)      2002 Peter Dimov and Multi Media Ltd
+//                         (`THRUST_CURRENT_FUNCTION`)
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_STRINGIZE(expr)
+/// \brief Stringizes the expression \a expr.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_STRINGIZE(foo) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << "foo" << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_STRINGIZE(expr) THRUST_PP_STRINGIZE_IMPL0(expr)
+#define THRUST_PP_STRINGIZE_IMPL0(expr) #expr
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_CAT2(a, b)
+/// \brief Concatenates the tokens \a a and \b b.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_CAT2(1, THRUST_PP_CAT2(2, 3)) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << 123 << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_CAT2(a, b) THRUST_PP_CAT2_IMPL0(a, b)
+
+#if    defined(_MSC_VER)                                                      \
+  && (defined(__EDG__) || defined(__EDG_VERSION__))                         \
+  && (defined(__INTELLISENSE__) || __EDG_VERSION__ >= 308)
+  #define THRUST_PP_CAT2_IMPL0(a, b) THRUST_PP_CAT2_IMPL1(~, a ## b)
+  #define THRUST_PP_CAT2_IMPL1(p, res) res
+#else
+  #define THRUST_PP_CAT2_IMPL0(a, b) a ## b
+#endif
+
+#define THRUST_PP_CAT3(a, b, c)                                               \
+  THRUST_PP_CAT2(a,                                                           \
+    THRUST_PP_CAT2(b, c))                                                     \
+  /**/
+
+#define THRUST_PP_CAT4(a, b, c, d)                                            \
+  THRUST_PP_CAT2(a,                                                           \
+    THRUST_PP_CAT2(b,                                                         \
+      THRUST_PP_CAT2(c, d)))                                                  \
+  /**/
+
+#define THRUST_PP_CAT5(a, b, c, d, e)                                         \
+  THRUST_PP_CAT2(a,                                                           \
+    THRUST_PP_CAT2(b,                                                         \
+      THRUST_PP_CAT2(c,                                                       \
+        THRUST_PP_CAT2(d, e))))                                               \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_EXPAND(x)
+/// \brief Performs macro expansion on \a x.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// #define FOO_BAR() "foo_bar"
+/// #define BUZZ()     THRUST_PP_EXPAND(THRUST_PP_CAT2(FOO_, BAR)())
+///
+/// int main()
+/// {
+///   std::cout << BUZZ() << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << "foo_bar" << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_EXPAND(x) THRUST_PP_EXPAND_IMPL0(x)
+#define THRUST_PP_EXPAND_IMPL0(x) x
+
+#define THRUST_PP_EXPAND_ARGS(...) THRUST_PP_EXPAND_ARGS_IMPL0(__VA_ARGS__)
+#define THRUST_PP_EXPAND_ARGS_IMPL0(...) __VA_ARGS__
+
+#define THRUST_PP_HEAD(x, ...) x
+
+#define THRUST_PP_TAIL(x, ...) __VA_ARGS__
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define THRUST_PP_EMPTY()
+
+#define THRUST_PP_COMMA() ,
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define THRUST_PP_INC(x) THRUST_PP_INC_IMPL0(x)
+
+#define THRUST_PP_INC_IMPL0(x) THRUST_PP_CAT2(THRUST_PP_INC_IMPL_TAG, x)
+
+#define THRUST_PP_INC_IMPL_TAG0 1
+#define THRUST_PP_INC_IMPL_TAG1 2
+#define THRUST_PP_INC_IMPL_TAG2 3
+#define THRUST_PP_INC_IMPL_TAG3 4
+#define THRUST_PP_INC_IMPL_TAG4 5
+#define THRUST_PP_INC_IMPL_TAG5 6
+#define THRUST_PP_INC_IMPL_TAG6 7
+#define THRUST_PP_INC_IMPL_TAG7 8
+#define THRUST_PP_INC_IMPL_TAG8 9
+#define THRUST_PP_INC_IMPL_TAG9 10
+#define THRUST_PP_INC_IMPL_TAG10 11
+#define THRUST_PP_INC_IMPL_TAG11 12
+#define THRUST_PP_INC_IMPL_TAG12 13
+#define THRUST_PP_INC_IMPL_TAG13 14
+#define THRUST_PP_INC_IMPL_TAG14 15
+#define THRUST_PP_INC_IMPL_TAG15 16
+#define THRUST_PP_INC_IMPL_TAG16 17
+#define THRUST_PP_INC_IMPL_TAG17 18
+#define THRUST_PP_INC_IMPL_TAG18 19
+#define THRUST_PP_INC_IMPL_TAG19 20
+#define THRUST_PP_INC_IMPL_TAG20 21
+#define THRUST_PP_INC_IMPL_TAG21 22
+#define THRUST_PP_INC_IMPL_TAG22 23
+#define THRUST_PP_INC_IMPL_TAG23 24
+#define THRUST_PP_INC_IMPL_TAG24 25
+#define THRUST_PP_INC_IMPL_TAG25 26
+#define THRUST_PP_INC_IMPL_TAG26 27
+#define THRUST_PP_INC_IMPL_TAG27 28
+#define THRUST_PP_INC_IMPL_TAG28 29
+#define THRUST_PP_INC_IMPL_TAG29 30
+#define THRUST_PP_INC_IMPL_TAG30 31
+#define THRUST_PP_INC_IMPL_TAG31 32
+#define THRUST_PP_INC_IMPL_TAG32 33
+#define THRUST_PP_INC_IMPL_TAG33 34
+#define THRUST_PP_INC_IMPL_TAG34 35
+#define THRUST_PP_INC_IMPL_TAG35 36
+#define THRUST_PP_INC_IMPL_TAG36 37
+#define THRUST_PP_INC_IMPL_TAG37 38
+#define THRUST_PP_INC_IMPL_TAG38 39
+#define THRUST_PP_INC_IMPL_TAG39 40
+#define THRUST_PP_INC_IMPL_TAG40 41
+#define THRUST_PP_INC_IMPL_TAG41 42
+#define THRUST_PP_INC_IMPL_TAG42 43
+#define THRUST_PP_INC_IMPL_TAG43 44
+#define THRUST_PP_INC_IMPL_TAG44 45
+#define THRUST_PP_INC_IMPL_TAG45 46
+#define THRUST_PP_INC_IMPL_TAG46 47
+#define THRUST_PP_INC_IMPL_TAG47 48
+#define THRUST_PP_INC_IMPL_TAG48 49
+#define THRUST_PP_INC_IMPL_TAG49 50
+#define THRUST_PP_INC_IMPL_TAG50 51
+#define THRUST_PP_INC_IMPL_TAG51 52
+#define THRUST_PP_INC_IMPL_TAG52 53
+#define THRUST_PP_INC_IMPL_TAG53 54
+#define THRUST_PP_INC_IMPL_TAG54 55
+#define THRUST_PP_INC_IMPL_TAG55 56
+#define THRUST_PP_INC_IMPL_TAG56 57
+#define THRUST_PP_INC_IMPL_TAG57 58
+#define THRUST_PP_INC_IMPL_TAG58 59
+#define THRUST_PP_INC_IMPL_TAG59 60
+#define THRUST_PP_INC_IMPL_TAG60 61
+#define THRUST_PP_INC_IMPL_TAG61 62
+#define THRUST_PP_INC_IMPL_TAG62 63
+#define THRUST_PP_INC_IMPL_TAG63 64
+#define THRUST_PP_INC_IMPL_TAG64 65
+#define THRUST_PP_INC_IMPL_TAG65 66
+#define THRUST_PP_INC_IMPL_TAG66 67
+#define THRUST_PP_INC_IMPL_TAG67 68
+#define THRUST_PP_INC_IMPL_TAG68 69
+#define THRUST_PP_INC_IMPL_TAG69 70
+#define THRUST_PP_INC_IMPL_TAG70 71
+#define THRUST_PP_INC_IMPL_TAG71 72
+#define THRUST_PP_INC_IMPL_TAG72 73
+#define THRUST_PP_INC_IMPL_TAG73 74
+#define THRUST_PP_INC_IMPL_TAG74 75
+#define THRUST_PP_INC_IMPL_TAG75 76
+#define THRUST_PP_INC_IMPL_TAG76 77
+#define THRUST_PP_INC_IMPL_TAG77 78
+#define THRUST_PP_INC_IMPL_TAG78 79
+#define THRUST_PP_INC_IMPL_TAG79 80
+#define THRUST_PP_INC_IMPL_TAG80 81
+#define THRUST_PP_INC_IMPL_TAG81 82
+#define THRUST_PP_INC_IMPL_TAG82 83
+#define THRUST_PP_INC_IMPL_TAG83 84
+#define THRUST_PP_INC_IMPL_TAG84 85
+#define THRUST_PP_INC_IMPL_TAG85 86
+#define THRUST_PP_INC_IMPL_TAG86 87
+#define THRUST_PP_INC_IMPL_TAG87 88
+#define THRUST_PP_INC_IMPL_TAG88 89
+#define THRUST_PP_INC_IMPL_TAG89 90
+#define THRUST_PP_INC_IMPL_TAG90 91
+#define THRUST_PP_INC_IMPL_TAG91 92
+#define THRUST_PP_INC_IMPL_TAG92 93
+#define THRUST_PP_INC_IMPL_TAG93 94
+#define THRUST_PP_INC_IMPL_TAG94 95
+#define THRUST_PP_INC_IMPL_TAG95 96
+#define THRUST_PP_INC_IMPL_TAG96 97
+#define THRUST_PP_INC_IMPL_TAG97 98
+#define THRUST_PP_INC_IMPL_TAG98 99
+#define THRUST_PP_INC_IMPL_TAG99 100
+#define THRUST_PP_INC_IMPL_TAG100 101
+#define THRUST_PP_INC_IMPL_TAG101 102
+#define THRUST_PP_INC_IMPL_TAG102 103
+#define THRUST_PP_INC_IMPL_TAG103 104
+#define THRUST_PP_INC_IMPL_TAG104 105
+#define THRUST_PP_INC_IMPL_TAG105 106
+#define THRUST_PP_INC_IMPL_TAG106 107
+#define THRUST_PP_INC_IMPL_TAG107 108
+#define THRUST_PP_INC_IMPL_TAG108 109
+#define THRUST_PP_INC_IMPL_TAG109 110
+#define THRUST_PP_INC_IMPL_TAG110 111
+#define THRUST_PP_INC_IMPL_TAG111 112
+#define THRUST_PP_INC_IMPL_TAG112 113
+#define THRUST_PP_INC_IMPL_TAG113 114
+#define THRUST_PP_INC_IMPL_TAG114 115
+#define THRUST_PP_INC_IMPL_TAG115 116
+#define THRUST_PP_INC_IMPL_TAG116 117
+#define THRUST_PP_INC_IMPL_TAG117 118
+#define THRUST_PP_INC_IMPL_TAG118 119
+#define THRUST_PP_INC_IMPL_TAG119 120
+#define THRUST_PP_INC_IMPL_TAG120 121
+#define THRUST_PP_INC_IMPL_TAG121 122
+#define THRUST_PP_INC_IMPL_TAG122 123
+#define THRUST_PP_INC_IMPL_TAG123 124
+#define THRUST_PP_INC_IMPL_TAG124 125
+#define THRUST_PP_INC_IMPL_TAG125 126
+#define THRUST_PP_INC_IMPL_TAG126 127
+#define THRUST_PP_INC_IMPL_TAG127 128
+#define THRUST_PP_INC_IMPL_TAG128 129
+#define THRUST_PP_INC_IMPL_TAG129 130
+#define THRUST_PP_INC_IMPL_TAG130 131
+#define THRUST_PP_INC_IMPL_TAG131 132
+#define THRUST_PP_INC_IMPL_TAG132 133
+#define THRUST_PP_INC_IMPL_TAG133 134
+#define THRUST_PP_INC_IMPL_TAG134 135
+#define THRUST_PP_INC_IMPL_TAG135 136
+#define THRUST_PP_INC_IMPL_TAG136 137
+#define THRUST_PP_INC_IMPL_TAG137 138
+#define THRUST_PP_INC_IMPL_TAG138 139
+#define THRUST_PP_INC_IMPL_TAG139 140
+#define THRUST_PP_INC_IMPL_TAG140 141
+#define THRUST_PP_INC_IMPL_TAG141 142
+#define THRUST_PP_INC_IMPL_TAG142 143
+#define THRUST_PP_INC_IMPL_TAG143 144
+#define THRUST_PP_INC_IMPL_TAG144 145
+#define THRUST_PP_INC_IMPL_TAG145 146
+#define THRUST_PP_INC_IMPL_TAG146 147
+#define THRUST_PP_INC_IMPL_TAG147 148
+#define THRUST_PP_INC_IMPL_TAG148 149
+#define THRUST_PP_INC_IMPL_TAG149 150
+#define THRUST_PP_INC_IMPL_TAG150 151
+#define THRUST_PP_INC_IMPL_TAG151 152
+#define THRUST_PP_INC_IMPL_TAG152 153
+#define THRUST_PP_INC_IMPL_TAG153 154
+#define THRUST_PP_INC_IMPL_TAG154 155
+#define THRUST_PP_INC_IMPL_TAG155 156
+#define THRUST_PP_INC_IMPL_TAG156 157
+#define THRUST_PP_INC_IMPL_TAG157 158
+#define THRUST_PP_INC_IMPL_TAG158 159
+#define THRUST_PP_INC_IMPL_TAG159 160
+#define THRUST_PP_INC_IMPL_TAG160 161
+#define THRUST_PP_INC_IMPL_TAG161 162
+#define THRUST_PP_INC_IMPL_TAG162 163
+#define THRUST_PP_INC_IMPL_TAG163 164
+#define THRUST_PP_INC_IMPL_TAG164 165
+#define THRUST_PP_INC_IMPL_TAG165 166
+#define THRUST_PP_INC_IMPL_TAG166 167
+#define THRUST_PP_INC_IMPL_TAG167 168
+#define THRUST_PP_INC_IMPL_TAG168 169
+#define THRUST_PP_INC_IMPL_TAG169 170
+#define THRUST_PP_INC_IMPL_TAG170 171
+#define THRUST_PP_INC_IMPL_TAG171 172
+#define THRUST_PP_INC_IMPL_TAG172 173
+#define THRUST_PP_INC_IMPL_TAG173 174
+#define THRUST_PP_INC_IMPL_TAG174 175
+#define THRUST_PP_INC_IMPL_TAG175 176
+#define THRUST_PP_INC_IMPL_TAG176 177
+#define THRUST_PP_INC_IMPL_TAG177 178
+#define THRUST_PP_INC_IMPL_TAG178 179
+#define THRUST_PP_INC_IMPL_TAG179 180
+#define THRUST_PP_INC_IMPL_TAG180 181
+#define THRUST_PP_INC_IMPL_TAG181 182
+#define THRUST_PP_INC_IMPL_TAG182 183
+#define THRUST_PP_INC_IMPL_TAG183 184
+#define THRUST_PP_INC_IMPL_TAG184 185
+#define THRUST_PP_INC_IMPL_TAG185 186
+#define THRUST_PP_INC_IMPL_TAG186 187
+#define THRUST_PP_INC_IMPL_TAG187 188
+#define THRUST_PP_INC_IMPL_TAG188 189
+#define THRUST_PP_INC_IMPL_TAG189 190
+#define THRUST_PP_INC_IMPL_TAG190 191
+#define THRUST_PP_INC_IMPL_TAG191 192
+#define THRUST_PP_INC_IMPL_TAG192 193
+#define THRUST_PP_INC_IMPL_TAG193 194
+#define THRUST_PP_INC_IMPL_TAG194 195
+#define THRUST_PP_INC_IMPL_TAG195 196
+#define THRUST_PP_INC_IMPL_TAG196 197
+#define THRUST_PP_INC_IMPL_TAG197 198
+#define THRUST_PP_INC_IMPL_TAG198 199
+#define THRUST_PP_INC_IMPL_TAG199 200
+#define THRUST_PP_INC_IMPL_TAG200 201
+#define THRUST_PP_INC_IMPL_TAG201 202
+#define THRUST_PP_INC_IMPL_TAG202 203
+#define THRUST_PP_INC_IMPL_TAG203 204
+#define THRUST_PP_INC_IMPL_TAG204 205
+#define THRUST_PP_INC_IMPL_TAG205 206
+#define THRUST_PP_INC_IMPL_TAG206 207
+#define THRUST_PP_INC_IMPL_TAG207 208
+#define THRUST_PP_INC_IMPL_TAG208 209
+#define THRUST_PP_INC_IMPL_TAG209 210
+#define THRUST_PP_INC_IMPL_TAG210 211
+#define THRUST_PP_INC_IMPL_TAG211 212
+#define THRUST_PP_INC_IMPL_TAG212 213
+#define THRUST_PP_INC_IMPL_TAG213 214
+#define THRUST_PP_INC_IMPL_TAG214 215
+#define THRUST_PP_INC_IMPL_TAG215 216
+#define THRUST_PP_INC_IMPL_TAG216 217
+#define THRUST_PP_INC_IMPL_TAG217 218
+#define THRUST_PP_INC_IMPL_TAG218 219
+#define THRUST_PP_INC_IMPL_TAG219 220
+#define THRUST_PP_INC_IMPL_TAG220 221
+#define THRUST_PP_INC_IMPL_TAG221 222
+#define THRUST_PP_INC_IMPL_TAG222 223
+#define THRUST_PP_INC_IMPL_TAG223 224
+#define THRUST_PP_INC_IMPL_TAG224 225
+#define THRUST_PP_INC_IMPL_TAG225 226
+#define THRUST_PP_INC_IMPL_TAG226 227
+#define THRUST_PP_INC_IMPL_TAG227 228
+#define THRUST_PP_INC_IMPL_TAG228 229
+#define THRUST_PP_INC_IMPL_TAG229 230
+#define THRUST_PP_INC_IMPL_TAG230 231
+#define THRUST_PP_INC_IMPL_TAG231 232
+#define THRUST_PP_INC_IMPL_TAG232 233
+#define THRUST_PP_INC_IMPL_TAG233 234
+#define THRUST_PP_INC_IMPL_TAG234 235
+#define THRUST_PP_INC_IMPL_TAG235 236
+#define THRUST_PP_INC_IMPL_TAG236 237
+#define THRUST_PP_INC_IMPL_TAG237 238
+#define THRUST_PP_INC_IMPL_TAG238 239
+#define THRUST_PP_INC_IMPL_TAG239 240
+#define THRUST_PP_INC_IMPL_TAG240 241
+#define THRUST_PP_INC_IMPL_TAG241 242
+#define THRUST_PP_INC_IMPL_TAG242 243
+#define THRUST_PP_INC_IMPL_TAG243 244
+#define THRUST_PP_INC_IMPL_TAG244 245
+#define THRUST_PP_INC_IMPL_TAG245 246
+#define THRUST_PP_INC_IMPL_TAG246 247
+#define THRUST_PP_INC_IMPL_TAG247 248
+#define THRUST_PP_INC_IMPL_TAG248 249
+#define THRUST_PP_INC_IMPL_TAG249 250
+#define THRUST_PP_INC_IMPL_TAG250 251
+#define THRUST_PP_INC_IMPL_TAG251 252
+#define THRUST_PP_INC_IMPL_TAG252 253
+#define THRUST_PP_INC_IMPL_TAG253 254
+#define THRUST_PP_INC_IMPL_TAG254 255
+#define THRUST_PP_INC_IMPL_TAG255 256
+#define THRUST_PP_INC_IMPL_TAG256 256
+
+#define THRUST_PP_DEC(x) THRUST_PP_DEC_IMPL0(x)
+
+#define THRUST_PP_DEC_IMPL0(x) THRUST_PP_CAT2(THRUST_PP_DEC_IMPL_TAG, x)
+
+#define THRUST_PP_DEC_IMPL_TAG0 0
+#define THRUST_PP_DEC_IMPL_TAG1 0
+#define THRUST_PP_DEC_IMPL_TAG2 1
+#define THRUST_PP_DEC_IMPL_TAG3 2
+#define THRUST_PP_DEC_IMPL_TAG4 3
+#define THRUST_PP_DEC_IMPL_TAG5 4
+#define THRUST_PP_DEC_IMPL_TAG6 5
+#define THRUST_PP_DEC_IMPL_TAG7 6
+#define THRUST_PP_DEC_IMPL_TAG8 7
+#define THRUST_PP_DEC_IMPL_TAG9 8
+#define THRUST_PP_DEC_IMPL_TAG10 9
+#define THRUST_PP_DEC_IMPL_TAG11 10
+#define THRUST_PP_DEC_IMPL_TAG12 11
+#define THRUST_PP_DEC_IMPL_TAG13 12
+#define THRUST_PP_DEC_IMPL_TAG14 13
+#define THRUST_PP_DEC_IMPL_TAG15 14
+#define THRUST_PP_DEC_IMPL_TAG16 15
+#define THRUST_PP_DEC_IMPL_TAG17 16
+#define THRUST_PP_DEC_IMPL_TAG18 17
+#define THRUST_PP_DEC_IMPL_TAG19 18
+#define THRUST_PP_DEC_IMPL_TAG20 19
+#define THRUST_PP_DEC_IMPL_TAG21 20
+#define THRUST_PP_DEC_IMPL_TAG22 21
+#define THRUST_PP_DEC_IMPL_TAG23 22
+#define THRUST_PP_DEC_IMPL_TAG24 23
+#define THRUST_PP_DEC_IMPL_TAG25 24
+#define THRUST_PP_DEC_IMPL_TAG26 25
+#define THRUST_PP_DEC_IMPL_TAG27 26
+#define THRUST_PP_DEC_IMPL_TAG28 27
+#define THRUST_PP_DEC_IMPL_TAG29 28
+#define THRUST_PP_DEC_IMPL_TAG30 29
+#define THRUST_PP_DEC_IMPL_TAG31 30
+#define THRUST_PP_DEC_IMPL_TAG32 31
+#define THRUST_PP_DEC_IMPL_TAG33 32
+#define THRUST_PP_DEC_IMPL_TAG34 33
+#define THRUST_PP_DEC_IMPL_TAG35 34
+#define THRUST_PP_DEC_IMPL_TAG36 35
+#define THRUST_PP_DEC_IMPL_TAG37 36
+#define THRUST_PP_DEC_IMPL_TAG38 37
+#define THRUST_PP_DEC_IMPL_TAG39 38
+#define THRUST_PP_DEC_IMPL_TAG40 39
+#define THRUST_PP_DEC_IMPL_TAG41 40
+#define THRUST_PP_DEC_IMPL_TAG42 41
+#define THRUST_PP_DEC_IMPL_TAG43 42
+#define THRUST_PP_DEC_IMPL_TAG44 43
+#define THRUST_PP_DEC_IMPL_TAG45 44
+#define THRUST_PP_DEC_IMPL_TAG46 45
+#define THRUST_PP_DEC_IMPL_TAG47 46
+#define THRUST_PP_DEC_IMPL_TAG48 47
+#define THRUST_PP_DEC_IMPL_TAG49 48
+#define THRUST_PP_DEC_IMPL_TAG50 49
+#define THRUST_PP_DEC_IMPL_TAG51 50
+#define THRUST_PP_DEC_IMPL_TAG52 51
+#define THRUST_PP_DEC_IMPL_TAG53 52
+#define THRUST_PP_DEC_IMPL_TAG54 53
+#define THRUST_PP_DEC_IMPL_TAG55 54
+#define THRUST_PP_DEC_IMPL_TAG56 55
+#define THRUST_PP_DEC_IMPL_TAG57 56
+#define THRUST_PP_DEC_IMPL_TAG58 57
+#define THRUST_PP_DEC_IMPL_TAG59 58
+#define THRUST_PP_DEC_IMPL_TAG60 59
+#define THRUST_PP_DEC_IMPL_TAG61 60
+#define THRUST_PP_DEC_IMPL_TAG62 61
+#define THRUST_PP_DEC_IMPL_TAG63 62
+#define THRUST_PP_DEC_IMPL_TAG64 63
+#define THRUST_PP_DEC_IMPL_TAG65 64
+#define THRUST_PP_DEC_IMPL_TAG66 65
+#define THRUST_PP_DEC_IMPL_TAG67 66
+#define THRUST_PP_DEC_IMPL_TAG68 67
+#define THRUST_PP_DEC_IMPL_TAG69 68
+#define THRUST_PP_DEC_IMPL_TAG70 69
+#define THRUST_PP_DEC_IMPL_TAG71 70
+#define THRUST_PP_DEC_IMPL_TAG72 71
+#define THRUST_PP_DEC_IMPL_TAG73 72
+#define THRUST_PP_DEC_IMPL_TAG74 73
+#define THRUST_PP_DEC_IMPL_TAG75 74
+#define THRUST_PP_DEC_IMPL_TAG76 75
+#define THRUST_PP_DEC_IMPL_TAG77 76
+#define THRUST_PP_DEC_IMPL_TAG78 77
+#define THRUST_PP_DEC_IMPL_TAG79 78
+#define THRUST_PP_DEC_IMPL_TAG80 79
+#define THRUST_PP_DEC_IMPL_TAG81 80
+#define THRUST_PP_DEC_IMPL_TAG82 81
+#define THRUST_PP_DEC_IMPL_TAG83 82
+#define THRUST_PP_DEC_IMPL_TAG84 83
+#define THRUST_PP_DEC_IMPL_TAG85 84
+#define THRUST_PP_DEC_IMPL_TAG86 85
+#define THRUST_PP_DEC_IMPL_TAG87 86
+#define THRUST_PP_DEC_IMPL_TAG88 87
+#define THRUST_PP_DEC_IMPL_TAG89 88
+#define THRUST_PP_DEC_IMPL_TAG90 89
+#define THRUST_PP_DEC_IMPL_TAG91 90
+#define THRUST_PP_DEC_IMPL_TAG92 91
+#define THRUST_PP_DEC_IMPL_TAG93 92
+#define THRUST_PP_DEC_IMPL_TAG94 93
+#define THRUST_PP_DEC_IMPL_TAG95 94
+#define THRUST_PP_DEC_IMPL_TAG96 95
+#define THRUST_PP_DEC_IMPL_TAG97 96
+#define THRUST_PP_DEC_IMPL_TAG98 97
+#define THRUST_PP_DEC_IMPL_TAG99 98
+#define THRUST_PP_DEC_IMPL_TAG100 99
+#define THRUST_PP_DEC_IMPL_TAG101 100
+#define THRUST_PP_DEC_IMPL_TAG102 101
+#define THRUST_PP_DEC_IMPL_TAG103 102
+#define THRUST_PP_DEC_IMPL_TAG104 103
+#define THRUST_PP_DEC_IMPL_TAG105 104
+#define THRUST_PP_DEC_IMPL_TAG106 105
+#define THRUST_PP_DEC_IMPL_TAG107 106
+#define THRUST_PP_DEC_IMPL_TAG108 107
+#define THRUST_PP_DEC_IMPL_TAG109 108
+#define THRUST_PP_DEC_IMPL_TAG110 109
+#define THRUST_PP_DEC_IMPL_TAG111 110
+#define THRUST_PP_DEC_IMPL_TAG112 111
+#define THRUST_PP_DEC_IMPL_TAG113 112
+#define THRUST_PP_DEC_IMPL_TAG114 113
+#define THRUST_PP_DEC_IMPL_TAG115 114
+#define THRUST_PP_DEC_IMPL_TAG116 115
+#define THRUST_PP_DEC_IMPL_TAG117 116
+#define THRUST_PP_DEC_IMPL_TAG118 117
+#define THRUST_PP_DEC_IMPL_TAG119 118
+#define THRUST_PP_DEC_IMPL_TAG120 119
+#define THRUST_PP_DEC_IMPL_TAG121 120
+#define THRUST_PP_DEC_IMPL_TAG122 121
+#define THRUST_PP_DEC_IMPL_TAG123 122
+#define THRUST_PP_DEC_IMPL_TAG124 123
+#define THRUST_PP_DEC_IMPL_TAG125 124
+#define THRUST_PP_DEC_IMPL_TAG126 125
+#define THRUST_PP_DEC_IMPL_TAG127 126
+#define THRUST_PP_DEC_IMPL_TAG128 127
+#define THRUST_PP_DEC_IMPL_TAG129 128
+#define THRUST_PP_DEC_IMPL_TAG130 129
+#define THRUST_PP_DEC_IMPL_TAG131 130
+#define THRUST_PP_DEC_IMPL_TAG132 131
+#define THRUST_PP_DEC_IMPL_TAG133 132
+#define THRUST_PP_DEC_IMPL_TAG134 133
+#define THRUST_PP_DEC_IMPL_TAG135 134
+#define THRUST_PP_DEC_IMPL_TAG136 135
+#define THRUST_PP_DEC_IMPL_TAG137 136
+#define THRUST_PP_DEC_IMPL_TAG138 137
+#define THRUST_PP_DEC_IMPL_TAG139 138
+#define THRUST_PP_DEC_IMPL_TAG140 139
+#define THRUST_PP_DEC_IMPL_TAG141 140
+#define THRUST_PP_DEC_IMPL_TAG142 141
+#define THRUST_PP_DEC_IMPL_TAG143 142
+#define THRUST_PP_DEC_IMPL_TAG144 143
+#define THRUST_PP_DEC_IMPL_TAG145 144
+#define THRUST_PP_DEC_IMPL_TAG146 145
+#define THRUST_PP_DEC_IMPL_TAG147 146
+#define THRUST_PP_DEC_IMPL_TAG148 147
+#define THRUST_PP_DEC_IMPL_TAG149 148
+#define THRUST_PP_DEC_IMPL_TAG150 149
+#define THRUST_PP_DEC_IMPL_TAG151 150
+#define THRUST_PP_DEC_IMPL_TAG152 151
+#define THRUST_PP_DEC_IMPL_TAG153 152
+#define THRUST_PP_DEC_IMPL_TAG154 153
+#define THRUST_PP_DEC_IMPL_TAG155 154
+#define THRUST_PP_DEC_IMPL_TAG156 155
+#define THRUST_PP_DEC_IMPL_TAG157 156
+#define THRUST_PP_DEC_IMPL_TAG158 157
+#define THRUST_PP_DEC_IMPL_TAG159 158
+#define THRUST_PP_DEC_IMPL_TAG160 159
+#define THRUST_PP_DEC_IMPL_TAG161 160
+#define THRUST_PP_DEC_IMPL_TAG162 161
+#define THRUST_PP_DEC_IMPL_TAG163 162
+#define THRUST_PP_DEC_IMPL_TAG164 163
+#define THRUST_PP_DEC_IMPL_TAG165 164
+#define THRUST_PP_DEC_IMPL_TAG166 165
+#define THRUST_PP_DEC_IMPL_TAG167 166
+#define THRUST_PP_DEC_IMPL_TAG168 167
+#define THRUST_PP_DEC_IMPL_TAG169 168
+#define THRUST_PP_DEC_IMPL_TAG170 169
+#define THRUST_PP_DEC_IMPL_TAG171 170
+#define THRUST_PP_DEC_IMPL_TAG172 171
+#define THRUST_PP_DEC_IMPL_TAG173 172
+#define THRUST_PP_DEC_IMPL_TAG174 173
+#define THRUST_PP_DEC_IMPL_TAG175 174
+#define THRUST_PP_DEC_IMPL_TAG176 175
+#define THRUST_PP_DEC_IMPL_TAG177 176
+#define THRUST_PP_DEC_IMPL_TAG178 177
+#define THRUST_PP_DEC_IMPL_TAG179 178
+#define THRUST_PP_DEC_IMPL_TAG180 179
+#define THRUST_PP_DEC_IMPL_TAG181 180
+#define THRUST_PP_DEC_IMPL_TAG182 181
+#define THRUST_PP_DEC_IMPL_TAG183 182
+#define THRUST_PP_DEC_IMPL_TAG184 183
+#define THRUST_PP_DEC_IMPL_TAG185 184
+#define THRUST_PP_DEC_IMPL_TAG186 185
+#define THRUST_PP_DEC_IMPL_TAG187 186
+#define THRUST_PP_DEC_IMPL_TAG188 187
+#define THRUST_PP_DEC_IMPL_TAG189 188
+#define THRUST_PP_DEC_IMPL_TAG190 189
+#define THRUST_PP_DEC_IMPL_TAG191 190
+#define THRUST_PP_DEC_IMPL_TAG192 191
+#define THRUST_PP_DEC_IMPL_TAG193 192
+#define THRUST_PP_DEC_IMPL_TAG194 193
+#define THRUST_PP_DEC_IMPL_TAG195 194
+#define THRUST_PP_DEC_IMPL_TAG196 195
+#define THRUST_PP_DEC_IMPL_TAG197 196
+#define THRUST_PP_DEC_IMPL_TAG198 197
+#define THRUST_PP_DEC_IMPL_TAG199 198
+#define THRUST_PP_DEC_IMPL_TAG200 199
+#define THRUST_PP_DEC_IMPL_TAG201 200
+#define THRUST_PP_DEC_IMPL_TAG202 201
+#define THRUST_PP_DEC_IMPL_TAG203 202
+#define THRUST_PP_DEC_IMPL_TAG204 203
+#define THRUST_PP_DEC_IMPL_TAG205 204
+#define THRUST_PP_DEC_IMPL_TAG206 205
+#define THRUST_PP_DEC_IMPL_TAG207 206
+#define THRUST_PP_DEC_IMPL_TAG208 207
+#define THRUST_PP_DEC_IMPL_TAG209 208
+#define THRUST_PP_DEC_IMPL_TAG210 209
+#define THRUST_PP_DEC_IMPL_TAG211 210
+#define THRUST_PP_DEC_IMPL_TAG212 211
+#define THRUST_PP_DEC_IMPL_TAG213 212
+#define THRUST_PP_DEC_IMPL_TAG214 213
+#define THRUST_PP_DEC_IMPL_TAG215 214
+#define THRUST_PP_DEC_IMPL_TAG216 215
+#define THRUST_PP_DEC_IMPL_TAG217 216
+#define THRUST_PP_DEC_IMPL_TAG218 217
+#define THRUST_PP_DEC_IMPL_TAG219 218
+#define THRUST_PP_DEC_IMPL_TAG220 219
+#define THRUST_PP_DEC_IMPL_TAG221 220
+#define THRUST_PP_DEC_IMPL_TAG222 221
+#define THRUST_PP_DEC_IMPL_TAG223 222
+#define THRUST_PP_DEC_IMPL_TAG224 223
+#define THRUST_PP_DEC_IMPL_TAG225 224
+#define THRUST_PP_DEC_IMPL_TAG226 225
+#define THRUST_PP_DEC_IMPL_TAG227 226
+#define THRUST_PP_DEC_IMPL_TAG228 227
+#define THRUST_PP_DEC_IMPL_TAG229 228
+#define THRUST_PP_DEC_IMPL_TAG230 229
+#define THRUST_PP_DEC_IMPL_TAG231 230
+#define THRUST_PP_DEC_IMPL_TAG232 231
+#define THRUST_PP_DEC_IMPL_TAG233 232
+#define THRUST_PP_DEC_IMPL_TAG234 233
+#define THRUST_PP_DEC_IMPL_TAG235 234
+#define THRUST_PP_DEC_IMPL_TAG236 235
+#define THRUST_PP_DEC_IMPL_TAG237 236
+#define THRUST_PP_DEC_IMPL_TAG238 237
+#define THRUST_PP_DEC_IMPL_TAG239 238
+#define THRUST_PP_DEC_IMPL_TAG240 239
+#define THRUST_PP_DEC_IMPL_TAG241 240
+#define THRUST_PP_DEC_IMPL_TAG242 241
+#define THRUST_PP_DEC_IMPL_TAG243 242
+#define THRUST_PP_DEC_IMPL_TAG244 243
+#define THRUST_PP_DEC_IMPL_TAG245 244
+#define THRUST_PP_DEC_IMPL_TAG246 245
+#define THRUST_PP_DEC_IMPL_TAG247 246
+#define THRUST_PP_DEC_IMPL_TAG248 247
+#define THRUST_PP_DEC_IMPL_TAG249 248
+#define THRUST_PP_DEC_IMPL_TAG250 249
+#define THRUST_PP_DEC_IMPL_TAG251 250
+#define THRUST_PP_DEC_IMPL_TAG252 251
+#define THRUST_PP_DEC_IMPL_TAG253 252
+#define THRUST_PP_DEC_IMPL_TAG254 253
+#define THRUST_PP_DEC_IMPL_TAG255 254
+#define THRUST_PP_DEC_IMPL_TAG256 255
+#define THRUST_PP_DEC_IMPL_TAG257 256
+
+#define THRUST_PP_BOOL(x) THRUST_PP_BOOL_IMPL0(x)
+
+#define THRUST_PP_BOOL_IMPL0(x) THRUST_PP_CAT2(THRUST_PP_BOOL_IMPL_TAG, x)
+
+#define THRUST_PP_BOOL_IMPL_TAG0 0
+#define THRUST_PP_BOOL_IMPL_TAG1 1
+#define THRUST_PP_BOOL_IMPL_TAG2 1
+#define THRUST_PP_BOOL_IMPL_TAG3 1
+#define THRUST_PP_BOOL_IMPL_TAG4 1
+#define THRUST_PP_BOOL_IMPL_TAG5 1
+#define THRUST_PP_BOOL_IMPL_TAG6 1
+#define THRUST_PP_BOOL_IMPL_TAG7 1
+#define THRUST_PP_BOOL_IMPL_TAG8 1
+#define THRUST_PP_BOOL_IMPL_TAG9 1
+#define THRUST_PP_BOOL_IMPL_TAG10 1
+#define THRUST_PP_BOOL_IMPL_TAG11 1
+#define THRUST_PP_BOOL_IMPL_TAG12 1
+#define THRUST_PP_BOOL_IMPL_TAG13 1
+#define THRUST_PP_BOOL_IMPL_TAG14 1
+#define THRUST_PP_BOOL_IMPL_TAG15 1
+#define THRUST_PP_BOOL_IMPL_TAG16 1
+#define THRUST_PP_BOOL_IMPL_TAG17 1
+#define THRUST_PP_BOOL_IMPL_TAG18 1
+#define THRUST_PP_BOOL_IMPL_TAG19 1
+#define THRUST_PP_BOOL_IMPL_TAG20 1
+#define THRUST_PP_BOOL_IMPL_TAG21 1
+#define THRUST_PP_BOOL_IMPL_TAG22 1
+#define THRUST_PP_BOOL_IMPL_TAG23 1
+#define THRUST_PP_BOOL_IMPL_TAG24 1
+#define THRUST_PP_BOOL_IMPL_TAG25 1
+#define THRUST_PP_BOOL_IMPL_TAG26 1
+#define THRUST_PP_BOOL_IMPL_TAG27 1
+#define THRUST_PP_BOOL_IMPL_TAG28 1
+#define THRUST_PP_BOOL_IMPL_TAG29 1
+#define THRUST_PP_BOOL_IMPL_TAG30 1
+#define THRUST_PP_BOOL_IMPL_TAG31 1
+#define THRUST_PP_BOOL_IMPL_TAG32 1
+#define THRUST_PP_BOOL_IMPL_TAG33 1
+#define THRUST_PP_BOOL_IMPL_TAG34 1
+#define THRUST_PP_BOOL_IMPL_TAG35 1
+#define THRUST_PP_BOOL_IMPL_TAG36 1
+#define THRUST_PP_BOOL_IMPL_TAG37 1
+#define THRUST_PP_BOOL_IMPL_TAG38 1
+#define THRUST_PP_BOOL_IMPL_TAG39 1
+#define THRUST_PP_BOOL_IMPL_TAG40 1
+#define THRUST_PP_BOOL_IMPL_TAG41 1
+#define THRUST_PP_BOOL_IMPL_TAG42 1
+#define THRUST_PP_BOOL_IMPL_TAG43 1
+#define THRUST_PP_BOOL_IMPL_TAG44 1
+#define THRUST_PP_BOOL_IMPL_TAG45 1
+#define THRUST_PP_BOOL_IMPL_TAG46 1
+#define THRUST_PP_BOOL_IMPL_TAG47 1
+#define THRUST_PP_BOOL_IMPL_TAG48 1
+#define THRUST_PP_BOOL_IMPL_TAG49 1
+#define THRUST_PP_BOOL_IMPL_TAG50 1
+#define THRUST_PP_BOOL_IMPL_TAG51 1
+#define THRUST_PP_BOOL_IMPL_TAG52 1
+#define THRUST_PP_BOOL_IMPL_TAG53 1
+#define THRUST_PP_BOOL_IMPL_TAG54 1
+#define THRUST_PP_BOOL_IMPL_TAG55 1
+#define THRUST_PP_BOOL_IMPL_TAG56 1
+#define THRUST_PP_BOOL_IMPL_TAG57 1
+#define THRUST_PP_BOOL_IMPL_TAG58 1
+#define THRUST_PP_BOOL_IMPL_TAG59 1
+#define THRUST_PP_BOOL_IMPL_TAG60 1
+#define THRUST_PP_BOOL_IMPL_TAG61 1
+#define THRUST_PP_BOOL_IMPL_TAG62 1
+#define THRUST_PP_BOOL_IMPL_TAG63 1
+#define THRUST_PP_BOOL_IMPL_TAG64 1
+#define THRUST_PP_BOOL_IMPL_TAG65 1
+#define THRUST_PP_BOOL_IMPL_TAG66 1
+#define THRUST_PP_BOOL_IMPL_TAG67 1
+#define THRUST_PP_BOOL_IMPL_TAG68 1
+#define THRUST_PP_BOOL_IMPL_TAG69 1
+#define THRUST_PP_BOOL_IMPL_TAG70 1
+#define THRUST_PP_BOOL_IMPL_TAG71 1
+#define THRUST_PP_BOOL_IMPL_TAG72 1
+#define THRUST_PP_BOOL_IMPL_TAG73 1
+#define THRUST_PP_BOOL_IMPL_TAG74 1
+#define THRUST_PP_BOOL_IMPL_TAG75 1
+#define THRUST_PP_BOOL_IMPL_TAG76 1
+#define THRUST_PP_BOOL_IMPL_TAG77 1
+#define THRUST_PP_BOOL_IMPL_TAG78 1
+#define THRUST_PP_BOOL_IMPL_TAG79 1
+#define THRUST_PP_BOOL_IMPL_TAG80 1
+#define THRUST_PP_BOOL_IMPL_TAG81 1
+#define THRUST_PP_BOOL_IMPL_TAG82 1
+#define THRUST_PP_BOOL_IMPL_TAG83 1
+#define THRUST_PP_BOOL_IMPL_TAG84 1
+#define THRUST_PP_BOOL_IMPL_TAG85 1
+#define THRUST_PP_BOOL_IMPL_TAG86 1
+#define THRUST_PP_BOOL_IMPL_TAG87 1
+#define THRUST_PP_BOOL_IMPL_TAG88 1
+#define THRUST_PP_BOOL_IMPL_TAG89 1
+#define THRUST_PP_BOOL_IMPL_TAG90 1
+#define THRUST_PP_BOOL_IMPL_TAG91 1
+#define THRUST_PP_BOOL_IMPL_TAG92 1
+#define THRUST_PP_BOOL_IMPL_TAG93 1
+#define THRUST_PP_BOOL_IMPL_TAG94 1
+#define THRUST_PP_BOOL_IMPL_TAG95 1
+#define THRUST_PP_BOOL_IMPL_TAG96 1
+#define THRUST_PP_BOOL_IMPL_TAG97 1
+#define THRUST_PP_BOOL_IMPL_TAG98 1
+#define THRUST_PP_BOOL_IMPL_TAG99 1
+#define THRUST_PP_BOOL_IMPL_TAG100 1
+#define THRUST_PP_BOOL_IMPL_TAG101 1
+#define THRUST_PP_BOOL_IMPL_TAG102 1
+#define THRUST_PP_BOOL_IMPL_TAG103 1
+#define THRUST_PP_BOOL_IMPL_TAG104 1
+#define THRUST_PP_BOOL_IMPL_TAG105 1
+#define THRUST_PP_BOOL_IMPL_TAG106 1
+#define THRUST_PP_BOOL_IMPL_TAG107 1
+#define THRUST_PP_BOOL_IMPL_TAG108 1
+#define THRUST_PP_BOOL_IMPL_TAG109 1
+#define THRUST_PP_BOOL_IMPL_TAG110 1
+#define THRUST_PP_BOOL_IMPL_TAG111 1
+#define THRUST_PP_BOOL_IMPL_TAG112 1
+#define THRUST_PP_BOOL_IMPL_TAG113 1
+#define THRUST_PP_BOOL_IMPL_TAG114 1
+#define THRUST_PP_BOOL_IMPL_TAG115 1
+#define THRUST_PP_BOOL_IMPL_TAG116 1
+#define THRUST_PP_BOOL_IMPL_TAG117 1
+#define THRUST_PP_BOOL_IMPL_TAG118 1
+#define THRUST_PP_BOOL_IMPL_TAG119 1
+#define THRUST_PP_BOOL_IMPL_TAG120 1
+#define THRUST_PP_BOOL_IMPL_TAG121 1
+#define THRUST_PP_BOOL_IMPL_TAG122 1
+#define THRUST_PP_BOOL_IMPL_TAG123 1
+#define THRUST_PP_BOOL_IMPL_TAG124 1
+#define THRUST_PP_BOOL_IMPL_TAG125 1
+#define THRUST_PP_BOOL_IMPL_TAG126 1
+#define THRUST_PP_BOOL_IMPL_TAG127 1
+#define THRUST_PP_BOOL_IMPL_TAG128 1
+#define THRUST_PP_BOOL_IMPL_TAG129 1
+#define THRUST_PP_BOOL_IMPL_TAG130 1
+#define THRUST_PP_BOOL_IMPL_TAG131 1
+#define THRUST_PP_BOOL_IMPL_TAG132 1
+#define THRUST_PP_BOOL_IMPL_TAG133 1
+#define THRUST_PP_BOOL_IMPL_TAG134 1
+#define THRUST_PP_BOOL_IMPL_TAG135 1
+#define THRUST_PP_BOOL_IMPL_TAG136 1
+#define THRUST_PP_BOOL_IMPL_TAG137 1
+#define THRUST_PP_BOOL_IMPL_TAG138 1
+#define THRUST_PP_BOOL_IMPL_TAG139 1
+#define THRUST_PP_BOOL_IMPL_TAG140 1
+#define THRUST_PP_BOOL_IMPL_TAG141 1
+#define THRUST_PP_BOOL_IMPL_TAG142 1
+#define THRUST_PP_BOOL_IMPL_TAG143 1
+#define THRUST_PP_BOOL_IMPL_TAG144 1
+#define THRUST_PP_BOOL_IMPL_TAG145 1
+#define THRUST_PP_BOOL_IMPL_TAG146 1
+#define THRUST_PP_BOOL_IMPL_TAG147 1
+#define THRUST_PP_BOOL_IMPL_TAG148 1
+#define THRUST_PP_BOOL_IMPL_TAG149 1
+#define THRUST_PP_BOOL_IMPL_TAG150 1
+#define THRUST_PP_BOOL_IMPL_TAG151 1
+#define THRUST_PP_BOOL_IMPL_TAG152 1
+#define THRUST_PP_BOOL_IMPL_TAG153 1
+#define THRUST_PP_BOOL_IMPL_TAG154 1
+#define THRUST_PP_BOOL_IMPL_TAG155 1
+#define THRUST_PP_BOOL_IMPL_TAG156 1
+#define THRUST_PP_BOOL_IMPL_TAG157 1
+#define THRUST_PP_BOOL_IMPL_TAG158 1
+#define THRUST_PP_BOOL_IMPL_TAG159 1
+#define THRUST_PP_BOOL_IMPL_TAG160 1
+#define THRUST_PP_BOOL_IMPL_TAG161 1
+#define THRUST_PP_BOOL_IMPL_TAG162 1
+#define THRUST_PP_BOOL_IMPL_TAG163 1
+#define THRUST_PP_BOOL_IMPL_TAG164 1
+#define THRUST_PP_BOOL_IMPL_TAG165 1
+#define THRUST_PP_BOOL_IMPL_TAG166 1
+#define THRUST_PP_BOOL_IMPL_TAG167 1
+#define THRUST_PP_BOOL_IMPL_TAG168 1
+#define THRUST_PP_BOOL_IMPL_TAG169 1
+#define THRUST_PP_BOOL_IMPL_TAG170 1
+#define THRUST_PP_BOOL_IMPL_TAG171 1
+#define THRUST_PP_BOOL_IMPL_TAG172 1
+#define THRUST_PP_BOOL_IMPL_TAG173 1
+#define THRUST_PP_BOOL_IMPL_TAG174 1
+#define THRUST_PP_BOOL_IMPL_TAG175 1
+#define THRUST_PP_BOOL_IMPL_TAG176 1
+#define THRUST_PP_BOOL_IMPL_TAG177 1
+#define THRUST_PP_BOOL_IMPL_TAG178 1
+#define THRUST_PP_BOOL_IMPL_TAG179 1
+#define THRUST_PP_BOOL_IMPL_TAG180 1
+#define THRUST_PP_BOOL_IMPL_TAG181 1
+#define THRUST_PP_BOOL_IMPL_TAG182 1
+#define THRUST_PP_BOOL_IMPL_TAG183 1
+#define THRUST_PP_BOOL_IMPL_TAG184 1
+#define THRUST_PP_BOOL_IMPL_TAG185 1
+#define THRUST_PP_BOOL_IMPL_TAG186 1
+#define THRUST_PP_BOOL_IMPL_TAG187 1
+#define THRUST_PP_BOOL_IMPL_TAG188 1
+#define THRUST_PP_BOOL_IMPL_TAG189 1
+#define THRUST_PP_BOOL_IMPL_TAG190 1
+#define THRUST_PP_BOOL_IMPL_TAG191 1
+#define THRUST_PP_BOOL_IMPL_TAG192 1
+#define THRUST_PP_BOOL_IMPL_TAG193 1
+#define THRUST_PP_BOOL_IMPL_TAG194 1
+#define THRUST_PP_BOOL_IMPL_TAG195 1
+#define THRUST_PP_BOOL_IMPL_TAG196 1
+#define THRUST_PP_BOOL_IMPL_TAG197 1
+#define THRUST_PP_BOOL_IMPL_TAG198 1
+#define THRUST_PP_BOOL_IMPL_TAG199 1
+#define THRUST_PP_BOOL_IMPL_TAG200 1
+#define THRUST_PP_BOOL_IMPL_TAG201 1
+#define THRUST_PP_BOOL_IMPL_TAG202 1
+#define THRUST_PP_BOOL_IMPL_TAG203 1
+#define THRUST_PP_BOOL_IMPL_TAG204 1
+#define THRUST_PP_BOOL_IMPL_TAG205 1
+#define THRUST_PP_BOOL_IMPL_TAG206 1
+#define THRUST_PP_BOOL_IMPL_TAG207 1
+#define THRUST_PP_BOOL_IMPL_TAG208 1
+#define THRUST_PP_BOOL_IMPL_TAG209 1
+#define THRUST_PP_BOOL_IMPL_TAG210 1
+#define THRUST_PP_BOOL_IMPL_TAG211 1
+#define THRUST_PP_BOOL_IMPL_TAG212 1
+#define THRUST_PP_BOOL_IMPL_TAG213 1
+#define THRUST_PP_BOOL_IMPL_TAG214 1
+#define THRUST_PP_BOOL_IMPL_TAG215 1
+#define THRUST_PP_BOOL_IMPL_TAG216 1
+#define THRUST_PP_BOOL_IMPL_TAG217 1
+#define THRUST_PP_BOOL_IMPL_TAG218 1
+#define THRUST_PP_BOOL_IMPL_TAG219 1
+#define THRUST_PP_BOOL_IMPL_TAG220 1
+#define THRUST_PP_BOOL_IMPL_TAG221 1
+#define THRUST_PP_BOOL_IMPL_TAG222 1
+#define THRUST_PP_BOOL_IMPL_TAG223 1
+#define THRUST_PP_BOOL_IMPL_TAG224 1
+#define THRUST_PP_BOOL_IMPL_TAG225 1
+#define THRUST_PP_BOOL_IMPL_TAG226 1
+#define THRUST_PP_BOOL_IMPL_TAG227 1
+#define THRUST_PP_BOOL_IMPL_TAG228 1
+#define THRUST_PP_BOOL_IMPL_TAG229 1
+#define THRUST_PP_BOOL_IMPL_TAG230 1
+#define THRUST_PP_BOOL_IMPL_TAG231 1
+#define THRUST_PP_BOOL_IMPL_TAG232 1
+#define THRUST_PP_BOOL_IMPL_TAG233 1
+#define THRUST_PP_BOOL_IMPL_TAG234 1
+#define THRUST_PP_BOOL_IMPL_TAG235 1
+#define THRUST_PP_BOOL_IMPL_TAG236 1
+#define THRUST_PP_BOOL_IMPL_TAG237 1
+#define THRUST_PP_BOOL_IMPL_TAG238 1
+#define THRUST_PP_BOOL_IMPL_TAG239 1
+#define THRUST_PP_BOOL_IMPL_TAG240 1
+#define THRUST_PP_BOOL_IMPL_TAG241 1
+#define THRUST_PP_BOOL_IMPL_TAG242 1
+#define THRUST_PP_BOOL_IMPL_TAG243 1
+#define THRUST_PP_BOOL_IMPL_TAG244 1
+#define THRUST_PP_BOOL_IMPL_TAG245 1
+#define THRUST_PP_BOOL_IMPL_TAG246 1
+#define THRUST_PP_BOOL_IMPL_TAG247 1
+#define THRUST_PP_BOOL_IMPL_TAG248 1
+#define THRUST_PP_BOOL_IMPL_TAG249 1
+#define THRUST_PP_BOOL_IMPL_TAG250 1
+#define THRUST_PP_BOOL_IMPL_TAG251 1
+#define THRUST_PP_BOOL_IMPL_TAG252 1
+#define THRUST_PP_BOOL_IMPL_TAG253 1
+#define THRUST_PP_BOOL_IMPL_TAG254 1
+#define THRUST_PP_BOOL_IMPL_TAG255 1
+#define THRUST_PP_BOOL_IMPL_TAG256 1
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define THRUST_PP_IIF(bit, t, f) THRUST_PP_IIF_IMPL0(bit, t, f)
+
+#if defined(_MSC_VER)
+  #define THRUST_PP_IIF_IMPL0(bit, t, f)                                      \
+    THRUST_PP_IIF_IMPL1(THRUST_PP_CAT2(THRUST_PP_IIF_IMPL_TAG, bit(t, f)))    \
+    /**/
+  #define THRUST_PP_IIF_IMPL1(id) id
+#else
+  #define THRUST_PP_IIF_IMPL0(bit, t, f)                                      \
+    THRUST_PP_CAT2(THRUST_PP_IIF_IMPL_TAG, bit(t, f))                         
+    /**/
+#endif
+
+#define THRUST_PP_IIF_IMPL_TAG0(t, f) f
+#define THRUST_PP_IIF_IMPL_TAG1(t, f) t
+
+#if defined(__EDG__)
+  #define THRUST_PP_IF(cond, t, f) THRUST_PP_IF_IMPL0(cond, t, f)
+  #define THRUST_PP_IF_IMPL0(cond, t, f)                                      \
+    THRUST_PP_IIF(THRUST_PP_BOOL(cond), t, f)                                 \
+    /**/
+#else
+  #define THRUST_PP_IF(cond, t, f) THRUST_PP_IIF(THRUST_PP_BOOL(cond), t, f)
+#endif
+
+/// \def THRUST_COMMA_IF(cond)
+/// \brief If \a cond is true, expands to a comma. Otherwise, expands to nothing.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_STRINGIZE(THRUST_COMMA_IF(0)) << "\n"
+///             << THRUST_PP_STRINGIZE(THRUST_COMMA_IF(1)) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << ""  << "\n"
+///             << "," << "\n";
+/// }
+/// \endcode
+///
+#if defined(__EDG__)
+  #define THRUST_PP_COMMA_IF(cond) THRUST_PP_COMMA_IF_IMPL0(cond)
+  #define THRUST_PP_COMMA_IF_IMPL0(cond)                                      \
+    THRUST_PP_IF(cond, THRUST_PP_COMMA, THRUST_PP_EMPTY)()                    \
+    /**/
+#else
+  #define THRUST_PP_COMMA_IF(cond)                                            \
+    THRUST_PP_IF(cond, THRUST_PP_COMMA, THRUST_PP_EMPTY)()                    \
+    /**/
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+// http://gustedt.wordpress.com/2010/06/08/detect-empty-macro-arguments
+
+#define THRUST_PP_64TH_ARG(                                                   \
+     _1, _2, _3, _4, _5, _6, _7, _8, _9,_10,_11,_12,_13,_14,_15,_16           \
+  , _17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32           \
+  , _33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48           \
+  , _49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,  N           \
+  , ...                                                                       \
+  ) N                                                                         \
+  /**/
+
+#define THRUST_PP_HAS_COMMA(...)                                              \
+  THRUST_PP_EXPAND(THRUST_PP_64TH_ARG(                                        \
+    __VA_ARGS__                                                               \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1                                           \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1                                           \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1                                           \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0                                             \
+  ))                                                                          \
+  /**/
+
+#define THRUST_PP_TRIGGER_PAREN(...) ,
+
+#define THRUST_PP_IS_VARIADIC_NULLARY(...)                                    \
+  THRUST_PP_IS_VARIADIC_NULLARY_IMPL0(                                        \
+    /* Test if there is just one argument, eventually an empty one. */        \
+    THRUST_PP_HAS_COMMA(__VA_ARGS__),                                         \
+    /* Test if THRUST_PP_TRIGGER_PAREN together with the argument adds a */   \
+    /* comma. */                                                              \
+    THRUST_PP_HAS_COMMA(THRUST_PP_TRIGGER_PAREN __VA_ARGS__),                 \
+    /* Test if the argument together with a parenthesis adds a comma. */      \
+    THRUST_PP_HAS_COMMA(__VA_ARGS__ (/*empty*/)),                             \
+    /* Test if placing it between THRUST_PP_TRIGGER_PAREN and the */          \
+    /* parenthesis adds a comma. */                                           \
+    THRUST_PP_HAS_COMMA(THRUST_PP_TRIGGER_PAREN __VA_ARGS__ (/*empty*/))      \
+  )                                                                           \
+  /**/
+
+#define THRUST_PP_IS_VARIADIC_NULLARY_IMPL0(_0, _1, _2, _3)                   \
+  THRUST_PP_HAS_COMMA(                                                        \
+    THRUST_PP_CAT5(THRUST_PP_IS_VARIADIC_NULLARY_IMPL_TAG, _0, _1, _2, _3)    \
+  )                                                                           \
+
+#define THRUST_PP_IS_VARIADIC_NULLARY_IMPL_TAG0001 ,
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_ARITY(...)
+/// \brief Returns the number of arguments that it was called with. Must be
+///        called with less than 64 arguments.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_ARITY()        << "\n"
+///             << THRUST_PP_ARITY(x)       << "\n"
+///             << THRUST_PP_ARITY(x, y)    << "\n"
+///             << THRUST_PP_ARITY(x, y, z) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << 0 << "\n"
+///             << 1 << "\n"
+///             << 2 << "\n"
+///             << 3 << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_ARITY(...)                                                  \
+  THRUST_PP_EXPAND(                                                           \
+    THRUST_PP_IF(                                                             \
+      THRUST_PP_IS_VARIADIC_NULLARY(__VA_ARGS__)                              \
+    , 0                                                                       \
+    , THRUST_PP_64TH_ARG(                                                     \
+        __VA_ARGS__                                                           \
+      , 63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48                       \
+      , 47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32                       \
+      , 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16                       \
+      , 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0                       \
+      )                                                                       \
+    )                                                                         \
+  )                                                                           \
+  /**/
+
+/// \def THRUST_PP_DISPATCH(basename, ...)
+/// \brief Expands to <code>basenameN(...)</code>, where <code>N</code> is the
+///        number of variadic arguments that \a THRUST_PP_DISPATCH was called 
+///        with. This macro can be used to implement "macro overloading".
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// #define PLUS(...) THRUST_PP_DISPATCH(PLUS, __VA_ARGS__)
+/// #define PLUS0()        0
+/// #define PLUS1(x)       x
+/// #define PLUS2(x, y)    x + y
+/// #define PLUS3(x, y, z) x + y + z
+///
+/// int main()
+/// {
+///   std::cout << PLUS()        << "\n"
+///             << PLUS(1)       << "\n"
+///             << PLUS(1, 2)    << "\n"
+///             << PLUS(1, 2, 3) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << 0         << "\n"
+///             << 1         << "\n"
+///             << 1 + 2     << "\n"
+///             << 1 + 2 + 3 << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_DISPATCH(basename, ...)                                     \
+  THRUST_PP_EXPAND(                                                           \
+    THRUST_PP_CAT2(                                                           \
+      basename,                                                               \
+      THRUST_PP_ARITY(__VA_ARGS__)                                            \
+    )(__VA_ARGS__)                                                            \
+  )                                                                           \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_CURRENT_FUNCTION
+/// \brief The name of the current function as a string.
+///
+#if    defined(__GNUC__)                                                      \
+    || (defined(__MWERKS__) && (__MWERKS__ >= 0x3000))                        \
+    || (defined(__ICC) && (__ICC >= 600)) || defined(__ghs__)
+  #define THRUST_CURRENT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(__DMC__) && (__DMC__ >= 0x810)
+  #define THRUST_CURRENT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(__FUNCSIG__)
+  #define THRUST_CURRENT_FUNCTION __FUNCSIG__
+#elif    (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 600))             \
+      || (defined(__IBMCTHRUST_PP__) && (__IBMCTHRUST_PP__ >= 500))
+  #define THRUST_CURRENT_FUNCTION __FUNCTION__
+#elif defined(__BORLANDC__) && (__BORLANDC__ >= 0x550)
+  #define THRUST_CURRENT_FUNCTION __FUNC__
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)
+  #define THRUST_CURRENT_FUNCTION __func__
+#elif defined(__cplusplus) && (__cplusplus >= 201103)
+  #define THRUST_CURRENT_FUNCTION __func__
+#else
+  #define THRUST_CURRENT_FUNCTION "(unknown)"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/thrust/thrust/detail/range/head_flags.h b/thrust/thrust/detail/range/head_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..b193651cfdafdee3bae159fd59ccfd7ff8763f63
--- /dev/null
+++ b/thrust/thrust/detail/range/head_flags.h
@@ -0,0 +1,230 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/functional.h>
+
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename RandomAccessIterator,
+         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
+         typename ValueType = bool,
+         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
+  class head_flags_with_init
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type init_type;
+
+  // XXX WAR cudafe issue
+  //private:
+  public:
+    struct head_flag_functor
+    {
+      BinaryPredicate binary_pred; // this must be the first member for performance reasons
+      init_type init;
+      IndexType n;
+
+      typedef ValueType result_type;
+
+      __host__ __device__
+      head_flag_functor(init_type init, IndexType n)
+        : binary_pred(), init(init), n(n)
+      {}
+
+      __host__ __device__
+      head_flag_functor(init_type init, IndexType n, BinaryPredicate binary_pred)
+        : binary_pred(binary_pred), init(init), n(n)
+      {}
+
+      template<typename Tuple>
+      __host__ __device__ __thrust_forceinline__
+      result_type operator()(const Tuple &t)
+      {
+        const IndexType i = thrust::get<0>(t);
+
+        if(i == 0)
+        {
+          return !binary_pred(init, thrust::get<1>(t));
+        }
+
+        return !binary_pred(thrust::get<1>(t), thrust::get<2>(t));
+      }
+    };
+
+    typedef thrust::counting_iterator<IndexType> counting_iterator;
+
+  public:
+    typedef thrust::transform_iterator<
+      head_flag_functor,
+      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
+    > iterator;
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init)
+      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
+                                                head_flag_functor(init, last - first))),
+        m_end(m_begin + (last - first))
+    {}
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init, BinaryPredicate binary_pred)
+      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
+                                                head_flag_functor(init, last - first, binary_pred))),
+        m_end(m_begin + (last - first))
+    {}
+
+    __host__ __device__
+    iterator begin() const
+    {
+      return m_begin;
+    }
+
+    __host__ __device__
+    iterator end() const
+    {
+      return m_end;
+    }
+
+    template<typename OtherIndex>
+    __host__ __device__
+    typename iterator::reference operator[](OtherIndex i)
+    {
+      return *(begin() + i);
+    }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+
+template<typename RandomAccessIterator,
+         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
+         typename ValueType = bool,
+         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
+  class head_flags
+{
+  // XXX WAR cudafe issue
+  //private:
+  public:
+    struct head_flag_functor
+    {
+      BinaryPredicate binary_pred; // this must be the first member for performance reasons
+      IndexType n;
+
+      typedef ValueType result_type;
+
+      __host__ __device__
+      head_flag_functor(IndexType n)
+        : binary_pred(), n(n)
+      {}
+
+      __host__ __device__
+      head_flag_functor(IndexType n, BinaryPredicate binary_pred)
+        : binary_pred(binary_pred), n(n)
+      {}
+
+      template<typename Tuple>
+      __host__ __device__ __thrust_forceinline__
+      result_type operator()(const Tuple &t)
+      {
+        const IndexType i = thrust::get<0>(t);
+
+        // note that we do not dereference the tuple's 2nd element when i <= 0
+        // and therefore do not dereference a bad location at the boundary
+        return (i == 0 || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)));
+      }
+    };
+
+    typedef thrust::counting_iterator<IndexType> counting_iterator;
+
+  public:
+    typedef thrust::transform_iterator<
+      head_flag_functor,
+      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
+    > iterator;
+
+    __host__ __device__
+    head_flags(RandomAccessIterator first, RandomAccessIterator last)
+      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
+                                                head_flag_functor(last - first))),
+        m_end(m_begin + (last - first))
+    {}
+
+    __host__ __device__
+    head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
+                                                head_flag_functor(last - first, binary_pred))),
+        m_end(m_begin + (last - first))
+    {}
+
+    __host__ __device__
+    iterator begin() const
+    {
+      return m_begin;
+    }
+
+    __host__ __device__
+    iterator end() const
+    {
+      return m_end;
+    }
+
+    template<typename OtherIndex>
+    __host__ __device__
+    typename iterator::reference operator[](OtherIndex i)
+    {
+      return *(begin() + i);
+    }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+template<typename RandomAccessIterator, typename BinaryPredicate>
+__host__ __device__
+head_flags<RandomAccessIterator, BinaryPredicate>
+  make_head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+{
+  return head_flags<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
+}
+
+
+template<typename RandomAccessIterator>
+__host__ __device__
+head_flags<RandomAccessIterator>
+  make_head_flags(RandomAccessIterator first, RandomAccessIterator last)
+{
+  return head_flags<RandomAccessIterator>(first, last);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/range/tail_flags.h b/thrust/thrust/detail/range/tail_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..32ccb53c6a36c2ce1ce75a3a9475729f652e1d75
--- /dev/null
+++ b/thrust/thrust/detail/range/tail_flags.h
@@ -0,0 +1,134 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename RandomAccessIterator,
+         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
+         typename ValueType = bool,
+         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
+  class tail_flags
+{
+  // XXX WAR cudafe bug
+  //private:
+  public:
+    struct tail_flag_functor
+    {
+      BinaryPredicate binary_pred; // this must be the first member for performance reasons
+      RandomAccessIterator iter;
+      IndexType n;
+
+      typedef ValueType result_type;
+
+      __host__ __device__
+      tail_flag_functor(RandomAccessIterator first, RandomAccessIterator last)
+        : binary_pred(), iter(first), n(last - first)
+      {}
+
+      __host__ __device__
+      tail_flag_functor(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+        : binary_pred(binary_pred), iter(first), n(last - first)
+      {}
+
+      __host__ __device__ __thrust_forceinline__
+      result_type operator()(const IndexType &i)
+      {
+        return (i == (n - 1) || !binary_pred(iter[i], iter[i+1]));
+      }
+    };
+
+    typedef thrust::counting_iterator<IndexType> counting_iterator;
+
+  public:
+    typedef thrust::transform_iterator<
+      tail_flag_functor,
+      counting_iterator
+    > iterator;
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    tail_flags(RandomAccessIterator first, RandomAccessIterator last)
+      : m_begin(thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0),
+                                                tail_flag_functor(first, last))),
+        m_end(m_begin + (last - first))
+    {}
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+      : m_begin(thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0),
+                                                tail_flag_functor(first, last, binary_pred))),
+        m_end(m_begin + (last - first))
+    {}
+
+    __host__ __device__
+    iterator begin() const
+    {
+      return m_begin;
+    }
+
+    __host__ __device__
+    iterator end() const
+    {
+      return m_end;
+    }
+
+    template<typename OtherIndex>
+    __host__ __device__
+    typename iterator::reference operator[](OtherIndex i)
+    {
+      return *(begin() + i);
+    }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+template<typename RandomAccessIterator, typename BinaryPredicate>
+__host__ __device__
+tail_flags<RandomAccessIterator, BinaryPredicate>
+  make_tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+{
+  return tail_flags<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
+}
+
+
+template<typename RandomAccessIterator>
+__host__ __device__
+tail_flags<RandomAccessIterator>
+  make_tail_flags(RandomAccessIterator first, RandomAccessIterator last)
+{
+  return tail_flags<RandomAccessIterator>(first, last);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/raw_pointer_cast.h b/thrust/thrust/detail/raw_pointer_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..33f87849d79d7ba48127b17bfa8e2d038127f18a
--- /dev/null
+++ b/thrust/thrust/detail/raw_pointer_cast.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+namespace thrust
+{
+
+template<typename Pointer>
+__host__ __device__
+typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+raw_pointer_cast(Pointer ptr)
+{
+  return thrust::detail::pointer_traits<Pointer>::get(ptr);
+}
+
+template <typename ToPointer, typename FromPointer>
+__host__ __device__
+ToPointer
+reinterpret_pointer_cast(FromPointer ptr)
+{
+  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
+  return ToPointer(reinterpret_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
+}
+
+template <typename ToPointer, typename FromPointer>
+__host__ __device__
+ToPointer
+static_pointer_cast(FromPointer ptr)
+{
+  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
+  return ToPointer(static_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
+}
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/raw_reference_cast.h b/thrust/thrust/detail/raw_reference_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..a678144e2256b43baab945f54bdf82871241e0ad
--- /dev/null
+++ b/thrust/thrust/detail/raw_reference_cast.h
@@ -0,0 +1,398 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/tuple_transform.h>
+#include <thrust/iterator/detail/tuple_of_iterator_references.h>
+
+
+// the order of declarations and definitions in this file is totally goofy
+// this header defines raw_reference_cast, which has a few overloads towards the bottom of the file
+// raw_reference_cast depends on metafunctions such as is_unwrappable and raw_reference
+// we need to be sure that these metafunctions are completely defined (including specializations) before they are instantiated by raw_reference_cast
+
+namespace thrust
+{
+namespace detail
+{
+
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(is_wrapped_reference, wrapped_reference_hint)
+
+
+// wrapped reference-like things which aren't strictly wrapped references
+// (e.g. tuples of wrapped references) are considered unwrappable
+template<typename T>
+  struct is_unwrappable
+    : is_wrapped_reference<T>
+{};
+
+
+// specialize is_unwrappable
+// a tuple is_unwrappable if any of its elements is_unwrappable
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct is_unwrappable<
+    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+    : or_<
+        is_unwrappable<T0>,
+        is_unwrappable<T1>,
+        is_unwrappable<T2>,
+        is_unwrappable<T3>,
+        is_unwrappable<T4>,
+        is_unwrappable<T5>,
+        is_unwrappable<T6>,
+        is_unwrappable<T7>,
+        is_unwrappable<T8>,
+        is_unwrappable<T9>
+      >
+{};
+
+
+// specialize is_unwrappable
+// a tuple_of_iterator_references is_unwrappable if any of its elements is_unwrappable
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct is_unwrappable<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+    : or_<
+        is_unwrappable<T0>,
+        is_unwrappable<T1>,
+        is_unwrappable<T2>,
+        is_unwrappable<T3>,
+        is_unwrappable<T4>,
+        is_unwrappable<T5>,
+        is_unwrappable<T6>,
+        is_unwrappable<T7>,
+        is_unwrappable<T8>,
+        is_unwrappable<T9>
+      >
+{};
+
+
+template<typename T, typename Result = void>
+  struct enable_if_unwrappable
+    : enable_if<
+        is_unwrappable<T>::value,
+        Result
+      >
+{};
+
+
+namespace raw_reference_detail
+{
+
+
+template<typename T, typename Enable = void>
+  struct raw_reference_impl
+    : add_reference<T>
+{};
+
+
+template<typename T>
+  struct raw_reference_impl<
+    T,
+    typename thrust::detail::enable_if<
+      is_wrapped_reference<
+        typename remove_cv<T>::type
+      >::value
+    >::type
+  >
+{
+  typedef typename add_reference<
+    typename pointer_element<typename T::pointer>::type
+  >::type type;
+};
+
+
+} // end raw_reference_detail
+
+
+template<typename T>
+  struct raw_reference : 
+    raw_reference_detail::raw_reference_impl<T>
+{};
+
+
+namespace raw_reference_detail
+{
+
+// unlike raw_reference,
+// raw_reference_tuple_helper needs to return a value
+// when it encounters one, rather than a reference
+// upon encountering tuple, recurse
+//
+// we want the following behavior:
+//  1. T                                -> T
+//  2. T&                               -> T&
+//  3. null_type                        -> null_type
+//  4. reference<T>                     -> T&
+//  5. tuple_of_iterator_references<T>  -> tuple_of_iterator_references<raw_reference_tuple_helper<T>::type>
+
+
+// wrapped references are unwrapped using raw_reference, otherwise, return T
+template<typename T>
+  struct raw_reference_tuple_helper
+    : eval_if<
+        is_unwrappable<
+          typename remove_cv<T>::type
+        >::value,
+        raw_reference<T>,
+        identity_<T>
+      >
+{};
+
+
+// recurse on tuples
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference_tuple_helper<
+    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  typedef thrust::tuple<
+    typename raw_reference_tuple_helper<T0>::type,
+    typename raw_reference_tuple_helper<T1>::type,
+    typename raw_reference_tuple_helper<T2>::type,
+    typename raw_reference_tuple_helper<T3>::type,
+    typename raw_reference_tuple_helper<T4>::type,
+    typename raw_reference_tuple_helper<T5>::type,
+    typename raw_reference_tuple_helper<T6>::type,
+    typename raw_reference_tuple_helper<T7>::type,
+    typename raw_reference_tuple_helper<T8>::type,
+    typename raw_reference_tuple_helper<T9>::type
+  > type;
+};
+
+
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference_tuple_helper<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  typedef thrust::detail::tuple_of_iterator_references<
+    typename raw_reference_tuple_helper<T0>::type,
+    typename raw_reference_tuple_helper<T1>::type,
+    typename raw_reference_tuple_helper<T2>::type,
+    typename raw_reference_tuple_helper<T3>::type,
+    typename raw_reference_tuple_helper<T4>::type,
+    typename raw_reference_tuple_helper<T5>::type,
+    typename raw_reference_tuple_helper<T6>::type,
+    typename raw_reference_tuple_helper<T7>::type,
+    typename raw_reference_tuple_helper<T8>::type,
+    typename raw_reference_tuple_helper<T9>::type
+  > type;
+};
+
+
+} // end raw_reference_detail
+
+
+// a couple of specializations of raw_reference for tuples follow
+
+
+// if a tuple "tuple_type" is_unwrappable,
+//   then the raw_reference of tuple_type is a tuple of its members' raw_references
+//   else the raw_reference of tuple_type is tuple_type &
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference<
+    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  private:
+    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+
+  public:
+    typedef typename eval_if<
+      is_unwrappable<tuple_type>::value,
+      raw_reference_detail::raw_reference_tuple_helper<tuple_type>,
+      add_reference<tuple_type>
+    >::type type;
+};
+
+
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  private:
+    typedef detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+
+  public:
+    typedef typename raw_reference_detail::raw_reference_tuple_helper<tuple_type>::type type;
+
+    // XXX figure out why is_unwrappable seems to be broken for tuple_of_iterator_references
+    //typedef typename eval_if<
+    //  is_unwrappable<tuple_type>::value,
+    //  raw_reference_detail::raw_reference_tuple_helper<tuple_type>,
+    //  add_reference<tuple_type>
+    //>::type type;
+};
+
+
+} // end detail
+
+
+// provide declarations of raw_reference_cast's overloads for raw_reference_caster below
+template<typename T>
+__host__ __device__
+typename detail::raw_reference<T>::type
+  raw_reference_cast(T &ref);
+
+
+template<typename T>
+__host__ __device__
+typename detail::raw_reference<const T>::type
+  raw_reference_cast(const T &ref);
+
+
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+__host__ __device__
+typename detail::enable_if_unwrappable<
+  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  typename detail::raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >::type
+>::type
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t);
+
+
+namespace detail
+{
+
+
+struct raw_reference_caster
+{
+  template<typename T>
+  __host__ __device__
+  typename detail::raw_reference<T>::type operator()(T &ref)
+  {
+    return thrust::raw_reference_cast(ref);
+  }
+
+  template<typename T>
+  __host__ __device__
+  typename detail::raw_reference<const T>::type operator()(const T &ref)
+  {
+    return thrust::raw_reference_cast(ref);
+  }
+
+  template<
+    typename T0, typename T1, typename T2,
+    typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8,
+    typename T9
+  >
+  __host__ __device__
+  typename detail::raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >::type
+  operator()(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t,
+             typename enable_if<
+               is_unwrappable<thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> >::value
+             >::type * = 0)
+  {
+    return thrust::raw_reference_cast(t);
+  }
+}; // end raw_reference_caster
+
+
+} // end detail
+
+
+template<typename T>
+__host__ __device__
+typename detail::raw_reference<T>::type
+  raw_reference_cast(T &ref)
+{
+  return *thrust::raw_pointer_cast(&ref);
+} // end raw_reference_cast
+
+
+template<typename T>
+__host__ __device__
+typename detail::raw_reference<const T>::type
+  raw_reference_cast(const T &ref)
+{
+  return *thrust::raw_pointer_cast(&ref);
+} // end raw_reference_cast
+
+
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+__host__ __device__
+typename detail::enable_if_unwrappable<
+  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  typename detail::raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >::type
+>::type
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t)
+{
+  thrust::detail::raw_reference_caster f;
+
+  // note that we pass raw_reference_tuple_helper, not raw_reference as the unary metafunction
+  // the different way that raw_reference_tuple_helper unwraps tuples is important
+  return thrust::detail::tuple_host_device_transform<detail::raw_reference_detail::raw_reference_tuple_helper>(t, f);
+} // end raw_reference_cast
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/reduce.inl b/thrust/thrust/detail/reduce.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2ecedc7a2c7e33aed4933c9b950d41b0f6f88d2c
--- /dev/null
+++ b/thrust/thrust/detail/reduce.inl
@@ -0,0 +1,278 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.inl
+ *  \brief Inline file for reduce.h.
+ */
+
+#include <thrust/reduce.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/reduce.h>
+#include <thrust/system/detail/generic/reduce_by_key.h>
+#include <thrust/system/detail/adl/reduce.h>
+#include <thrust/system/detail/adl/reduce_by_key.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last)
+{
+  using thrust::system::detail::generic::reduce;
+  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end reduce()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init)
+{
+  using thrust::system::detail::generic::reduce;
+  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init);
+} // end reduce()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+__host__ __device__
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init,
+           BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::reduce;
+  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init, binary_op);
+} // end reduce()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::reduce_by_key;
+  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output);
+} // end reduce_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::reduce_by_key;
+  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+} // end reduce_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::reduce_by_key;
+  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+} // end reduce_by_key()
+
+
+template<typename InputIterator>
+typename thrust::iterator_traits<InputIterator>::value_type
+  reduce(InputIterator first,
+         InputIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::reduce(select_system(system), first, last);
+}
+
+
+template<typename InputIterator,
+         typename T>
+   T reduce(InputIterator first,
+            InputIterator last,
+            T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::reduce(select_system(system), first, last, init);
+}
+
+
+template<typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+   T reduce(InputIterator first,
+            InputIterator last,
+            T init,
+            BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::reduce(select_system(system), first, last, init, binary_op);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+}
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/reference.h b/thrust/thrust/detail/reference.h
new file mode 100644
index 0000000000000000000000000000000000000000..89bcf63ca7a5d9ba91d242ddaec318a02a832c65
--- /dev/null
+++ b/thrust/thrust/detail/reference.h
@@ -0,0 +1,178 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/detail/reference_forward_declaration.h>
+#include <ostream>
+
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename> struct is_wrapped_reference;
+
+}
+
+// the base type for all of thrust's system-annotated references.
+// for reasonable reference-like semantics, derived types must reimplement the following:
+// 1. constructor from pointer
+// 2. copy constructor
+// 3. templated copy constructor from other reference
+// 4. templated assignment from other reference
+// 5. assignment from value_type
+template<typename Element, typename Pointer, typename Derived>
+  class reference
+{
+  private:
+    typedef typename thrust::detail::eval_if<
+      thrust::detail::is_same<Derived,use_default>::value,
+      thrust::detail::identity_<reference>,
+      thrust::detail::identity_<Derived>
+    >::type derived_type;
+
+    // hint for is_wrapped_reference lets it know that this type (or a derived type)
+    // is a wrapped reference
+    struct wrapped_reference_hint {};
+    template<typename> friend struct thrust::detail::is_wrapped_reference;
+
+  public:
+    typedef Pointer                                              pointer;
+    typedef typename thrust::detail::remove_const<Element>::type value_type;
+
+    __host__ __device__
+    explicit reference(const pointer &ptr);
+
+#if THRUST_CPP_DIALECT >= 2011
+    reference(const reference &) = default;
+#endif
+
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
+                pointer
+              >::type * = 0);
+
+    __host__ __device__
+    derived_type &operator=(const reference &other);
+
+    // XXX this may need an enable_if
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
+
+    __host__ __device__
+    derived_type &operator=(const value_type &x);
+
+    __host__ __device__
+    pointer operator&() const;
+
+    __host__ __device__
+    operator value_type () const;
+
+    __host__ __device__
+    void swap(derived_type &other);
+
+    derived_type &operator++();
+
+    value_type operator++(int);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator+=(const value_type &rhs);
+
+    derived_type &operator--();
+
+    value_type operator--(int);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator-=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator*=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator/=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator%=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator<<=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator>>=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator&=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator|=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator^=(const value_type &rhs);
+
+  private:
+    const pointer m_ptr;
+
+    // allow access to m_ptr for other references
+    template <typename OtherElement, typename OtherPointer, typename OtherDerived> friend class reference;
+
+    template<typename System>
+    __host__ __device__
+    inline value_type strip_const_get_value(const System &system) const;
+
+    template<typename OtherPointer>
+    __host__ __device__
+    inline void assign_from(OtherPointer src);
+
+    // XXX this helper exists only to avoid warnings about null references from the other assign_from
+    template<typename System1, typename System2, typename OtherPointer>
+    inline __host__ __device__
+    void assign_from(System1 *system1, System2 *system2, OtherPointer src);
+
+    template<typename System, typename OtherPointer>
+    __host__ __device__
+    inline void strip_const_assign_value(const System &system, OtherPointer src);
+
+    // XXX this helper exists only to avoid warnings about null references from the other swap
+    template<typename System>
+    inline __host__ __device__
+    void swap(System *system, derived_type &other);
+
+    // XXX this helper exists only to avoid warnings about null references from operator value_type ()
+    template<typename System>
+    inline __host__ __device__
+    value_type convert_to_value_type(System *system) const;
+}; // end reference
+
+// Output stream operator
+template<typename Element, typename Pointer, typename Derived,
+         typename charT, typename traits>
+std::basic_ostream<charT, traits> &
+operator<<(std::basic_ostream<charT, traits> &os,
+           const reference<Element, Pointer, Derived> &y);
+
+} // end thrust
+
+#include <thrust/detail/reference.inl>
+
diff --git a/thrust/thrust/detail/reference.inl b/thrust/thrust/detail/reference.inl
new file mode 100644
index 0000000000000000000000000000000000000000..91f2b9736887df35b912acd7cc7398048b68c26e
--- /dev/null
+++ b/thrust/thrust/detail/reference.inl
@@ -0,0 +1,382 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/reference.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/get_value.h>
+#include <thrust/system/detail/adl/assign_value.h>
+#include <thrust/system/detail/adl/iter_swap.h>
+
+
+namespace thrust
+{
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    reference<Element,Pointer,Derived>
+      ::reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
+                  typename thrust::detail::enable_if_convertible<
+                    typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
+                    pointer
+                  >::type *)
+        : m_ptr(other.m_ptr)
+{}
+
+
+template<typename Element, typename Pointer, typename Derived>
+  __host__ __device__
+  reference<Element,Pointer,Derived>
+    ::reference(const pointer &ptr)
+      : m_ptr(ptr)
+{}
+
+
+template<typename Element, typename Pointer, typename Derived>
+  __host__ __device__
+  typename reference<Element,Pointer,Derived>::pointer
+    reference<Element,Pointer,Derived>
+      ::operator&() const
+{
+  return m_ptr;
+} // end reference::operator&()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  __host__ __device__
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator=(const value_type &v)
+{
+  assign_from(&v);
+  return static_cast<derived_type&>(*this);
+} // end reference::operator=()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  __host__ __device__
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator=(const reference &other)
+{
+  assign_from(&other); 
+  return static_cast<derived_type&>(*this);
+} // end reference::operator=()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    typename reference<Element,Pointer,Derived>::derived_type &
+      reference<Element,Pointer,Derived>
+        ::operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other)
+{
+  assign_from(&other);
+  return static_cast<derived_type&>(*this);
+} // end reference::operator=()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System>
+    __host__ __device__
+    typename reference<Element,Pointer,Derived>::value_type
+      reference<Element,Pointer,Derived>
+        ::convert_to_value_type(System *system) const
+{
+  using thrust::system::detail::generic::select_system;
+  return strip_const_get_value(select_system(*system));
+} // end convert_to_value_type()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  __host__ __device__
+  reference<Element,Pointer,Derived>
+    ::operator typename reference<Element,Pointer,Derived>::value_type () const
+{
+  typedef typename thrust::iterator_system<pointer>::type System;
+
+  // XXX avoid default-constructing a system
+  // XXX use null a reference for dispatching
+  // XXX this assumes that the eventual invocation of
+  // XXX get_value will not access system state
+  System *system = 0;
+
+  return convert_to_value_type(system);
+} // end reference::operator value_type ()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System>
+    __host__ __device__
+    typename reference<Element,Pointer,Derived>::value_type
+      reference<Element,Pointer,Derived>
+        ::strip_const_get_value(const System &system) const
+{
+  System &non_const_system = const_cast<System&>(system);
+
+  using thrust::system::detail::generic::get_value;
+
+  return get_value(thrust::detail::derived_cast(non_const_system), m_ptr);
+} // end reference::strip_const_get_value()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System1, typename System2, typename OtherPointer>
+    __host__ __device__
+    void reference<Element,Pointer,Derived>
+      ::assign_from(System1 *system1, System2 *system2, OtherPointer src)
+{
+  using thrust::system::detail::generic::select_system;
+
+  strip_const_assign_value(select_system(*system1, *system2), src);
+} // end assign_from()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename OtherPointer>
+    __host__ __device__
+    void reference<Element,Pointer,Derived>
+      ::assign_from(OtherPointer src)
+{
+  typedef typename thrust::iterator_system<pointer>::type      System1;
+  typedef typename thrust::iterator_system<OtherPointer>::type System2;
+
+  // XXX avoid default-constructing a system
+  // XXX use null references for dispatching
+  // XXX this assumes that the eventual invocation of
+  // XXX assign_value will not access system state
+  System1 *system1 = 0;
+  System2 *system2 = 0;
+
+  assign_from(system1, system2, src);
+} // end assign_from()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System, typename OtherPointer>
+    __host__ __device__
+    void reference<Element,Pointer,Derived>
+      ::strip_const_assign_value(const System &system, OtherPointer src)
+{
+  System &non_const_system = const_cast<System&>(system);
+
+  using thrust::system::detail::generic::assign_value;
+
+  assign_value(thrust::detail::derived_cast(non_const_system), m_ptr, src);
+} // end strip_const_assign_value()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System>
+    __host__ __device__
+    void reference<Element,Pointer,Derived>
+      ::swap(System *system, derived_type &other)
+{
+  using thrust::system::detail::generic::select_system;
+  using thrust::system::detail::generic::iter_swap;
+
+  iter_swap(select_system(*system, *system), m_ptr, other.m_ptr);
+} // end reference::swap()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  __host__ __device__
+  void reference<Element,Pointer,Derived>
+    ::swap(derived_type &other)
+{
+  typedef typename thrust::iterator_system<pointer>::type System;
+
+  // XXX avoid default-constructing a system
+  // XXX use null references for dispatching
+  // XXX this assumes that the eventual invocation
+  // XXX of iter_swap will not access system state
+  System *system = 0;
+
+  swap(system, other);
+} // end reference::swap()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator++(void)
+{
+  value_type temp = *this;
+  ++temp;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator++()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::value_type
+    reference<Element,Pointer,Derived>
+      ::operator++(int)
+{
+  value_type temp = *this;
+  value_type result = temp++;
+  *this = temp;
+  return result;
+} // end reference::operator++()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator+=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp += rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator+=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator--(void)
+{
+  value_type temp = *this;
+  --temp;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator--()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::value_type
+    reference<Element,Pointer,Derived>
+      ::operator--(int)
+{
+  value_type temp = *this;
+  value_type result = temp--;
+  *this = temp;
+  return result;
+} // end reference::operator--()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator-=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp -= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator-=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator*=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp *= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator*=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator/=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp /= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator/=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator%=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp %= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator%=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator<<=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp <<= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator<<=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator>>=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp >>= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator>>=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator&=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp &= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator&=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator|=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp |= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator|=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator^=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp ^= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator^=()
+
+template<typename Element, typename Pointer, typename Derived,
+         typename charT, typename traits>
+std::basic_ostream<charT, traits> &
+operator<<(std::basic_ostream<charT, traits> &os,
+           const reference<Element, Pointer, Derived> &y) {
+  typedef typename reference<Element, Pointer, Derived>::value_type value_type;
+  return os << static_cast<value_type>(y);
+} // end operator<<()
+
+} // end thrust
diff --git a/thrust/thrust/detail/reference_forward_declaration.h b/thrust/thrust/detail/reference_forward_declaration.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8912ca43a4ad51bc9461a72416b99b06e53a3c2
--- /dev/null
+++ b/thrust/thrust/detail/reference_forward_declaration.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/use_default.h>
+
+namespace thrust
+{
+
+template<typename Element, typename Pointer, typename Derived = use_default> class reference;
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/remove.inl b/thrust/thrust/detail/remove.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f5951fa91dde208a1484b5d046d079dfd0722bc8
--- /dev/null
+++ b/thrust/thrust/detail/remove.inl
@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.inl
+ *  \brief Inline file for remove.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/remove.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/remove.h>
+#include <thrust/system/detail/adl/remove.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  ForwardIterator remove(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value)
+{
+  using thrust::system::detail::generic::remove;
+  return remove(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end remove()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator remove_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value)
+{
+  using thrust::system::detail::generic::remove_copy;
+  return remove_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, value);
+} // end remove_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::remove_if;
+  return remove_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end remove_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::remove_copy_if;
+  return remove_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred);
+} // end remove_copy_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::remove_if;
+  return remove_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
+} // end remove_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::remove_copy_if;
+  return remove_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred);
+} // end remove_copy_if()
+
+
+template<typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::remove(select_system(system), first, last, value);
+} // end remove()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::remove_copy(select_system(system1,system2), first, last, result, value);
+} // end remove_copy()
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::remove_if(select_system(system), first, last, pred);
+} // end remove_if()
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type   System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::remove_if(select_system(system1,system2), first, last, stencil, pred);
+} // end remove_if()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::remove_copy_if(select_system(system1,system2), first, last, result, pred);
+} // end remove_copy_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::remove_copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
+} // end remove_copy_if()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/replace.inl b/thrust/thrust/detail/replace.inl
new file mode 100644
index 0000000000000000000000000000000000000000..de5bff4d5cfc97a942d951a386ad7fab0af26df2
--- /dev/null
+++ b/thrust/thrust/detail/replace.inl
@@ -0,0 +1,222 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file replace.inl
+ *  \brief Inline file for replace.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/replace.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/replace.h>
+#include <thrust/system/detail/adl/replace.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void replace(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               ForwardIterator first, ForwardIterator last,
+               const T &old_value,
+               const T &new_value)
+{
+  using thrust::system::detail::generic::replace;
+  return replace(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, old_value, new_value);
+} // end replace()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::replace_if;
+  return replace_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred, new_value);
+} // end replace_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::replace_if;
+  return replace_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred, new_value);
+} // end replace_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+__host__ __device__
+  OutputIterator replace_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first, InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value)
+{
+  using thrust::system::detail::generic::replace_copy;
+  return replace_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, old_value, new_value);
+} // end replace_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::replace_copy_if;
+  return replace_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred, new_value);
+} // end replace_copy_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::replace_copy_if;
+  return replace_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::replace_copy_if(select_system(system1,system2), first, last, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::replace_copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(InputIterator first, InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::replace_copy(select_system(system1,system2), first, last, result, old_value, new_value);
+} // end replace_copy()
+
+
+template<typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::replace_if(select_system(system), first, last, pred, new_value);
+} // end replace_if()
+
+
+template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type   System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::replace_if(select_system(system1,system2), first, last, stencil, pred, new_value);
+} // end replace_if()
+
+
+template<typename ForwardIterator, typename T>
+  void replace(ForwardIterator first, ForwardIterator last,
+               const T &old_value,
+               const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::replace(select_system(system), first, last, old_value, new_value);
+} // end replace()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/reverse.inl b/thrust/thrust/detail/reverse.inl
new file mode 100644
index 0000000000000000000000000000000000000000..e8a018cd68b2c9b820e6fc58398f0f4671cc2a94
--- /dev/null
+++ b/thrust/thrust/detail/reverse.inl
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reverse.inl
+ *  \brief Inline file for reverse.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/reverse.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/reverse.h>
+#include <thrust/system/detail/adl/reverse.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename BidirectionalIterator>
+__host__ __device__
+  void reverse(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last)
+{
+  using thrust::system::detail::generic::reverse;
+  return reverse(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end reverse()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename BidirectionalIterator, typename OutputIterator>
+__host__ __device__
+  OutputIterator reverse_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result)
+{
+  using thrust::system::detail::generic::reverse_copy;
+  return reverse_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end reverse_copy()
+
+
+template<typename BidirectionalIterator>
+  void reverse(BidirectionalIterator first,
+               BidirectionalIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<BidirectionalIterator>::type System;
+
+  System system;
+
+  return thrust::reverse(select_system(system), first, last);
+} // end reverse()
+
+
+template<typename BidirectionalIterator,
+         typename OutputIterator>
+  OutputIterator reverse_copy(BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<BidirectionalIterator>::type System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type        System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::reverse_copy(select_system(system1,system2), first, last, result);
+} // end reverse_copy()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl
new file mode 100644
index 0000000000000000000000000000000000000000..5329d111846c22f6b1441be01faca3a4f795eab3
--- /dev/null
+++ b/thrust/thrust/detail/scan.inl
@@ -0,0 +1,526 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.inl
+ *  \brief Inline file for scan.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/scan.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/scan.h>
+#include <thrust/system/detail/generic/scan_by_key.h>
+#include <thrust/system/detail/adl/scan.h>
+#include <thrust/system/detail/adl/scan_by_key.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::inclusive_scan;
+  return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end inclusive_scan() 
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::inclusive_scan;
+  return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
+} // end inclusive_scan()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::exclusive_scan;
+  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end exclusive_scan()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init)
+{
+  using thrust::system::detail::generic::exclusive_scan;
+  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init);
+} // end exclusive_scan()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::exclusive_scan;
+  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, binary_op);
+} // end exclusive_scan()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::inclusive_scan_by_key;
+  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result);
+} // end inclusive_scan_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::inclusive_scan_by_key;
+  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, binary_pred);
+} // end inclusive_scan_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::inclusive_scan_by_key;
+  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, binary_pred, binary_op);
+} // end inclusive_scan_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result);
+} // end exclusive_scan_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init);
+} // end exclusive_scan_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init, binary_pred);
+} // end exclusive_scan_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init, binary_pred, binary_op);
+} // end exclusive_scan_by_key()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inclusive_scan(select_system(system1,system2), first, last, result);
+} // end inclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inclusive_scan(select_system(system1,system2), first, last, result, binary_op);
+} // end inclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::exclusive_scan(select_system(system1,system2), first, last, result);
+} // end exclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::exclusive_scan(select_system(system1,system2), first, last, result, init);
+} // end exclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::exclusive_scan(select_system(system1,system2), first, last, result, init, binary_op);
+} // end exclusive_scan()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, binary_pred);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, binary_pred, binary_op);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init, binary_pred);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init, binary_pred, binary_op);
+}
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/scatter.inl b/thrust/thrust/detail/scatter.inl
new file mode 100644
index 0000000000000000000000000000000000000000..50ca8f3aa2a3224ceeeaa94b6d0917d1a35d6e27
--- /dev/null
+++ b/thrust/thrust/detail/scatter.inl
@@ -0,0 +1,166 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scatter.inl
+ *  \brief Inline file for scatter.h.
+ */
+
+#include <thrust/scatter.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/scatter.h>
+#include <thrust/system/detail/adl/scatter.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::scatter;
+  return scatter(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, output);
+} // end scatter()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::scatter_if;
+  return scatter_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, stencil, output);
+} // end scatter_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+__host__ __device__
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred)
+{
+  using thrust::system::detail::generic::scatter_if;
+  return scatter_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, stencil, output, pred);
+} // end scatter_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::scatter(select_system(system1,system2,system3), first, last, map, output);
+} // end scatter()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output);
+} // end scatter_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output, pred);
+} // end scatter_if()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/select_system.h b/thrust/thrust/detail/select_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..b22ceb0e96967b258caa01d054f8f45521ada23c
--- /dev/null
+++ b/thrust/thrust/detail/select_system.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/generic/select_system.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// We need a way to compute the return type of `select_system`, which is found
+// by using `thrust::system::detail::generic::select_system` and then making an
+// ADL call. We have no trait that defines the return type. With the
+// limitations of C++11 return type deduction, we need to be able to stick all
+// of that into `decltype`. So, we put the using statement into a detail
+// namespace, and then implement the generic dispatch function in that
+// namespace.
+
+namespace select_system_detail
+{
+
+using thrust::system::detail::generic::select_system;
+
+struct select_system_fn final
+{
+  __thrust_exec_check_disable__
+  template <typename DerivedPolicy0>
+  __host__ __device__
+  auto operator()(
+    thrust::detail::execution_policy_base<DerivedPolicy0> const& exec0
+  ) const
+  THRUST_DECLTYPE_RETURNS(
+    select_system(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec0))
+    )
+  )
+
+  __thrust_exec_check_disable__
+  template <typename DerivedPolicy0, typename DerivedPolicy1>
+  __host__ __device__
+  auto operator()(
+    thrust::detail::execution_policy_base<DerivedPolicy0> const& exec0
+  , thrust::detail::execution_policy_base<DerivedPolicy1> const& exec1
+  ) const
+  THRUST_DECLTYPE_RETURNS(
+    select_system(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec0))
+    , thrust::detail::derived_cast(thrust::detail::strip_const(exec1))
+    )
+  )
+};
+
+} // namespace select_system_detail
+
+THRUST_INLINE_CONSTANT select_system_detail::select_system_fn select_system{};
+
+} // detail
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/detail/seq.h b/thrust/thrust/detail/seq.h
new file mode 100644
index 0000000000000000000000000000000000000000..b548652d2d9d24c5cd143e39a5184182175453a8
--- /dev/null
+++ b/thrust/thrust/detail/seq.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::detail::sequential::execution_policy>
+{
+  __host__ __device__
+  THRUST_CONSTEXPR seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
+
+  // allow any execution_policy to convert to seq_t
+  template<typename DerivedPolicy>
+  __host__ __device__
+  seq_t(const thrust::execution_policy<DerivedPolicy> &)
+    : thrust::system::detail::sequential::execution_policy<seq_t>()
+  {}
+};
+
+
+} // end detail
+
+
+THRUST_INLINE_CONSTANT detail::seq_t seq;
+
+
+} // end thrust
+
+
diff --git a/thrust/thrust/detail/sequence.inl b/thrust/thrust/detail/sequence.inl
new file mode 100644
index 0000000000000000000000000000000000000000..fff7cbb63327ee43f2a6624c989f44e53ca5fffb
--- /dev/null
+++ b/thrust/thrust/detail/sequence.inl
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sequence.inl
+ *  \brief Inline file for sequence.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/sequence.h>
+#include <thrust/system/detail/adl/sequence.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last)
+{
+  using thrust::system::detail::generic::sequence;
+  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end sequence()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init)
+{
+  using thrust::system::detail::generic::sequence;
+  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init);
+} // end sequence()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step)
+{
+  using thrust::system::detail::generic::sequence;
+  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init, step);
+} // end sequence()
+
+
+template<typename ForwardIterator>
+  void sequence(ForwardIterator first,
+                ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::sequence(select_system(system), first, last);
+} // end sequence()
+
+
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::sequence(select_system(system), first, last, init);
+} // end sequence()
+
+
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::sequence(select_system(system), first, last, init, step);
+} // end sequence()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/set_operations.inl b/thrust/thrust/detail/set_operations.inl
new file mode 100644
index 0000000000000000000000000000000000000000..42cf5ed359948fbdce7bf5e87fe580de1d9b155d
--- /dev/null
+++ b/thrust/thrust/detail/set_operations.inl
@@ -0,0 +1,868 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file set_operations.inl
+ *  \brief Inline file for set_operations.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/set_operations.h>
+#include <thrust/system/detail/adl/set_operations.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator1                                              first1,
+                              InputIterator1                                              last1,
+                              InputIterator2                                              first2,
+                              InputIterator2                                              last2,
+                              OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_difference;
+  return set_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_difference()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator1                                              first1,
+                              InputIterator1                                              last1,
+                              InputIterator2                                              first2,
+                              InputIterator2                                              last2,
+                              OutputIterator                                              result,
+                              StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_difference;
+  return set_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_difference()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator1                                              keys_first1,
+                        InputIterator1                                              keys_last1,
+                        InputIterator2                                              keys_first2,
+                        InputIterator2                                              keys_last2,
+                        InputIterator3                                              values_first1,
+                        InputIterator4                                              values_first2,
+                        OutputIterator1                                             keys_result,
+                        OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_difference_by_key;
+  return set_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_difference_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator1                                              keys_first1,
+                        InputIterator1                                              keys_last1,
+                        InputIterator2                                              keys_first2,
+                        InputIterator2                                              keys_last2,
+                        InputIterator3                                              values_first1,
+                        InputIterator4                                              values_first2,
+                        OutputIterator1                                             keys_result,
+                        OutputIterator2                                             values_result,
+                        StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_difference_by_key;
+  return set_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_difference_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_intersection;
+  return set_intersection(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_intersection()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result,
+                                StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_intersection;
+  return set_intersection(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_intersection()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_intersection_by_key;
+  return set_intersection_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
+} // end set_intersection_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result,
+                          StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_intersection_by_key;
+  return set_intersection_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, comp);
+} // end set_intersection_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                        InputIterator1                                              first1,
+                                        InputIterator1                                              last1,
+                                        InputIterator2                                              first2,
+                                        InputIterator2                                              last2,
+                                        OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_symmetric_difference;
+  return set_symmetric_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_symmetric_difference()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                        InputIterator1                                              first1,
+                                        InputIterator1                                              last1,
+                                        InputIterator2                                              first2,
+                                        InputIterator2                                              last2,
+                                        OutputIterator                                              result,
+                                        StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_symmetric_difference;
+  return set_symmetric_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_symmetric_difference()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              keys_first1,
+                                  InputIterator1                                              keys_last1,
+                                  InputIterator2                                              keys_first2,
+                                  InputIterator2                                              keys_last2,
+                                  InputIterator3                                              values_first1,
+                                  InputIterator4                                              values_first2,
+                                  OutputIterator1                                             keys_result,
+                                  OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_symmetric_difference_by_key;
+  return set_symmetric_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_symmetric_difference_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              keys_first1,
+                                  InputIterator1                                              keys_last1,
+                                  InputIterator2                                              keys_first2,
+                                  InputIterator2                                              keys_last2,
+                                  InputIterator3                                              values_first1,
+                                  InputIterator4                                              values_first2,
+                                  OutputIterator1                                             keys_result,
+                                  OutputIterator2                                             values_result,
+                                  StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_symmetric_difference_by_key;
+  return set_symmetric_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_symmetric_difference_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1                                              first1,
+                         InputIterator1                                              last1,
+                         InputIterator2                                              first2,
+                         InputIterator2                                              last2,
+                         OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_union;
+  return set_union(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_union()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1                                              first1,
+                         InputIterator1                                              last1,
+                         InputIterator2                                              first2,
+                         InputIterator2                                              last2,
+                         OutputIterator                                              result,
+                         StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_union;
+  return set_union(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_union()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator1                                              keys_first1,
+                   InputIterator1                                              keys_last1,
+                   InputIterator2                                              keys_first2,
+                   InputIterator2                                              keys_last2,
+                   InputIterator3                                              values_first1,
+                   InputIterator4                                              values_first2,
+                   OutputIterator1                                             keys_result,
+                   OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_union_by_key;
+  return set_union_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_union_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator1                                              keys_first1,
+                   InputIterator1                                              keys_last1,
+                   InputIterator2                                              keys_first2,
+                   InputIterator2                                              keys_last2,
+                   InputIterator3                                              values_first1,
+                   InputIterator4                                              values_first2,
+                   OutputIterator1                                             keys_result,
+                   OutputIterator2                                             values_result,
+                   StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_union_by_key;
+  return set_union_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_union_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1 keys_first1,
+                          InputIterator1 keys_last1,
+                          InputIterator2 keys_first2,
+                          InputIterator2 keys_last2,
+                          InputIterator3 values_first1,
+                          InputIterator4 values_first2,
+                          OutputIterator1 keys_result,
+                          OutputIterator2 values_result,
+                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1 keys_first1,
+                          InputIterator1 keys_last1,
+                          InputIterator2 keys_first2,
+                          InputIterator2 keys_last2,
+                          InputIterator3 values_first1,
+                          InputIterator4 values_first2,
+                          OutputIterator1 keys_result,
+                          OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_intersection(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_intersection()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_intersection(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_intersection()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1 keys_first1,
+                            InputIterator1 keys_last1,
+                            InputIterator2 keys_first2,
+                            InputIterator2 keys_last2,
+                            InputIterator3 values_first1,
+                            OutputIterator1 keys_result,
+                            OutputIterator2 values_result,
+                            StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System4;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System5;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+
+  return thrust::set_intersection_by_key(select_system(system1,system2,system3,system4,system5), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, comp);
+} // end set_intersection_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1 keys_first1,
+                            InputIterator1 keys_last1,
+                            InputIterator2 keys_first2,
+                            InputIterator2 keys_last2,
+                            InputIterator3 values_first1,
+                            OutputIterator1 keys_result,
+                            OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System4;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System5;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+
+  return thrust::set_intersection_by_key(select_system(system1,system2,system3,system4,system5), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
+} // end set_intersection_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_symmetric_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_symmetric_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_symmetric_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_symmetric_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1 keys_first1,
+                                    InputIterator1 keys_last1,
+                                    InputIterator2 keys_first2,
+                                    InputIterator2 keys_last2,
+                                    InputIterator3 values_first1,
+                                    InputIterator4 values_first2,
+                                    OutputIterator1 keys_result,
+                                    OutputIterator2 values_result,
+                                    StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_symmetric_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_symmetric_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1 keys_first1,
+                                    InputIterator1 keys_last1,
+                                    InputIterator2 keys_first2,
+                                    InputIterator2 keys_last2,
+                                    InputIterator3 values_first1,
+                                    InputIterator4 values_first2,
+                                    OutputIterator1 keys_result,
+                                    OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_symmetric_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_symmetric_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_union(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_union()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_union(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_union()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1 keys_first1,
+                     InputIterator1 keys_last1,
+                     InputIterator2 keys_first2,
+                     InputIterator2 keys_last2,
+                     InputIterator3 values_first1,
+                     InputIterator4 values_first2,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result,
+                     StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_union_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_union_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1 keys_first1,
+                     InputIterator1 keys_last1,
+                     InputIterator2 keys_first2,
+                     InputIterator2 keys_last2,
+                     InputIterator3 values_first1,
+                     InputIterator4 values_first2,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_union_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_union_by_key()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/shuffle.inl b/thrust/thrust/detail/shuffle.inl
new file mode 100644
index 0000000000000000000000000000000000000000..edccc878731ef45efba53dc3b0e89deccf9d745a
--- /dev/null
+++ b/thrust/thrust/detail/shuffle.inl
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.inl
+ *  \brief Inline file for shuffle.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/shuffle.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/shuffle.h>
+
+namespace thrust {
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, URBG&& g) {
+  using thrust::system::detail::generic::shuffle;
+  return shuffle(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      first, last, g);
+}
+
+template <typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(RandomIterator first, RandomIterator last,
+                                 URBG&& g) {
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomIterator>::type System;
+  System system;
+
+  return thrust::shuffle(select_system(system), first, last, g);
+}
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, OutputIterator result,
+    URBG&& g) {
+  using thrust::system::detail::generic::shuffle_copy;
+  return shuffle_copy(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      first, last, result, g);
+}
+
+template <typename RandomIterator, typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(RandomIterator first, RandomIterator last,
+                                      OutputIterator result, URBG&& g) {
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomIterator>::type System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::shuffle_copy(select_system(system1, system2), first, last,
+                              result, g);
+}
+
+}  // namespace thrust
+
+#endif
diff --git a/thrust/thrust/detail/sort.inl b/thrust/thrust/detail/sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d4a7901e6ca7b932ab20f25f06e761352686d5fa
--- /dev/null
+++ b/thrust/thrust/detail/sort.inl
@@ -0,0 +1,408 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sort.inl
+ *  \brief Inline file for sort.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/sort.h>
+#include <thrust/system/detail/adl/sort.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename RandomAccessIterator>
+__host__ __device__
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::sort;
+  return sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end sort()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::sort;
+  return sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end sort()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename RandomAccessIterator>
+__host__ __device__
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::stable_sort;
+  return stable_sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end stable_sort()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::stable_sort;
+  return stable_sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end stable_sort()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::sort_by_key;
+  return sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
+} // end sort_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::sort_by_key;
+  return sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, comp);
+} // end sort_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::stable_sort_by_key;
+  return stable_sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
+} // end stable_sort_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::stable_sort_by_key;
+  return stable_sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, comp);
+} // end stable_sort_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::is_sorted;
+  return is_sorted(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end is_sorted()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+__host__ __device__
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp)
+{
+  using thrust::system::detail::generic::is_sorted;
+  return is_sorted(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end is_sorted()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last)
+{
+  using thrust::system::detail::generic::is_sorted_until;
+  return is_sorted_until(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end is_sorted_until()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+__host__ __device__
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp)
+{
+  using thrust::system::detail::generic::is_sorted_until;
+  return is_sorted_until(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end is_sorted_until()
+
+
+///////////////
+// Key Sorts //
+///////////////
+
+template<typename RandomAccessIterator>
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::sort(select_system(system), first, last);
+} // end sort()
+
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  __host__ __device__
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::sort(select_system(system), first, last, comp);
+} // end sort()
+
+
+template<typename RandomAccessIterator>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::stable_sort(select_system(system), first, last);
+} // end stable_sort() 
+
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::stable_sort(select_system(system), first, last, comp);
+} // end stable_sort()
+
+
+
+/////////////////////
+// Key-Value Sorts //
+/////////////////////
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
+} // end sort_by_key()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first, comp);
+} // end sort_by_key()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::stable_sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
+} // end stable_sort_by_key()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::stable_sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first, comp);
+} // end stable_sort_by_key()
+
+
+template<typename ForwardIterator>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted(select_system(system), first, last);
+} // end is_sorted()
+
+
+template<typename ForwardIterator,
+         typename Compare>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted(select_system(system), first, last, comp);
+} // end is_sorted()
+
+
+template<typename ForwardIterator>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted_until(select_system(system), first, last);
+} // end is_sorted_until()
+
+
+template<typename ForwardIterator,
+         typename Compare>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted_until(select_system(system), first, last, comp);
+} // end is_sorted_until()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/static_assert.h b/thrust/thrust/detail/static_assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..52674dcaf18ef6459b6ef826a524623162ce0f23
--- /dev/null
+++ b/thrust/thrust/detail/static_assert.h
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * (C) Copyright John Maddock 2000.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/preprocessor.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename, bool x>
+struct depend_on_instantiation
+{
+  THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT bool value = x;
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#  if THRUST_CPP_DIALECT >= 2017
+#    define THRUST_STATIC_ASSERT(B)        static_assert(B)
+#  else
+#    define THRUST_STATIC_ASSERT(B)        static_assert(B, "static assertion failed")
+#  endif
+#  define THRUST_STATIC_ASSERT_MSG(B, msg) static_assert(B, msg)
+
+#else // Older than C++11.
+
+// HP aCC cannot deal with missing names for template value parameters.
+template <bool x> struct STATIC_ASSERTION_FAILURE;
+
+template <> struct STATIC_ASSERTION_FAILURE<true> {};
+
+// HP aCC cannot deal with missing names for template value parameters.
+template <int x> struct static_assert_test {};
+
+#if    (  (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                  \
+       && (THRUST_GCC_VERSION >= 40800))                                      \
+    || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG)
+  // Clang and GCC 4.8+ will complain about this typedef being unused unless we
+  // annotate it as such.
+#  define THRUST_STATIC_ASSERT(B)                                             \
+    typedef ::thrust::detail::static_assert_test<                             \
+      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)           \
+    >                                                                         \
+      THRUST_PP_CAT2(thrust_static_assert_typedef_, __LINE__)                 \
+      __attribute__((unused))                                                 \
+    /**/      
+#else
+#  define THRUST_STATIC_ASSERT(B)                                             \
+    typedef ::thrust::detail::static_assert_test<                             \
+      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)           \
+    >                                                                         \
+      THRUST_PP_CAT2(thrust_static_assert_typedef_, __LINE__)                 \
+    /**/      
+#endif
+
+#define THRUST_STATIC_ASSERT_MSG(B, msg) THRUST_STATIC_ASSERT(B)
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
+} // namespace detail
+
+} // end namespace thrust
+
+
diff --git a/thrust/thrust/detail/static_map.h b/thrust/thrust/detail/static_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..872a73aefd347d65519663bdcb8105ee83f86baf
--- /dev/null
+++ b/thrust/thrust/detail/static_map.h
@@ -0,0 +1,170 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+
+#include <thrust/detail/config.h>
+
+
+namespace thrust
+{
+namespace detail
+{
+namespace static_map_detail
+{
+
+
+template<unsigned int k, unsigned int v>
+struct key_value
+{
+  static const unsigned int key = k;
+  static const unsigned int value = v;
+};
+
+
+template<typename Head, typename Tail = void>
+struct cons
+{
+  template<unsigned int key, unsigned int default_value>
+  struct static_get
+  {
+    static const unsigned int value = (key == Head::key) ? (Head::value) : Tail::template static_get<key,default_value>::value;
+  };
+
+
+  template<unsigned int default_value>
+  __host__ __device__
+  static unsigned int get(unsigned int key)
+  {
+    return (key == Head::key) ? (Head::value) : Tail::template get<default_value>(key);
+  }
+};
+
+
+template<typename Head>
+struct cons<Head,void>
+{
+  template<unsigned int key, unsigned int default_value>
+  struct static_get
+  {
+    static const unsigned int value = (key == Head::key) ? (Head::value) : default_value;
+  };
+
+  template<unsigned int default_value>
+  __host__ __device__
+  static unsigned int get(unsigned int key)
+  {
+    return (key == Head::key) ? (Head::value) : default_value;
+  }
+};
+
+
+template<unsigned int default_value,
+         unsigned int key0 = 0, unsigned int value0 = default_value,
+         unsigned int key1 = 0, unsigned int value1 = default_value,
+         unsigned int key2 = 0, unsigned int value2 = default_value,
+         unsigned int key3 = 0, unsigned int value3 = default_value,
+         unsigned int key4 = 0, unsigned int value4 = default_value,
+         unsigned int key5 = 0, unsigned int value5 = default_value,
+         unsigned int key6 = 0, unsigned int value6 = default_value,
+         unsigned int key7 = 0, unsigned int value7 = default_value>
+struct static_map
+{
+  typedef cons<
+    key_value<key0,value0>,
+    cons<
+      key_value<key1,value1>,
+      cons<
+        key_value<key2,value2>,
+        cons<
+          key_value<key3,value3>,
+          cons<
+            key_value<key4,value4>,
+            cons<
+              key_value<key5,value5>,
+              cons<
+                key_value<key6,value6>,
+                cons<
+                  key_value<key7,value7>
+                >
+              >
+            >
+          >
+        >
+      >
+    >
+  > impl;
+
+  template<unsigned int key>
+  struct static_get
+  {
+    static const unsigned int value = impl::template static_get<key,default_value>::value;
+  };
+
+  __host__ __device__
+  static unsigned int get(unsigned int key)
+  {
+    return impl::template get<default_value>(key);
+  }
+};
+
+
+} // end namespace static_map_detail
+
+
+template<unsigned int default_value,
+         unsigned int key0 = 0, unsigned int value0 = default_value,
+         unsigned int key1 = 0, unsigned int value1 = default_value,
+         unsigned int key2 = 0, unsigned int value2 = default_value,
+         unsigned int key3 = 0, unsigned int value3 = default_value,
+         unsigned int key4 = 0, unsigned int value4 = default_value,
+         unsigned int key5 = 0, unsigned int value5 = default_value,
+         unsigned int key6 = 0, unsigned int value6 = default_value,
+         unsigned int key7 = 0, unsigned int value7 = default_value>
+struct static_map
+  : static_map_detail::static_map<
+      default_value,
+      key0, value0,
+      key1, value1,
+      key2, value2,
+      key3, value3,
+      key4, value4,
+      key5, value5,
+      key6, value6,
+      key7, value7
+    >
+{};
+
+
+template<unsigned int key, typename StaticMap>
+struct static_lookup
+{
+  static const unsigned int value = StaticMap::template static_get<key>::value;
+};
+
+
+template<typename StaticMap>
+__host__ __device__
+unsigned int lookup(unsigned int key)
+{
+  return StaticMap::get(key);
+}
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/swap.h b/thrust/thrust/detail/swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..96783c762bd9c2ebae3ca5318fe04f15457c545f
--- /dev/null
+++ b/thrust/thrust/detail/swap.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+__thrust_exec_check_disable__
+template<typename Assignable1, typename Assignable2>
+__host__ __device__
+inline void swap(Assignable1 &a, Assignable2 &b)
+{
+  Assignable1 temp = a;
+  a = b;
+  b = temp;
+} // end swap()
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/swap.inl b/thrust/thrust/detail/swap.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9364ef8ad4fb150a59391857f538222abfb38f6c
--- /dev/null
+++ b/thrust/thrust/detail/swap.inl
@@ -0,0 +1,21 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/swap.h>
+#include <thrust/detail/swap.h>
+#include <thrust/detail/swap_ranges.inl>
+
diff --git a/thrust/thrust/detail/swap_ranges.inl b/thrust/thrust/detail/swap_ranges.inl
new file mode 100644
index 0000000000000000000000000000000000000000..8ed97cc740a4770a8bd764bb82dc47618c09f828
--- /dev/null
+++ b/thrust/thrust/detail/swap_ranges.inl
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file swap_ranges.inl
+ *  \brief Inline file for swap_ranges.h.
+ */
+
+#include <thrust/swap.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/swap_ranges.h>
+#include <thrust/system/detail/adl/swap_ranges.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  ForwardIterator2 swap_ranges(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2)
+{
+  using thrust::system::detail::generic::swap_ranges;
+  return swap_ranges(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2);
+} // end swap_ranges()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
+  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::swap_ranges(select_system(system1,system2), first1, last1, first2);
+} // end swap_ranges()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/tabulate.inl b/thrust/thrust/detail/tabulate.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f6385234e8965403cecc8b08d387923daead27a4
--- /dev/null
+++ b/thrust/thrust/detail/tabulate.inl
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/tabulate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/tabulate.h>
+#include <thrust/system/detail/adl/tabulate.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename UnaryOperation>
+__host__ __device__
+  void tabulate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op)
+{
+  using thrust::system::detail::generic::tabulate;
+  return tabulate(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, unary_op);
+} // end tabulate()
+
+
+template<typename ForwardIterator, typename UnaryOperation>
+  void tabulate(ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::tabulate(select_system(system), first, last, unary_op);
+} // end tabulate()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/temporary_array.h b/thrust/thrust/detail/temporary_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f412008396ab85c29b313c9bbcf80a02678d3b9
--- /dev/null
+++ b/thrust/thrust/detail/temporary_array.h
@@ -0,0 +1,181 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file temporary_array.h
+ *  \brief Container-like class temporary storage inside algorithms.
+ */
+
+#pragma once
+
+namespace thrust
+{
+namespace detail
+{
+
+// Forward declare temporary_array, as it's used by the CUDA copy backend, which
+// is included in contiguous_storage's definition.
+template<typename T, typename System>
+  class temporary_array;
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/tagged_iterator.h>
+#include <thrust/detail/contiguous_storage.h>
+#include <thrust/detail/allocator/temporary_allocator.h>
+#include <thrust/detail/allocator/no_throw_allocator.h>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename System>
+  class temporary_array
+    : public contiguous_storage<
+               T,
+               no_throw_allocator<
+                 temporary_allocator<T,System>
+               >
+             >
+{
+  private:
+    typedef contiguous_storage<
+      T,
+      no_throw_allocator<
+        temporary_allocator<T,System>
+      >
+    > super_t;
+
+    // to help out the constructor
+    typedef no_throw_allocator<temporary_allocator<T,System> > alloc_type;
+
+  public:
+    typedef typename super_t::size_type size_type;
+
+    __host__ __device__
+    temporary_array(thrust::execution_policy<System> &system);
+
+    __host__ __device__
+    temporary_array(thrust::execution_policy<System> &system, size_type n);
+
+    // provide a kill-switch to explicitly avoid initialization
+    __host__ __device__
+    temporary_array(int uninit, thrust::execution_policy<System> &system, size_type n);
+
+    template<typename InputIterator>
+    __host__ __device__
+    temporary_array(thrust::execution_policy<System> &system,
+                    InputIterator first,
+                    size_type n);
+
+    template<typename InputIterator, typename InputSystem>
+    __host__ __device__
+    temporary_array(thrust::execution_policy<System> &system,
+                    thrust::execution_policy<InputSystem> &input_system,
+                    InputIterator first,
+                    size_type n);
+
+    template<typename InputIterator>
+    __host__ __device__
+    temporary_array(thrust::execution_policy<System> &system,
+                    InputIterator first,
+                    InputIterator last);
+
+    template<typename InputSystem, typename InputIterator>
+    __host__ __device__
+    temporary_array(thrust::execution_policy<System> &system,
+                    thrust::execution_policy<InputSystem> &input_system,
+                    InputIterator first,
+                    InputIterator last);
+
+    __host__ __device__
+    ~temporary_array();
+}; // end temporary_array
+
+
+// XXX eliminate this when we do ranges for real
+template<typename Iterator, typename System>
+  class tagged_iterator_range
+{
+  public:
+    typedef thrust::detail::tagged_iterator<Iterator,System> iterator;
+
+    template<typename Ignored1, typename Ignored2>
+    tagged_iterator_range(const Ignored1 &, const Ignored2 &, Iterator first, Iterator last)
+      : m_begin(first),
+        m_end(last)
+    {}
+
+    iterator begin(void) const { return m_begin; }
+    iterator end(void) const { return m_end; }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+// if FromSystem is convertible to ToSystem, then just make a shallow
+// copy of the range. else, use a temporary_array
+// note that the resulting iterator is explicitly tagged with ToSystem either way
+template<typename Iterator, typename FromSystem, typename ToSystem>
+  struct move_to_system_base
+    : public eval_if<
+        is_convertible<
+          FromSystem,
+          ToSystem
+        >::value,
+        identity_<
+          tagged_iterator_range<Iterator,ToSystem>
+        >,
+        identity_<
+          temporary_array<
+            typename thrust::iterator_value<Iterator>::type,
+            ToSystem
+          >
+        >
+      >
+{};
+
+
+template<typename Iterator, typename FromSystem, typename ToSystem>
+  class move_to_system
+    : public move_to_system_base<
+        Iterator,
+        FromSystem,
+        ToSystem
+      >::type
+{
+  typedef typename move_to_system_base<Iterator,FromSystem,ToSystem>::type super_t;
+
+  public:
+    move_to_system(thrust::execution_policy<FromSystem> &from_system,
+                   thrust::execution_policy<ToSystem> &to_system,
+                   Iterator first,
+                   Iterator last)
+      : super_t(to_system, from_system, first, last) {}
+};
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/temporary_array.inl>
+
diff --git a/thrust/thrust/detail/temporary_array.inl b/thrust/thrust/detail/temporary_array.inl
new file mode 100644
index 0000000000000000000000000000000000000000..e730966c032f1343e8a2622cc1a6ceb1e0f7dd40
--- /dev/null
+++ b/thrust/thrust/detail/temporary_array.inl
@@ -0,0 +1,166 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/type_traits.h>
+
+
+namespace thrust
+{
+
+namespace detail
+{
+namespace temporary_array_detail
+{
+
+
+template<typename T> struct avoid_initialization : thrust::detail::has_trivial_copy_constructor<T> {};
+
+
+template<typename T, typename TemporaryArray, typename Size>
+__host__ __device__
+typename thrust::detail::enable_if<
+  avoid_initialization<T>::value
+>::type
+  construct_values(TemporaryArray &,
+                   Size)
+{
+  // avoid the overhead of initialization
+} // end construct_values()
+
+
+template<typename T, typename TemporaryArray, typename Size>
+__host__ __device__
+typename thrust::detail::disable_if<
+  avoid_initialization<T>::value
+>::type
+  construct_values(TemporaryArray &a,
+                   Size n)
+{
+  a.default_construct_n(a.begin(), n);
+} // end construct_values()
+
+
+} // end temporary_array_detail
+
+
+template<typename T, typename System>
+__host__ __device__
+  temporary_array<T,System>
+    ::temporary_array(thrust::execution_policy<System> &system)
+      :super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+__host__ __device__
+  temporary_array<T,System>
+    ::temporary_array(thrust::execution_policy<System> &system, size_type n)
+      :super_t(n, alloc_type(temporary_allocator<T,System>(system)))
+{
+  temporary_array_detail::construct_values<T>(*this, n);
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+__host__ __device__
+  temporary_array<T,System>
+    ::temporary_array(int, thrust::execution_policy<System> &system, size_type n)
+      :super_t(n, alloc_type(temporary_allocator<T,System>(system)))
+{
+  // avoid initialization
+  ;
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputIterator>
+  __host__ __device__
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        InputIterator first,
+                        size_type n)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(n);
+
+  super_t::uninitialized_copy_n(system, first, n, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputIterator, typename InputSystem>
+  __host__ __device__
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        thrust::execution_policy<InputSystem> &input_system,
+                        InputIterator first,
+                        size_type n)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(n);
+
+  super_t::uninitialized_copy_n(input_system, first, n, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputIterator>
+  __host__ __device__
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        InputIterator first,
+                        InputIterator last)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(thrust::distance(first,last));
+
+  super_t::uninitialized_copy(system, first, last, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputSystem, typename InputIterator>
+  __host__ __device__
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        thrust::execution_policy<InputSystem> &input_system,
+                        InputIterator first,
+                        InputIterator last)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(thrust::distance(first,last));
+
+  super_t::uninitialized_copy(input_system, first, last, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+__host__ __device__
+  temporary_array<T,System>
+    ::~temporary_array()
+{
+  // note that super_t::destroy will ignore trivial destructors automatically
+  super_t::destroy(super_t::begin(), super_t::end());
+} // end temporary_array::~temporary_array()
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/temporary_buffer.h b/thrust/thrust/detail/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4dca3be3b9b0525aa01bcaa339a13782ac38272f
--- /dev/null
+++ b/thrust/thrust/detail/temporary_buffer.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/execute_with_allocator.h>
+#include <thrust/system/detail/generic/temporary_buffer.h>
+#include <thrust/system/detail/adl/temporary_buffer.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename DerivedPolicy, typename Pair>
+__host__ __device__
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    down_cast_pair(Pair p)
+{
+  // XXX should use a hypothetical thrust::static_pointer_cast here
+  thrust::pointer<T,DerivedPolicy> ptr = thrust::pointer<T,DerivedPolicy>(static_cast<T*>(thrust::raw_pointer_cast(p.first)));
+
+  typedef thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type> result_type;
+  return result_type(ptr, p.second);
+} // end down_cast_pair()
+
+
+} // end detail
+
+
+__thrust_exec_check_disable__
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
+{
+  using thrust::detail::get_temporary_buffer; // execute_with_allocator
+  using thrust::system::detail::generic::get_temporary_buffer;
+
+  return thrust::detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
+} // end get_temporary_buffer()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t n)
+{
+  using thrust::detail::return_temporary_buffer; // execute_with_allocator
+  using thrust::system::detail::generic::return_temporary_buffer;
+
+  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p, n);
+} // end return_temporary_buffer()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/transform.inl b/thrust/thrust/detail/transform.inl
new file mode 100644
index 0000000000000000000000000000000000000000..c27e4de27cf47b735bc18279a8715f02d5794211
--- /dev/null
+++ b/thrust/thrust/detail/transform.inl
@@ -0,0 +1,249 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform.inl
+ *  \brief Inline file for transform.h.
+ */
+
+#include <thrust/transform.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/transform.h>
+#include <thrust/system/detail/adl/transform.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+__host__ __device__
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first, InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op)
+{
+  using thrust::system::detail::generic::transform;
+  return transform(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, op);
+} // end transform()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1 first1, InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op)
+{
+  using thrust::system::detail::generic::transform;
+  return transform(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, op);
+} // end transform()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator first, InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::transform_if;
+  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, op, pred);
+} // end transform_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first, InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::transform_if;
+  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, op, pred);
+} // end transform_if()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::transform_if;
+  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, stencil, result, binary_op, pred);
+} // end transform_if()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform(select_system(system1,system2), first, last, result, op);
+} // end transform()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::transform(select_system(system1,system2,system3), first1, last1, first2, result, op);
+} // end transform()
+
+
+template<typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator first,
+                               InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_if(select_system(system1,system2), first, last, result, unary_op, pred);
+} // end transform_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first,
+                               InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::transform_if(select_system(system1,system2,system3), first, last, stencil, result, unary_op, pred);
+} // end transform_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first1,
+                               InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::transform_if(select_system(system1,system2,system3,system4), first1, last1, first2, stencil, result, binary_op, pred);
+} // end transform_if()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/transform_reduce.inl b/thrust/thrust/detail/transform_reduce.inl
new file mode 100644
index 0000000000000000000000000000000000000000..571b0e79b191bf0b030feddfbee0e2d60b2de1a5
--- /dev/null
+++ b/thrust/thrust/detail/transform_reduce.inl
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_reduce.inl
+ *  \brief Inline file for transform_reduce.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/transform_reduce.h>
+#include <thrust/system/detail/adl/transform_reduce.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputType transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::transform_reduce;
+  return transform_reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, unary_op, init, binary_op);
+} // end transform_reduce()
+
+
+template<typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::transform_reduce(select_system(system), first, last, unary_op, init, binary_op);
+} // end transform_reduce()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/transform_scan.inl b/thrust/thrust/detail/transform_scan.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d6a488b0a0e2669c7b43edd849063e18fd750b68
--- /dev/null
+++ b/thrust/thrust/detail/transform_scan.inl
@@ -0,0 +1,119 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_scan.inl
+ *  \brief Inline file for transform_scan.h.
+ */
+
+#include <thrust/scan.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/transform_scan.h>
+#include <thrust/system/detail/adl/transform_scan.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator transform_inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::transform_inclusive_scan;
+  return transform_inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, binary_op);
+} // end transform_inclusive_scan()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator transform_exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::transform_exclusive_scan;
+  return transform_exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, init, binary_op);
+} // end transform_exclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename BinaryFunction>
+  OutputIterator transform_inclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_inclusive_scan(select_system(system1,system2), first, last, result, unary_op, binary_op);
+} // end transform_inclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_exclusive_scan(select_system(system1,system2), first, last, result, unary_op, init, binary_op);
+} // end transform_exclusive_scan()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/trivial_sequence.h b/thrust/thrust/detail/trivial_sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6c3ed9ebb3cd9022368644edb77ca101ec133e3
--- /dev/null
+++ b/thrust/thrust/detail/trivial_sequence.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file trivial_sequence.h
+ *  \brief Container-like class for wrapping sequences.  The wrapped
+ *         sequence always has trivial iterators, even when the input
+ *         sequence does not.
+ */
+
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// never instantiated
+template<typename Iterator, typename DerivedPolicy, typename is_trivial> struct _trivial_sequence { };
+
+// trivial case
+template<typename Iterator, typename DerivedPolicy>
+struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::true_type>
+{
+    typedef Iterator iterator_type;
+    Iterator first, last;
+
+    __host__ __device__
+    _trivial_sequence(thrust::execution_policy<DerivedPolicy> &, Iterator _first, Iterator _last) : first(_first), last(_last)
+    {
+    }
+
+    __host__ __device__
+    iterator_type begin() { return first; }
+
+    __host__ __device__
+    iterator_type end()   { return last; }
+};
+
+// non-trivial case
+template<typename Iterator, typename DerivedPolicy>
+struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::false_type>
+{
+    typedef typename thrust::iterator_value<Iterator>::type iterator_value;
+    typedef typename thrust::detail::temporary_array<iterator_value, DerivedPolicy>::iterator iterator_type;
+    
+    thrust::detail::temporary_array<iterator_value, DerivedPolicy> buffer;
+
+    __host__ __device__
+    _trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last)
+      : buffer(exec, first, last)
+    {
+    }
+
+    __host__ __device__
+    iterator_type begin() { return buffer.begin(); }
+
+    __host__ __device__
+    iterator_type end()   { return buffer.end(); }
+};
+
+template <typename Iterator, typename DerivedPolicy>
+struct trivial_sequence
+  : detail::_trivial_sequence<Iterator, DerivedPolicy, typename thrust::is_contiguous_iterator<Iterator>::type>
+{
+    typedef _trivial_sequence<Iterator, DerivedPolicy, typename thrust::is_contiguous_iterator<Iterator>::type> super_t;
+
+    __host__ __device__
+    trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last) : super_t(exec, first, last) { }
+};
+
+} // end namespace detail
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/tuple.inl b/thrust/thrust/detail/tuple.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7d9841fd2cdefda6f5e9fefb9dd4649641d56b9e
--- /dev/null
+++ b/thrust/thrust/detail/tuple.inl
@@ -0,0 +1,960 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/swap.h>
+
+namespace thrust
+{
+
+// define null_type
+struct null_type {};
+
+// null_type comparisons
+__host__ __device__ inline
+bool operator==(const null_type&, const null_type&) { return true; }
+
+__host__ __device__ inline
+bool operator>=(const null_type&, const null_type&) { return true; }
+
+__host__ __device__ inline
+bool operator<=(const null_type&, const null_type&) { return true; }
+
+__host__ __device__ inline
+bool operator!=(const null_type&, const null_type&) { return false; }
+
+__host__ __device__ inline
+bool operator<(const null_type&, const null_type&) { return false; }
+
+__host__ __device__ inline
+bool operator>(const null_type&, const null_type&) { return false; }
+
+// forward declaration for tuple
+template <
+  class T0 = null_type, class T1 = null_type, class T2 = null_type,
+  class T3 = null_type, class T4 = null_type, class T5 = null_type,
+  class T6 = null_type, class T7 = null_type, class T8 = null_type,
+  class T9 = null_type>
+class tuple;
+
+// forward declaration of tuple_element
+template<int i, typename T> struct tuple_element;
+
+// specializations for tuple_element
+template<class T>
+  struct tuple_element<0,T>
+{
+  typedef typename T::head_type type;
+}; // end tuple_element<0,T>
+
+template<int N, class T>
+  struct tuple_element<N, const T>
+{
+  private:
+    typedef typename T::tail_type Next;
+    typedef typename tuple_element<N-1, Next>::type unqualified_type;
+
+  public:
+    typedef typename thrust::detail::add_const<unqualified_type>::type type;
+}; // end tuple_element<N, const T>
+
+template<class T>
+  struct tuple_element<0,const T>
+{
+  typedef typename thrust::detail::add_const<typename T::head_type>::type type;
+}; // end tuple_element<0,const T>
+
+
+
+// forward declaration of tuple_size
+template<class T> struct tuple_size;
+
+// specializations for tuple_size
+template<>
+  struct tuple_size< tuple<> >
+{
+  static const int value = 0;
+}; // end tuple_size< tuple<> >
+
+template<>
+  struct tuple_size<null_type>
+{
+  static const int value = 0;
+}; // end tuple_size<null_type>
+
+
+
+// forward declaration of detail::cons
+namespace detail
+{
+
+template <class HT, class TT> struct cons;
+
+} // end detail
+
+
+// -- some traits classes for get functions
+template <class T> struct access_traits
+{
+  typedef const T& const_type;
+  typedef T& non_const_type;
+
+  typedef const typename thrust::detail::remove_cv<T>::type& parameter_type;
+
+// used as the tuple constructors parameter types
+// Rationale: non-reference tuple element types can be cv-qualified.
+// It should be possible to initialize such types with temporaries,
+// and when binding temporaries to references, the reference must
+// be non-volatile and const. 8.5.3. (5)
+}; // end access_traits
+
+template <class T> struct access_traits<T&>
+{
+  typedef T& const_type;
+  typedef T& non_const_type;
+
+  typedef T& parameter_type;
+}; // end access_traits<T&>
+
+// forward declarations of get()
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::non_const_type
+// XXX we probably don't need to do this for any compiler we care about -jph
+//get(cons<HT, TT>& c BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(int, N));
+get(detail::cons<HT, TT>& c);
+
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::const_type
+// XXX we probably don't need to do this for any compiler we care about -jph
+//get(const cons<HT, TT>& c BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(int, N));
+get(const detail::cons<HT, TT>& c);
+
+namespace detail
+{
+
+// -- generate error template, referencing to non-existing members of this
+// template is used to produce compilation errors intentionally
+template<class T>
+class generate_error;
+
+// - cons getters --------------------------------------------------------
+// called: get_class<N>::get<RETURN_TYPE>(aTuple)
+
+template< int N >
+struct get_class
+{
+  template<class RET, class HT, class TT >
+  __host__ __device__
+  inline static RET get(const cons<HT, TT>& t)
+  {
+    // XXX we may not need to deal with this for any compiler we care about -jph
+    //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
+    return get_class<N-1>::template get<RET>(t.tail);
+    
+    // gcc 4.3 couldn't compile this:
+    //return get_class<N-1>::get<RET>(t.tail);
+  }
+
+  template<class RET, class HT, class TT >
+  __host__ __device__
+  inline static RET get(cons<HT, TT>& t)
+  {
+    // XXX we may not need to deal with this for any compiler we care about -jph
+    //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
+    return get_class<N-1>::template get<RET>(t.tail);
+
+    // gcc 4.3 couldn't compile this:
+    //return get_class<N-1>::get<RET>(t.tail);
+  }
+}; // end get_class
+
+template<>
+struct get_class<0>
+{
+  template<class RET, class HT, class TT>
+  __host__ __device__
+  inline static RET get(const cons<HT, TT>& t)
+  {
+    return t.head;
+  }
+
+  template<class RET, class HT, class TT>
+  __host__ __device__
+  inline static RET get(cons<HT, TT>& t)
+  {
+    return t.head;
+  }
+}; // get get_class<0>
+
+
+template <bool If, class Then, class Else> struct IF
+{
+  typedef Then RET;
+};
+
+template <class Then, class Else> struct IF<false, Then, Else>
+{
+  typedef Else RET;
+};
+
+//  These helper templates wrap void types and plain function types.
+//  The rationale is to allow one to write tuple types with those types
+//  as elements, even though it is not possible to instantiate such object.
+//  E.g: typedef tuple<void> some_type; // ok
+//  but: some_type x; // fails
+
+template <class T> class non_storeable_type
+{
+  __host__ __device__
+  non_storeable_type();
+};
+
+template <class T> struct wrap_non_storeable_type
+{
+  // XXX is_function looks complicated; punt for now -jph
+  //typedef typename IF<
+  //  ::thrust::detail::is_function<T>::value, non_storeable_type<T>, T
+  //>::RET type;
+
+  typedef T type;
+};
+
+template <> struct wrap_non_storeable_type<void>
+{
+  typedef non_storeable_type<void> type;
+};
+
+
+template <class HT, class TT>
+  struct cons
+{
+  typedef HT head_type;
+  typedef TT tail_type;
+
+  typedef typename
+    wrap_non_storeable_type<head_type>::type stored_head_type;
+
+  stored_head_type head;
+  tail_type tail;
+
+  inline __host__ __device__
+  typename access_traits<stored_head_type>::non_const_type
+  get_head() { return head; }
+
+  inline __host__ __device__
+  typename access_traits<tail_type>::non_const_type
+  get_tail() { return tail; }
+
+  inline __host__ __device__
+  typename access_traits<stored_head_type>::const_type
+  get_head() const { return head; }
+
+  inline __host__ __device__
+  typename access_traits<tail_type>::const_type
+  get_tail() const { return tail; }
+
+  inline __host__ __device__
+  cons(void) : head(), tail() {}
+  //  cons() : head(detail::default_arg<HT>::f()), tail() {}
+
+  // the argument for head is not strictly needed, but it prevents
+  // array type elements. This is good, since array type elements
+  // cannot be supported properly in any case (no assignment,
+  // copy works only if the tails are exactly the same type, ...)
+
+  inline __host__ __device__
+  cons(typename access_traits<stored_head_type>::parameter_type h,
+       const tail_type& t)
+    : head (h), tail(t) {}
+
+  template <class T1, class T2, class T3, class T4, class T5,
+            class T6, class T7, class T8, class T9, class T10>
+  inline __host__ __device__
+  cons( T1& t1, T2& t2, T3& t3, T4& t4, T5& t5,
+        T6& t6, T7& t7, T8& t8, T9& t9, T10& t10 )
+    : head (t1),
+      tail (t2, t3, t4, t5, t6, t7, t8, t9, t10, static_cast<const null_type&>(null_type()))
+      {}
+
+  template <class T2, class T3, class T4, class T5,
+            class T6, class T7, class T8, class T9, class T10>
+  inline __host__ __device__
+  cons( const null_type& /*t1*/, T2& t2, T3& t3, T4& t4, T5& t5,
+        T6& t6, T7& t7, T8& t8, T9& t9, T10& t10 )
+    : head (),
+      tail (t2, t3, t4, t5, t6, t7, t8, t9, t10, static_cast<const null_type&>(null_type()))
+      {}
+
+
+  template <class HT2, class TT2>
+  inline __host__ __device__
+  cons( const cons<HT2, TT2>& u ) : head(u.head), tail(u.tail) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  cons(const cons &) = default;
+#endif
+
+  __thrust_exec_check_disable__
+  template <class HT2, class TT2>
+  inline __host__ __device__
+  cons& operator=( const cons<HT2, TT2>& u ) {
+    head=u.head; tail=u.tail; return *this;
+  }
+
+  // must define assignment operator explicitly, implicit version is
+  // illformed if HT is a reference (12.8. (12))
+  __thrust_exec_check_disable__
+  inline __host__ __device__
+  cons& operator=(const cons& u) {
+    head = u.head; tail = u.tail;  return *this;
+  }
+
+  // XXX enable when we support std::pair -jph
+  //template <class T1, class T2>
+  //__host__ __device__
+  //cons& operator=( const std::pair<T1, T2>& u ) {
+  //  //BOOST_STATIC_ASSERT(length<cons>::value == 2); // check length = 2
+  //  head = u.first; tail.head = u.second; return *this;
+  //}
+
+  // get member functions (non-const and const)
+  template <int N>
+  __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, cons<HT, TT> >::type
+           >::non_const_type
+  get() {
+    return thrust::get<N>(*this); // delegate to non-member get
+  }
+
+  template <int N>
+  __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, cons<HT, TT> >::type
+           >::const_type
+  get() const {
+    return thrust::get<N>(*this); // delegate to non-member get
+  }
+
+  inline __host__ __device__
+  void swap(cons &c)
+  {
+    using thrust::swap;
+
+    swap(head, c.head);
+    tail.swap(c.tail);
+  }
+};
+
+template <class HT>
+  struct cons<HT, null_type>
+{
+  typedef HT head_type;
+  typedef null_type tail_type;
+  typedef cons<HT, null_type> self_type;
+
+  typedef typename
+    wrap_non_storeable_type<head_type>::type stored_head_type;
+  stored_head_type head;
+
+  typename access_traits<stored_head_type>::non_const_type
+  inline __host__ __device__
+  get_head() { return head; }
+
+  inline __host__ __device__
+  null_type get_tail() { return null_type(); }
+
+  inline __host__ __device__
+  typename access_traits<stored_head_type>::const_type
+  get_head() const { return head; }
+
+  inline __host__ __device__
+  null_type get_tail() const { return null_type(); }
+
+  inline __host__ __device__
+  cons() : head() {}
+
+  inline __host__ __device__
+  cons(typename access_traits<stored_head_type>::parameter_type h,
+       const null_type& = null_type())
+    : head (h) {}
+
+  template<class T1>
+  inline __host__ __device__
+  cons(T1& t1, const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&)
+  : head (t1) {}
+
+  inline __host__ __device__
+  cons(const null_type&,
+       const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&)
+  : head () {}
+
+  template <class HT2>
+  inline __host__ __device__
+  cons( const cons<HT2, null_type>& u ) : head(u.head) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  cons(const cons &) = default;
+#endif
+
+  __thrust_exec_check_disable__
+  template <class HT2>
+  inline __host__ __device__
+  cons& operator=(const cons<HT2, null_type>& u )
+  {
+    head = u.head;
+    return *this;
+  }
+
+  // must define assignment operator explicitly, implicit version
+  // is illformed if HT is a reference
+  inline __host__ __device__
+  cons& operator=(const cons& u) { head = u.head; return *this; }
+
+  template <int N>
+  inline __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, self_type>::type
+            >::non_const_type
+  // XXX we probably don't need this for the compilers we care about -jph
+  //get(BOOST_EXPLICIT_TEMPLATE_NON_TYPE(int, N))
+  get(void)
+  {
+    return thrust::get<N>(*this);
+  }
+
+  template <int N>
+  inline __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, self_type>::type
+           >::const_type
+  // XXX we probably don't need this for the compilers we care about -jph
+  //get(BOOST_EXPLICIT_TEMPLATE_NON_TYPE(int, N)) const
+  get(void) const
+  {
+    return thrust::get<N>(*this);
+  }
+
+  inline __host__ __device__
+  void swap(cons &c)
+  {
+    using thrust::swap;
+
+    swap(head, c.head);
+  }
+}; // end cons
+
+template <class T0, class T1, class T2, class T3, class T4,
+          class T5, class T6, class T7, class T8, class T9>
+  struct map_tuple_to_cons
+{
+  typedef cons<T0,
+               typename map_tuple_to_cons<T1, T2, T3, T4, T5,
+                                          T6, T7, T8, T9, null_type>::type
+              > type;
+}; // end map_tuple_to_cons
+
+// The empty tuple is a null_type
+template <>
+  struct map_tuple_to_cons<null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type>
+{
+  typedef null_type type;
+}; // end map_tuple_to_cons<...>
+
+
+
+// ---------------------------------------------------------------------------
+// The call_traits for make_tuple
+
+// Must be instantiated with plain or const plain types (not with references)
+
+// from template<class T> foo(const T& t) : make_tuple_traits<const T>::type
+// from template<class T> foo(T& t) : make_tuple_traits<T>::type
+
+// Conversions:
+// T -> T,
+// references -> compile_time_error
+// array -> const ref array
+
+
+template<class T>
+struct make_tuple_traits {
+  typedef T type;
+
+  // commented away, see below  (JJ)
+  //  typedef typename IF<
+  //  boost::is_function<T>::value,
+  //  T&,
+  //  T>::RET type;
+
+};
+
+// The is_function test was there originally for plain function types,
+// which can't be stored as such (we must either store them as references or
+// pointers). Such a type could be formed if make_tuple was called with a
+// reference to a function.
+// But this would mean that a const qualified function type was formed in
+// the make_tuple function and hence make_tuple can't take a function
+// reference as a parameter, and thus T can't be a function type.
+// So is_function test was removed.
+// (14.8.3. says that type deduction fails if a cv-qualified function type
+// is created. (It only applies for the case of explicitly specifying template
+// args, though?)) (JJ)
+
+template<class T>
+struct make_tuple_traits<T&> {
+  typedef typename
+     detail::generate_error<T&>::
+       do_not_use_with_reference_type error;
+};
+
+// Arrays can't be stored as plain types; convert them to references.
+// All arrays are converted to const. This is because make_tuple takes its
+// parameters as const T& and thus the knowledge of the potential
+// non-constness of actual argument is lost.
+template<class T, int n>  struct make_tuple_traits <T[n]> {
+  typedef const T (&type)[n];
+};
+
+template<class T, int n>
+struct make_tuple_traits<const T[n]> {
+  typedef const T (&type)[n];
+};
+
+template<class T, int n>  struct make_tuple_traits<volatile T[n]> {
+  typedef const volatile T (&type)[n];
+};
+
+template<class T, int n>
+struct make_tuple_traits<const volatile T[n]> {
+  typedef const volatile T (&type)[n];
+};
+
+// XXX enable these if we ever care about reference_wrapper -jph
+//template<class T>
+//struct make_tuple_traits<reference_wrapper<T> >{
+//  typedef T& type;
+//};
+//
+//template<class T>
+//struct make_tuple_traits<const reference_wrapper<T> >{
+//  typedef T& type;
+//};
+
+
+// a helper traits to make the make_tuple functions shorter (Vesa Karvonen's
+// suggestion)
+template <
+  class T0 = null_type, class T1 = null_type, class T2 = null_type,
+  class T3 = null_type, class T4 = null_type, class T5 = null_type,
+  class T6 = null_type, class T7 = null_type, class T8 = null_type,
+  class T9 = null_type
+>
+struct make_tuple_mapper {
+  typedef
+    tuple<typename make_tuple_traits<T0>::type,
+          typename make_tuple_traits<T1>::type,
+          typename make_tuple_traits<T2>::type,
+          typename make_tuple_traits<T3>::type,
+          typename make_tuple_traits<T4>::type,
+          typename make_tuple_traits<T5>::type,
+          typename make_tuple_traits<T6>::type,
+          typename make_tuple_traits<T7>::type,
+          typename make_tuple_traits<T8>::type,
+          typename make_tuple_traits<T9>::type> type;
+};
+
+} // end detail
+
+
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::non_const_type
+get(detail::cons<HT, TT>& c)
+{
+  //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
+  
+  // gcc 4.3 couldn't compile this:
+  //return detail::get_class<N>::
+
+  return detail::get_class<N>::template
+         get<
+           typename access_traits<
+             typename tuple_element<N, detail::cons<HT, TT> >::type
+           >::non_const_type,
+           HT,TT
+         >(c);
+}
+
+
+// get function for const cons-lists, returns a const reference to
+// the element. If the element is a reference, returns the reference
+// as such (that is, can return a non-const reference)
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::const_type
+get(const detail::cons<HT, TT>& c)
+{
+  //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
+
+  // gcc 4.3 couldn't compile this:
+  //return detail::get_class<N>::
+
+  return detail::get_class<N>::template
+         get<
+           typename access_traits<
+             typename tuple_element<N, detail::cons<HT, TT> >::type
+           >::const_type,
+           HT,TT
+         >(c);
+}
+
+
+template<class T0>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0>::type
+    make_tuple(const T0& t0)
+{
+  typedef typename detail::make_tuple_mapper<T0>::type t;
+  return t(t0);
+} // end make_tuple()
+
+template<class T0, class T1>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1>::type
+    make_tuple(const T0& t0, const T1& t1)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1>::type t;
+  return t(t0,t1);
+} // end make_tuple()
+
+template<class T0, class T1, class T2>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2>::type t;
+  return t(t0,t1,t2);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3>::type t;
+  return t(t0,t1,t2,t3);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4>::type t;
+  return t(t0,t1,t2,t3,t4);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5>::type t;
+  return t(t0,t1,t2,t3,t4,t5);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6,t7);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7,T8>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6,t7,t8);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8, const T9& t9)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6,t7,t8,t9);
+} // end make_tuple()
+
+
+template<typename T0>
+__host__ __device__ inline
+tuple<T0&> tie(T0 &t0)
+{
+  return tuple<T0&>(t0);
+}
+
+template<typename T0,typename T1>
+__host__ __device__ inline
+tuple<T0&,T1&> tie(T0 &t0, T1 &t1)
+{
+  return tuple<T0&,T1&>(t0,t1);
+}
+
+template<typename T0,typename T1, typename T2>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&> tie(T0 &t0, T1 &t1, T2 &t2)
+{
+  return tuple<T0&,T1&,T2&>(t0,t1,t2);
+}
+
+template<typename T0,typename T1, typename T2, typename T3>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3)
+{
+  return tuple<T0&,T1&,T2&,T3&>(t0,t1,t2,t3);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&>(t0,t1,t2,t3,t4);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&>(t0,t1,t2,t3,t4,t5);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>(t0,t1,t2,t3,t4,t5,t6);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>(t0,t1,t2,t3,t4,t5,t6,t7);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>(t0,t1,t2,t3,t4,t5,t6,t7,t8);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8, T9 &t9)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>(t0,t1,t2,t3,t4,t5,t6,t7,t8,t9);
+}
+
+template<
+  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
+  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
+>
+__host__ __device__ inline
+void swap(thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> &x,
+          thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> &y)
+{
+  return x.swap(y);
+}
+
+
+
+namespace detail
+{
+
+template<class T1, class T2>
+__host__ __device__
+inline bool eq(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() == rhs.get_head() &&
+         eq(lhs.get_tail(), rhs.get_tail());
+}
+template<>
+__host__ __device__
+inline bool eq<null_type,null_type>(const null_type&, const null_type&) { return true; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool neq(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() != rhs.get_head()  ||
+         neq(lhs.get_tail(), rhs.get_tail());
+}
+template<>
+__host__ __device__
+inline bool neq<null_type,null_type>(const null_type&, const null_type&) { return false; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool lt(const T1& lhs, const T2& rhs) {
+  return (lhs.get_head() < rhs.get_head())  ||
+            (!(rhs.get_head() < lhs.get_head()) &&
+             lt(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool lt<null_type,null_type>(const null_type&, const null_type&) { return false; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool gt(const T1& lhs, const T2& rhs) {
+  return (lhs.get_head() > rhs.get_head())  ||
+            (!(rhs.get_head() > lhs.get_head()) &&
+             gt(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool gt<null_type,null_type>(const null_type&, const null_type&) { return false; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool lte(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() <= rhs.get_head()  &&
+          ( !(rhs.get_head() <= lhs.get_head()) ||
+            lte(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool lte<null_type,null_type>(const null_type&, const null_type&) { return true; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool gte(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() >= rhs.get_head()  &&
+          ( !(rhs.get_head() >= lhs.get_head()) ||
+            gte(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool gte<null_type,null_type>(const null_type&, const null_type&) { return true; }
+
+} // end detail
+
+
+
+// equal ----
+
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator==(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return  detail::eq(lhs, rhs);
+} // end operator==()
+
+// not equal -----
+
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator!=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::neq(lhs, rhs);
+} // end operator!=()
+
+// <
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator<(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::lt(lhs, rhs);
+} // end operator<()
+
+// >
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator>(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::gt(lhs, rhs);
+} // end operator>()
+
+// <=
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator<=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::lte(lhs, rhs);
+} // end operator<=()
+
+// >=
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator>=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::gte(lhs, rhs);
+} // end operator>=()
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/tuple_algorithms.h b/thrust/thrust/detail/tuple_algorithms.h
new file mode 100644
index 0000000000000000000000000000000000000000..530de4b3f1205c4882d7c636c695e556b68bc004
--- /dev/null
+++ b/thrust/thrust/detail/tuple_algorithms.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/integer_sequence.h>
+
+#include <tuple>
+
+namespace thrust
+{
+
+template <typename Tuple, std::size_t... Is>
+auto tuple_subset(Tuple&& t, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(std::make_tuple(std::get<Is>(THRUST_FWD(t))...));
+
+namespace detail
+{
+
+template <typename Tuple, typename F, std::size_t... Is>
+void tuple_for_each_impl(Tuple&& t, F&& f, index_sequence<Is...>)
+{
+  auto l = { (f(std::get<Is>(t)), 0)... };
+  THRUST_UNUSED_VAR(l);
+}
+
+template <typename Tuple, typename F, std::size_t... Is>
+auto tuple_transform_impl(Tuple&& t, F&& f, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(std::make_tuple(f(std::get<Is>(t))...));
+
+} // namespace detail
+
+template <typename... Ts, typename F>
+auto tuple_for_each(std::tuple<Ts...>& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_for_each_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_for_each(std::tuple<Ts...> const& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_for_each_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_for_each(std::tuple<Ts...>&& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_for_each_impl(
+    std::move(t)
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+
+template <typename... Ts, typename F>
+auto tuple_transform(std::tuple<Ts...>& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_transform_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_transform(std::tuple<Ts...> const& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_transform_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_transform(std::tuple<Ts...>&& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_transform_impl(
+    std::move(t)
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/detail/tuple_meta_transform.h b/thrust/thrust/detail/tuple_meta_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..4aca1a91bb6a6932e357670475ef9c2d7149ffe5
--- /dev/null
+++ b/thrust/thrust/detail/tuple_meta_transform.h
@@ -0,0 +1,177 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         unsigned int sz = thrust::tuple_size<Tuple>::value>
+  struct tuple_meta_transform;
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,0>
+{
+  typedef null_type type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,1>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,2>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,3>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,4>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,5>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,6>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,7>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,8>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,9>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,10>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<9,Tuple>::type>::type
+  > type;
+};
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/tuple_transform.h b/thrust/thrust/detail/tuple_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..166fab3cb4b76000a9cf6454d743d2c6f30c4b67
--- /dev/null
+++ b/thrust/thrust/detail/tuple_transform.h
@@ -0,0 +1,418 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/tuple.h>
+#include <thrust/detail/tuple_meta_transform.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction,
+         unsigned int sz = thrust::tuple_size<Tuple>::value>
+  struct tuple_transform_functor;
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,0>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &, UnaryFunction)
+  {
+    return thrust::null_type();
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &, UnaryFunction)
+  {
+    return thrust::null_type();
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,1>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,2>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,3>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,4>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,5>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,6>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,7>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,8>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,9>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,10>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)),
+                     f(thrust::get<9>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)),
+                     f(thrust::get<9>(t)));
+  }
+};
+
+
+template<template<typename> class UnaryMetaFunction,
+         typename Tuple,
+         typename UnaryFunction>
+typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+tuple_host_transform(const Tuple &t, UnaryFunction f)
+{
+  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host(t,f);
+}
+
+template<template<typename> class UnaryMetaFunction,
+         typename Tuple,
+         typename UnaryFunction>
+typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+__host__ __device__
+tuple_host_device_transform(const Tuple &t, UnaryFunction f)
+{
+  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host_or_device(t,f);
+}
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/type_deduction.h b/thrust/thrust/detail/type_deduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..735b31d68297d5779eb32a2070fbb9e8fefdd5d4
--- /dev/null
+++ b/thrust/thrust/detail/type_deduction.h
@@ -0,0 +1,74 @@
+// Copyright (c)      2018 NVIDIA Corporation
+//                         (Bryce Adelstein Lelbach <brycelelbach@gmail.com>)
+// Copyright (c) 2013-2018 Eric Niebler (`THRUST_RETURNS`, etc)
+// Copyright (c) 2016-2018 Casey Carter (`THRUST_RETURNS`, etc)
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/preprocessor.h>
+
+#include <utility>
+#include <type_traits>
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_FWD(x)
+/// \brief Performs universal forwarding of a universal reference.
+///
+#define THRUST_FWD(x) ::std::forward<decltype(x)>(x)
+
+/// \def THRUST_MVCAP(x)
+/// \brief Capture `x` into a lambda by moving.
+///
+#define THRUST_MVCAP(x) x = ::std::move(x)
+
+/// \def THRUST_RETOF(invocable, ...)
+/// \brief Expands to the type returned by invoking an instance of the invocable
+///        type \a invocable with parameters of type \c __VA_ARGS__. Must
+///        be called with 1 or fewer parameters to the invocable.
+///
+#define THRUST_RETOF(...)   THRUST_PP_DISPATCH(THRUST_RETOF, __VA_ARGS__)
+#define THRUST_RETOF1(C)    decltype(::std::declval<C>()())
+#define THRUST_RETOF2(C, V) decltype(::std::declval<C>()(::std::declval<V>()))
+
+/// \def THRUST_RETURNS(...)
+/// \brief Expands to a function definition that returns the expression
+///        \c __VA_ARGS__.
+///
+#define THRUST_RETURNS(...)                                                   \
+  noexcept(noexcept(__VA_ARGS__))                                             \
+  { return (__VA_ARGS__); }                                                   \
+  /**/
+
+/// \def THRUST_DECLTYPE_RETURNS(...)
+/// \brief Expands to a function definition, including a trailing returning
+///        type, that returns the expression \c __VA_ARGS__.
+///
+#define THRUST_DECLTYPE_RETURNS(...)                                          \
+  noexcept(noexcept(__VA_ARGS__))                                             \
+  -> decltype(__VA_ARGS__)                                                    \
+  { return (__VA_ARGS__); }                                                   \
+  /**/
+
+/// \def THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)
+/// \brief Expands to a function definition, including a trailing returning
+///        type, that returns the expression \c __VA_ARGS__. It shall only 
+///        participate in overload resolution if \c condition is \c true.
+///
+#define THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)         \
+  noexcept(noexcept(__VA_ARGS__))                                             \
+  -> typename std::enable_if<condition, decltype(__VA_ARGS__)>::type          \
+  { return (__VA_ARGS__); }                                                   \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/detail/type_traits.h b/thrust/thrust/detail/type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bfe60d31c686ffa5b628ef27748ce7ecbce7932
--- /dev/null
+++ b/thrust/thrust/detail/type_traits.h
@@ -0,0 +1,714 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file type_traits.h
+ *  \brief Temporarily define some type traits
+ *         until nvcc can compile tr1::type_traits.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  include <type_traits>
+#endif
+
+namespace thrust
+{
+
+// forward declaration of device_reference
+template<typename T> class device_reference;
+
+namespace detail
+{
+ /// helper classes [4.3].
+ template<typename T, T v>
+   struct integral_constant
+   {
+     THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT T value = v;
+
+     typedef T                       value_type;
+     typedef integral_constant<T, v> type;
+
+     // We don't want to switch to std::integral_constant, because we want access
+     // to the C++14 operator(), but we'd like standard traits to interoperate
+     // with our version when tag dispatching.
+     #if THRUST_CPP_DIALECT >= 2011
+     integral_constant() = default;
+
+     integral_constant(integral_constant const&) = default;
+
+     integral_constant& operator=(integral_constant const&) = default;
+
+     constexpr __host__ __device__
+     integral_constant(std::integral_constant<T, v>) noexcept {}
+     #endif
+
+     THRUST_CONSTEXPR __host__ __device__ operator value_type() const THRUST_NOEXCEPT { return value; }
+     THRUST_CONSTEXPR __host__ __device__ value_type operator()() const THRUST_NOEXCEPT { return value; }
+   };
+ 
+ /// typedef for true_type
+ typedef integral_constant<bool, true>  true_type;
+
+ /// typedef for true_type
+ typedef integral_constant<bool, false> false_type;
+
+//template<typename T> struct is_integral : public std::tr1::is_integral<T> {};
+template<typename T> struct is_integral                           : public false_type {};
+template<>           struct is_integral<bool>                     : public true_type {};
+template<>           struct is_integral<char>                     : public true_type {};
+template<>           struct is_integral<signed char>              : public true_type {};
+template<>           struct is_integral<unsigned char>            : public true_type {};
+template<>           struct is_integral<short>                    : public true_type {};
+template<>           struct is_integral<unsigned short>           : public true_type {};
+template<>           struct is_integral<int>                      : public true_type {};
+template<>           struct is_integral<unsigned int>             : public true_type {};
+template<>           struct is_integral<long>                     : public true_type {};
+template<>           struct is_integral<unsigned long>            : public true_type {};
+template<>           struct is_integral<long long>                : public true_type {};
+template<>           struct is_integral<unsigned long long>       : public true_type {};
+template<>           struct is_integral<const bool>               : public true_type {};
+template<>           struct is_integral<const char>               : public true_type {};
+template<>           struct is_integral<const unsigned char>      : public true_type {};
+template<>           struct is_integral<const short>              : public true_type {};
+template<>           struct is_integral<const unsigned short>     : public true_type {};
+template<>           struct is_integral<const int>                : public true_type {};
+template<>           struct is_integral<const unsigned int>       : public true_type {};
+template<>           struct is_integral<const long>               : public true_type {};
+template<>           struct is_integral<const unsigned long>      : public true_type {};
+template<>           struct is_integral<const long long>          : public true_type {};
+template<>           struct is_integral<const unsigned long long> : public true_type {};
+
+template<typename T> struct is_floating_point              : public false_type {};
+template<>           struct is_floating_point<float>       : public true_type {};
+template<>           struct is_floating_point<double>      : public true_type {};
+template<>           struct is_floating_point<long double> : public true_type {};
+
+template<typename T> struct is_arithmetic               : public is_integral<T> {};
+template<>           struct is_arithmetic<float>        : public true_type {};
+template<>           struct is_arithmetic<double>       : public true_type {};
+template<>           struct is_arithmetic<const float>  : public true_type {};
+template<>           struct is_arithmetic<const double> : public true_type {};
+
+template<typename T> struct is_pointer      : public false_type {};
+template<typename T> struct is_pointer<T *> : public true_type  {};
+
+template<typename T> struct is_device_ptr  : public false_type {};
+
+template<typename T> struct is_void             : public false_type {};
+template<>           struct is_void<void>       : public true_type {};
+template<>           struct is_void<const void> : public true_type {};
+
+template<typename T> struct is_non_bool_integral       : public is_integral<T> {};
+template<>           struct is_non_bool_integral<bool> : public false_type {};
+
+template<typename T> struct is_non_bool_arithmetic       : public is_arithmetic<T> {};
+template<>           struct is_non_bool_arithmetic<bool> : public false_type {};
+
+template<typename T> struct is_pod
+   : public integral_constant<
+       bool,
+       is_void<T>::value || is_pointer<T>::value || is_arithmetic<T>::value
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC || \
+    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+// use intrinsic type traits
+       || __is_pod(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+       || __is_pod(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+     >
+ {};
+
+
+template<typename T> struct has_trivial_constructor
+  : public integral_constant<
+      bool,
+      is_pod<T>::value
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC || \
+    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+      || __has_trivial_constructor(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+      || __has_trivial_constructor(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+      >
+{};
+
+template<typename T> struct has_trivial_copy_constructor
+  : public integral_constant<
+      bool,
+      is_pod<T>::value
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC || \
+    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+      || __has_trivial_copy(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+      || __has_trivial_copy(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+    >
+{};
+
+template<typename T> struct has_trivial_destructor : public is_pod<T> {};
+
+template<typename T> struct is_const          : public false_type {};
+template<typename T> struct is_const<const T> : public true_type {};
+
+template<typename T> struct is_volatile             : public false_type {};
+template<typename T> struct is_volatile<volatile T> : public true_type {};
+
+template<typename T>
+  struct add_const
+{
+  typedef T const type;
+}; // end add_const
+
+template<typename T>
+  struct remove_const
+{
+  typedef T type;
+}; // end remove_const
+
+template<typename T>
+  struct remove_const<const T>
+{
+  typedef T type;
+}; // end remove_const
+
+template<typename T>
+  struct add_volatile
+{
+  typedef volatile T type;
+}; // end add_volatile
+
+template<typename T>
+  struct remove_volatile
+{
+  typedef T type;
+}; // end remove_volatile
+
+template<typename T>
+  struct remove_volatile<volatile T>
+{
+  typedef T type;
+}; // end remove_volatile
+
+template<typename T>
+  struct add_cv
+{
+  typedef const volatile T type;
+}; // end add_cv
+
+template<typename T>
+  struct remove_cv
+{
+  typedef typename remove_const<typename remove_volatile<T>::type>::type type;
+}; // end remove_cv
+
+
+template<typename T> struct is_reference     : public false_type {};
+template<typename T> struct is_reference<T&> : public true_type {};
+
+template<typename T> struct is_proxy_reference  : public false_type {};
+
+template<typename T> struct is_device_reference                                : public false_type {};
+template<typename T> struct is_device_reference< thrust::device_reference<T> > : public true_type {};
+
+
+// NB: Careful with reference to void.
+template<typename _Tp, bool = (is_void<_Tp>::value || is_reference<_Tp>::value)>
+  struct __add_reference_helper
+  { typedef _Tp&    type; };
+
+template<typename _Tp>
+  struct __add_reference_helper<_Tp, true>
+  { typedef _Tp     type; };
+
+template<typename _Tp>
+  struct add_reference
+    : public __add_reference_helper<_Tp>{};
+
+template<typename T>
+  struct remove_reference
+{
+  typedef T type;
+}; // end remove_reference
+
+template<typename T>
+  struct remove_reference<T&>
+{
+  typedef T type;
+}; // end remove_reference
+
+template<typename T1, typename T2>
+  struct is_same
+    : public false_type
+{
+}; // end is_same
+
+template<typename T>
+  struct is_same<T,T>
+    : public true_type
+{
+}; // end is_same
+
+template<typename T1, typename T2>
+  struct lazy_is_same
+    : is_same<typename T1::type, typename T2::type>
+{
+}; // end lazy_is_same
+
+template<typename T1, typename T2>
+  struct is_different
+    : public true_type
+{
+}; // end is_different
+
+template<typename T>
+  struct is_different<T,T>
+    : public false_type
+{
+}; // end is_different
+
+template<typename T1, typename T2>
+  struct lazy_is_different
+    : is_different<typename T1::type, typename T2::type>
+{
+}; // end lazy_is_different
+
+#if THRUST_CPP_DIALECT >= 2011
+
+using std::is_convertible;
+
+#else
+
+namespace tt_detail
+{
+
+template<typename T>
+  struct is_int_or_cref
+{
+  typedef typename remove_reference<T>::type type_sans_ref;
+  static const bool value = (is_integral<T>::value
+                             || (is_integral<type_sans_ref>::value
+                                 && is_const<type_sans_ref>::value
+                                 && !is_volatile<type_sans_ref>::value));
+}; // end is_int_or_cref
+
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN
+
+template<typename From, typename To>
+  struct is_convertible_sfinae
+{
+  private:
+    typedef char                          yes;
+    typedef struct { char two_chars[2]; } no;
+
+    static inline yes   test(To) { return yes(); }
+    static inline no    test(...) { return no(); } 
+    static inline typename remove_reference<From>::type& from() { typename remove_reference<From>::type* ptr = 0; return *ptr; }
+
+  public:
+    static const bool value = sizeof(test(from())) == sizeof(yes);
+}; // end is_convertible_sfinae
+
+
+THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+
+template<typename From, typename To>
+  struct is_convertible_needs_simple_test
+{
+  static const bool from_is_void      = is_void<From>::value;
+  static const bool to_is_void        = is_void<To>::value;
+  static const bool from_is_float     = is_floating_point<typename remove_reference<From>::type>::value;
+  static const bool to_is_int_or_cref = is_int_or_cref<To>::value;
+
+  static const bool value = (from_is_void || to_is_void || (from_is_float && to_is_int_or_cref));
+}; // end is_convertible_needs_simple_test
+
+
+template<typename From, typename To,
+         bool = is_convertible_needs_simple_test<From,To>::value>
+  struct is_convertible
+{
+  static const bool value = (is_void<To>::value
+                             || (is_int_or_cref<To>::value
+                                 && !is_void<From>::value));
+}; // end is_convertible
+
+
+template<typename From, typename To>
+  struct is_convertible<From, To, false>
+{
+  static const bool value = (is_convertible_sfinae<typename
+                             add_reference<From>::type, To>::value);
+}; // end is_convertible
+
+
+} // end tt_detail
+
+template<typename From, typename To>
+  struct is_convertible
+    : public integral_constant<bool, tt_detail::is_convertible<From, To>::value>
+{
+}; // end is_convertible
+
+#endif
+
+template<typename T1, typename T2>
+  struct is_one_convertible_to_the_other
+    : public integral_constant<
+        bool,
+        is_convertible<T1,T2>::value || is_convertible<T2,T1>::value
+      >
+{};
+
+
+// mpl stuff
+
+template <typename Condition1,               typename Condition2,              typename Condition3 = false_type,
+          typename Condition4  = false_type, typename Condition5 = false_type, typename Condition6 = false_type,
+          typename Condition7  = false_type, typename Condition8 = false_type, typename Condition9 = false_type,
+          typename Condition10 = false_type>
+  struct or_
+    : public integral_constant<
+        bool,
+        Condition1::value || Condition2::value || Condition3::value || Condition4::value || Condition5::value || Condition6::value || Condition7::value || Condition8::value || Condition9::value || Condition10::value
+      >
+{
+}; // end or_
+
+template <typename Condition1, typename Condition2, typename Condition3 = true_type>
+  struct and_
+    : public integral_constant<bool, Condition1::value && Condition2::value && Condition3::value>
+{
+}; // end and_
+
+template <typename Boolean>
+  struct not_
+    : public integral_constant<bool, !Boolean::value>
+{
+}; // end not_
+
+template<bool B, class T, class F>
+struct conditional { typedef T type; };
+ 
+template<class T, class F>
+struct conditional<false, T, F> { typedef F type; };
+
+template <bool, typename Then, typename Else>
+  struct eval_if
+{
+}; // end eval_if
+
+template<typename Then, typename Else>
+  struct eval_if<true, Then, Else>
+{
+  typedef typename Then::type type;
+}; // end eval_if
+
+template<typename Then, typename Else>
+  struct eval_if<false, Then, Else>
+{
+  typedef typename Else::type type;
+}; // end eval_if
+
+template<typename T>
+//  struct identity
+//  XXX WAR nvcc's confusion with thrust::identity
+  struct identity_
+{
+  typedef T type;
+}; // end identity
+
+template<bool, typename T = void> struct enable_if {};
+template<typename T>              struct enable_if<true, T> {typedef T type;};
+
+template<bool, typename T> struct lazy_enable_if {};
+template<typename T>       struct lazy_enable_if<true, T> {typedef typename T::type type;};
+
+template<bool condition, typename T = void> struct disable_if : enable_if<!condition, T> {};
+template<bool condition, typename T>        struct lazy_disable_if : lazy_enable_if<!condition, T> {};
+
+
+template<typename T1, typename T2, typename T = void>
+  struct enable_if_convertible
+    : enable_if< is_convertible<T1,T2>::value, T >
+{};
+
+
+template<typename T1, typename T2, typename T = void>
+  struct disable_if_convertible
+    : disable_if< is_convertible<T1,T2>::value, T >
+{};
+
+
+template<typename T1, typename T2, typename Result = void>
+  struct enable_if_different
+    : enable_if<is_different<T1,T2>::value, Result>
+{};
+
+
+template<typename T>
+  struct is_numeric
+    : and_<
+        is_convertible<int,T>,
+        is_convertible<T,int>
+      >
+{
+}; // end is_numeric
+
+
+template<typename> struct is_reference_to_const             : false_type {};
+template<typename T> struct is_reference_to_const<const T&> : true_type {};
+
+
+// make_unsigned follows
+
+namespace tt_detail
+{
+
+template<typename T> struct make_unsigned_simple;
+
+template<> struct make_unsigned_simple<char>                   { typedef unsigned char          type; };
+template<> struct make_unsigned_simple<signed char>            { typedef unsigned char          type; };
+template<> struct make_unsigned_simple<unsigned char>          { typedef unsigned char          type; };
+template<> struct make_unsigned_simple<short>                  { typedef unsigned short         type; };
+template<> struct make_unsigned_simple<unsigned short>         { typedef unsigned short         type; };
+template<> struct make_unsigned_simple<int>                    { typedef unsigned int           type; };
+template<> struct make_unsigned_simple<unsigned int>           { typedef unsigned int           type; };
+template<> struct make_unsigned_simple<long int>               { typedef unsigned long int      type; };
+template<> struct make_unsigned_simple<unsigned long int>      { typedef unsigned long int      type; };
+template<> struct make_unsigned_simple<long long int>          { typedef unsigned long long int type; };
+template<> struct make_unsigned_simple<unsigned long long int> { typedef unsigned long long int type; };
+
+template<typename T>
+  struct make_unsigned_base
+{
+  // remove cv
+  typedef typename remove_cv<T>::type remove_cv_t;
+
+  // get the simple unsigned type
+  typedef typename make_unsigned_simple<remove_cv_t>::type unsigned_remove_cv_t;
+
+  // add back const, volatile, both, or neither to the simple result
+  typedef typename eval_if<
+    is_const<T>::value && is_volatile<T>::value,
+    // add cv back
+    add_cv<unsigned_remove_cv_t>,
+    // check const & volatile individually
+    eval_if<
+      is_const<T>::value,
+      // add c back
+      add_const<unsigned_remove_cv_t>,
+      eval_if<
+        is_volatile<T>::value,
+        // add v back
+        add_volatile<unsigned_remove_cv_t>,
+        // original type was neither cv, return the simple unsigned result
+        identity_<unsigned_remove_cv_t>
+      >
+    >
+  >::type type;
+};
+
+} // end tt_detail
+
+template<typename T>
+  struct make_unsigned
+    : tt_detail::make_unsigned_base<T>
+{};
+
+struct largest_available_float
+{
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ < 130)
+  typedef float type;
+#  else
+  typedef double type;
+#  endif
+#else
+  typedef double type;
+#endif
+};
+
+// T1 wins if they are both the same size
+template<typename T1, typename T2>
+  struct larger_type
+    : thrust::detail::eval_if<
+        (sizeof(T2) > sizeof(T1)),
+        thrust::detail::identity_<T2>,
+        thrust::detail::identity_<T1>
+      >
+{};
+
+#if THRUST_CPP_DIALECT >= 2011
+
+using std::is_base_of;
+
+#else
+
+namespace is_base_of_ns
+{
+
+typedef char                          yes;
+typedef struct { char two_chars[2]; } no;
+
+template<typename Base, typename Derived>
+  struct host
+{
+  operator Base*() const;
+  operator Derived*();
+}; // end host
+
+template<typename Base, typename Derived>
+  struct impl
+{
+  template<typename T> static yes check(Derived *, T);
+  static no check(Base*, int);
+
+  static const bool value = sizeof(check(host<Base,Derived>(), int())) == sizeof(yes);
+}; // end impl
+
+} // end is_base_of_ns
+
+
+template<typename Base, typename Derived>
+  struct is_base_of
+    : integral_constant<
+        bool,
+        is_base_of_ns::impl<Base,Derived>::value
+      >
+{};
+
+#endif
+
+template<typename Base, typename Derived, typename Result = void>
+  struct enable_if_base_of
+    : enable_if<
+        is_base_of<Base,Derived>::value,
+        Result
+      >
+{};
+
+
+namespace is_assignable_ns
+{
+
+template<typename T1, typename T2>
+  class is_assignable
+{
+  typedef char                      yes_type;
+  typedef struct { char array[2]; } no_type;
+
+  template<typename T> static typename add_reference<T>::type declval();
+  
+  template<unsigned int> struct helper { typedef void * type; };
+
+  template<typename U1, typename U2> static yes_type test(typename helper<sizeof(declval<U1>() = declval<U2>())>::type);
+
+  template<typename,typename> static no_type test(...);
+
+  public:
+    static const bool value = sizeof(test<T1,T2>(0)) == 1;
+}; // end is_assignable
+
+} // end is_assignable_ns
+
+
+template<typename T1, typename T2>
+  struct is_assignable
+    : integral_constant<
+        bool,
+        is_assignable_ns::is_assignable<T1,T2>::value
+      >
+{};
+
+
+template<typename T>
+  struct is_copy_assignable
+    : is_assignable<
+        typename add_reference<T>::type,
+        typename add_reference<typename add_const<T>::type>::type
+      >
+{};
+
+
+template<typename T1, typename T2, typename Enable = void> struct promoted_numerical_type;
+
+template<typename T1, typename T2> 
+  struct promoted_numerical_type<T1,T2,typename enable_if<and_
+  <typename is_floating_point<T1>::type,typename is_floating_point<T2>::type>
+  ::value>::type>
+  {
+  typedef typename larger_type<T1,T2>::type type;
+  };
+
+template<typename T1, typename T2> 
+  struct promoted_numerical_type<T1,T2,typename enable_if<and_
+  <typename is_integral<T1>::type,typename is_floating_point<T2>::type>
+  ::value>::type>
+  {
+  typedef T2 type;
+  };
+
+template<typename T1, typename T2>
+  struct promoted_numerical_type<T1,T2,typename enable_if<and_
+  <typename is_floating_point<T1>::type, typename is_integral<T2>::type>
+  ::value>::type>
+  {
+  typedef T1 type;
+  };
+
+template<typename T>
+  struct is_empty_helper : public T
+  {
+  };
+
+struct is_empty_helper_base
+{
+};
+
+template<typename T>
+  struct is_empty : integral_constant<bool,
+    sizeof(is_empty_helper_base) == sizeof(is_empty_helper<T>)
+  >
+  {
+  };
+
+} // end detail
+
+using detail::integral_constant;
+using detail::true_type;
+using detail::false_type;
+
+} // end thrust
+
+#include <thrust/detail/type_traits/has_trivial_assign.h>
+
diff --git a/thrust/thrust/detail/type_traits/function_traits.h b/thrust/thrust/detail/type_traits/function_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c7775c0dc092545cf1a2b9a0a7012aacbd23687
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/function_traits.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+
+namespace thrust
+{
+
+// forward definitions for is_commutative
+template <typename T> struct plus;
+template <typename T> struct multiplies;
+template <typename T> struct minimum;
+template <typename T> struct maximum;
+template <typename T> struct logical_or;
+template <typename T> struct logical_and;
+template <typename T> struct bit_or;
+template <typename T> struct bit_and;
+template <typename T> struct bit_xor;
+
+namespace detail
+{
+
+
+// some metafunctions which check for the nested types of the adaptable functions
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_result_type, result_type)
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_argument_type, argument_type)
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_first_argument_type, first_argument_type)
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_second_argument_type, second_argument_type)
+
+
+template<typename AdaptableBinaryFunction>
+  struct result_type
+{
+  typedef typename AdaptableBinaryFunction::result_type type;
+};
+
+
+template<typename T>
+  struct is_adaptable_unary_function
+    : thrust::detail::and_<
+        has_result_type<T>,
+        has_argument_type<T>
+      >
+{};
+
+
+template<typename T>
+  struct is_adaptable_binary_function
+    : thrust::detail::and_<
+        has_result_type<T>,
+        thrust::detail::and_<
+          has_first_argument_type<T>,
+          has_second_argument_type<T>
+        >
+      >
+{};
+
+
+template<typename BinaryFunction>
+  struct is_commutative
+    : public thrust::detail::false_type
+{};
+
+template<typename T> struct is_commutative< typename thrust::plus<T>        > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::multiplies<T>  > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::minimum<T>     > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::maximum<T>     > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::logical_or<T>  > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::logical_and<T> > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::bit_or<T>      > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::bit_and<T>     > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::bit_xor<T>     > : public thrust::detail::is_arithmetic<T> {};
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/type_traits/has_member_function.h b/thrust/thrust/detail/type_traits/has_member_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..03ed61b6db5ad47106ed3746f3725e3676c69d33
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/has_member_function.h
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+
+#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)                                \
+template<typename T, typename Signature> class trait_name;                                                   \
+                                                                                                             \
+template<typename T, typename Result>                                                                        \
+class trait_name<T, Result(void)>                                                                            \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name();                                                                          \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(), &U::member_function_name>* = 0);                    \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg>                                                          \
+class trait_name<T, Result(Arg)>                                                                             \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg);                                                                       \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg), &U::member_function_name>* = 0);                 \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg1, typename Arg2>                                          \
+class trait_name<T, Result(Arg1,Arg2)>                                                                       \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg1,Arg2);                                                                 \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2), &U::member_function_name>* = 0);           \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3>                           \
+class trait_name<T, Result(Arg1,Arg2,Arg3)>                                                                  \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg1,Arg2,Arg3);                                                            \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3), &U::member_function_name>* = 0);      \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>            \
+class trait_name<T, Result(Arg1,Arg2,Arg3,Arg4)>                                                             \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg1,Arg2,Arg3,Arg4);                                                       \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3,Arg4), &U::member_function_name>* = 0); \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           
+
diff --git a/thrust/thrust/detail/type_traits/has_nested_type.h b/thrust/thrust/detail/type_traits/has_nested_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..78bb4b7f57a2f62d70ed6cf04bf58913b4100b69
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/has_nested_type.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+
+#define __THRUST_DEFINE_HAS_NESTED_TYPE(trait_name, nested_type_name) \
+template<typename T> \
+  struct trait_name  \
+{                    \
+  typedef char yes_type; \
+  typedef int  no_type;  \
+  template<typename S> static yes_type test(typename S::nested_type_name *); \
+  template<typename S> static no_type  test(...); \
+  static bool const value = sizeof(test<T>(0)) == sizeof(yes_type);\
+  typedef thrust::detail::integral_constant<bool, value> type;\
+};
+
diff --git a/thrust/thrust/detail/type_traits/has_trivial_assign.h b/thrust/thrust/detail/type_traits/has_trivial_assign.h
new file mode 100644
index 0000000000000000000000000000000000000000..01f26c7ef59cce18b5406af0a8a50cfc8df4a982
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/has_trivial_assign.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file type_traits.h
+ *  \brief Temporarily define some type traits
+ *         until nvcc can compile tr1::type_traits.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T> struct has_trivial_assign
+  : public integral_constant<
+      bool,
+      (is_pod<T>::value && !is_const<T>::value)
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+      || __has_trivial_assign(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+      || __has_trivial_assign(T)
+#endif // GCC VERSION
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+      || __has_trivial_assign(T)
+#endif // THRUST_HOST_COMPILER
+    >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/type_traits/is_call_possible.h b/thrust/thrust/detail/type_traits/is_call_possible.h
new file mode 100644
index 0000000000000000000000000000000000000000..bff0493772ba7b1dfe8f3846ab3270a6c2e8a368
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/is_call_possible.h
@@ -0,0 +1,161 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/has_member_function.h>
+
+// inspired by Roman Perepelitsa's presentation from comp.lang.c++.moderated
+// based on the implementation here: http://www.rsdn.ru/forum/cpp/2759773.1.aspx
+
+namespace thrust
+{
+namespace detail
+{
+namespace is_call_possible_detail
+{
+
+template<typename T> class void_exp_result {}; 
+
+template<typename T, typename U> 
+U const& operator,(U const&, void_exp_result<T>); 
+
+template<typename T, typename U> 
+U& operator,(U&, void_exp_result<T>); 
+
+template<typename src_type, typename dest_type> 
+struct clone_constness 
+{
+  typedef dest_type type; 
+}; 
+
+template<typename src_type, typename dest_type> 
+struct clone_constness<const src_type, dest_type> 
+{ 
+  typedef const dest_type type; 
+};
+
+} // end is_call_possible_detail
+} // end detail
+} // end thrust
+
+#define __THRUST_DEFINE_IS_CALL_POSSIBLE(trait_name, member_function_name)                                                                \
+__THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name##_has_member, member_function_name)                                                        \
+                                                                                                                                          \
+template <typename T, typename Signature>                                                                                                 \
+struct trait_name                                                                                                                         \
+{                                                                                                                                         \
+  private:                                                                                                                                \
+    struct yes {};                                                                                                                        \
+    struct no { yes m[2]; };                                                                                                              \
+    struct derived : public T                                                                                                             \
+    {                                                                                                                                     \
+      using T::member_function_name;                                                                                                      \
+      no member_function_name(...) const;                                                                                                 \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    typedef typename thrust::detail::is_call_possible_detail::clone_constness<T, derived>::type derived_type;                             \
+                                                                                                                                          \
+    template<typename U, typename Result>                                                                                                 \
+    struct return_value_check                                                                                                             \
+    {                                                                                                                                     \
+      static yes deduce(Result);                                                                                                          \
+      static no deduce(...);                                                                                                              \
+      static no deduce(no);                                                                                                               \
+      static no deduce(thrust::detail::is_call_possible_detail::void_exp_result<T>);                                                      \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename U>                                                                                                                  \
+    struct return_value_check<U, void>                                                                                                    \
+    {                                                                                                                                     \
+      static yes deduce(...);                                                                                                             \
+      static no deduce(no);                                                                                                               \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<bool has_the_member_of_interest, typename F>                                                                                 \
+    struct impl                                                                                                                           \
+    {                                                                                                                                     \
+      static const bool value = false;                                                                                                    \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg>                                                                                               \
+    struct impl<true, Result(Arg)>                                                                                                        \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg>::type          arg;                                                                              \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg), thrust::detail::is_call_possible_detail::void_exp_result<T>())                      \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg1, typename Arg2>                                                                               \
+    struct impl<true, Result(Arg1,Arg2)>                                                                                                  \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg1>::type         arg1;                                                                             \
+      static typename add_reference<Arg2>::type         arg2;                                                                             \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg1,arg2), thrust::detail::is_call_possible_detail::void_exp_result<T>())                \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg1, typename Arg2, typename Arg3>                                                                \
+    struct impl<true, Result(Arg1,Arg2,Arg3)>                                                                                             \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg1>::type         arg1;                                                                             \
+      static typename add_reference<Arg2>::type         arg2;                                                                             \
+      static typename add_reference<Arg3>::type         arg3;                                                                             \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg1,arg2,arg3), thrust::detail::is_call_possible_detail::void_exp_result<T>())           \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>                                                 \
+    struct impl<true, Result(Arg1,Arg2,Arg3,Arg4)>                                                                                        \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg1>::type         arg1;                                                                             \
+      static typename add_reference<Arg2>::type         arg2;                                                                             \
+      static typename add_reference<Arg3>::type         arg3;                                                                             \
+      static typename add_reference<Arg4>::type         arg4;                                                                             \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg1,arg2,arg3,arg4), thrust::detail::is_call_possible_detail::void_exp_result<T>())      \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+  public:                                                                                                                                 \
+    static const bool value = impl<trait_name##_has_member<T,Signature>::value, Signature>::value;                                        \
+    typedef thrust::detail::integral_constant<bool,value> type;                                                                           \
+}; 
+
diff --git a/thrust/thrust/detail/type_traits/is_metafunction_defined.h b/thrust/thrust/detail/type_traits/is_metafunction_defined.h
new file mode 100644
index 0000000000000000000000000000000000000000..c278e5bdb23121f7271fe339a1d9678ab415e9a9
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/is_metafunction_defined.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(is_metafunction_defined, type)
+
+template<typename Metafunction>
+  struct enable_if_defined
+    : thrust::detail::lazy_enable_if<
+        is_metafunction_defined<Metafunction>::value,
+        Metafunction
+      >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/type_traits/iterator/is_discard_iterator.h b/thrust/thrust/detail/type_traits/iterator/is_discard_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a5900de2b4e8b62cc6ff8b9ca57f11ee419602d
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/iterator/is_discard_iterator.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename Iterator>
+struct is_discard_iterator
+  : public thrust::detail::false_type
+{};
+
+template <typename System>
+struct is_discard_iterator< thrust::discard_iterator<System> >
+ : public thrust::detail::true_type
+{};
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/type_traits/iterator/is_output_iterator.h b/thrust/thrust/detail/type_traits/iterator/is_output_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6801305be01b903d7a3b9a8bd45101f709543f4
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/iterator/is_output_iterator.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/any_assign.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+
+template<typename T>
+  struct is_void_like
+    : thrust::detail::or_<
+        thrust::detail::is_void<T>,
+        thrust::detail::is_same<T,thrust::detail::any_assign>
+      >
+{}; // end is_void_like
+
+
+template<typename T>
+  struct lazy_is_void_like
+    : is_void_like<typename T::type>
+{}; // end lazy_is_void_like
+
+
+// XXX this meta function should first check that T is actually an iterator
+//
+//     if thrust::iterator_value<T> is defined and thrust::iterator_value<T>::type == void
+//       return false
+//     else
+//       return true
+template<typename T>
+  struct is_output_iterator
+    : eval_if<
+        is_metafunction_defined<thrust::iterator_value<T> >::value,
+        lazy_is_void_like<thrust::iterator_value<T> >,
+        thrust::detail::true_type
+      >::type
+{
+}; // end is_output_iterator
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/type_traits/minimum_type.h b/thrust/thrust/detail/type_traits/minimum_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e34f4f8a533403afa945716a18418583e55d0cc
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/minimum_type.h
@@ -0,0 +1,162 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{ 
+
+namespace minimum_type_detail
+{
+
+//
+// Returns the minimum type or is empty
+// if T1 and T2 are unrelated.
+//
+template <typename T1, typename T2, bool GreaterEqual, bool LessEqual> struct minimum_type_impl {};
+  
+template <typename T1, typename T2>
+struct minimum_type_impl<T1,T2,true,false>
+{
+  typedef T2 type;
+}; // end minimum_type_impl
+
+template <typename T1, typename T2>
+struct minimum_type_impl<T1,T2,false,true>
+{
+  typedef T1 type;
+}; // end minimum_type_impl
+
+template <typename T1, typename T2>
+struct minimum_type_impl<T1,T2,true,true>
+{
+  typedef T1 type;
+}; // end minimum_type_impl
+
+template <typename T1, typename T2>
+struct primitive_minimum_type
+  : minimum_type_detail::minimum_type_impl<
+      T1,
+      T2,
+      ::thrust::detail::is_convertible<T1,T2>::value,
+      ::thrust::detail::is_convertible<T2,T1>::value
+    >
+{
+}; // end primitive_minimum_type
+
+// because some types are not convertible (even to themselves)
+// specialize primitive_minimum_type for when both types are identical
+template <typename T>
+struct primitive_minimum_type<T,T>
+{
+  typedef T type;
+}; // end primitive_minimum_type
+
+// XXX this belongs somewhere more general
+struct any_conversion
+{
+  template<typename T> operator T (void);
+};
+
+} // end minimum_type_detail
+
+template<typename T1,
+         typename T2  = minimum_type_detail::any_conversion,
+         typename T3  = minimum_type_detail::any_conversion,
+         typename T4  = minimum_type_detail::any_conversion,
+         typename T5  = minimum_type_detail::any_conversion,
+         typename T6  = minimum_type_detail::any_conversion,
+         typename T7  = minimum_type_detail::any_conversion,
+         typename T8  = minimum_type_detail::any_conversion,
+         typename T9  = minimum_type_detail::any_conversion,
+         typename T10 = minimum_type_detail::any_conversion,
+         typename T11 = minimum_type_detail::any_conversion,
+         typename T12 = minimum_type_detail::any_conversion,
+         typename T13 = minimum_type_detail::any_conversion,
+         typename T14 = minimum_type_detail::any_conversion,
+         typename T15 = minimum_type_detail::any_conversion,
+         typename T16 = minimum_type_detail::any_conversion>
+  struct minimum_type;
+
+// base case
+template<typename T1, typename T2>
+  struct minimum_type<T1,T2>
+    : minimum_type_detail::primitive_minimum_type<T1,T2>
+{};
+
+template<typename T1, typename T2>
+  struct lazy_minimum_type
+    : minimum_type<
+        typename T1::type,
+        typename T2::type
+      >
+{};
+
+// carefully avoid referring to a nested ::type which may not exist
+template<typename T1,  typename T2,  typename T3,  typename T4,
+         typename T5,  typename T6,  typename T7,  typename T8,
+         typename T9,  typename T10, typename T11, typename T12,
+         typename T13, typename T14, typename T15, typename T16>
+  struct minimum_type
+    : lazy_minimum_type<
+        lazy_minimum_type<
+          lazy_minimum_type<
+            minimum_type<
+              T1,T2
+            >,
+            minimum_type<
+              T3,T4
+            >
+          >,
+          lazy_minimum_type<
+            minimum_type<
+              T5,T6
+            >,
+            minimum_type<
+              T7,T8
+            >
+          >
+        >,
+        lazy_minimum_type<
+          lazy_minimum_type<
+            minimum_type<
+              T9,T10
+            >,
+            minimum_type<
+              T11,T12
+            >
+          >,
+          lazy_minimum_type<
+            minimum_type<
+              T13,T14
+            >,
+            minimum_type<
+              T15,T16
+            >
+          >
+        >
+      >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/type_traits/pointer_traits.h b/thrust/thrust/detail/type_traits/pointer_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..48ac7d6dc4a5391504dd768702448d16e88cb6ad
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/pointer_traits.h
@@ -0,0 +1,371 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <cstddef>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename Ptr> struct pointer_element;
+
+template<template<typename> class Ptr, typename Arg>
+  struct pointer_element<Ptr<Arg> >
+{
+  typedef Arg type;
+};
+
+template<template<typename,typename> class Ptr, typename Arg1, typename Arg2>
+  struct pointer_element<Ptr<Arg1,Arg2> >
+{
+  typedef Arg1 type;
+};
+
+template<template<typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3>
+  struct pointer_element<Ptr<Arg1,Arg2,Arg3> >
+{
+  typedef Arg1 type;
+};
+
+template<template<typename,typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+  struct pointer_element<Ptr<Arg1,Arg2,Arg3,Arg4> >
+{
+  typedef Arg1 type;
+};
+
+template<template<typename,typename,typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
+  struct pointer_element<Ptr<Arg1,Arg2,Arg3,Arg4,Arg5> >
+{
+  typedef Arg1 type;
+};
+
+template<typename T>
+  struct pointer_element<T*>
+{
+  typedef T type;
+};
+
+template<typename Ptr>
+  struct pointer_difference
+{
+  typedef typename Ptr::difference_type type;
+};
+
+template<typename T>
+  struct pointer_difference<T*>
+{
+  typedef std::ptrdiff_t type;
+};
+
+template<typename Ptr, typename T> struct rebind_pointer;
+
+template<typename T, typename U>
+  struct rebind_pointer<T*,U>
+{
+  typedef U* type;
+};
+
+template<template<typename> class Ptr, typename Arg, typename T>
+  struct rebind_pointer<Ptr<Arg>,T>
+{
+  typedef Ptr<T> type;
+};
+
+template<template<typename, typename> class Ptr, typename Arg1, typename Arg2, typename T>
+  struct rebind_pointer<Ptr<Arg1,Arg2>,T>
+{
+  typedef Ptr<T,Arg2> type;
+};
+
+template<template<typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename T>
+  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3>,T>
+{
+  typedef Ptr<T,Arg2,Arg3> type;
+};
+
+template<template<typename, typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename T>
+  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3,Arg4>,T>
+{
+  typedef Ptr<T,Arg2,Arg3,Arg4> type;
+};
+
+// XXX this should probably be renamed native_type or similar
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_raw_pointer, raw_pointer)
+
+namespace pointer_traits_detail
+{
+
+template<typename Ptr, typename Enable = void> struct pointer_raw_pointer_impl {};
+
+template<typename T>
+  struct pointer_raw_pointer_impl<T*>
+{
+  typedef T* type;
+};
+
+template<typename Ptr>
+  struct pointer_raw_pointer_impl<Ptr, typename enable_if<has_raw_pointer<Ptr>::value>::type>
+{
+  typedef typename Ptr::raw_pointer type;
+};
+
+} // end pointer_traits_detail
+
+template<typename T>
+  struct pointer_raw_pointer
+    : pointer_traits_detail::pointer_raw_pointer_impl<T>
+{};
+
+namespace pointer_traits_detail
+{
+
+template<typename Void>
+  struct capture_address
+{
+  template<typename T>
+  __host__ __device__
+  capture_address(T &r)
+    : m_addr(&r)
+  {}
+
+  inline __host__ __device__
+  Void *operator&() const
+  {
+    return m_addr;
+  }
+
+  Void *m_addr;
+};
+
+// metafunction to compute the type of pointer_to's parameter below
+template<typename T>
+  struct pointer_to_param
+    : thrust::detail::eval_if<
+        thrust::detail::is_void<T>::value,
+        thrust::detail::identity_<capture_address<T> >,
+        thrust::detail::add_reference<T>
+      >
+{};
+
+}
+
+template<typename Ptr>
+  struct pointer_traits
+{
+  typedef Ptr                                    pointer;
+  typedef typename Ptr::reference                reference;
+  typedef typename pointer_element<Ptr>::type    element_type;
+  typedef typename pointer_difference<Ptr>::type difference_type;
+
+  template<typename U>
+    struct rebind 
+  {
+    typedef typename rebind_pointer<Ptr,U>::type other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(typename pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    // XXX this is supposed to be pointer::pointer_to(&r); (i.e., call a static member function of pointer called pointer_to)
+    //     assume that pointer has a constructor from raw pointer instead
+    
+    return pointer(&r);
+  }
+
+  // thrust additions follow
+  typedef typename pointer_raw_pointer<Ptr>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr.get();
+  }
+};
+
+template<typename T>
+  struct pointer_traits<T*>
+{
+  typedef T*                                    pointer;
+  typedef T&                                    reference;
+  typedef T                                     element_type;
+  typedef typename pointer_difference<T*>::type difference_type;
+
+  template<typename U>
+    struct rebind
+  {
+    typedef U* other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(typename pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    return &r;
+  }
+
+  // thrust additions follow
+  typedef typename pointer_raw_pointer<T*>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr;
+  }
+};
+
+template<>
+  struct pointer_traits<void*>
+{
+  typedef void*                                    pointer;
+  typedef void                                     reference;
+  typedef void                                     element_type;
+  typedef pointer_difference<void*>::type          difference_type;
+
+  template<typename U>
+    struct rebind
+  {
+    typedef U* other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    return &r;
+  }
+
+  // thrust additions follow
+  typedef pointer_raw_pointer<void*>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr;
+  }
+};
+
+template<>
+  struct pointer_traits<const void*>
+{
+  typedef const void*                           pointer;
+  typedef const void                            reference;
+  typedef const void                            element_type;
+  typedef pointer_difference<const void*>::type difference_type;
+
+  template<typename U>
+    struct rebind
+  {
+    typedef U* other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    return &r;
+  }
+
+  // thrust additions follow
+  typedef pointer_raw_pointer<const void*>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr;
+  }
+};
+
+template<typename FromPtr, typename ToPtr>
+  struct is_pointer_system_convertible
+    : thrust::detail::is_convertible<
+        typename iterator_system<FromPtr>::type,
+        typename iterator_system<ToPtr>::type
+      >
+{};
+
+template<typename FromPtr, typename ToPtr>
+  struct is_pointer_convertible
+    : thrust::detail::and_<
+        thrust::detail::is_convertible<
+          typename pointer_element<FromPtr>::type *,
+          typename pointer_element<ToPtr>::type *
+        >,
+        is_pointer_system_convertible<FromPtr, ToPtr>
+      >
+{};
+
+template<typename FromPtr, typename ToPtr>
+  struct is_void_pointer_system_convertible
+    : thrust::detail::and_<
+        thrust::detail::is_same<
+          typename pointer_element<FromPtr>::type,
+          void
+        >,
+        is_pointer_system_convertible<FromPtr, ToPtr>
+      >
+{};
+
+// this could be a lot better, but for our purposes, it's probably
+// sufficient just to check if pointer_raw_pointer<T> has meaning
+template<typename T>
+  struct is_thrust_pointer
+    : is_metafunction_defined<pointer_raw_pointer<T> >
+{};
+
+// avoid inspecting traits of the arguments if they aren't known to be pointers
+template<typename FromPtr, typename ToPtr>
+  struct lazy_is_pointer_convertible
+    : thrust::detail::eval_if<
+        is_thrust_pointer<FromPtr>::value && is_thrust_pointer<ToPtr>::value,
+        is_pointer_convertible<FromPtr,ToPtr>,
+        thrust::detail::identity_<thrust::detail::false_type>
+      >
+{};
+
+template<typename FromPtr, typename ToPtr>
+  struct lazy_is_void_pointer_system_convertible
+    : thrust::detail::eval_if<
+        is_thrust_pointer<FromPtr>::value && is_thrust_pointer<ToPtr>::value,
+        is_void_pointer_system_convertible<FromPtr,ToPtr>,
+        thrust::detail::identity_<thrust::detail::false_type>
+      >
+{};
+
+template<typename FromPtr, typename ToPtr, typename T = void>
+  struct enable_if_pointer_is_convertible
+    : thrust::detail::enable_if<
+        lazy_is_pointer_convertible<FromPtr,ToPtr>::type::value,
+        T
+      >
+{};
+
+template<typename FromPtr, typename ToPtr, typename T = void>
+  struct enable_if_void_pointer_is_system_convertible
+    : thrust::detail::enable_if<
+        lazy_is_void_pointer_system_convertible<FromPtr,ToPtr>::type::value,
+        T
+      >
+{};
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/type_traits/result_of_adaptable_function.h b/thrust/thrust/detail/type_traits/result_of_adaptable_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f91ff0b21546291abb474adac095f7974b19273
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/result_of_adaptable_function.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011 || defined(__cpp_lib_result_of_sfinae)
+// necessary for std::result_of
+#include <type_traits>
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+// In the C++11 mode, by default, result_of_adaptable function inheritfrom std::result_of
+#if THRUST_CPP_DIALECT >= 2011 || defined(__cpp_lib_result_of_sfinae)
+template <typename Signature, typename Enable = void>
+struct result_of_adaptable_function : std::result_of<Signature> {};
+#else  /* cxx11 */
+template<typename Signature, typename Enable = void> 
+struct result_of_adaptable_function;
+#endif  /* cxx11 */
+
+// specialization for unary invocations of things which have result_type
+template<typename Functor, typename Arg1>
+  struct result_of_adaptable_function<
+    Functor(Arg1),
+    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
+  >
+{
+  typedef typename Functor::result_type type;
+}; // end result_of
+
+// specialization for binary invocations of things which have result_type
+template<typename Functor, typename Arg1, typename Arg2>
+  struct result_of_adaptable_function<
+    Functor(Arg1,Arg2),
+    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
+  >
+{
+  typedef typename Functor::result_type type;
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/detail/uninitialized_copy.inl b/thrust/thrust/detail/uninitialized_copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..660df76d5983d80e6ea9ec80370618c92e7826a9
--- /dev/null
+++ b/thrust/thrust/detail/uninitialized_copy.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_copy.inl
+ *  \brief Inline file for uninitialized_copy.h.
+ */
+
+#include <thrust/uninitialized_copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/uninitialized_copy.h>
+#include <thrust/system/detail/adl/uninitialized_copy.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result)
+{
+  using thrust::system::detail::generic::uninitialized_copy;
+  return uninitialized_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end uninitialized_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result)
+{
+  using thrust::system::detail::generic::uninitialized_copy_n;
+  return uninitialized_copy_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, result);
+} // end uninitialized_copy_n()
+
+
+template<typename InputIterator,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy(InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::uninitialized_copy(select_system(system1,system2), first, last, result);
+} // end uninitialized_copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(InputIterator first,
+                                       Size n,
+                                       ForwardIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::uninitialized_copy_n(select_system(system1,system2), first, n, result);
+} // end uninitialized_copy_n()
+
+
+} // end thrust
+
+
diff --git a/thrust/thrust/detail/uninitialized_fill.inl b/thrust/thrust/detail/uninitialized_fill.inl
new file mode 100644
index 0000000000000000000000000000000000000000..30eab23a219208db8174964405224d50a84a8192
--- /dev/null
+++ b/thrust/thrust/detail/uninitialized_fill.inl
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_fill.inl
+ *  \brief Inline file for uninitialized_fill.h.
+ */
+
+#include <thrust/uninitialized_fill.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/uninitialized_fill.h>
+#include <thrust/system/detail/adl/uninitialized_fill.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void uninitialized_fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x)
+{
+  using thrust::system::detail::generic::uninitialized_fill;
+  return uninitialized_fill(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, x);
+} // end uninitialized_fill()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename ForwardIterator, typename Size, typename T>
+__host__ __device__
+  ForwardIterator uninitialized_fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x)
+{
+  using thrust::system::detail::generic::uninitialized_fill_n;
+  return uninitialized_fill_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, x);
+} // end uninitialized_fill_n()
+
+
+template<typename ForwardIterator,
+         typename T>
+  void uninitialized_fill(ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  thrust::uninitialized_fill(select_system(system), first, last, x);
+} // end uninitialized_fill()
+
+
+template<typename ForwardIterator,
+         typename Size,
+         typename T>
+  ForwardIterator uninitialized_fill_n(ForwardIterator first,
+                                       Size n,
+                                       const T &x)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::uninitialized_fill_n(select_system(system), first, n, x);
+} // end uninitialized_fill_n()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/unique.inl b/thrust/thrust/detail/unique.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b6fa9304db1be34d7dba34a46202a962eff65954
--- /dev/null
+++ b/thrust/thrust/detail/unique.inl
@@ -0,0 +1,336 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.inl
+ *  \brief Inline file for unique.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/unique.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/system/detail/adl/unique.h>
+#include <thrust/system/detail/adl/unique_by_key.h>
+
+namespace thrust
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last)
+{
+  using thrust::system::detail::generic::unique;
+  return unique(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end unique()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique;
+  return unique(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, binary_pred);
+} // end unique()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output)
+{
+  using thrust::system::detail::generic::unique_copy;
+  return unique_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, output);
+} // end unique_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output,
+                           BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_copy;
+  return unique_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, output, binary_pred);
+} // end unique_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first)
+{
+  using thrust::system::detail::generic::unique_by_key;
+  return unique_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
+} // end unique_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first,
+                BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_by_key;
+  return unique_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, binary_pred);
+} // end unique_by_key()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_output,
+                     OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::unique_by_key_copy;
+  return unique_by_key_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output);
+} // end unique_by_key_copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_output,
+                     OutputIterator2 values_output,
+                     BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_by_key_copy;
+  return unique_by_key_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+} // end unique_by_key_copy()
+
+
+template<typename ForwardIterator>
+  ForwardIterator unique(ForwardIterator first,
+                         ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique(select_system(system), first, last);
+} // end unique()
+
+
+template<typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique(select_system(system), first, last, binary_pred);
+} // end unique()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator unique_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_copy(select_system(system1,system2), first, last, output);
+} // end unique_copy()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_copy(select_system(system1,system2), first, last, output, binary_pred);
+} // end unique_copy()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
+  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
+} // end unique_by_key()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
+  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_by_key(select_system(system1,system2), keys_first, keys_last, values_first, binary_pred);
+} // end unique_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output);
+} // end unique_by_key_copy()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+} // end unique_by_key_copy()
+
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/use_default.h b/thrust/thrust/detail/use_default.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba2c27bc58bb4abe62945587eb94238f7988b341
--- /dev/null
+++ b/thrust/thrust/detail/use_default.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+struct use_default {};
+
+} // end thrust
+
diff --git a/thrust/thrust/detail/util/align.h b/thrust/thrust/detail/util/align.h
new file mode 100644
index 0000000000000000000000000000000000000000..af97cd44a7cb86f195b38439dd1a10c111044c85
--- /dev/null
+++ b/thrust/thrust/detail/util/align.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/cstdint.h>
+
+// functions to handle memory alignment
+
+namespace thrust
+{
+namespace detail
+{
+namespace util
+{
+
+
+template<typename T>
+__host__ __device__
+T *align_up(T * ptr, detail::uintptr_t bytes)
+{
+  return (T *) ( bytes * (((detail::uintptr_t) ptr + (bytes - 1)) / bytes) );
+}
+
+
+template<typename T>
+__host__ __device__
+T *align_down(T * ptr, detail::uintptr_t bytes)
+{
+  return (T *) ( bytes * (detail::uintptr_t(ptr) / bytes) );
+}
+
+
+template<typename T>
+__host__ __device__
+bool is_aligned(T * ptr, detail::uintptr_t bytes = sizeof(T))
+{
+  return detail::uintptr_t(ptr) % bytes == 0;
+}
+
+
+} // end namespace util
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/detail/vector_base.h b/thrust/thrust/detail/vector_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..eecedfc14d88cfdf8355eadba0a3e188c04ceb81
--- /dev/null
+++ b/thrust/thrust/detail/vector_base.h
@@ -0,0 +1,588 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file vector_base.h
+ *  \brief Defines the interface to a base class for
+ *         host_vector & device_vector.
+ */
+
+#pragma once
+
+#include <thrust/iterator/detail/normal_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/config.h>
+#include <thrust/detail/contiguous_storage.h>
+#include <vector>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T, typename Alloc>
+  class vector_base
+{
+  private:
+    typedef thrust::detail::contiguous_storage<T,Alloc> storage_type;
+
+  public:
+    // typedefs
+    typedef typename storage_type::value_type      value_type;
+    typedef typename storage_type::pointer         pointer;
+    typedef typename storage_type::const_pointer   const_pointer;
+    typedef typename storage_type::reference       reference;
+    typedef typename storage_type::const_reference const_reference;
+    typedef typename storage_type::size_type       size_type;
+    typedef typename storage_type::difference_type difference_type;
+    typedef typename storage_type::allocator_type  allocator_type;
+
+    typedef typename storage_type::iterator        iterator;
+    typedef typename storage_type::const_iterator  const_iterator;
+
+    typedef thrust::reverse_iterator<iterator>       reverse_iterator;
+    typedef thrust::reverse_iterator<const_iterator> const_reverse_iterator;
+
+    /*! This constructor creates an empty vector_base.
+     */
+    vector_base(void);
+
+    /*! This constructor creates an empty vector_base.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    explicit vector_base(const Alloc &alloc);
+
+    /*! This constructor creates a vector_base with default-constructed
+     *  elements.
+     *  \param n The number of elements to create.
+     */
+    explicit vector_base(size_type n);
+
+    /*! This constructor creates a vector_base with default-constructed
+     *  elements.
+     *  \param n The number of elements to create.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    explicit vector_base(size_type n, const Alloc &alloc);
+
+    /*! This constructor creates a vector_base with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     */
+    explicit vector_base(size_type n, const value_type &value);
+
+    /*! This constructor creates a vector_base with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    explicit vector_base(size_type n, const value_type &value, const Alloc &alloc);
+
+    /*! Copy constructor copies from an exemplar vector_base.
+     *  \param v The vector_base to copy.
+     */
+    vector_base(const vector_base &v);
+
+    /*! Copy constructor copies from an exemplar vector_base.
+     *  \param v The vector_base to copy.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    vector_base(const vector_base &v, const Alloc &alloc);
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move constructor moves from another vector_base.
+     *  \param v The vector_base to move.
+     */
+    vector_base(vector_base &&v);
+
+    // FIXME: the internal Thrust machinery in range_init doesn't work with move
+    // iterators, which is necessary for the following constructor to be implemented
+    // correctly
+    // vector_base(vector_base &&v, const Alloc &alloc);
+  #endif
+
+    /*! Copy assign operator copies from another vector_base.
+     *  \param v The vector_base to copy.
+     */
+    vector_base &operator=(const vector_base &v);
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move assign operator moves from another vector_base.
+     *  \param v The vector_base to move.
+     */
+    vector_base &operator=(vector_base &&v);
+  #endif
+
+    /*! Copy constructor copies from an exemplar vector_base with different
+     *  type.
+     *  \param v The vector_base to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base(const vector_base<OtherT, OtherAlloc> &v);
+
+    /*! assign operator makes a copy of an exemplar vector_base with different
+     *  type.
+     *  \param v The vector_base to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base &operator=(const vector_base<OtherT,OtherAlloc> &v);
+
+    /*! Copy constructor copies from an exemplar std::vector.
+     *  \param v The std::vector to copy.
+     *  XXX TODO: Make this method redundant with a properly templatized constructor.
+     *            We would like to copy from a vector whose element type is anything
+     *            assignable to value_type.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base(const std::vector<OtherT, OtherAlloc> &v);
+
+    /*! assign operator makes a copy of an exemplar std::vector.
+     *  \param v The vector to copy.
+     *  XXX TODO: Templatize this assign on the type of the vector to copy from.
+     *            We would like to copy from a vector whose element type is anything
+     *            assignable to value_type.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base &operator=(const std::vector<OtherT,OtherAlloc> &v);
+
+    /*! This constructor builds a vector_base from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    vector_base(InputIterator first, InputIterator last);
+
+    /*! This constructor builds a vector_base from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    template<typename InputIterator>
+    vector_base(InputIterator first, InputIterator last, const Alloc &alloc);
+
+    /*! The destructor erases the elements.
+     */
+    ~vector_base(void);
+
+    /*! \brief Resizes this vector_base to the specified number of elements.
+     *  \param new_size Number of elements this vector_base should contain.
+     *  \throw std::length_error If n exceeds max_size9).
+     *
+     *  This method will resize this vector_base to the specified number of
+     *  elements. If the number is smaller than this vector_base's current
+     *  size this vector_base is truncated, otherwise this vector_base is
+     *  extended and new elements are default constructed.
+     */
+    void resize(size_type new_size);
+
+    /*! \brief Resizes this vector_base to the specified number of elements.
+     *  \param new_size Number of elements this vector_base should contain.
+     *  \param x Data with which new elements should be populated.
+     *  \throw std::length_error If n exceeds max_size().
+     *
+     *  This method will resize this vector_base to the specified number of
+     *  elements.  If the number is smaller than this vector_base's current
+     *  size this vector_base is truncated, otherwise this vector_base is
+     *  extended and new elements are populated with given data.
+     */
+    void resize(size_type new_size, const value_type &x);
+
+    /*! Returns the number of elements in this vector_base.
+     */
+    size_type size(void) const;
+
+    /*! Returns the size() of the largest possible vector_base.
+     *  \return The largest possible return value of size().
+     */
+    size_type max_size(void) const;
+
+    /*! \brief If n is less than or equal to capacity(), this call has no effect.
+     *         Otherwise, this method is a request for allocation of additional memory. If
+     *         the request is successful, then capacity() is greater than or equal to
+     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
+     *  \throw std::length_error If n exceeds max_size().
+     */
+    void reserve(size_type n);
+
+    /*! Returns the number of elements which have been reserved in this
+     *  vector_base.
+     */
+    size_type capacity(void) const;
+
+    /*! This method shrinks the capacity of this vector_base to exactly
+     *  fit its elements.
+     */
+    void shrink_to_fit(void);
+
+    /*! \brief Subscript access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read/write reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    reference operator[](size_type n);
+
+    /*! \brief Subscript read access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    const_reference operator[](size_type n) const;
+
+    /*! This method returns an iterator pointing to the beginning of
+     *  this vector_base.
+     *  \return mStart
+     */
+    iterator begin(void);
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector_base.
+     *  \return mStart
+     */
+    const_iterator begin(void) const;
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector_base.
+     *  \return mStart
+     */
+    const_iterator cbegin(void) const;
+
+    /*! This method returns a reverse_iterator pointing to the beginning of
+     *  this vector_base's reversed sequence.
+     *  \return A reverse_iterator pointing to the beginning of this
+     *          vector_base's reversed sequence.
+     */
+    reverse_iterator rbegin(void);
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector_base's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector_base's reversed sequence.
+     */
+    const_reverse_iterator rbegin(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector_base's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector_base's reversed sequence.
+     */
+    const_reverse_iterator crbegin(void) const;
+
+    /*! This method returns an iterator pointing to one element past the
+     *  last of this vector_base.
+     *  \return begin() + size().
+     */
+    iterator end(void);
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector_base.
+     *  \return begin() + size().
+     */
+    const_iterator end(void) const;
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector_base.
+     *  \return begin() + size().
+     */
+    const_iterator cend(void) const;
+
+    /*! This method returns a reverse_iterator pointing to one element past the
+     *  last of this vector_base's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    reverse_iterator rend(void);
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector_base's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator rend(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector_base's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator crend(void) const;
+
+    /*! This method returns a const_reference referring to the first element of this
+     *  vector_base.
+     *  \return The first element of this vector_base.
+     */
+    const_reference front(void) const;
+
+    /*! This method returns a reference pointing to the first element of this
+     *  vector_base.
+     *  \return The first element of this vector_base.
+     */
+    reference front(void);
+
+    /*! This method returns a const reference pointing to the last element of
+     *  this vector_base.
+     *  \return The last element of this vector_base.
+     */
+    const_reference back(void) const;
+
+    /*! This method returns a reference referring to the last element of
+     *  this vector_dev.
+     *  \return The last element of this vector_base.
+     */
+    reference back(void);
+
+    /*! This method returns a pointer to this vector_base's first element.
+     *  \return A pointer to the first element of this vector_base.
+     */
+    pointer data(void);
+
+    /*! This method returns a const_pointer to this vector_base's first element.
+     *  \return a const_pointer to the first element of this vector_base.
+     */
+    const_pointer data(void) const;
+
+    /*! This method resizes this vector_base to 0.
+     */
+    void clear(void);
+
+    /*! This method returns true iff size() == 0.
+     *  \return true if size() == 0; false, otherwise.
+     */
+    bool empty(void) const;
+
+    /*! This method appends the given element to the end of this vector_base.
+     *  \param x The element to append.
+     */
+    void push_back(const value_type &x);
+
+    /*! This method erases the last element of this vector_base, invalidating
+     *  all iterators and references to it.
+     */
+    void pop_back(void);
+
+    /*! This method swaps the contents of this vector_base with another vector_base.
+     *  \param v The vector_base with which to swap.
+     */
+    void swap(vector_base &v);
+
+    /*! This method removes the element at position pos.
+     *  \param pos The position of the element of interest.
+     *  \return An iterator pointing to the new location of the element that followed the element
+     *          at position pos.
+     */
+    iterator erase(iterator pos);
+
+    /*! This method removes the range of elements [first,last) from this vector_base.
+     *  \param first The beginning of the range of elements to remove.
+     *  \param last The end of the range of elements to remove.
+     *  \return An iterator pointing to the new location of the element that followed the last
+     *          element in the sequence [first,last).
+     */
+    iterator erase(iterator first, iterator last);
+
+    /*! This method inserts a single copy of a given exemplar value at the
+     *  specified position in this vector_base.
+     *  \param position The insertion position.
+     *  \param x The exemplar element to copy & insert.
+     *  \return An iterator pointing to the newly inserted element.
+     */
+    iterator insert(iterator position, const T &x); 
+
+    /*! This method inserts a copy of an exemplar value to a range at the
+     *  specified position in this vector_base.
+     *  \param position The insertion position
+     *  \param n The number of insertions to perform.
+     *  \param x The value to replicate and insert.
+     */
+    void insert(iterator position, size_type n, const T &x);
+
+    /*! This method inserts a copy of an input range at the specified position
+     *  in this vector_base.
+     *  \param position The insertion position.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     */
+    template<typename InputIterator>
+    void insert(iterator position, InputIterator first, InputIterator last);
+
+    /*! This version of \p assign replicates a given exemplar
+     *  \p n times into this vector_base.
+     *  \param n The number of times to copy \p x.
+     *  \param x The exemplar element to replicate.
+     */
+    void assign(size_type n, const T &x);
+
+    /*! This version of \p assign makes this vector_base a copy of a given input range.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     */
+    template<typename InputIterator>
+    void assign(InputIterator first, InputIterator last);
+
+    /*! This method returns a copy of this vector's allocator.
+     *  \return A copy of the alloctor used by this vector.
+     */
+    allocator_type get_allocator(void) const;
+
+  protected:
+    // Our storage
+    storage_type m_storage;
+
+    // The size of this vector_base, in number of elements.
+    size_type m_size;
+
+  private:
+    // these methods resolve the ambiguity of the constructor template of form (Iterator, Iterator)
+    template<typename IteratorOrIntegralType>
+      void init_dispatch(IteratorOrIntegralType begin, IteratorOrIntegralType end, false_type); 
+
+    template<typename IteratorOrIntegralType>
+      void init_dispatch(IteratorOrIntegralType n, IteratorOrIntegralType value, true_type); 
+
+    template<typename InputIterator>
+      void range_init(InputIterator first, InputIterator last);
+
+    template<typename InputIterator>
+      void range_init(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag);
+
+    template<typename ForwardIterator>
+      void range_init(ForwardIterator first, ForwardIterator last, thrust::random_access_traversal_tag);
+
+    void default_init(size_type n);
+
+    void fill_init(size_type n, const T &x);
+
+    // these methods resolve the ambiguity of the insert() template of form (iterator, InputIterator, InputIterator)
+    template<typename InputIteratorOrIntegralType>
+      void insert_dispatch(iterator position, InputIteratorOrIntegralType first, InputIteratorOrIntegralType last, false_type);
+
+    // these methods resolve the ambiguity of the insert() template of form (iterator, InputIterator, InputIterator)
+    template<typename InputIteratorOrIntegralType>
+      void insert_dispatch(iterator position, InputIteratorOrIntegralType n, InputIteratorOrIntegralType x, true_type);
+
+    // this method appends n default-constructed elements at the end
+    void append(size_type n);
+
+    // this method performs insertion from a fill value
+    void fill_insert(iterator position, size_type n, const T &x);
+
+    // this method performs insertion from a range
+    template<typename InputIterator>
+      void copy_insert(iterator position, InputIterator first, InputIterator last);
+
+    // these methods resolve the ambiguity of the assign() template of form (InputIterator, InputIterator)
+    template<typename InputIterator>
+      void assign_dispatch(InputIterator first, InputIterator last, false_type);
+
+    // these methods resolve the ambiguity of the assign() template of form (InputIterator, InputIterator)
+    template<typename Integral>
+      void assign_dispatch(Integral n, Integral x, true_type);
+
+    // this method performs assignment from a range
+    template<typename InputIterator>
+      void range_assign(InputIterator first, InputIterator last);
+
+    // this method performs assignment from a range of RandomAccessIterators
+    template<typename RandomAccessIterator>
+      void range_assign(RandomAccessIterator first, RandomAccessIterator last, thrust::random_access_traversal_tag);
+
+    // this method performs assignment from a range of InputIterators
+    template<typename InputIterator>
+      void range_assign(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag);
+
+    // this method performs assignment from a fill value
+    void fill_assign(size_type n, const T &x);
+
+    // this method allocates new storage and construct copies the given range
+    template<typename ForwardIterator>
+    void allocate_and_copy(size_type requested_size,
+                           ForwardIterator first, ForwardIterator last,
+                           storage_type &new_storage);
+}; // end vector_base
+
+} // end detail
+
+/*! This function assigns the contents of vector a to vector b and the
+ *  contents of vector b to vector a.
+ *
+ *  \param a The first vector of interest. After completion, the contents
+ *           of b will be returned here.
+ *  \param b The second vector of interest. After completion, the contents
+ *           of a will be returned here.
+ */
+template<typename T, typename Alloc>
+  void swap(detail::vector_base<T,Alloc> &a,
+            detail::vector_base<T,Alloc> &b);
+
+
+/*! This operator allows comparison between two vectors.
+ *  \param lhs The first \p vector to compare.
+ *  \param rhs The second \p vector to compare.
+ *  \return \c true if and only if each corresponding element in either
+ *          \p vector equals the other; \c false, otherwise.
+ */
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+    
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs);
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+
+/*! This operator allows comparison between two vectors.
+ *  \param lhs The first \p vector to compare.
+ *  \param rhs The second \p vector to compare.
+ *  \return \c false if and only if each corresponding element in either
+ *          \p vector equals the other; \c true, otherwise.
+ */
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+    
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs);
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+
+} // end thrust
+
+#include <thrust/detail/vector_base.inl>
+
diff --git a/thrust/thrust/detail/vector_base.inl b/thrust/thrust/detail/vector_base.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2e233177080e54024bbe075a54884e38b685b836
--- /dev/null
+++ b/thrust/thrust/detail/vector_base.inl
@@ -0,0 +1,1290 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file vector_base.inl
+ *  \brief Inline file for vector_base.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/vector_base.h>
+#include <thrust/detail/copy.h>
+#include <thrust/detail/overlapped_copy.h>
+#include <thrust/equal.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+
+#include <stdexcept>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(void)
+      :m_storage(),
+       m_size(0)
+{
+  ;
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  ;
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n)
+      :m_storage(),
+       m_size(0)
+{
+  default_init(n);
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n, const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  default_init(n);
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n, const value_type &value)
+      :m_storage(),
+       m_size(0)
+{
+  fill_init(n,value);
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n, const value_type &value, const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  fill_init(n,value);
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(const vector_base &v)
+      :m_storage(copy_allocator_t(), v.m_storage),
+       m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(const vector_base &v, const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc>
+      ::vector_base(vector_base &&v)
+        :m_storage(copy_allocator_t(), v.m_storage),
+         m_size(0)
+  {
+    *this = std::move(v);
+  } //end vector_base::vector_base()
+#endif
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc> &
+    vector_base<T,Alloc>
+      ::operator=(const vector_base &v)
+{
+  if(this != &v)
+  {
+    m_storage.destroy_on_allocator_mismatch(v.m_storage, begin(), end());
+    m_storage.deallocate_on_allocator_mismatch(v.m_storage);
+
+    m_storage.propagate_allocator(v.m_storage);
+
+    assign(v.begin(), v.end());
+  } // end if
+
+  return *this;
+} // end vector_base::operator=()
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+        ::operator=(vector_base &&v)
+  {
+    m_storage.destroy(begin(), end());
+    m_storage = std::move(v.m_storage);
+    m_size = std::move(v.m_size);
+
+    v.m_storage = contiguous_storage<T,Alloc>(copy_allocator_t(), m_storage);
+    v.m_size = 0;
+
+    return *this;
+  } // end vector_base::operator=()
+#endif
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc>
+      ::vector_base(const vector_base<OtherT,OtherAlloc> &v)
+        :m_storage(),
+         m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+        ::operator=(const vector_base<OtherT,OtherAlloc> &v)
+{
+  assign(v.begin(), v.end());
+
+  return *this;
+} // end vector_base::operator=()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc>
+      ::vector_base(const std::vector<OtherT,OtherAlloc> &v)
+        :m_storage(),
+         m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+        ::operator=(const std::vector<OtherT,OtherAlloc> &v)
+{
+  assign(v.begin(), v.end());
+
+  return *this;
+} // end vector_base::operator=()
+
+template<typename T, typename Alloc>
+  template<typename IteratorOrIntegralType>
+    void vector_base<T,Alloc>
+      ::init_dispatch(IteratorOrIntegralType n,
+                      IteratorOrIntegralType value,
+                      true_type)
+{
+  fill_init(n,value);
+} // end vector_base::init_dispatch()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::default_init(size_type n)
+{
+  if(n > 0)
+  {
+    m_storage.allocate(n);
+    m_size = n;
+
+    m_storage.default_construct_n(begin(), size());
+  } // end if
+} // end vector_base::default_init()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::fill_init(size_type n, const T &x)
+{
+  if(n > 0)
+  {
+    m_storage.allocate(n);
+    m_size = n;
+
+    m_storage.uninitialized_fill_n(begin(), size(), x);
+  } // end if
+} // end vector_base::fill_init()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::init_dispatch(InputIterator first,
+                      InputIterator last,
+                      false_type)
+{
+  range_init(first, last);
+} // end vector_base::init_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_init(InputIterator first,
+                   InputIterator last)
+{
+  range_init(first, last,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end vector_base::range_init()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_init(InputIterator first,
+                   InputIterator last,
+                   thrust::incrementable_traversal_tag)
+{
+  for(; first != last; ++first)
+    push_back(*first);
+} // end vector_base::range_init()
+
+template<typename T, typename Alloc>
+  template<typename ForwardIterator>
+    void vector_base<T,Alloc>
+      ::range_init(ForwardIterator first,
+                   ForwardIterator last,
+                   thrust::random_access_traversal_tag)
+{
+  size_type new_size = thrust::distance(first, last);
+
+  allocate_and_copy(new_size, first, last, m_storage);
+  m_size    = new_size;
+} // end vector_base::range_init()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    vector_base<T,Alloc>
+      ::vector_base(InputIterator first,
+                    InputIterator last)
+        :m_storage(),
+         m_size(0)
+{
+  // check the type of InputIterator: if it's an integral type,
+  // we need to interpret this call as (size_type, value_type)
+  typedef thrust::detail::is_integral<InputIterator> Integer;
+
+  init_dispatch(first, last, Integer());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    vector_base<T,Alloc>
+      ::vector_base(InputIterator first,
+                    InputIterator last,
+                    const Alloc &alloc)
+        :m_storage(alloc),
+         m_size(0)
+{
+  // check the type of InputIterator: if it's an integral type,
+  // we need to interpret this call as (size_type, value_type)
+  typedef thrust::detail::is_integral<InputIterator> Integer;
+
+  init_dispatch(first, last, Integer());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::resize(size_type new_size)
+{
+  if(new_size < size())
+  {
+    iterator new_end = begin();
+    thrust::advance(new_end, new_size);
+    erase(new_end, end());
+  } // end if
+  else
+  {
+    append(new_size - size());
+  } // end else
+} // end vector_base::resize()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::resize(size_type new_size, const value_type &x)
+{
+  if(new_size < size())
+  {
+    iterator new_end = begin();
+    thrust::advance(new_end, new_size);
+    erase(new_end, end());
+  } // end if
+  else
+  {
+    insert(end(), new_size - size(), x);
+  } // end else
+} // end vector_base::resize()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::size_type
+    vector_base<T,Alloc>
+      ::size(void) const
+{
+  return m_size;
+} // end vector_base::size()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::size_type
+    vector_base<T,Alloc>
+      ::max_size(void) const
+{
+  return m_storage.max_size();
+} // end vector_base::max_size()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::reserve(size_type n)
+{
+  if(n > capacity())
+  {
+    allocate_and_copy(n, begin(), end(), m_storage);
+  } // end if
+} // end vector_base::reserve()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::size_type
+    vector_base<T,Alloc>
+      ::capacity(void) const
+{
+  return m_storage.size();
+} // end vector_base::capacity()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::shrink_to_fit(void)
+{
+  // use the swap trick
+  vector_base(*this).swap(*this);
+} // end vector_base::shrink_to_fit()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reference
+    vector_base<T,Alloc>
+      ::operator[](const size_type n)
+{
+  return m_storage[n];
+} // end vector_base::operator[]
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reference
+    vector_base<T,Alloc>
+      ::operator[](const size_type n) const
+{
+  return m_storage[n];
+} // end vector_base::operator[]
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator
+    vector_base<T,Alloc>
+      ::begin(void)
+{
+  return m_storage.begin();
+} // end vector_base::begin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::begin(void) const
+{
+  return m_storage.begin();
+} // end vector_base::begin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::cbegin(void) const
+{
+  return begin();
+} // end vector_base::cbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reverse_iterator
+    vector_base<T,Alloc>
+      ::rbegin(void)
+{
+  return reverse_iterator(end());
+} // end vector_base::rbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::rbegin(void) const
+{
+  return const_reverse_iterator(end());
+} // end vector_base::rbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::crbegin(void) const
+{
+  return rbegin();
+} // end vector_base::crbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator
+    vector_base<T,Alloc>
+      ::end(void)
+{
+  iterator result = begin();
+  thrust::advance(result, size());
+  return result;
+} // end vector_base::end()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::end(void) const
+{
+  const_iterator result = begin();
+  thrust::advance(result, size());
+  return result;
+} // end vector_base::end()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::cend(void) const
+{
+  return end();
+} // end vector_base::cend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reverse_iterator
+    vector_base<T,Alloc>
+      ::rend(void)
+{
+  return reverse_iterator(begin());
+} // end vector_base::rend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::rend(void) const
+{
+  return const_reverse_iterator(begin());
+} // end vector_base::rend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::crend(void) const
+{
+  return rend();
+} // end vector_base::crend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reference
+    vector_base<T,Alloc>
+      ::front(void) const
+{
+  return *begin();
+} // end vector_base::front()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reference
+    vector_base<T,Alloc>
+      ::front(void)
+{
+  return *begin();
+} // end vector_base::front()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reference
+    vector_base<T,Alloc>
+      ::back(void) const
+{
+  const_iterator ptr_to_back = end();
+  --ptr_to_back;
+  return *ptr_to_back;
+} // end vector_base::vector_base
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reference
+    vector_base<T,Alloc>
+      ::back(void)
+{
+  iterator ptr_to_back = end();
+  --ptr_to_back;
+  return *ptr_to_back;
+} // end vector_base::vector_base
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::pointer
+    vector_base<T,Alloc>
+      ::data(void)
+{
+  return pointer(&front());
+} // end vector_base::data()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_pointer
+    vector_base<T,Alloc>
+      ::data(void) const
+{
+  return const_pointer(&front());
+} // end vector_base::data()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::~vector_base(void)
+{
+  // destroy every living thing
+  if (!empty())
+    m_storage.destroy(begin(),end());
+} // end vector_base::~vector_base()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::clear(void)
+{
+  erase(begin(), end());
+} // end vector_base::~vector_dev()
+
+template<typename T, typename Alloc>
+  bool vector_base<T,Alloc>
+    ::empty(void) const
+{
+  return size() == 0;
+} // end vector_base::empty();
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::push_back(const value_type &x)
+{
+  insert(end(), x);
+} // end vector_base::push_back()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::pop_back(void)
+{
+  iterator e = end();
+  iterator ptr_to_back = e;
+  --ptr_to_back;
+  m_storage.destroy(ptr_to_back, e);
+  --m_size;
+} // end vector_base::pop_back()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator vector_base<T,Alloc>
+    ::erase(iterator pos)
+{
+  iterator end = pos;
+  ++end;
+  return erase(pos,end);
+} // end vector_base::erase()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator vector_base<T,Alloc>
+    ::erase(iterator first, iterator last)
+{
+  // overlap copy the range [last,end()) to first
+  // XXX this copy only potentially overlaps
+  iterator i = thrust::detail::overlapped_copy(last, end(), first);
+
+  // destroy everything after i
+  m_storage.destroy(i, end());
+
+  // modify our size
+  m_size -= (last - first);
+
+  // return an iterator pointing to the position of the first element
+  // following the erased range
+  return first;
+} // end vector_base::erase()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::swap(vector_base &v)
+{
+  thrust::swap(m_storage,  v.m_storage);
+  thrust::swap(m_size,     v.m_size);
+} // end vector_base::swap()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::assign(size_type n, const T &x)
+{
+  fill_assign(n, x);
+} // end vector_base::assign()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::assign(InputIterator first, InputIterator last)
+{
+  // we could have received assign(n, x), so disambiguate on the
+  // type of InputIterator
+  typedef typename thrust::detail::is_integral<InputIterator> integral;
+
+  assign_dispatch(first, last, integral());
+} // end vector_base::assign()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::allocator_type
+    vector_base<T,Alloc>
+      ::get_allocator(void) const
+{
+  return m_storage.get_allocator();
+} // end vector_base::get_allocator()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator
+    vector_base<T,Alloc>
+      ::insert(iterator position, const T &x)
+{
+  // find the index of the insertion
+  size_type index = thrust::distance(begin(), position);
+
+  // make the insertion
+  insert(position, 1, x);
+
+  // return an iterator pointing back to position
+  iterator result = begin();
+  thrust::advance(result, index);
+  return result;
+} // end vector_base::insert()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::insert(iterator position, size_type n, const T &x)
+{
+  fill_insert(position, n, x);
+} // end vector_base::insert()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::insert(iterator position, InputIterator first, InputIterator last)
+{
+  // we could have received insert(position, n, x), so disambiguate on the
+  // type of InputIterator
+  typedef typename thrust::detail::is_integral<InputIterator> integral;
+
+  insert_dispatch(position, first, last, integral());
+} // end vector_base::insert()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::assign_dispatch(InputIterator first, InputIterator last, false_type)
+{
+  range_assign(first, last);
+} // end vector_base::assign_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename Integral>
+    void vector_base<T,Alloc>
+      ::assign_dispatch(Integral n, Integral x, true_type)
+{
+  fill_assign(n, x);
+} // end vector_base::assign_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::insert_dispatch(iterator position, InputIterator first, InputIterator last, false_type)
+{
+  copy_insert(position, first, last);
+} // end vector_base::insert_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename Integral>
+    void vector_base<T,Alloc>
+      ::insert_dispatch(iterator position, Integral n, Integral x, true_type)
+{
+  fill_insert(position, n, x);
+} // end vector_base::insert_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename ForwardIterator>
+    void vector_base<T,Alloc>
+      ::copy_insert(iterator position,
+                    ForwardIterator first,
+                    ForwardIterator last)
+{
+  if(first != last)
+  {
+    // how many new elements will we create?
+    const size_type num_new_elements = thrust::distance(first, last);
+    if(capacity() - size() >= num_new_elements)
+    {
+      // we've got room for all of them
+      // how many existing elements will we displace?
+      const size_type num_displaced_elements = end() - position;
+      iterator old_end = end();
+
+      if(num_displaced_elements > num_new_elements)
+      {
+        // construct copy n displaced elements to new elements
+        // following the insertion
+        m_storage.uninitialized_copy(end() - num_new_elements, end(), end());
+
+        // extend the size
+        m_size += num_new_elements;
+
+        // copy num_displaced_elements - num_new_elements elements to existing elements
+        // this copy overlaps
+        const size_type copy_length = (old_end - num_new_elements) - position;
+        thrust::detail::overlapped_copy(position, old_end - num_new_elements, old_end - copy_length);
+
+        // finally, copy the range to the insertion point
+        thrust::copy(first, last, position);
+      } // end if
+      else
+      {
+        ForwardIterator mid = first;
+        thrust::advance(mid, num_displaced_elements);
+
+        // construct copy new elements at the end of the vector
+        m_storage.uninitialized_copy(mid, last, end());
+
+        // extend the size
+        m_size += num_new_elements - num_displaced_elements;
+
+        // construct copy the displaced elements
+        m_storage.uninitialized_copy(position, old_end, end());
+
+        // extend the size
+        m_size += num_displaced_elements;
+
+        // copy to elements which already existed
+        thrust::copy(first, mid, position);
+      } // end else
+    } // end if
+    else
+    {
+      const size_type old_size = size();
+
+      // compute the new capacity after the allocation
+      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, num_new_elements);
+
+      // allocate exponentially larger new storage
+      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
+
+      // do not exceed maximum storage
+      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+      if(new_capacity > max_size())
+      {
+        throw std::length_error("insert(): insertion exceeds max_size().");
+      } // end if
+
+      storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
+
+      // record how many constructors we invoke in the try block below
+      iterator new_end = new_storage.begin();
+
+      try
+      {
+        // construct copy elements before the insertion to the beginning of the newly
+        // allocated storage
+        new_end = m_storage.uninitialized_copy(begin(), position, new_storage.begin());
+
+        // construct copy elements to insert
+        new_end = m_storage.uninitialized_copy(first, last, new_end);
+
+        // construct copy displaced elements from the old storage to the new storage
+        // remember [position, end()) refers to the old storage
+        new_end = m_storage.uninitialized_copy(position, end(), new_end);
+      } // end try
+      catch(...)
+      {
+        // something went wrong, so destroy & deallocate the new storage
+        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.deallocate();
+
+        // rethrow
+        throw;
+      } // end catch
+
+      // call destructors on the elements in the old storage
+      m_storage.destroy(begin(), end());
+
+      // record the vector's new state
+      m_storage.swap(new_storage);
+      m_size = old_size + num_new_elements;
+    } // end else
+  } // end if
+} // end vector_base::copy_insert()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::append(size_type n)
+{
+  if(n != 0)
+  {
+    if(capacity() - size() >= n)
+    {
+      // we've got room for all of them
+
+      // default construct new elements at the end of the vector
+      m_storage.default_construct_n(end(), n);
+
+      // extend the size
+      m_size += n;
+    } // end if
+    else
+    {
+      const size_type old_size = size();
+
+      // compute the new capacity after the allocation
+      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, n);
+
+      // allocate exponentially larger new storage
+      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
+
+      // do not exceed maximum storage
+      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+      // create new storage
+      storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
+
+      // record how many constructors we invoke in the try block below
+      iterator new_end = new_storage.begin();
+
+      try
+      {
+        // construct copy all elements into the newly allocated storage
+        new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
+
+        // construct new elements to insert
+        m_storage.default_construct_n(new_end, n);
+        new_end += n;
+      } // end try
+      catch(...)
+      {
+        // something went wrong, so destroy & deallocate the new storage
+        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.deallocate();
+
+        // rethrow
+        throw;
+      } // end catch
+
+      // call destructors on the elements in the old storage
+      m_storage.destroy(begin(), end());
+
+      // record the vector's new state
+      m_storage.swap(new_storage);
+      m_size    = old_size + n;
+    } // end else
+  } // end if
+} // end vector_base::append()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::fill_insert(iterator position, size_type n, const T &x)
+{
+  if(n != 0)
+  {
+    if(capacity() - size() >= n)
+    {
+      // we've got room for all of them
+      // how many existing elements will we displace?
+      const size_type num_displaced_elements = end() - position;
+      iterator old_end = end();
+
+      if(num_displaced_elements > n)
+      {
+        // construct copy n displaced elements to new elements
+        // following the insertion
+        m_storage.uninitialized_copy(end() - n, end(), end());
+
+        // extend the size
+        m_size += n;
+
+        // copy num_displaced_elements - n elements to existing elements
+        // this copy overlaps
+        const size_type copy_length = (old_end - n) - position;
+        thrust::detail::overlapped_copy(position, old_end - n, old_end - copy_length);
+
+        // finally, fill the range to the insertion point
+        thrust::fill_n(position, n, x);
+      } // end if
+      else
+      {
+        // construct new elements at the end of the vector
+        m_storage.uninitialized_fill_n(end(), n - num_displaced_elements, x);
+
+        // extend the size
+        m_size += n - num_displaced_elements;
+
+        // construct copy the displaced elements
+        m_storage.uninitialized_copy(position, old_end, end());
+
+        // extend the size
+        m_size += num_displaced_elements;
+
+        // fill to elements which already existed
+        thrust::fill(position, old_end, x);
+      } // end else
+    } // end if
+    else
+    {
+      const size_type old_size = size();
+
+      // compute the new capacity after the allocation
+      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, n);
+
+      // allocate exponentially larger new storage
+      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
+
+      // do not exceed maximum storage
+      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+      if(new_capacity > max_size())
+      {
+        throw std::length_error("insert(): insertion exceeds max_size().");
+      } // end if
+
+      storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
+
+      // record how many constructors we invoke in the try block below
+      iterator new_end = new_storage.begin();
+
+      try
+      {
+        // construct copy elements before the insertion to the beginning of the newly
+        // allocated storage
+        new_end = m_storage.uninitialized_copy(begin(), position, new_storage.begin());
+
+        // construct new elements to insert
+        m_storage.uninitialized_fill_n(new_end, n, x);
+        new_end += n;
+
+        // construct copy displaced elements from the old storage to the new storage
+        // remember [position, end()) refers to the old storage
+        new_end = m_storage.uninitialized_copy(position, end(), new_end);
+      } // end try
+      catch(...)
+      {
+        // something went wrong, so destroy & deallocate the new storage
+        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.deallocate();
+
+        // rethrow
+        throw;
+      } // end catch
+
+      // call destructors on the elements in the old storage
+      m_storage.destroy(begin(), end());
+
+      // record the vector's new state
+      m_storage.swap(new_storage);
+      m_size    = old_size + n;
+    } // end else
+  } // end if
+} // end vector_base::fill_insert()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_assign(InputIterator first,
+                     InputIterator last)
+{
+  // dispatch on traversal
+  range_assign(first, last,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end range_assign()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_assign(InputIterator first,
+                     InputIterator last,
+                     thrust::incrementable_traversal_tag)
+{
+  iterator current(begin());
+
+  // assign to elements which already exist
+  for(; first != last && current != end(); ++current, ++first)
+  {
+    *current = *first;
+  } // end for
+
+  // either just the input was exhausted or both
+  // the input and vector elements were exhausted
+  if(first == last)
+  {
+    // if we exhausted the input, erase leftover elements
+    erase(current, end());
+  } // end if
+  else
+  {
+    // insert the rest of the input at the end of the vector
+    insert(end(), first, last);
+  } // end else
+} // end vector_base::range_assign()
+
+template<typename T, typename Alloc>
+  template<typename RandomAccessIterator>
+    void vector_base<T,Alloc>
+      ::range_assign(RandomAccessIterator first,
+                     RandomAccessIterator last,
+                     thrust::random_access_traversal_tag)
+{
+  const size_type n = thrust::distance(first, last);
+
+  if(n > capacity())
+  {
+    storage_type new_storage(copy_allocator_t(), m_storage);
+    allocate_and_copy(n, first, last, new_storage);
+
+    // call destructors on the elements in the old storage
+    m_storage.destroy(begin(), end());
+
+    // record the vector's new state
+    m_storage.swap(new_storage);
+    m_size = n;
+  } // end if
+  else if(size() >= n)
+  {
+    // we can already accomodate the new range
+    iterator new_end = thrust::copy(first, last, begin());
+
+    // destroy the elements we don't need
+    m_storage.destroy(new_end, end());
+
+    // update size
+    m_size = n;
+  } // end else if
+  else
+  {
+    // range fits inside allocated storage, but some elements
+    // have not been constructed yet
+
+    // XXX TODO we could possibly implement this with one call
+    // to transform rather than copy + uninitialized_copy
+
+    // copy to elements which already exist
+    RandomAccessIterator mid = first;
+    thrust::advance(mid, size());
+    thrust::copy(first, mid, begin());
+
+    // uninitialize_copy to elements which must be constructed
+    m_storage.uninitialized_copy(mid, last, end());
+
+    // update size
+    m_size = n;
+  } // end else
+} // end vector_base::assign()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::fill_assign(size_type n, const T &x)
+{
+  if(n > capacity())
+  {
+    // XXX we should also include a copy of the allocator:
+    // vector_base<T,Alloc> temp(n, x, get_allocator());
+    vector_base<T,Alloc> temp(n, x);
+    temp.swap(*this);
+  } // end if
+  else if(n > size())
+  {
+    // fill to existing elements
+    thrust::fill(begin(), end(), x);
+
+    // construct uninitialized elements
+    m_storage.uninitialized_fill_n(end(), n - size(), x);
+
+    // adjust size
+    m_size += (n - size());
+  } // end else if
+  else
+  {
+    // fill to existing elements
+    iterator new_end = thrust::fill_n(begin(), n, x);
+
+    // erase the elements after the fill
+    erase(new_end, end());
+  } // end else
+} // end vector_base::fill_assign()
+
+template<typename T, typename Alloc>
+  template<typename ForwardIterator>
+    void vector_base<T,Alloc>
+      ::allocate_and_copy(size_type requested_size,
+                          ForwardIterator first, ForwardIterator last,
+                          storage_type &new_storage)
+{
+  if(requested_size == 0)
+  {
+    new_storage.deallocate();
+    return;
+  } // end if
+
+  // allocate exponentially larger new storage
+  size_type allocated_size = thrust::max<size_type>(requested_size, 2 * capacity());
+
+  // do not exceed maximum storage
+  allocated_size = thrust::min<size_type>(allocated_size, max_size());
+
+  if(requested_size > allocated_size)
+  {
+    throw std::length_error("assignment exceeds max_size().");
+  } // end if
+
+  new_storage.allocate(allocated_size);
+
+  try
+  {
+    // construct the range to the newly allocated storage
+    m_storage.uninitialized_copy(first, last, new_storage.begin());
+  } // end try
+  catch(...)
+  {
+    // something went wrong, so destroy & deallocate the new storage
+    // XXX seems like this destroys too many elements -- should just be last - first instead of requested_size
+    iterator new_storage_end = new_storage.begin();
+    thrust::advance(new_storage_end, requested_size);
+    m_storage.destroy(new_storage.begin(), new_storage_end);
+    new_storage.deallocate();
+
+    // rethrow
+    throw;
+  } // end catch
+} // end vector_base::allocate_and_copy()
+
+
+} // end detail
+
+template<typename T, typename Alloc>
+  void swap(detail::vector_base<T,Alloc> &a,
+            detail::vector_base<T,Alloc> &b)
+{
+  a.swap(b);
+} // end swap()
+
+
+
+namespace detail
+{
+
+// iterator tags match
+template <typename InputIterator1, typename InputIterator2>
+bool vector_equal(InputIterator1 first1, InputIterator1 last1,
+                  InputIterator2 first2,
+                  thrust::detail::true_type)
+{
+  return thrust::equal(first1, last1, first2);
+}
+
+// iterator tags differ
+template <typename InputIterator1, typename InputIterator2>
+bool vector_equal(InputIterator1 first1, InputIterator1 last1,
+                  InputIterator2 first2,
+                  thrust::detail::false_type)
+{
+  typename thrust::iterator_difference<InputIterator1>::type n = thrust::distance(first1,last1);
+
+  typedef typename thrust::iterator_system<InputIterator1>::type FromSystem1;
+  typedef typename thrust::iterator_system<InputIterator2>::type FromSystem2;
+
+  // bring both ranges to the host system
+  // note that these copies are no-ops if the range is already convertible to the host system
+  FromSystem1 from_system1;
+  FromSystem2 from_system2;
+  thrust::host_system_tag to_system;
+  thrust::detail::move_to_system<InputIterator1, FromSystem1, thrust::host_system_tag> rng1(from_system1, to_system, first1, last1);
+  thrust::detail::move_to_system<InputIterator2, FromSystem2, thrust::host_system_tag> rng2(from_system2, to_system, first2, first2 + n);
+
+  return thrust::equal(rng1.begin(), rng1.end(), rng2.begin());
+}
+
+template <typename InputIterator1, typename InputIterator2>
+bool vector_equal(InputIterator1 first1, InputIterator1 last1,
+                  InputIterator2 first2)
+{
+  typedef typename thrust::iterator_system<InputIterator1>::type system1;
+  typedef typename thrust::iterator_system<InputIterator2>::type system2;
+
+  // dispatch on the sameness of the two systems
+  return vector_equal(first1, last1, first2,
+    thrust::detail::is_same<system1,system2>());
+}
+
+} // end namespace detail
+
+
+
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs)
+{
+    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return !(lhs == rhs);
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs)
+{
+    return !(lhs == rhs);
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return !(lhs == rhs);
+}
+
+} // end thrust
+
diff --git a/thrust/thrust/device_allocator.h b/thrust/thrust/device_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5ff0d9654c997a8fcccb24db9707cd43cf18f17
--- /dev/null
+++ b/thrust/thrust/device_allocator.h
@@ -0,0 +1,146 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_allocator.h
+ *  \brief An allocator which creates new elements in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/memory/detail/device_system_resource.h>
+
+#include <limits>
+#include <stdexcept>
+
+namespace thrust
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! Memory resource adaptor that turns any memory resource that returns a fancy
+ *      with the same tag as \p device_ptr, and adapts it to a resource that returns
+ *      a \p device_ptr.
+ */
+template<typename Upstream>
+class device_ptr_memory_resource THRUST_FINAL
+    : public thrust::mr::memory_resource<
+        device_ptr<void>
+    >
+{
+    typedef typename Upstream::pointer upstream_ptr;
+
+public:
+    /*! Initialize the adaptor with the global instance of the upstream resource. Obtains
+     *      the global instance by calling \p get_global_resource.
+     */
+    __host__
+    device_ptr_memory_resource() : m_upstream(mr::get_global_resource<Upstream>())
+    {
+    }
+
+    /*! Initialize the adaptor with an upstream resource.
+     *
+     *  \param upstream the upstream memory resource to adapt.
+     */
+    __host__
+    device_ptr_memory_resource(Upstream * upstream) : m_upstream(upstream)
+    {
+    }
+
+    THRUST_NODISCARD __host__
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        return pointer(m_upstream->do_allocate(bytes, alignment).get());
+    }
+
+    __host__
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    {
+        m_upstream->do_deallocate(upstream_ptr(p.get()), bytes, alignment);
+    }
+
+private:
+    Upstream * m_upstream;
+};
+
+/*! \}
+ */
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+template<typename T>
+class device_allocator
+    : public thrust::mr::stateless_resource_allocator<
+        T,
+        device_ptr_memory_resource<device_memory_resource>
+    >
+{
+    typedef thrust::mr::stateless_resource_allocator<
+        T,
+        device_ptr_memory_resource<device_memory_resource>
+    > base;
+
+public:
+    /*! The \p rebind metafunction provides the type of a \p device_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p device_allocator.
+         */
+        typedef device_allocator<U> other;
+    };
+
+    /*! Default constructor has no effect. */
+    __host__
+    device_allocator() {}
+
+    /*! Copy constructor has no effect. */
+    __host__
+    device_allocator(const device_allocator& other) : base(other) {}
+
+    /*! Constructor from other \p device_allocator has no effect. */
+    template<typename U>
+    __host__
+    device_allocator(const device_allocator<U>& other) : base(other) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+    device_allocator & operator=(const device_allocator &) = default;
+#endif
+
+    /*! Destructor has no effect. */
+    __host__
+    ~device_allocator() {}
+};
+
+/*! \}
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/device_delete.h b/thrust/thrust/device_delete.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce822f09dced8851218beea89e3127c7050140c0
--- /dev/null
+++ b/thrust/thrust/device_delete.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_delete.h
+ *  \brief Deletes variables in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+
+namespace thrust
+{
+
+/*! \addtogroup deallocation_functions Deallocation Functions
+ *  \ingroup memory_management_functions
+ *  \{
+ */
+
+/*! \p device_delete deletes a \p device_ptr allocated with
+ *  \p device_new.
+ *
+ *  \param ptr The \p device_ptr to delete, assumed to have
+ *         been allocated with \p device_new.
+ *  \param n The number of objects to destroy at \p ptr. Defaults to \c 1
+ *         similar to \p device_new.
+ *
+ *  \see device_ptr
+ *  \see device_new
+ */
+template<typename T>
+  inline void device_delete(thrust::device_ptr<T> ptr,
+                            const size_t n = 1);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_delete.inl>
+
diff --git a/thrust/thrust/device_free.h b/thrust/thrust/device_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..38d4424c7da7ca2ba91377d0f29c8ee24afdcc19
--- /dev/null
+++ b/thrust/thrust/device_free.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_free.h
+ *  \brief Deallocates storage allocated by \p device_malloc
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+
+namespace thrust
+{
+
+/*! \addtogroup deallocation_functions Deallocation Functions
+ *  \ingroup memory_management_functions
+ *  \{
+ */
+
+/*! \p device_free deallocates memory allocated by the function \p device_malloc.
+ *
+ *  \param ptr A \p device_ptr pointing to memory to be deallocated.
+ *
+ *  The following code snippet demonstrates how to use \p device_free to
+ *  deallocate memory allocated by \p device_malloc.
+ *
+ *  \code
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_free.h>
+ *  ...
+ *  // allocate some integers with device_malloc
+ *  const int N = 100;
+ *  thrust::device_ptr<int> int_array = thrust::device_malloc<int>(N);
+ *
+ *  // manipulate integers
+ *  ...
+ *
+ *  // deallocate with device_free
+ *  thrust::device_free(int_array);
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_malloc
+ */
+inline void device_free(thrust::device_ptr<void> ptr);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_free.inl>
+
diff --git a/thrust/thrust/device_make_unique.h b/thrust/thrust/device_make_unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..939006f27e3dad8cfe09dc32ef0219a4bd419d38
--- /dev/null
+++ b/thrust/thrust/device_make_unique.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_make_unique.h
+ *  \brief A factory function for creating `unique_ptr`s to device objects.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/allocate_unique.h>
+#include <thrust/device_new.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_allocator.h>
+#include <thrust/detail/type_deduction.h>
+
+namespace thrust
+{
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename... Args>
+__host__
+auto device_make_unique(Args&&... args)
+  -> decltype(
+    uninitialized_allocate_unique<T>(device_allocator<T>{})
+  )
+{
+  // FIXME: This is crude - we construct an unnecessary T on the host for 
+  // `device_new`. We need a proper dispatched `construct` algorithm to
+  // do this properly.
+  auto p = uninitialized_allocate_unique<T>(device_allocator<T>{});
+  device_new<T>(p.get(), T(THRUST_FWD(args)...));
+  return p;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/thrust/device_malloc.h b/thrust/thrust/device_malloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..75194491e5ec3bb4aafa700b197feae45d50dac6
--- /dev/null
+++ b/thrust/thrust/device_malloc.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_malloc.h
+ *  \brief Allocates storage in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <cstddef> // for std::size_t
+
+namespace thrust
+{
+
+/*! \addtogroup allocation_functions Allocation Functions
+ *  \ingroup memory_management_functions
+ *  \{
+ */
+
+/*! This version of \p device_malloc allocates sequential device storage
+ *  for bytes.
+ *
+ *  \param n The number of bytes to allocate sequentially
+ *           in device memory.
+ *  \return A \p device_ptr to the newly allocated memory.
+ *
+ *  The following code snippet demonstrates how to use \p device_malloc to
+ *  allocate a range of device memory.
+ *
+ *  \code
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_free.h>
+ *  ...
+ *  // allocate some memory with device_malloc
+ *  const int N = 100;
+ *  thrust::device_ptr<void> void_ptr = thrust::device_malloc(N);
+ *
+ *  // manipulate memory
+ *  ...
+ *
+ *  // deallocate with device_free
+ *  thrust::device_free(void_ptr);
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_free
+ */
+inline thrust::device_ptr<void> device_malloc(const std::size_t n);
+
+/*! This version of \p device_malloc allocates sequential device storage for
+ *  new objects of the given type.
+ *
+ *  \param n The number of objects of type T to allocate
+ *           sequentially in device memory.
+ *  \return A \p device_ptr to the newly allocated memory.
+ *
+ *  The following code snippet demonstrates how to use \p device_malloc to
+ *  allocate a range of device memory.
+ *
+ *  \code
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_free.h>
+ *  ...
+ *  // allocate some integers with device_malloc
+ *  const int N = 100;
+ *  thrust::device_ptr<int> int_array = thrust::device_malloc<int>(N);
+ *
+ *  // manipulate integers
+ *  ...
+ *
+ *  // deallocate with device_free
+ *  thrust::device_free(int_array);
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_free
+ */
+template<typename T>
+  inline thrust::device_ptr<T> device_malloc(const std::size_t n);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_malloc.inl>
+
diff --git a/thrust/thrust/device_malloc_allocator.h b/thrust/thrust/device_malloc_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e40c362e08dfd6111ebb0932530c4df10438249f
--- /dev/null
+++ b/thrust/thrust/device_malloc_allocator.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_malloc_allocator.h
+ *  \brief An allocator which allocates storage with \p device_malloc
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_reference.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <limits>
+#include <stdexcept>
+
+namespace thrust
+{
+
+// forward declarations to WAR circular #includes
+template<typename> class device_ptr;
+template<typename T> device_ptr<T> device_malloc(const std::size_t n);
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_malloc_allocator is a device memory allocator that employs the
+ *  \p device_malloc function for allocation.
+ *
+ *  \p device_malloc_allocator is deprecated in favor of <tt>thrust::mr</tt>
+ *      memory resource-based allocators.
+ *
+ *  \see device_malloc
+ *  \see device_ptr
+ *  \see device_allocator
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T>
+  class device_malloc_allocator
+{
+  public:
+    /*! Type of element allocated, \c T. */
+    typedef T                                 value_type;
+
+    /*! Pointer to allocation, \c device_ptr<T>. */
+    typedef device_ptr<T>                     pointer;
+
+    /*! \c const pointer to allocation, \c device_ptr<const T>. */
+    typedef device_ptr<const T>               const_pointer;
+
+    /*! Reference to allocated element, \c device_reference<T>. */
+    typedef device_reference<T>               reference;
+
+    /*! \c const reference to allocated element, \c device_reference<const T>. */
+    typedef device_reference<const T>         const_reference;
+
+    /*! Type of allocation size, \c std::size_t. */
+    typedef std::size_t                       size_type;
+
+    /*! Type of allocation difference, \c pointer::difference_type. */
+    typedef typename pointer::difference_type difference_type;
+
+    /*! The \p rebind metafunction provides the type of a \p device_malloc_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U The other type to use for instantiation.
+     */
+    template<typename U>
+      struct rebind
+    {
+      /*! The typedef \p other gives the type of the rebound \p device_malloc_allocator.
+       */
+      typedef device_malloc_allocator<U> other;
+    }; // end rebind
+
+    /*! No-argument constructor has no effect. */
+    __host__ __device__
+    inline device_malloc_allocator() {}
+
+    /*! No-argument destructor has no effect. */
+    __host__ __device__
+    inline ~device_malloc_allocator() {}
+
+    /*! Copy constructor has no effect. */
+    __host__ __device__
+    inline device_malloc_allocator(device_malloc_allocator const&) {}
+
+    /*! Constructor from other \p device_malloc_allocator has no effect. */
+    template<typename U>
+    __host__ __device__
+    inline device_malloc_allocator(device_malloc_allocator<U> const&) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+    device_malloc_allocator & operator=(const device_malloc_allocator &) = default;
+#endif
+
+    /*! Returns the address of an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+
+    /*! Returns the address an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! Allocates storage for \p cnt objects.
+     *  \param cnt The number of objects to allocate.
+     *  \return A \p pointer to uninitialized storage for \p cnt objects.
+     *  \note Memory allocated by this function must be deallocated with \p deallocate.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = const_pointer(static_cast<T*>(0)))
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      return pointer(device_malloc<T>(cnt));
+    } // end allocate()
+
+    /*! Deallocates storage for objects allocated with \p allocate.
+     *  \param p A \p pointer to the storage to deallocate.
+     *  \param cnt The size of the previous allocation.
+     *  \note Memory deallocated by this function must previously have been
+     *        allocated with \p allocate.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type cnt)
+    {
+      // silence unused parameter warning while still leaving the parameter name for Doxygen
+      (void)(cnt);
+
+      device_free(p);
+    } // end deallocate()
+
+    /*! Returns the largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     *  \return The largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! Compares against another \p device_malloc_allocator for equality.
+     *  \return \c true
+     */
+    __host__ __device__
+    inline bool operator==(device_malloc_allocator const&) const { return true; }
+
+    /*! Compares against another \p device_malloc_allocator for inequality.
+     *  \return \c false
+     */
+    __host__ __device__
+    inline bool operator!=(device_malloc_allocator const &a) const {return !operator==(a); }
+}; // end device_malloc_allocator
+
+/*! \}
+ */
+
+} // end thrust
+
+
diff --git a/thrust/thrust/device_new.h b/thrust/thrust/device_new.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae4ce5a40d03b88073dd029d9a7049dcdab6783
--- /dev/null
+++ b/thrust/thrust/device_new.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_new.h
+ *  \brief Constructs new elements in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include this for size_t
+#include <cstddef>
+#include <thrust/device_ptr.h>
+
+namespace thrust
+{
+
+/*!
+ *  \addtogroup allocation_functions Allocation Functions
+ *  \{
+ */
+
+/*! \p device_new implements the placement \c new operator for types
+ *  resident in device memory. \p device_new calls <tt>T</tt>'s null
+ *  constructor on a array of objects in device memory.
+ *  No memory is allocated by this function.
+ *
+ *  \param  p A \p device_ptr to a region of device memory into which
+ *          to construct one or many <tt>T</tt>s.
+ *  \param  n The number of objects to construct at \p p.
+ *  \return p, casted to <tt>T</tt>'s type.
+ *
+ *  \see device_ptr
+ */
+template <typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const size_t n = 1);
+
+/*! \p device_new implements the placement new operator for types
+ *  resident in device memory. \p device_new calls <tt>T</tt>'s copy
+ *  constructor on a array of objects in device memory. No memory is
+ *  allocated by this function.
+ *
+ *  \param  p A \p device_ptr to a region of device memory into which to
+ *          construct one or many <tt>T</tt>s.
+ *  \param exemplar The value from which to copy.
+ *  \param  n The number of objects to construct at \p p.
+ *  \return p, casted to <tt>T</tt>'s type.
+ *
+ *  \see device_ptr
+ *  \see fill
+ */
+template <typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const T &exemplar,
+                           const size_t n = 1);
+
+/*! \p device_new implements the new operator for types resident in device memory.
+ *  It allocates device memory large enough to hold \p n new objects of type \c T.
+ *
+ *  \param n The number of objects to allocate. Defaults to \c 1.
+ *  \return A \p device_ptr to the newly allocated region of device memory.
+ */
+template <typename T>
+  device_ptr<T> device_new(const size_t n = 1);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_new.inl>
+
diff --git a/thrust/thrust/device_new_allocator.h b/thrust/thrust/device_new_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d7133ba711254d9284200173a453b2155f410c5
--- /dev/null
+++ b/thrust/thrust/device_new_allocator.h
@@ -0,0 +1,172 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_new_allocator.h
+ *  \brief An allocator which allocates storage with \p device_new
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_reference.h>
+#include <thrust/device_new.h>
+#include <thrust/device_delete.h>
+#include <limits>
+#include <stdexcept>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_new_allocator is a device memory allocator that employs the
+ *  \p device_new function for allocation.
+ *
+ *  \see device_new
+ *  \see device_ptr
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T>
+  class device_new_allocator
+{
+  public:
+    /*! Type of element allocated, \c T. */
+    typedef T                                 value_type;
+
+    /*! Pointer to allocation, \c device_ptr<T>. */
+    typedef device_ptr<T>                     pointer;
+
+    /*! \c const pointer to allocation, \c device_ptr<const T>. */
+    typedef device_ptr<const T>               const_pointer;
+
+    /*! Reference to allocated element, \c device_reference<T>. */
+    typedef device_reference<T>               reference;
+
+    /*! \c const reference to allocated element, \c device_reference<const T>. */
+    typedef device_reference<const T>         const_reference;
+
+    /*! Type of allocation size, \c std::size_t. */
+    typedef std::size_t                       size_type;
+
+    /*! Type of allocation difference, \c pointer::difference_type. */
+    typedef typename pointer::difference_type difference_type;
+
+    /*! The \p rebind metafunction provides the type of a \p device_new_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U The other type to use for instantiation.
+     */
+    template<typename U>
+      struct rebind
+    {
+      /*! The typedef \p other gives the type of the rebound \p device_new_allocator.
+       */
+      typedef device_new_allocator<U> other;
+    }; // end rebind
+
+    /*! No-argument constructor has no effect. */
+    __host__ __device__
+    inline device_new_allocator() {}
+
+    /*! No-argument destructor has no effect. */
+    __host__ __device__
+    inline ~device_new_allocator() {}
+
+    /*! Copy constructor has no effect. */
+    __host__ __device__
+    inline device_new_allocator(device_new_allocator const&) {}
+
+    /*! Constructor from other \p device_malloc_allocator has no effect. */
+    template<typename U>
+    __host__ __device__
+    inline device_new_allocator(device_new_allocator<U> const&) {}
+
+    /*! Returns the address of an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+    
+    /*! Returns the address an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! Allocates storage for \p cnt objects.
+     *  \param cnt The number of objects to allocate.
+     *  \return A \p pointer to uninitialized storage for \p cnt objects.
+     *  \note Memory allocated by this function must be deallocated with \p deallocate.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = const_pointer(static_cast<T*>(0)))
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      // use "::operator new" rather than keyword new
+      return pointer(device_new<T>(cnt));
+    } // end allocate()
+
+    /*! Deallocates storage for objects allocated with \p allocate.
+     *  \param p A \p pointer to the storage to deallocate.
+     *  \param cnt The size of the previous allocation.
+     *  \note Memory deallocated by this function must previously have been
+     *        allocated with \p allocate.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type cnt)
+    {
+      // use "::operator delete" rather than keyword delete
+      (void)cnt;
+      device_delete(p);
+    } // end deallocate()
+
+    /*! Returns the largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     *  \return The largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     */
+    __host__ __device__
+    inline size_type max_size() const
+    {
+      return std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
+    } // end max_size()
+
+    /*! Compares against another \p device_malloc_allocator for equality.
+     *  \return \c true
+     */
+    __host__ __device__
+    inline bool operator==(device_new_allocator const&) { return true; }
+
+    /*! Compares against another \p device_malloc_allocator for inequality.
+     *  \return \c false
+     */
+    __host__ __device__
+    inline bool operator!=(device_new_allocator const &a) {return !operator==(a); }
+}; // end device_new_allocator
+
+/*! \}
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/device_ptr.h b/thrust/thrust/device_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb3ad1ee021ab35d2ad1ef63744e9832d08a8b12
--- /dev/null
+++ b/thrust/thrust/device_ptr.h
@@ -0,0 +1,192 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_ptr.h
+ *  \brief A pointer to a variable which resides in the "device" system's memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/memory.h>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+// forward declarations
+template<typename T> class device_reference;
+
+/*! \p device_ptr stores a pointer to an object allocated in device memory. This type
+ *  provides type safety when dispatching standard algorithms on ranges resident in
+ *  device memory.
+ *
+ *  \p device_ptr has pointer semantics: it may be dereferenced safely from the host and
+ *  may be manipulated with pointer arithmetic.
+ *
+ *  \p device_ptr can be created with the functions device_malloc, device_new, or
+ *  device_pointer_cast, or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p device_ptr may be obtained by either its <tt>get</tt>
+ *  method or the \p raw_pointer_cast free function.
+ *
+ *  \note \p device_ptr is not a smart pointer; it is the programmer's responsibility to
+ *  deallocate memory pointed to by \p device_ptr.
+ *
+ *  \see device_malloc
+ *  \see device_new
+ *  \see device_pointer_cast
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class device_ptr
+    : public thrust::pointer<
+               T,
+               thrust::device_system_tag,
+               thrust::device_reference<T>,
+               thrust::device_ptr<T>
+             >
+{
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::device_system_tag,
+      thrust::device_reference<T>,
+      thrust::device_ptr<T>
+    > super_t;
+
+  public:
+    /*! \p device_ptr's null constructor initializes its raw pointer to \c 0.
+     */
+    __host__ __device__
+    device_ptr() : super_t() {}
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    device_ptr(decltype(nullptr)) : super_t(nullptr) {}
+    #endif
+
+    /*! \p device_ptr's copy constructor is templated to allow copying to a
+     *  <tt>device_ptr<const T></tt> from a <tt>T *</tt>.
+     *  
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in
+     *         device memory.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit device_ptr(OtherT *ptr) : super_t(ptr) {}
+
+    /*! \p device_ptr's copy constructor allows copying from another device_ptr with related type.
+     *  \param other The \p device_ptr to copy from.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_ptr(const device_ptr<OtherT> &other) : super_t(other) {}
+
+    /*! \p device_ptr's assignment operator allows assigning from another \p device_ptr with related type.
+     *  \param other The other \p device_ptr to copy from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_ptr &operator=(const device_ptr<OtherT> &other)
+    {
+      super_t::operator=(other);
+      return *this;
+    }
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    device_ptr& operator=(decltype(nullptr))
+    {
+      super_t::operator=(nullptr);
+      return *this;
+    }
+    #endif
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! This method returns this \p device_ptr's raw pointer.
+     *  \return This \p device_ptr's raw pointer.
+     */
+    __host__ __device__
+    T *get(void) const;
+#endif // end doxygen-only members
+}; // end device_ptr
+
+// declare these methods for the purpose of Doxygenating them
+// they actually are defined for a derived-from class
+#if 0
+/*! Writes to an output stream the value of a \p device_ptr's raw pointer.
+ *
+ *  \param os The output stream.
+ *  \param p The \p device_ptr to output.
+ *  \return os.
+ */
+template<typename T, typename charT, typename traits>
+std::basic_ostream<charT, traits> &
+operator<<(std::basic_ostream<charT, traits> &os, const device_ptr<T> &p);
+#endif
+
+/*! \}
+ */
+
+
+/*!
+ *  \addtogroup memory_management_functions Memory Management Functions
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_pointer_cast creates a device_ptr from a raw pointer which is presumed to point
+ *  to a location in device memory.
+ *
+ *  \param ptr A raw pointer, presumed to point to a location in device memory.
+ *  \return A device_ptr wrapping ptr.
+ */
+template<typename T>
+__host__ __device__
+inline device_ptr<T> device_pointer_cast(T *ptr);
+
+/*! This version of \p device_pointer_cast creates a copy of a device_ptr from another device_ptr.
+ *  This version is included for symmetry with \p raw_pointer_cast.
+ *
+ *  \param ptr A device_ptr.
+ *  \return A copy of \p ptr.
+ */
+template<typename T>
+__host__ __device__
+inline device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_ptr.inl>
+#include <thrust/detail/raw_pointer_cast.h>
+
diff --git a/thrust/thrust/device_reference.h b/thrust/thrust/device_reference.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d8538b2fbfe4149f0dc56650eb9eb3c49ff0b91
--- /dev/null
+++ b/thrust/thrust/device_reference.h
@@ -0,0 +1,983 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_reference.h
+ *  \brief A reference to a variable which resides in the "device" system's memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/reference.h>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_reference acts as a reference-like object to an object stored in device memory.
+ *  \p device_reference is not intended to be used directly; rather, this type
+ *  is the result of deferencing a \p device_ptr. Similarly, taking the address of
+ *  a \p device_reference yields a \p device_ptr.
+ *  
+ *  \p device_reference may often be used from host code in place of operations defined on
+ *  its associated \c value_type. For example, when \p device_reference refers to an
+ *  arithmetic type, arithmetic operations on it are legal:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1, 13);
+ *
+ *    thrust::device_reference<int> ref_to_thirteen = vec[0];
+ *
+ *    int x = ref_to_thirteen + 1;
+ *
+ *    // x is 14
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Similarly, we can print the value of \c ref_to_thirteen in the above code by using an
+ *  \c iostream:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1, 13);
+ *
+ *    thrust::device_reference<int> ref_to_thirteen = vec[0];
+ *
+ *    std::cout << ref_to_thirteen << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Of course, we needn't explicitly create a \p device_reference in the previous
+ *  example, because one is returned by \p device_vector's bracket operator. A more natural
+ *  way to print the value of a \p device_vector element might be:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1, 13);
+ *
+ *    std::cout << vec[0] << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  These kinds of operations should be used sparingly in performance-critical code, because
+ *  they imply a potentially expensive copy between host and device space.
+ *
+ *  Some operations which are possible with regular objects are impossible with their
+ *  corresponding \p device_reference objects due to the requirements of the C++ language. For
+ *  example, because the member access operator cannot be overloaded, member variables and functions
+ *  of a referent object cannot be directly accessed through its \p device_reference.
+ *
+ *  The following code, which generates a compiler error, illustrates:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct foo
+ *  {
+ *    int x;
+ *  };
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<foo> foo_vec(1);
+ *
+ *    thrust::device_reference<foo> foo_ref = foo_vec[0];
+ *
+ *    foo_ref.x = 13; // ERROR: x cannot be accessed through foo_ref
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Instead, a host space copy must be created to access \c foo's \c x member:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct foo
+ *  {
+ *    int x;
+ *  };
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<foo> foo_vec(1);
+ *
+ *    // create a local host-side foo object
+ *    foo host_foo;
+ *    host_foo.x = 13;
+ *
+ *    thrust::device_reference<foo> foo_ref = foo_vec[0];
+ *
+ *    foo_ref = host_foo;
+ *
+ *    // foo_ref's x member is 13
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *  
+ *  Another common case where a \p device_reference cannot directly be used in place of
+ *  its referent object occurs when passing them as parameters to functions like \c printf
+ *  which have varargs parameters. Because varargs parameters must be Plain Old Data, a
+ *  \p device_reference to a POD type requires a cast when passed to \c printf:
+ *
+ *  \code
+ *  #include <stdio.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1,13);
+ *
+ *    // vec[0] must be cast to int when passing to printf
+ *    printf("%d\n", (int) vec[0]);
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_vector
+ */
+template<typename T>
+  class device_reference
+    : public thrust::reference<
+               T,
+               thrust::device_ptr<T>,
+               thrust::device_reference<T>
+             >
+{
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::device_ptr<T>,
+      thrust::device_reference<T>
+    > super_t;
+
+  public:
+    /*! The type of the value referenced by this type of \p device_reference.
+     */
+    typedef typename super_t::value_type value_type;
+
+    /*! The type of the expression <tt>&ref</tt>, where <tt>ref</tt> is a \p device_reference.
+     */
+    typedef typename super_t::pointer    pointer;
+
+    /*! This copy constructor accepts a const reference to another
+     *  \p device_reference. After this \p device_reference is constructed,
+     *  it shall refer to the same object as \p other.
+     *  
+     *  \param other A \p device_reference to copy from.
+     *
+     *  The following code snippet demonstrates the semantics of this
+     *  copy constructor.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_reference<int> ref = v[0];
+     *
+     *  // ref equals the object at v[0]
+     *  assert(ref == v[0]);
+     *
+     *  // the address of ref equals the address of v[0]
+     *  assert(&ref == &v[0]);
+     *
+     *  // modifying v[0] modifies ref
+     *  v[0] = 13;
+     *  assert(ref == 13);
+     *  \endcode
+     *
+     *  \note This constructor is templated primarily to allow initialization of 
+     *  <tt>device_reference<const T></tt> from <tt>device_reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_reference(const device_reference<OtherT> &other,
+                     typename thrust::detail::enable_if_convertible<
+                       typename device_reference<OtherT>::pointer,
+                       pointer
+                     >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! This copy constructor initializes this \p device_reference
+     *  to refer to an object pointed to by the given \p device_ptr. After
+     *  this \p device_reference is constructed, it shall refer to the
+     *  object pointed to by \p ptr.
+     *
+     *  \param ptr A \p device_ptr to copy from.
+     *
+     *  The following code snippet demonstrates the semantic of this
+     *  copy constructor.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals the object pointed to by ptr
+     *  assert(ref == *ptr);
+     *
+     *  // the address of ref equals ptr
+     *  assert(&ref == ptr);
+     *
+     *  // modifying *ptr modifies ref
+     *  *ptr = 13;
+     *  assert(ref == 13);
+     *  \endcode
+     */
+    __host__ __device__
+    explicit device_reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This assignment operator assigns the value of the object referenced by
+     *  the given \p device_reference to the object referenced by this
+     *  \p device_reference.
+     *
+     *  \param other The \p device_reference to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_reference &operator=(const device_reference<OtherT> &other);
+
+    /*! Assignment operator assigns the value of the given value to the
+     *  value referenced by this \p device_reference.
+     *  
+     *  \param x The value to assign from.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    device_reference &operator=(const value_type &x);
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! Address-of operator returns a \p device_ptr pointing to the object
+     *  referenced by this \p device_reference. It does not return the
+     *  address of this \p device_reference.
+     *
+     *  \return A \p device_ptr pointing to the object this
+     *  \p device_reference references.
+     */
+    __host__ __device__
+    pointer operator&(void) const;
+
+    /*! Conversion operator converts this \p device_reference to T
+     *  by returning a copy of the object referenced by this
+     *  \p device_reference.
+     *
+     *  \return A copy of the object referenced by this \p device_reference.
+     */
+    __host__ __device__
+    operator value_type (void) const;
+
+    /*! swaps the value this \p device_reference references with another.
+     *  \p other The other \p device_reference with which to swap.
+     */
+    __host__ __device__
+    void swap(device_reference &other);
+
+    /*! Prefix increment operator increments the object referenced by this
+     *  \p device_reference.
+     *
+     *  \return <tt>*this</tt>
+     *  
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's prefix increment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // increment ref
+     *  ++ref;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The increment executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator++(void);
+
+    /*! Postfix increment operator copies the object referenced by this
+     *  \p device_reference, increments the object referenced by this
+     *  \p device_reference, and returns the copy.
+     *
+     *  \return A copy of the object referenced by this \p device_reference
+     *          before being incremented.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's postfix increment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // increment ref
+     *  int x = ref++;
+     *
+     *  // x equals 0
+     *  assert(x == 0)
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The increment executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    value_type operator++(int);
+
+    /*! Addition assignment operator add-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the add-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's addition assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // add-assign ref
+     *  ref += 5;
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *  \endcode
+     *
+     *  \note The add-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator+=(const T &rhs);
+
+    /*! Prefix decrement operator decrements the object referenced by this
+     *  \p device_reference.
+     *
+     *  \return <tt>*this</tt>
+     *  
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's prefix decrement operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // decrement ref
+     *  --ref;
+     *
+     *  // ref equals -1
+     *  assert(ref == -1);
+     *
+     *  // the object pointed to by ptr equals -1
+     *  assert(*ptr == -1);
+     *
+     *  // v[0] equals -1
+     *  assert(v[0] == -1);
+     *  \endcode
+     *
+     *  \note The decrement executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator--(void);
+
+    /*! Postfix decrement operator copies the object referenced by this
+     *  \p device_reference, decrements the object referenced by this
+     *  \p device_reference, and returns the copy.
+     *
+     *  \return A copy of the object referenced by this \p device_reference
+     *          before being decremented.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's postfix decrement operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // decrement ref
+     *  int x = ref--;
+     *
+     *  // x equals 0
+     *  assert(x == 0)
+     *
+     *  // ref equals -1
+     *  assert(ref == -1);
+     *
+     *  // the object pointed to by ptr equals -1
+     *  assert(*ptr == -1);
+     *
+     *  // v[0] equals -1
+     *  assert(v[0] == -1);
+     *  \endcode
+     *
+     *  \note The decrement executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    value_type operator--(int);
+
+    /*! Subtraction assignment operator subtract-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the subtraction-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's addition assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // subtract-assign ref
+     *  ref -= 5;
+     *
+     *  // ref equals -5
+     *  assert(ref == -5);
+     *
+     *  // the object pointed to by ptr equals -5
+     *  assert(*ptr == -5);
+     *
+     *  // v[0] equals -5
+     *  assert(v[0] == -5);
+     *  \endcode
+     *
+     *  \note The subtract-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator-=(const T &rhs);
+
+    /*! Multiplication assignment operator multiply-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the multiply-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's multiply assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // multiply-assign ref
+     *  ref *= 5;
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *  \endcode
+     *
+     *  \note The multiply-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator*=(const T &rhs);
+
+    /*! Division assignment operator divide-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the divide-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's divide assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,5);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *
+     *  // divide-assign ref
+     *  ref /= 5;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The divide-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator/=(const T &rhs);
+
+    /*! Modulation assignment operator modulus-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the divide-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's divide assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,5);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *
+     *  // modulus-assign ref
+     *  ref %= 5;
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *  \endcode
+     *
+     *  \note The modulus-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator%=(const T &rhs);
+
+    /*! Bitwise left shift assignment operator left shift-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the left shift-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's left shift assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // left shift-assign ref
+     *  ref <<= 1;
+     *
+     *  // ref equals 2
+     *  assert(ref == 2);
+     *
+     *  // the object pointed to by ptr equals 2
+     *  assert(*ptr == 2);
+     *
+     *  // v[0] equals 2
+     *  assert(v[0] == 2);
+     *  \endcode
+     *
+     *  \note The left shift-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator<<=(const T &rhs);
+
+    /*! Bitwise right shift assignment operator right shift-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the right shift-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's right shift assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,2);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 2
+     *  assert(ref == 2);
+     *
+     *  // the object pointed to by ptr equals 2
+     *  assert(*ptr == 2);
+     *
+     *  // v[0] equals 2
+     *  assert(v[0] == 2);
+     *
+     *  // right shift-assign ref
+     *  ref >>= 1;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The right shift-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator>>=(const T &rhs);
+
+    /*! Bitwise AND assignment operator AND-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the AND-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's AND assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // right AND-assign ref
+     *  ref &= 0;
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *  \endcode
+     *
+     *  \note The AND-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator&=(const T &rhs);
+
+    /*! Bitwise OR assignment operator OR-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the OR-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's OR assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // right OR-assign ref
+     *  ref |= 1;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The OR-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator|=(const T &rhs);
+
+    /*! Bitwise XOR assignment operator XOR-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the XOR-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's XOR assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // right XOR-assign ref
+     *  ref ^= 1;
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *  \endcode
+     *
+     *  \note The XOR-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator^=(const T &rhs);
+#endif // end doxygen-only members
+}; // end device_reference
+
+/*! swaps the value of one \p device_reference with another.
+ *  \p x The first \p device_reference of interest.
+ *  \p y The second \p device_reference of interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(device_reference<T> x, device_reference<T> y);
+
+// declare these methods for the purpose of Doxygenating them
+// they actually are defined for a derived-from class
+#if 0
+/*! Writes to an output stream the value of a \p device_reference.
+ *
+ *  \param os The output stream.
+ *  \param y The \p device_reference to output.
+ *  \return os.
+ */
+template<typename T, typename charT, typename traits>
+std::basic_ostream<charT, traits> &
+operator<<(std::basic_ostream<charT, traits> &os, const device_reference<T> &y);
+#endif
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_reference.inl>
+
diff --git a/thrust/thrust/device_vector.h b/thrust/thrust/device_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa52ec66236c315c27c36fd88f12292cf65112a1
--- /dev/null
+++ b/thrust/thrust/device_vector.h
@@ -0,0 +1,496 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_vector.h
+ *  \brief A dynamically-sizable array of elements which reside in the "device" memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/vector_base.h>
+#include <thrust/device_allocator.h>
+
+#include <vector>
+#include <utility>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+template<typename T, typename Alloc> class host_vector;
+
+/*! \addtogroup container_classes Container Classes
+ *  \addtogroup device_containers Device Containers
+ *  \ingroup container_classes
+ *  \{
+ */
+
+/*! A \p device_vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p device_vector may vary dynamically; memory management is
+ *  automatic. The memory associated with a \p device_vector resides in the memory
+ *  space of a parallel device.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see device_allocator
+ *  \see host_vector
+ */
+template<typename T, typename Alloc = thrust::device_allocator<T> >
+  class device_vector
+    : public detail::vector_base<T,Alloc>
+{
+  private:
+    typedef detail::vector_base<T,Alloc> Parent;
+
+  public:
+    /*! \cond
+     */
+    typedef typename Parent::size_type  size_type;
+    typedef typename Parent::value_type value_type;
+    /*! \endcond
+     */
+
+    /*! This constructor creates an empty \p device_vector.
+     */
+    device_vector(void)
+      :Parent() {}
+
+    /*! This constructor creates an empty \p device_vector.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(const Alloc &alloc)
+      :Parent(alloc) {}
+
+    /*! The destructor erases the elements.
+     */
+    //  Define an empty destructor to explicitly specify
+    //  its execution space qualifier, as a workaround for nvcc warning
+    ~device_vector(void) {}
+
+    /*! This constructor creates a \p device_vector with the given
+     *  size.
+     *  \param n The number of elements to initially create.
+     */
+    explicit device_vector(size_type n)
+      :Parent(n) {}
+
+    /*! This constructor creates a \p device_vector with the given
+     *  size.
+     *  \param n The number of elements to initially create.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    explicit device_vector(size_type n, const Alloc &alloc)
+      :Parent(n,alloc) {}
+
+    /*! This constructor creates a \p device_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     */
+    explicit device_vector(size_type n, const value_type &value)
+      :Parent(n,value) {}
+
+    /*! This constructor creates a \p device_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    explicit device_vector(size_type n, const value_type &value, const Alloc &alloc)
+      :Parent(n,value,alloc) {}
+
+    /*! Copy constructor copies from an exemplar \p device_vector.
+     *  \param v The \p device_vector to copy.
+     */
+    device_vector(const device_vector &v)
+      :Parent(v) {}
+
+    /*! Copy constructor copies from an exemplar \p device_vector.
+     *  \param v The \p device_vector to copy.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(const device_vector &v, const Alloc &alloc)
+      :Parent(v,alloc) {}
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move constructor moves from another \p device_vector.
+     *  \param v The device_vector to move.
+     */
+    device_vector(device_vector &&v)
+      :Parent(std::move(v)) {}
+
+    /*! Move constructor moves from another \p device_vector.
+     *  \param v The device_vector to move.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(device_vector &&v, const Alloc &alloc)
+      :Parent(std::move(v), alloc) {}
+  #endif // THRUST_CPP_DIALECT >= 2011
+
+    /*! Copy assign operator copies another \p device_vector with the same type.
+     *  \param v The \p device_vector to copy.
+     */
+    device_vector &operator=(const device_vector &v)
+    { Parent::operator=(v); return *this; }
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move assign operator moves from another \p device_vector.
+     *  \param v The device_vector to move.
+     */
+     device_vector &operator=(device_vector &&v)
+     { Parent::operator=(std::move(v)); return *this; }
+  #endif // THRUST_CPP_DIALECT >= 2011
+
+    /*! Copy constructor copies from an exemplar \p device_vector with different type.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    explicit device_vector(const device_vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar \p device_vector with different type.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    device_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! Copy constructor copies from an exemplar \c std::vector.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    device_vector(const std::vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar <tt>std::vector</tt>.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    device_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this;}
+
+    /*! Copy constructor copies from an exemplar \p host_vector with possibly different type.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    device_vector(const host_vector<OtherT,OtherAlloc> &v);
+
+    /*! Assign operator copies from an examplar \p host_vector.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    device_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! This constructor builds a \p device_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    device_vector(InputIterator first, InputIterator last)
+      :Parent(first,last) {}
+
+    /*! This constructor builds a \p device_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    template<typename InputIterator>
+    device_vector(InputIterator first, InputIterator last, const Alloc &alloc)
+      :Parent(first,last,alloc) {}
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! \brief Resizes this vector to the specified number of elements.
+     *  \param new_size Number of elements this vector should contain.
+     *  \param x Data with which new elements should be populated.
+     *  \throw std::length_error If n exceeds max_size().
+     *
+     *  This method will resize this vector to the specified number of
+     *  elements.  If the number is smaller than this vector's current
+     *  size this vector is truncated, otherwise this vector is
+     *  extended and new elements are populated with given data.
+     */
+    void resize(size_type new_size, const value_type &x = value_type());
+
+    /*! Returns the number of elements in this vector.
+     */
+    size_type size(void) const;
+
+    /*! Returns the size() of the largest possible vector.
+     *  \return The largest possible return value of size().
+     */
+    size_type max_size(void) const;
+
+    /*! \brief If n is less than or equal to capacity(), this call has no effect.
+     *         Otherwise, this method is a request for allocation of additional memory. If
+     *         the request is successful, then capacity() is greater than or equal to
+     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
+     *  \throw std::length_error If n exceeds max_size().
+     */
+    void reserve(size_type n);
+
+    /*! Returns the number of elements which have been reserved in this
+     *  vector.
+     */
+    size_type capacity(void) const;
+
+    /*! This method shrinks the capacity of this vector to exactly
+     *  fit its elements.
+     */
+    void shrink_to_fit(void);
+
+    /*! \brief Subscript access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read/write reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    reference operator[](size_type n);
+
+    /*! \brief Subscript read access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    const_reference operator[](size_type n) const;
+
+    /*! This method returns an iterator pointing to the beginning of
+     *  this vector.
+     *  \return mStart
+     */
+    iterator begin(void);
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator begin(void) const;
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator cbegin(void) const;
+
+    /*! This method returns a reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    reverse_iterator rbegin(void);
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator rbegin(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator crbegin(void) const;
+
+    /*! This method returns an iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    iterator end(void);
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator end(void) const;
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator cend(void) const;
+
+    /*! This method returns a reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    reverse_iterator rend(void);
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator rend(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator crend(void) const;
+
+    /*! This method returns a const_reference referring to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    const_reference front(void) const;
+
+    /*! This method returns a reference pointing to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    reference front(void);
+
+    /*! This method returns a const reference pointing to the last element of
+     *  this vector.
+     *  \return The last element of this vector.
+     */
+    const_reference back(void) const;
+
+    /*! This method returns a reference referring to the last element of
+     *  this vector_dev.
+     *  \return The last element of this vector.
+     */
+    reference back(void);
+
+    /*! This method returns a pointer to this vector's first element.
+     *  \return A pointer to the first element of this vector.
+     */
+    pointer data(void);
+
+    /*! This method returns a const_pointer to this vector's first element.
+     *  \return a const_pointer to the first element of this vector.
+     */
+    const_pointer data(void) const;
+
+    /*! This method resizes this vector to 0.
+     */
+    void clear(void);
+
+    /*! This method returns true iff size() == 0.
+     *  \return true if size() == 0; false, otherwise.
+     */
+    bool empty(void) const;
+
+    /*! This method appends the given element to the end of this vector.
+     *  \param x The element to append.
+     */
+    void push_back(const value_type &x);
+
+    /*! This method erases the last element of this vector, invalidating
+     *  all iterators and references to it.
+     */
+    void pop_back(void);
+
+    /*! This method swaps the contents of this device_vector with another vector.
+     *  \param v The vector with which to swap.
+     */
+    void swap(device_vector &v);
+
+    /*! This method removes the element at position pos.
+     *  \param pos The position of the element of interest.
+     *  \return An iterator pointing to the new location of the element that followed the element
+     *          at position pos.
+     */
+    iterator erase(iterator pos);
+
+    /*! This method removes the range of elements [first,last) from this vector.
+     *  \param first The beginning of the range of elements to remove.
+     *  \param last The end of the range of elements to remove.
+     *  \return An iterator pointing to the new location of the element that followed the last
+     *          element in the sequence [first,last).
+     */
+    iterator erase(iterator first, iterator last);
+
+    /*! This method inserts a single copy of a given exemplar value at the
+     *  specified position in this vector.
+     *  \param position The insertion position.
+     *  \param x The exemplar element to copy & insert.
+     *  \return An iterator pointing to the newly inserted element.
+     */
+    iterator insert(iterator position, const T &x); 
+
+    /*! This method inserts a copy of an exemplar value to a range at the
+     *  specified position in this vector.
+     *  \param position The insertion position
+     *  \param n The number of insertions to perform.
+     *  \param x The value to replicate and insert.
+     */
+    void insert(iterator position, size_type n, const T &x);
+
+    /*! This method inserts a copy of an input range at the specified position
+     *  in this vector.
+     *  \param position The insertion position.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     */
+    template<typename InputIterator>
+    void insert(iterator position, InputIterator first, InputIterator last);
+
+    /*! This version of \p assign replicates a given exemplar
+     *  \p n times into this vector.
+     *  \param n The number of times to copy \p x.
+     *  \param x The exemplar element to replicate.
+     */
+    void assign(size_type n, const T &x);
+
+    /*! This version of \p assign makes this vector a copy of a given input range.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     */
+    template<typename InputIterator>
+    void assign(InputIterator first, InputIterator last);
+
+    /*! This method returns a copy of this vector's allocator.
+     *  \return A copy of the alloctor used by this vector.
+     */
+    allocator_type get_allocator(void) const;
+#endif // end doxygen-only members
+}; // end device_vector
+
+/*! Exchanges the values of two vectors.
+ *  \p x The first \p device_vector of interest.
+ *  \p y The second \p device_vector of interest.
+ */
+template<typename T, typename Alloc>
+  void swap(device_vector<T,Alloc> &a, device_vector<T,Alloc> &b)
+{
+  a.swap(b);
+} // end swap()
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_vector.inl>
+
+
diff --git a/thrust/thrust/distance.h b/thrust/thrust/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dd4800be7a8975061fb58777d603f13fb0c82b6
--- /dev/null
+++ b/thrust/thrust/distance.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file distance.h
+ *  \brief Computes the size of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \p distance finds the distance between \p first and \p last, i.e. the
+ *  number of times that \p first must be incremented until it is equal to
+ *  \p last.
+ *
+ *  \param first The beginning of an input range of interest.
+ *  \param last The end of an input range of interest.
+ *  \return The distance between the beginning and end of the input range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *
+ *  \pre If \c InputIterator meets the requirements of random access iterator, \p last shall be reachable from \p first or
+ *       \p first shall be reachable from \p last; otherwise, \p last shall be reachable from \p first.
+ *
+ *  The following code snippet demonstrates how to use \p distance to compute
+ *  the distance to one iterator from another.
+ *
+ *  \code
+ *  #include <thrust/distance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator iter1 = vec.begin();
+ *  thrust::device_vector<int>::iterator iter2 = iter1 + 7;
+ *
+ *  int d = thrust::distance(iter1, iter2);
+ *
+ *  // d is 7
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/distance.html
+ */
+template<typename InputIterator>
+inline __host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last);
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/detail/distance.inl>
+
diff --git a/thrust/thrust/equal.h b/thrust/thrust/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc6db501573534cf5c78f51d9dd3becffb7e2180
--- /dev/null
+++ b/thrust/thrust/equal.h
@@ -0,0 +1,238 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file equal.h
+ *  \brief Equality between ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup comparisons
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>, <tt>*i == *(first2 + (i - first1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p equal to test
+ *  two ranges for equality using the \p thrust::host execution policy:
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[7] = {3, 1, 4, 1, 5, 9, 3};
+ *  int A2[7] = {3, 1, 4, 2, 8, 5, 7};
+ *  ...
+ *  bool result = thrust::equal(thrust::host, A1, A1 + 7, A2);
+ *
+ *  // result == false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>, <tt>*i == *(first2 + (i - first1))</tt>.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p equal to test
+ *  two ranges for equality.
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  ...
+ *  int A1[7] = {3, 1, 4, 1, 5, 9, 3};
+ *  int A2[7] = {3, 1, 4, 2, 8, 5, 7};
+ *  ...
+ *  bool result = thrust::equal(A1, A1 + 7, A2);
+ *
+ *  // result == false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template <typename InputIterator1, typename InputIterator2>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2);
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>,
+ *  <tt>binary_pred(*i, *(first2 + (i - first1)))</tt> is \c true.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param binary_pred Binary predicate used to test element equality.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal to compare the
+ *  elements in two ranges modulo 2 using the \p thrust::host execution policy.
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  struct compare_modulo_two
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x, int y) const
+ *    {
+ *      return (x % 2) == (y % 2);
+ *    }
+ *  };
+ *  ...
+ *  int x[6] = {0, 2, 4, 6, 8, 10};
+ *  int y[6] = {1, 3, 5, 7, 9, 11};
+ *
+ *  bool result = thrust::equal(x, x + 6, y, compare_modulo_two());
+ *
+ *  // result is false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred);
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>,
+ *  <tt>binary_pred(*i, *(first2 + (i - first1)))</tt> is \c true.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param binary_pred Binary predicate used to test element equality.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal to compare the
+ *  elements in two ranges modulo 2.
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  
+ *  struct compare_modulo_two
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x, int y) const
+ *    {
+ *      return (x % 2) == (y % 2);
+ *    }
+ *  };
+ *  ...
+ *  int x[6] = {0, 2, 4, 6, 8, 10};
+ *  int y[6] = {1, 3, 5, 7, 9, 11};
+ *
+ *  bool result = thrust::equal(x, x + 5, y, compare_modulo_two());
+ *
+ *  // result is true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template <typename InputIterator1, typename InputIterator2, 
+          typename BinaryPredicate>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2, BinaryPredicate binary_pred);
+
+
+/*! \} // end comparisons
+ *  \} // end reductions
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/equal.inl>
+
diff --git a/thrust/thrust/event.h b/thrust/thrust/event.h
new file mode 100644
index 0000000000000000000000000000000000000000..75578d96443c2e20c0b80b4d4efe28a6a5a5b04f
--- /dev/null
+++ b/thrust/thrust/event.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/event.h
+ *  \brief `thrust::event`, an asynchronous handle type.
+ */
+
+#pragma once
+
+#include <thrust/future.h>
+
+// TODO: Actually separate `<thrust/future.h>` into two headers.
+
diff --git a/thrust/thrust/execution_policy.h b/thrust/thrust/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a4caba0f3bdb5215a5642c82ef1efc668dfda3
--- /dev/null
+++ b/thrust/thrust/execution_policy.h
@@ -0,0 +1,396 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/execution_policy.h
+ *  \brief Thrust execution policies.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/execute_with_allocator.h>
+#include <thrust/detail/seq.h>
+
+//! \cond
+
+// #include the host system's execution_policy header
+#define __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_HOST_SYSTEM_ROOT/execution_policy.h>
+#include __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
+#undef __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
+
+// #include the device system's execution_policy.h header
+#define __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/execution_policy.h>
+#include __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
+
+//! \endcond
+
+namespace thrust
+{
+
+
+/*! \cond
+ */
+
+
+namespace detail
+{
+
+
+typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::detail::par_t host_t;
+
+
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::detail::par_t device_t;
+
+
+} // end detail
+
+
+/*! \endcond
+ */
+
+
+/*! \addtogroup execution_policies Parallel Execution Policies
+ *  \{
+ */
+
+
+// define execution_policy for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+/*! \p execution_policy is the base class for all Thrust parallel execution policies
+ *  like \p thrust::host, \p thrust::device, and each backend system's tag type.
+ *
+ *  Custom user-defined backends should derive a policy from this type in order to
+ *  interoperate with Thrust algorithm dispatch.
+ *
+ *  The following code snippet demonstrates how to derive a standalone custom execution policy
+ *  from \p thrust::execution_policy to implement a backend which only implements \p for_each:
+ *
+ *  \code
+ *  #include <thrust/execution_policy.h>
+ *  #include <iostream>
+ *
+ *  // define a type derived from thrust::execution_policy to distinguish our custom execution policy:
+ *  struct my_policy : thrust::execution_policy<my_policy> {};
+ *
+ *  // overload for_each on my_policy
+ *  template<typename Iterator, typename Function>
+ *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
+ *  {
+ *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
+ *
+ *    for(; first < last; ++first)
+ *    {
+ *      f(*first);
+ *    }
+ *
+ *    return first;
+ *  }
+ *
+ *  struct ignore_argument
+ *  {
+ *    void operator()(int) {}
+ *  };
+ *
+ *  int main()
+ *  {
+ *    int data[4];
+ *
+ *    // dispatch thrust::for_each using our custom policy:
+ *    my_policy exec;
+ *    thrust::for_each(exec, data, data + 4, ignore_argument());
+ *
+ *    // can't dispatch thrust::transform because no overload exists for my_policy:
+ *    //thrust::transform(exec, data, data, + 4, data, thrust::identity<int>()); // error!
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see host_execution_policy
+ *  \see device_execution_policy
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::detail::execution_policy_base<DerivedPolicy>
+{};
+#endif
+
+
+/*! \p host_execution_policy is the base class for all Thrust parallel execution policies
+ *  which are derived from Thrust's default host backend system configured with the \p THRUST_HOST_SYSTEM
+ *  macro.
+ *
+ *  Custom user-defined backends which wish to inherit the functionality of Thrust's host backend system
+ *  should derive a policy from this type in order to interoperate with Thrust algorithm dispatch.
+ *
+ *  The following code snippet demonstrates how to derive a standalone custom execution policy from
+ *  \p thrust::host_execution_policy to implement a backend which specializes \p for_each while inheriting
+ *  the behavior of every other algorithm from the host system:
+ *
+ *  \code
+ *  #include <thrust/execution_policy.h>
+ *  #include <iostream>
+ *
+ *  // define a type derived from thrust::host_execution_policy to distinguish our custom execution policy:
+ *  struct my_policy : thrust::host_execution_policy<my_policy> {};
+ *
+ *  // overload for_each on my_policy
+ *  template<typename Iterator, typename Function>
+ *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
+ *  {
+ *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
+ *
+ *    for(; first < last; ++first)
+ *    {
+ *      f(*first);
+ *    }
+ *
+ *    return first;
+ *  }
+ *
+ *  struct ignore_argument
+ *  {
+ *    void operator()(int) {}
+ *  };
+ *
+ *  int main()
+ *  {
+ *    int data[4];
+ *
+ *    // dispatch thrust::for_each using our custom policy:
+ *    my_policy exec;
+ *    thrust::for_each(exec, data, data + 4, ignore_argument());
+ *
+ *    // dispatch thrust::transform whose behavior our policy inherits
+ *    thrust::transform(exec, data, data, + 4, data, thrust::identity<int>());
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see execution_policy
+ *  \see device_execution_policy
+ */
+template<typename DerivedPolicy>
+  struct host_execution_policy
+    : thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p device_execution_policy is the base class for all Thrust parallel execution policies
+ *  which are derived from Thrust's default device backend system configured with the \p THRUST_DEVICE_SYSTEM
+ *  macro.
+ *
+ *  Custom user-defined backends which wish to inherit the functionality of Thrust's device backend system
+ *  should derive a policy from this type in order to interoperate with Thrust algorithm dispatch.
+ *
+ *  The following code snippet demonstrates how to derive a standalone custom execution policy from
+ *  \p thrust::device_execution_policy to implement a backend which specializes \p for_each while inheriting
+ *  the behavior of every other algorithm from the device system:
+ *
+ *  \code
+ *  #include <thrust/execution_policy.h>
+ *  #include <iostream>
+ *
+ *  // define a type derived from thrust::device_execution_policy to distinguish our custom execution policy:
+ *  struct my_policy : thrust::device_execution_policy<my_policy> {};
+ *
+ *  // overload for_each on my_policy
+ *  template<typename Iterator, typename Function>
+ *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
+ *  {
+ *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
+ *
+ *    for(; first < last; ++first)
+ *    {
+ *      f(*first);
+ *    }
+ *
+ *    return first;
+ *  }
+ *
+ *  struct ignore_argument
+ *  {
+ *    void operator()(int) {}
+ *  };
+ *
+ *  int main()
+ *  {
+ *    int data[4];
+ *
+ *    // dispatch thrust::for_each using our custom policy:
+ *    my_policy exec;
+ *    thrust::for_each(exec, data, data + 4, ignore_argument());
+ *
+ *    // dispatch thrust::transform whose behavior our policy inherits
+ *    thrust::transform(exec, data, data, + 4, data, thrust::identity<int>());
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see execution_policy
+ *  \see host_execution_policy
+ */
+template<typename DerivedPolicy>
+  struct device_execution_policy
+    : thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p thrust::host is the default parallel execution policy associated with Thrust's host backend system
+ *  configured by the \p THRUST_HOST_SYSTEM macro.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may directly target
+ *  algorithm dispatch at Thrust's host system by providing \p thrust::host as an algorithm parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such as
+ *  \p thrust::host_vector.
+ *
+ *  Note that even though \p thrust::host targets the host CPU, it is a parallel execution policy. That is,
+ *  the order that an algorithm invokes functors or dereferences iterators is not defined.
+ *
+ *  The type of \p thrust::host is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::host to explicitly dispatch an invocation
+ *  of \p thrust::for_each to the host backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  int vec(3);
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::host, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see host_execution_policy
+ *  \see thrust::device
+ */
+static const detail::host_t host;
+
+
+/*! \p thrust::device is the default parallel execution policy associated with Thrust's device backend system
+ *  configured by the \p THRUST_DEVICE_SYSTEM macro.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may directly target
+ *  algorithm dispatch at Thrust's device system by providing \p thrust::device as an algorithm parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such as
+ *  \p thrust::device_vector or to avoid wrapping e.g. raw pointers allocated by the CUDA API with types
+ *  such as \p thrust::device_ptr.
+ *
+ *  The user must take care to guarantee that the iterators provided to an algorithm are compatible with
+ *  the device backend system. For example, raw pointers allocated by <tt>std::malloc</tt> typically
+ *  cannot be dereferenced by a GPU. For this reason, raw pointers allocated by host APIs should not be mixed
+ *  with a \p thrust::device algorithm invocation when the device backend is CUDA.
+ *
+ *  The type of \p thrust::device is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::device to explicitly dispatch an invocation
+ *  of \p thrust::for_each to the device backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> vec(3);
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::device, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see host_execution_policy
+ *  \see thrust::device
+ */
+THRUST_INLINE_CONSTANT detail::device_t device;
+
+
+// define seq for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+/*! \p thrust::seq is an execution policy which requires an algorithm invocation to execute sequentially
+ *  in the current thread. It can not be configured by a compile-time macro.
+ *
+ *  The type of \p thrust::seq is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::seq to explicitly execute an invocation
+ *  of \p thrust::for_each sequentially:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <vector>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  std::vector<int> vec(3);
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::seq, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in sequential order
+ *  \endcode
+ *
+ *  \see thrust::host
+ *  \see thrust::device
+ */
+static const detail::seq_t seq;
+#endif
+
+
+/*! \}
+ */
+
+
+} // end thrust
+
diff --git a/thrust/thrust/extrema.h b/thrust/thrust/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9fd016ccc36196dc071eceff7b64c545f11f096
--- /dev/null
+++ b/thrust/thrust/extrema.h
@@ -0,0 +1,804 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file extrema.h
+ *  \brief Functions for computing computing extremal values
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! This version of \p min returns the smaller of two values, given a comparison operation.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \param comp A comparison operation.
+ *  \return The smaller element.
+ *
+ *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min to compute the smaller of two
+ *  key-value objects.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value a = {13, 0};
+ *  key_value b = { 7, 1);
+ *
+ *  key_value smaller = thrust::min(a, b, compare_key_value());
+ *
+ *  // smaller is {7, 1}
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see max
+ */
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp);
+
+
+/*! This version of \p min returns the smaller of two values.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \return The smaller element.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min to compute the smaller of two
+ *  integers.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int a = 13;
+ *  int b = 7;
+ *
+ *  int smaller = thrust::min(a, b);
+ *
+ *  // smaller is 7
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see max
+ */
+template<typename T>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs);
+
+
+/*! This version of \p max returns the larger of two values, given a comparison operation.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \param comp A comparison operation.
+ *  \return The larger element.
+ *
+ *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max to compute the larger of two
+ *  key-value objects.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value a = {13, 0};
+ *  key_value b = { 7, 1);
+ *
+ *  key_value larger = thrust::max(a, b, compare_key_value());
+ *
+ *  // larger is {13, 0}
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see min
+ */
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp);
+
+
+/*! This version of \p max returns the larger of two values.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \return The larger element.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max to compute the larger of two
+ *  integers.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int a = 13;
+ *  int b = 7;
+ *
+ *  int larger = thrust::min(a, b);
+ *
+ *  // larger is 13
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see min
+ */
+template<typename T>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs);
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup extrema
+ *  \ingroup reductions
+ *  \{
+ */
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*j < *i</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::min_element(thrust::host, data, data + 6);
+ *
+ *  // result is data + 1
+ *  // *result is 0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
+
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*j < *i</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::min_element(data, data + 6);
+ *
+ *  // result is data + 1
+ *  // *result is 0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template <typename ForwardIterator>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
+
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*j, *i)</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min_element to find the smallest element
+ *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *smallest = thrust::min_element(thrust::host, data, data + 4, compare_key_value());
+ *
+ *  // smallest == data + 1
+ *  // *smallest == {0,7}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
+
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*j, *i)</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min_element to find the smallest element
+ *  of a collection of key-value pairs.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *smallest = thrust::min_element(data, data + 4, compare_key_value());
+ *
+ *  // smallest == data + 1
+ *  // *smallest == {0,7}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  greater than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*i < *j</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam A Thrust backend system.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::max_element(thrust::host, data, data + 6);
+ *
+ *  // *result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  greater than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*i < *j</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::max_element(data, data + 6);
+ *
+ *  // *result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template <typename ForwardIterator>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*i, *j)</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max_element to find the largest element
+ *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *largest = thrust::max_element(thrust::host, data, data + 4, compare_key_value());
+ *
+ *  // largest == data + 3
+ *  // *largest == {6,1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*i, *j)</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max_element to find the largest element
+ *  of a collection of key-value pairs.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *largest = thrust::max_element(data, data + 4, compare_key_value());
+ *
+ *  // largest == data + 3
+ *  // *largest == {6,1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  thrust::pair<int *, int *> result = thrust::minmax_element(thrust::host, data, data + 6);
+ *
+ *  // result.first is data + 1
+ *  // result.second is data + 5
+ *  // *result.first is 0
+ *  // *result.second is 3
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  thrust::pair<int *, int *> result = thrust::minmax_element(data, data + 6);
+ *
+ *  // result.first is data + 1
+ *  // result.second is data + 5
+ *  // *result.first is 0
+ *  // *result.second is 3
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template <typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
+                                                             ForwardIterator last);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
+ *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/pair.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(thrust::host, data, data + 4, compare_key_value());
+ *
+ *  // extrema.first   == data + 1
+ *  // *extrema.first  == {0,7}
+ *  // extrema.second  == data + 3
+ *  // *extrema.second == {6,1}
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
+ *  of a collection of key-value pairs.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/pair.h>
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(data, data + 4, compare_key_value());
+ *
+ *  // extrema.first   == data + 1
+ *  // *extrema.first  == {0,7}
+ *  // extrema.second  == data + 3
+ *  // *extrema.second == {6,1}
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template <typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp);
+
+/*! \} // end extrema
+ *  \} // end reductions
+ */
+
+} // end thrust
+
+#include <thrust/detail/extrema.inl>
+#include <thrust/detail/minmax.h>
+
diff --git a/thrust/thrust/fill.h b/thrust/thrust/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..8503138023fcd87a3d45f34f7d9f476badfc00cb
--- /dev/null
+++ b/thrust/thrust/fill.h
@@ -0,0 +1,209 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.h
+ *  \brief Fills a range with a constant value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \addtogroup filling
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! \p fill assigns the value \p value to every element in
+ *  the range <tt>[first, last)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, last)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be copied.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill(thrust::device, v.begin(), v.end(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see \c fill_n
+ *  \see \c uninitialized_fill
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value);
+
+
+/*! \p fill assigns the value \p value to every element in
+ *  the range <tt>[first, last)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, last)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be copied.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value.
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill(v.begin(), v.end(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see \c fill_n
+ *  \see \c uninitialized_fill
+ */
+template<typename ForwardIterator, typename T>
+__host__ __device__
+  void fill(ForwardIterator first,
+            ForwardIterator last,
+            const T &value);
+
+
+/*! \p fill_n assigns the value \p value to every element in
+ *  the range <tt>[first, first+n)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, first+n)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the sequence.
+ *  \param value The value to be copied.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill_n(thrust::device, v.begin(), v.size(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see \c fill
+ *  \see \c uninitialized_fill_n
+ */
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+__host__ __device__
+  OutputIterator fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value);
+
+
+/*! \p fill_n assigns the value \p value to every element in
+ *  the range <tt>[first, first+n)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, first+n)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the sequence.
+ *  \param value The value to be copied.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value.
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill_n(v.begin(), v.size(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see \c fill
+ *  \see \c uninitialized_fill_n
+ */
+template<typename OutputIterator, typename Size, typename T>
+__host__ __device__
+  OutputIterator fill_n(OutputIterator first,
+                        Size n,
+                        const T &value);
+
+
+/*! \} // end filling
+ *  \} // transformations
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/fill.inl>
+
diff --git a/thrust/thrust/find.h b/thrust/thrust/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e992499e9fa56af445d69b58941a1278a72ef67
--- /dev/null
+++ b/thrust/thrust/find.h
@@ -0,0 +1,385 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief Locating values in (unsorted) ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup searching
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p find returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>
+ *  or \c last if no such iterator exists.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param value The value to find.
+ *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator's \c value_type is equality comparable to type \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find(thrust::device, input.begin(), input.end(), 3); // returns input.first() + 2
+ *  iter = thrust::find(thrust::device, input.begin(), input.end(), 5); // returns input.first() + 1
+ *  iter = thrust::find(thrust::device, input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see find_if
+ *  \see mismatch
+ */
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value);
+
+
+/*! \p find returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>
+ *  or \c last if no such iterator exists.
+ *
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param value The value to find.
+ *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator's \c value_type is equality comparable to type \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find(input.begin(), input.end(), 3); // returns input.first() + 2
+ *  iter = thrust::find(input.begin(), input.end(), 5); // returns input.first() + 1
+ *  iter = thrust::find(input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see find_if
+ *  \see mismatch
+ */
+template <typename InputIterator, typename T>
+InputIterator find(InputIterator first,
+                   InputIterator last,
+                   const T& value);
+
+
+/*! \p find_if returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true
+ *  or \c last if no such iterator exists.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if(thrust::device, input.begin(), input.end(), greater_than_four()); // returns input.first() + 1
+ *
+ *  iter = thrust::find_if(thrust::device, input.begin(), input.end(), greater_than_ten());  // returns input.end()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if_not
+ *  \see mismatch
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \p find_if returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true
+ *  or \c last if no such iterator exists.
+ *
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if(input.begin(), input.end(), greater_than_four()); // returns input.first() + 1
+ *
+ *  iter = thrust::find_if(input.begin(), input.end(), greater_than_ten());  // returns input.end()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if_not
+ *  \see mismatch
+ */
+template <typename InputIterator, typename Predicate>
+InputIterator find_if(InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \p find_if_not returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c false
+ *  or \c last if no such iterator exists.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if_not(thrust::device, input.begin(), input.end(), greater_than_four()); // returns input.first()
+ *
+ *  iter = thrust::find_if_not(thrust::device, input.begin(), input.end(), greater_than_ten());  // returns input.first()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ *  \see mismatch
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred);
+
+
+/*! \p find_if_not returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c false
+ *  or \c last if no such iterator exists.
+ *
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if_not(input.begin(), input.end(), greater_than_four()); // returns input.first()
+ *
+ *  iter = thrust::find_if_not(input.begin(), input.end(), greater_than_ten());  // returns input.first()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ *  \see mismatch
+ */
+template <typename InputIterator, typename Predicate>
+InputIterator find_if_not(InputIterator first,
+                          InputIterator last,
+                          Predicate pred);
+
+/*! \} // end searching
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/find.inl>
+
diff --git a/thrust/thrust/for_each.h b/thrust/thrust/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcc87f399445ce776aabee256f97cd0d4570bc99
--- /dev/null
+++ b/thrust/thrust/for_each.h
@@ -0,0 +1,280 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/for_each.h
+ *  \brief Applies a function to each element in a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup modifying
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! \p for_each applies the function object \p f to each element
+ *  in the range <tt>[first, last)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution. For this reason, this version of \p for_each
+ *  does not return a copy of the function object.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param f The function object to apply to the range <tt>[first, last)</tt>.
+ *  \return last
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each to print the elements
+ *  of a \p std::device_vector using the \p thrust::device parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *  ...
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::device, d_vec.begin(), d_vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each_n
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+__host__ __device__
+InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f);
+
+
+/*! \p for_each_n applies the function object \p f to each element
+ *  in the range <tt>[first, first + n)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the input sequence.
+ *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam Size is an integral type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each_n to print the elements
+ *  of a \p device_vector using the \p thrust::device parallelization policy.
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each_n(thrust::device, d_vec.begin(), d_vec.size(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+__host__ __device__
+InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         Size n,
+                         UnaryFunction f);
+
+/*! \p for_each applies the function object \p f to each element
+ *  in the range <tt>[first, last)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution. For this reason, this version of \p for_each
+ *  does not return a copy of the function object.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param f The function object to apply to the range <tt>[first, last)</tt>.
+ *  \return last
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each to print the elements
+ *  of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <stdio.h>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each(d_vec.begin(), d_vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each_n
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f);
+
+
+/*! \p for_each_n applies the function object \p f to each element
+ *  in the range <tt>[first, first + n)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the input sequence.
+ *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam Size is an integral type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each_n to print the elements
+ *  of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <stdio.h>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each_n(d_vec.begin(), d_vec.size(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(InputIterator first,
+                         Size n,
+                         UnaryFunction f);
+
+/*! \} // end modifying
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/for_each.inl>
+
diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a62539d2c60a32939e3092e2e2e179169b832e6
--- /dev/null
+++ b/thrust/thrust/functional.h
@@ -0,0 +1,1719 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file functional.h
+ *  \brief Function objects and tools for manipulating them
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <functional>
+#include <thrust/detail/functional/placeholder.h>
+
+namespace thrust
+{
+
+/*! \addtogroup function_objects Function Objects
+ */
+
+template<typename Operation> struct unary_traits;
+
+template<typename Operation> struct binary_traits;
+
+/*! \addtogroup function_object_adaptors Function Object Adaptors
+ *  \ingroup function_objects
+ *  \{
+ */
+
+/*! \p unary_function is an empty base class: it contains no member functions
+ *  or member variables, but only type information. The only reason it exists
+ *  is to make it more convenient to define types that are models of the
+ *  concept Adaptable Unary Function. Specifically, any model of Adaptable
+ *  Unary Function must define nested \c typedefs. Those \c typedefs are
+ *  provided by the base class \p unary_function.
+ *
+ *  The following code snippet demonstrates how to construct an 
+ *  Adaptable Unary Function using \p unary_function.
+ *
+ *  \code
+ *  struct sine : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) { return sinf(x); }
+ *  };
+ *  \endcode
+ *
+ *  \note Because C++11 language support makes the functionality of
+ *        \c unary_function obsolete, its use is optional if C++11 language
+ *        features are enabled.
+ *
+ *  \see http://www.sgi.com/tech/stl/unary_function.html
+ *  \see binary_function
+ */
+template<typename Argument,
+         typename Result>
+struct unary_function
+{
+  /*! \typedef argument_type
+   *  \brief The type of the function object's argument.
+   */
+  typedef Argument argument_type;
+
+  /*! \typedef result_type;
+   *  \brief The type of the function object's result.
+   */
+  typedef Result   result_type;
+}; // end unary_function
+
+/*! \p binary_function is an empty base class: it contains no member functions
+ *  or member variables, but only type information. The only reason it exists
+ *  is to make it more convenient to define types that are models of the
+ *  concept Adaptable Binary Function. Specifically, any model of Adaptable
+ *  Binary Function must define nested \c typedefs. Those \c typedefs are
+ *  provided by the base class \p binary_function.
+ *
+ *  The following code snippet demonstrates how to construct an 
+ *  Adaptable Binary Function using \p binary_function.
+ *
+ *  \code
+ *  struct exponentiate : public thrust::binary_function<float,float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x, float y) { return powf(x,y); }
+ *  };
+ *  \endcode
+ *
+ *  \note Because C++11 language support makes the functionality of
+ *        \c binary_function obsolete, its use is optional if C++11 language
+ *        features are enabled.
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_function.html
+ *  \see unary_function
+ */
+template<typename Argument1,
+         typename Argument2,
+         typename Result>
+struct binary_function
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef Argument1 first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef Argument2 second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef Result    result_type;
+}; // end binary_function
+
+/*! \}
+ */
+
+
+/*! \addtogroup predefined_function_objects Predefined Function Objects
+ *  \ingroup function_objects
+ */
+
+/*! \addtogroup arithmetic_operations Arithmetic Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+#define THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(func, impl)                   \
+  template <>                                                                  \
+  struct func<void>                                                            \
+  {                                                                            \
+    using is_transparent = void;                                               \
+    __thrust_exec_check_disable__                                              \
+    template <typename T>                                                      \
+    __host__ __device__                                                        \
+    constexpr auto operator()(T&& x) const                                     \
+      noexcept(noexcept(impl)) -> decltype(impl)                               \
+    {                                                                          \
+      return impl;                                                             \
+    }                                                                          \
+  }
+
+#define THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(func, impl)                  \
+  template <>                                                                  \
+  struct func<void>                                                            \
+  {                                                                            \
+    using is_transparent = void;                                               \
+    __thrust_exec_check_disable__                                              \
+    template <typename T1, typename T2>                                        \
+    __host__ __device__                                                        \
+    constexpr auto operator()(T1&& t1, T2&& t2) const                          \
+      noexcept(noexcept(impl)) -> decltype(impl)                               \
+    {                                                                          \
+      return impl;                                                             \
+    }                                                                          \
+  }
+
+#define THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(func, op)                 \
+  THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(                                   \
+    func, THRUST_FWD(t1) op THRUST_FWD(t2))
+
+
+/*! \p plus is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>plus<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x+y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x+y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>plus</tt> to sum two
+ *  device_vectors of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::plus<float>());
+ *  // V3 is now {76, 77, 78, ..., 1075}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/plus.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct plus
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs + rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs + rhs;
+  }
+}; // end plus
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(plus, +);
+
+/*! \p minus is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x-y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x-y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>minus</tt> to subtract
+ *  a device_vector of \c floats from another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::minus<float>());
+ *  // V3 is now {-74, -73, -72, ..., 925}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/minus.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct minus
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs - rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs - rhs;
+  }
+}; // end minus
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(minus, -);
+
+/*! \p multiplies is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>multiplies<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x*y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x*y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>multiplies</tt> to multiply
+ *  two device_vectors of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::multiplies<float>());
+ *  // V3 is now {75, 150, 225, ..., 75000}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/multiplies.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct multiplies
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs * rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs * rhs;
+  }
+}; // end multiplies
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(multiplies, *);
+
+/*! \p divides is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x/y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x/y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>divides</tt> to divide
+ *  one device_vectors of \c floats by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::divides<float>());
+ *  // V3 is now {1/75, 2/75, 3/75, ..., 1000/75}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/divides.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct divides
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs / rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs / rhs;
+  }
+}; // end divides
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(divides, /);
+
+/*! \p modulus is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>modulus<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x \% y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x \% y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>modulus</tt> to take
+ *  the modulus of one device_vectors of \c floats by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::modulus<int>());
+ *  // V3 is now {1%75, 2%75, 3%75, ..., 1000%75}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/modulus.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct modulus
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs % rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs % rhs;
+  }
+}; // end modulus
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(modulus, %);
+
+/*! \p negate is a function object. Specifically, it is an Adaptable Unary Function.
+ *  If \c f is an object of class <tt>negate<T></tt>, and \c x is an object
+ *  of class \c T, then <tt>f(x)</tt> returns <tt>-x</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p T, then <tt>-x</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>negate</tt> to negate
+ *  the elements of a device_vector of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(),
+ *                    thrust::negate<float>());
+ *  // V2 is now {-1, -2, -3, ..., -1000}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/negate.html
+ *  \see unary_function
+ */
+template<typename T = void>
+struct negate
+{
+  /*! \typedef argument_type
+   *  \brief The type of the function object's argument.
+   */
+  typedef T argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>-x</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &x) const
+  {
+    return -x;
+  }
+}; // end negate
+
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(negate, -THRUST_FWD(x));
+
+/*! \p square is a function object. Specifically, it is an Adaptable Unary Function.
+ *  If \c f is an object of class <tt>square<T></tt>, and \c x is an object
+ *  of class \c T, then <tt>f(x)</tt> returns <tt>x*x</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p T, then <tt>x*x</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>square</tt> to square
+ *  the elements of a device_vector of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(),
+ *                    thrust::square<float>());
+ *  // V2 is now {1, 4, 9, ..., 1000000}
+ *  \endcode
+ *
+ *  \see unary_function
+ */
+template<typename T = void>
+struct square
+{
+  /*! \typedef argument_type
+   *  \brief The type of the function object's argument.
+   */
+  typedef T argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>x*x</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &x) const
+  {
+    return x*x;
+  }
+}; // end square
+
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(square, x*x);
+
+/*! \}
+ */
+
+/*! \addtogroup comparison_operations Comparison Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p equal_to is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>equal_to<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x == y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_to.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct equal_to
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs == rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs == rhs;
+  }
+}; // end equal_to
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(equal_to, ==);
+
+/*! \p not_equal_to is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>not_equal_to<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x != y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/not_equal_to.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct not_equal_to
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs != rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs != rhs;
+  }
+}; // end not_equal_to
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(not_equal_to, !=);
+
+/*! \p greater is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>greater<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x > y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/greater.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct greater
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs > rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs > rhs;
+  }
+}; // end greater
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater, >);
+
+/*! \p less is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>less<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x < y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/less.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct less
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs < rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs;
+  }
+}; // end less
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less, <);
+
+/*! \p greater_equal is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>greater_equal<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x >= y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/greater_equal.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct greater_equal
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs >= rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs >= rhs;
+  }
+}; // end greater_equal
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater_equal, >=);
+
+/*! \p less_equal is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>less_equal<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x <= y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/less_equal.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct less_equal
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs <= rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs <= rhs;
+  }
+}; // end less_equal
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less_equal, <=);
+
+/*! \}
+ */
+
+
+/*! \addtogroup logical_operations Logical Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p logical_and is a function object. Specifically, it is an Adaptable Binary Predicate,
+ *  which means it is a function object that tests the truth or falsehood of some condition.
+ *  If \c f is an object of class <tt>logical_and<T></tt> and \c x and \c y are objects of
+ *  class \c T (where \c T is convertible to \c bool) then <tt>f(x,y)</tt> returns \c true
+ *  if and only if both \c x and \c y are \c true.
+ *
+ *  \tparam T must be convertible to \c bool.
+ *
+ *  \see http://www.sgi.com/tech/stl/logical_and.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct logical_and
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs && rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs && rhs;
+  }
+}; // end logical_and
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_and, &&);
+
+/*! \p logical_or is a function object. Specifically, it is an Adaptable Binary Predicate,
+ *  which means it is a function object that tests the truth or falsehood of some condition.
+ *  If \c f is an object of class <tt>logical_or<T></tt> and \c x and \c y are objects of
+ *  class \c T (where \c T is convertible to \c bool) then <tt>f(x,y)</tt> returns \c true
+ *  if and only if either \c x or \c y are \c true.
+ *
+ *  \tparam T must be convertible to \c bool.
+ *
+ *  \see http://www.sgi.com/tech/stl/logical_or.html
+ *  \see binary_function
+ */
+template<typename T = void>
+struct logical_or
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>lhs || rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs || rhs;
+  }
+}; // end logical_or
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_or, ||);
+
+/*! \p logical_not is a function object. Specifically, it is an Adaptable Predicate,
+ *  which means it is a function object that tests the truth or falsehood of some condition.
+ *  If \c f is an object of class <tt>logical_not<T></tt> and \c x is an object of
+ *  class \c T (where \c T is convertible to \c bool) then <tt>f(x)</tt> returns \c true
+ *  if and only if \c x is \c false.
+ *
+ *  \tparam T must be convertible to \c bool.
+ *
+ *  The following code snippet demonstrates how to use \p logical_not to transform
+ *  a device_vector of \c bools into its logical complement.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<bool> V;
+ *  ...
+ *  thrust::transform(V.begin(), V.end(), V.begin(), thrust::logical_not<bool>());
+ *  // The elements of V are now the logical complement of what they were prior
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/logical_not.html
+ *  \see unary_function
+ */
+template<typename T = void>
+struct logical_not
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef bool result_type;
+
+  /*! Function call operator. The return value is <tt>!x</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &x) const
+  {
+    return !x;
+  }
+}; // end logical_not
+
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(logical_not, !THRUST_FWD(x));
+
+/*! \}
+ */
+
+/*! \addtogroup bitwise_operations Bitwise Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p bit_and is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x&y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x&y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>bit_and</tt> to take
+ *  the bitwise AND of one device_vector of \c ints by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<int> V1(N);
+ *  thrust::device_vector<int> V2(N);
+ *  thrust::device_vector<int> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 13);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::bit_and<int>());
+ *  // V3 is now {1&13, 2&13, 3&13, ..., 1000%13}
+ *  \endcode
+ *
+ *  \see binary_function
+ */
+template<typename T = void>
+struct bit_and
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs & rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs & rhs;
+  }
+}; // end bit_and
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_and, &);
+
+/*! \p bit_or is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x|y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x|y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>bit_or</tt> to take
+ *  the bitwise OR of one device_vector of \c ints by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<int> V1(N);
+ *  thrust::device_vector<int> V2(N);
+ *  thrust::device_vector<int> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 13);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::bit_or<int>());
+ *  // V3 is now {1|13, 2|13, 3|13, ..., 1000|13}
+ *  \endcode
+ *
+ *  \see binary_function
+ */
+template<typename T = void>
+struct bit_or
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs | rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs | rhs;
+  }
+}; // end bit_or
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_or, |);
+
+/*! \p bit_xor is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x^y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x^y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>bit_xor</tt> to take
+ *  the bitwise XOR of one device_vector of \c ints by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<int> V1(N);
+ *  thrust::device_vector<int> V2(N);
+ *  thrust::device_vector<int> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 13);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::bit_xor<int>());
+ *  // V3 is now {1^13, 2^13, 3^13, ..., 1000^13}
+ *  \endcode
+ *
+ *  \see binary_function
+ */
+template<typename T = void>
+struct bit_xor
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs ^ rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs ^ rhs;
+  }
+}; // end bit_xor
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_xor, ^);
+
+/*! \}
+ */
+
+/*! \addtogroup generalized_identity_operations Generalized Identity Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p identity is a Unary Function that represents the identity function: it takes
+ *  a single argument \c x, and returns \c x.
+ *
+ *  \tparam T No requirements on \p T.
+ *
+ *  The following code snippet demonstrates that \p identity returns its
+ *  argument.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x = 137;
+ *  thrust::identity<int> id;
+ *  assert(x == id(x));
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/identity.html
+ *  \see unary_function
+ */
+template<typename T = void>
+struct identity
+{
+  /*! \typedef argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>x</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &operator()(const T &x) const
+  {
+    return x;
+  }
+}; // end identity
+
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(identity, THRUST_FWD(x));
+
+/*! \p maximum is a function object that takes two arguments and returns the greater
+ *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
+ *  object of class <tt>maximum<T></tt> and \c x and \c y are objects of class \c T
+ *  <tt>f(x,y)</tt> returns \c x if <tt>x > y</tt> and \c y, otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates that \p maximum returns its
+ *  greater argument.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::maximum<int> mx;
+ *  assert(x == mx(x,y));
+ *  \endcode
+ *
+ *  \see minimum
+ *  \see min
+ *  \see binary_function
+ */
+template<typename T = void>
+struct maximum
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>rhs < lhs ? lhs : rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs ? rhs : lhs;
+  }
+}; // end maximum
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(maximum,
+                                          t1 < t2 ? THRUST_FWD(t2)
+                                                  : THRUST_FWD(t1));
+
+/*! \p minimum is a function object that takes two arguments and returns the lesser
+ *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
+ *  object of class <tt>minimum<T></tt> and \c x and \c y are objects of class \c T
+ *  <tt>f(x,y)</tt> returns \c x if <tt>x < y</tt> and \c y, otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates that \p minimum returns its
+ *  lesser argument.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::minimum<int> mn;
+ *  assert(y == mn(x,y));
+ *  \endcode
+ *
+ *  \see maximum
+ *  \see max
+ *  \see binary_function
+ */
+template<typename T = void>
+struct minimum
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>lhs < rhs ? lhs : rhs</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs ? lhs : rhs;
+  }
+}; // end minimum
+
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(minimum,
+                                          t1 < t2 ? THRUST_FWD(t1)
+                                                  : THRUST_FWD(t2));
+
+/*! \p project1st is a function object that takes two arguments and returns
+ *  its first argument; the second argument is unused. It is essentially a
+ *  generalization of identity to the case of a Binary Function.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::project1st<int> pj1;
+ *  assert(x == pj1(x,y));
+ *  \endcode
+ *
+ *  \see identity
+ *  \see project2nd
+ *  \see binary_function
+ */
+template<typename T1 = void, typename T2 = void>
+struct project1st
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T1 first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T2 second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T1 result_type;
+
+  /*! Function call operator. The return value is <tt>lhs</tt>.
+   */
+  __host__ __device__
+  constexpr const T1 &operator()(const T1 &lhs, const T2 & /*rhs*/) const
+  {
+    return lhs;
+  }
+}; // end project1st
+
+template <>
+struct project1st<void, void>
+{
+  using is_transparent = void;
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&&) const
+    noexcept(noexcept(THRUST_FWD(t1))) -> decltype(THRUST_FWD(t1))
+  {
+    return THRUST_FWD(t1);
+  }
+};
+
+/*! \p project2nd is a function object that takes two arguments and returns
+ *  its second argument; the first argument is unused. It is essentially a
+ *  generalization of identity to the case of a Binary Function.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::project2nd<int> pj2;
+ *  assert(y == pj2(x,y));
+ *  \endcode
+ *
+ *  \see identity
+ *  \see project1st
+ *  \see binary_function
+ */
+template<typename T1 = void, typename T2 = void>
+struct project2nd
+{
+  /*! \typedef first_argument_type
+   *  \brief The type of the function object's first argument.
+   */
+  typedef T1 first_argument_type;
+
+  /*! \typedef second_argument_type
+   *  \brief The type of the function object's second argument.
+   */
+  typedef T2 second_argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T2 result_type;
+
+  /*! Function call operator. The return value is <tt>rhs</tt>.
+   */
+  __host__ __device__
+  constexpr const T2 &operator()(const T1 &/*lhs*/, const T2 &rhs) const
+  {
+    return rhs;
+  }
+}; // end project2nd
+
+template <>
+struct project2nd<void, void>
+{
+  using is_transparent = void;
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&&, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t2))) -> decltype(THRUST_FWD(t2))
+  {
+    return THRUST_FWD(t2);
+  }
+};
+
+/*! \}
+ */
+
+// odds and ends
+
+/*! \addtogroup function_object_adaptors
+ *  \{
+ */
+
+/*! \p unary_negate is a function object adaptor: it is an Adaptable Predicate
+ *  that represents the logical negation of some other Adaptable Predicate.
+ *  That is: if \c f is an object of class <tt>unary_negate<AdaptablePredicate></tt>,
+ *  then there exists an object \c pred of class \c AdaptablePredicate such
+ *  that <tt>f(x)</tt> always returns the same value as <tt>!pred(x)</tt>.
+ *  There is rarely any reason to construct a <tt>unary_negate</tt> directly;
+ *  it is almost always easier to use the helper function not1.
+ *
+ *  \see http://www.sgi.com/tech/stl/unary_negate.html
+ *  \see not1
+ */
+template<typename Predicate>
+struct unary_negate 
+    : public thrust::unary_function<typename Predicate::argument_type, bool>
+{
+  /*! Constructor takes a \p Predicate object to negate.
+   *  \param p The \p Predicate object to negate.
+   */
+  __host__ __device__
+  explicit unary_negate(Predicate p) : pred(p){}
+
+  /*! Function call operator. The return value is <tt>!pred(x)</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  bool operator()(const typename Predicate::argument_type& x) { return !pred(x); }
+
+  /*! \cond
+   */
+  Predicate pred;
+  /*! \endcond
+   */
+}; // end unary_negate
+
+/*! \p not1 is a helper function to simplify the creation of Adaptable Predicates:
+ *  it takes an Adaptable Predicate \p pred as an argument and returns a new Adaptable
+ *  Predicate that represents the negation of \p pred. That is: if \c pred is an object
+ *  of a type which models Adaptable Predicate, then the the type of the result
+ *  \c npred of <tt>not1(pred)</tt> is also a model of Adaptable Predicate and
+ *  <tt>npred(x)</tt> always returns the same value as <tt>!pred(x)</tt>.
+ *
+ *  \param pred The Adaptable Predicate to negate.
+ *  \return A new object, <tt>npred</tt> such that <tt>npred(x)</tt> always returns
+ *          the same value as <tt>!pred(x)</tt>.
+ *
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptablePredicate.html">Adaptable Predicate</a>.
+ *
+ *  \see unary_negate
+ *  \see not2
+ */
+template<typename Predicate>
+  __host__ __device__
+  unary_negate<Predicate> not1(const Predicate &pred);
+
+/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary 
+ *  Predicate that represents the logical negation of some other Adaptable
+ *  Binary Predicate. That is: if \c f is an object of class <tt>binary_negate<AdaptablePredicate></tt>,
+ *  then there exists an object \c pred of class \c AdaptableBinaryPredicate
+ *  such that <tt>f(x,y)</tt> always returns the same value as <tt>!pred(x,y)</tt>.
+ *  There is rarely any reason to construct a <tt>binary_negate</tt> directly;
+ *  it is almost always easier to use the helper function not2.
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_negate.html
+ */
+template<typename Predicate>
+struct binary_negate
+    : public thrust::binary_function<typename Predicate::first_argument_type,
+                                     typename Predicate::second_argument_type,
+                                     bool>
+{
+  /*! Constructor takes a \p Predicate object to negate.
+   *  \param p The \p Predicate object to negate.
+   */
+  __host__ __device__
+  explicit binary_negate(Predicate p) : pred(p){}
+
+  /*! Function call operator. The return value is <tt>!pred(x,y)</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  bool operator()(const typename Predicate::first_argument_type& x, const typename Predicate::second_argument_type& y)
+  { 
+      return !pred(x,y); 
+  }
+
+  /*! \cond
+   */
+  Predicate pred;
+  /*! \endcond
+   */
+}; // end binary_negate
+
+/*! \p not2 is a helper function to simplify the creation of Adaptable Binary Predicates:
+ *  it takes an Adaptable Binary Predicate \p pred as an argument and returns a new Adaptable
+ *  Binary Predicate that represents the negation of \p pred. That is: if \c pred is an object
+ *  of a type which models Adaptable Binary Predicate, then the the type of the result
+ *  \c npred of <tt>not2(pred)</tt> is also a model of Adaptable Binary Predicate and
+ *  <tt>npred(x,y)</tt> always returns the same value as <tt>!pred(x,y)</tt>.
+ *
+ *  \param pred The Adaptable Binary Predicate to negate.
+ *  \return A new object, <tt>npred</tt> such that <tt>npred(x,y)</tt> always returns
+ *          the same value as <tt>!pred(x,y)</tt>.
+ *
+ *  \tparam Binary Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptableBinaryPredicate.html">Adaptable Binary Predicate</a>.
+ *
+ *  \see binary_negate
+ *  \see not1
+ */
+template<typename BinaryPredicate>
+  __host__ __device__
+  binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred);
+
+/*! \}
+ */
+
+
+/*! \addtogroup placeholder_objects Placeholder Objects
+ *  \ingroup function_objects
+ *  \{
+ */
+
+
+/*! \namespace thrust::placeholders
+ *  \brief Facilities for constructing simple functions inline.
+ *
+ *  Objects in the \p thrust::placeholders namespace may be used to create simple arithmetic functions inline
+ *  in an algorithm invocation. Combining placeholders such as \p _1 and \p _2 with arithmetic operations such as \c +
+ *  creates an unnamed function object which applies the operation to their arguments.
+ *
+ *  The type of placeholder objects is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use the placeholders \p _1 and \p _2 with \p thrust::transform
+ *  to implement the SAXPY computation:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<float> x(4), y(4);
+ *    x[0] = 1;
+ *    x[1] = 2;
+ *    x[2] = 3;
+ *    x[3] = 4;
+ *
+ *    y[0] = 1;
+ *    y[1] = 1;
+ *    y[2] = 1;
+ *    y[3] = 1;
+ *
+ *    float a = 2.0f;
+ *
+ *    using namespace thrust::placeholders;
+ *
+ *    thrust::transform(x.begin(), x.end(), y.begin(), y.begin(),
+ *      a * _1 + _2
+ *    );
+ *
+ *    // y is now {3, 5, 7, 9}
+ *  }
+ *  \endcode
+ */
+namespace placeholders
+{
+
+
+/*! \p thrust::placeholders::_1 is the placeholder for the first function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<0>::type _1;
+
+
+/*! \p thrust::placeholders::_2 is the placeholder for the second function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<1>::type _2;
+
+
+/*! \p thrust::placeholders::_3 is the placeholder for the third function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<2>::type _3;
+
+
+/*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<3>::type _4;
+
+
+/*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<4>::type _5;
+
+
+/*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<5>::type _6;
+
+
+/*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<6>::type _7;
+
+
+/*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<7>::type _8;
+
+
+/*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<8>::type _9;
+
+
+/*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter.
+ */
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<9>::type _10;
+
+
+} // end placeholders
+
+
+/*! \} // placeholder_objects
+ */
+
+#undef THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION
+#undef THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION
+#undef THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP
+
+} // end thrust
+
+#include <thrust/detail/functional.inl>
+#include <thrust/detail/functional/operators.h>
+
diff --git a/thrust/thrust/future.h b/thrust/thrust/future.h
new file mode 100644
index 0000000000000000000000000000000000000000..12bebf8c6e041484b43d5a97759cccd730fc82f3
--- /dev/null
+++ b/thrust/thrust/future.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/future.h
+ *  \brief `thrust::future`, an asynchronous value type.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/execution_policy.h>
+#include <thrust/detail/static_assert.h>
+
+#include <utility>
+
+/*
+// #include the host system's pointer.h header.
+#define __THRUST_HOST_SYSTEM_POINTER_HEADER <__THRUST_HOST_SYSTEM_ROOT/pointer.h>
+  #include __THRUST_HOST_SYSTEM_POINTER_HEADER
+#undef __THRUST_HOST_SYSTEM_POINTER_HEADER
+*/
+
+// #include the device system's pointer.h header.
+#define __THRUST_DEVICE_SYSTEM_POINTER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/pointer.h>
+  #include __THRUST_DEVICE_SYSTEM_POINTER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_POINTER_HEADER
+
+/*
+// #include the host system's future.h header.
+#define __THRUST_HOST_SYSTEM_FUTURE_HEADER <__THRUST_HOST_SYSTEM_ROOT/future.h>
+  #include __THRUST_HOST_SYSTEM_FUTURE_HEADER
+#undef __THRUST_HOST_SYSTEM_FUTURE_HEADER
+*/
+
+// #include the device system's future.h header.
+#define __THRUST_DEVICE_SYSTEM_FUTURE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/future.h>
+  #include __THRUST_DEVICE_SYSTEM_FUTURE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FUTURE_HEADER
+
+namespace thrust
+{
+
+///////////////////////////////////////////////////////////////////////////////
+
+// `select_unique_(future|event)_type` is a hook for choosing the
+// `unique_eager_event`/`unique_eager_future` type for a system. `decltype` is
+// used to determine the return type of an ADL call to
+// `select_unique_eager_(future|event)_type(system)`; that return type should
+// be the correct event/future type for `system`. Overloads should only be
+// declared, not defined.
+
+namespace unimplemented
+{
+
+struct no_unique_eager_event_type_found {};
+
+inline __host__ 
+no_unique_eager_event_type_found
+unique_eager_event_type(...) noexcept;
+
+struct no_unique_eager_future_type_found {};
+
+template <typename T>
+__host__ 
+no_unique_eager_future_type_found
+unique_eager_future_type(...) noexcept;
+
+} // namespace unimplemented
+
+namespace unique_eager_event_type_detail
+{
+
+using unimplemented::unique_eager_event_type;
+
+template <typename System>
+using select = decltype(
+  unique_eager_event_type(std::declval<System>())
+);
+
+} // namespace unique_eager_event_type_detail
+
+namespace unique_eager_future_type_detail
+{
+
+using unimplemented::unique_eager_future_type;
+
+template <typename System, typename T>
+using select = decltype(
+  unique_eager_future_type<T>(std::declval<System>())
+);
+
+} // namespace unique_eager_future_type_detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename System>
+using unique_eager_event = unique_eager_event_type_detail::select<System>;
+
+template <typename System>
+using event = unique_eager_event<System>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename System, typename T>
+using unique_eager_future = unique_eager_future_type_detail::select<System, T>;
+
+template <typename System, typename T>
+using future = unique_eager_future<System, T>;
+
+/*
+///////////////////////////////////////////////////////////////////////////////
+
+using host_unique_eager_event = unique_eager_event_type_detail::select<
+  thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag
+>;
+using host_event = host_unique_eager_event;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+using host_unique_eager_future = unique_eager_future_type_detail::select<
+  thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag, T
+>;
+template <typename T>
+using host_future = host_unique_eager_future<T>;
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+
+using device_unique_eager_event = unique_eager_event_type_detail::select<
+  thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag
+>;
+
+using device_event = device_unique_eager_event;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+using device_unique_eager_future = unique_eager_future_type_detail::select<
+  thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag, T
+>;
+
+template <typename T>
+using device_future = device_unique_eager_future<T>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct new_stream_t final {};
+
+THRUST_INLINE_CONSTANT new_stream_t new_stream{};
+
+///////////////////////////////////////////////////////////////////////////////
+
+using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::when_all;
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace thrust
+
+#endif
+
diff --git a/thrust/thrust/gather.h b/thrust/thrust/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..276650a6cf77511d99aede80bd668cee818bd495
--- /dev/null
+++ b/thrust/thrust/gather.h
@@ -0,0 +1,441 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file gather.h
+ *  \brief Irregular copying from a source range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup gathering
+ *  \ingroup copying
+ *  \{
+ */
+
+
+/*! \p gather copies elements from a source array into a destination range according 
+ *  to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>, the
+ *  value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather is the inverse of thrust::scatter.
+ *
+ *  The following code snippet demonstrates how to use \p gather to reorder
+ *  a range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // gather all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::gather(thrust::device,
+ *                 d_map.begin(), d_map.end(),
+ *                 d_values.begin(),
+ *                 d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator                                               map_first,
+                        InputIterator                                               map_last,
+                        RandomAccessIterator                                        input_first,
+                        OutputIterator                                              result);
+
+
+/*! \p gather copies elements from a source array into a destination range according 
+ *  to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>, the
+ *  value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather is the inverse of thrust::scatter.
+ *
+ *  The following code snippet demonstrates how to use \p gather to reorder
+ *  a range.
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // gather all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::gather(d_map.begin(), d_map.end(),
+ *                 d_values.begin(),
+ *                 d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ */
+template<typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(InputIterator        map_first,
+                        InputIterator        map_last,
+                        RandomAccessIterator input_first,
+                        OutputIterator       result);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>,
+ *  such that the value of <tt>\*(stencil + (i - map_first))</tt> is \c true, the value
+ *  <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // select elements at even-indexed locations
+ *  int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(thrust::device,
+ *                    d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>,
+ *  such that the value of <tt>\*(stencil + (i - map_first))</tt> is \c true, the value
+ *  <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range.
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // select elements at even-indexed locations
+ *  int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>
+ *  such that the value of <tt>pred(\*(stencil + (i - map_first)))</tt> is \c true,
+ *  the value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range based on an arbitrary selection function using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // we will select an element when our stencil is even
+ *  int stencil[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(thrust::device,
+ *                    d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin(),
+ *                    is_even());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result,
+                           Predicate                                                   pred);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>
+ *  such that the value of <tt>pred(\*(stencil + (i - map_first)))</tt> is \c true,
+ *  the value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range based on an arbitrary selection function.
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // we will select an element when our stencil is even
+ *  int stencil[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin(),
+ *                    is_even());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result,
+                           Predicate            pred);
+
+/*! \} // gathering
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/gather.inl>
+
diff --git a/thrust/thrust/generate.h b/thrust/thrust/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..a651dd0dccee089f4b31df03000e724fdab13648
--- /dev/null
+++ b/thrust/thrust/generate.h
@@ -0,0 +1,213 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generate.h
+ *  \brief Fills a range with values "generated" from a function of no arguments
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \{
+ */
+
+
+/*! \p generate assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,last)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element in the range of interest.
+ *  \param last The last element in the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdlib>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate(thrust::host, v.begin(), v.end(), rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate_n
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Generator>
+__host__ __device__
+  void generate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen);
+
+
+/*! \p generate assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,last)</tt>.
+ *
+ *  \param first The first element in the range of interest.
+ *  \param last The last element in the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,last)</tt>.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand.
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdlib>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate(v.begin(), v.end(), rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate_n
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename ForwardIterator,
+         typename Generator>
+  void generate(ForwardIterator first,
+                ForwardIterator last,
+                Generator gen);
+
+
+/*! \p generate_n assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,first + n)</tt>. The return value is <tt>first + n</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element in the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,first + n)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Size is an integral type (either signed or unsigned).
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdlib>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate_n(thrust::host, v.begin(), 10, rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename DerivedPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+__host__ __device__
+  OutputIterator generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen);
+
+
+/*! \p generate_n assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,first + n)</tt>. The return value is <tt>first + n</tt>.
+ *
+ *  \param first The first element in the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,first + n)</tt>.
+ *
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Size is an integral type (either signed or unsigned).
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand.
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <stdlib.h>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate_n(v.begin(), 10, rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(OutputIterator first,
+                            Size n,
+                            Generator gen);
+
+
+/*! \} // end transformations
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/generate.inl>
+
diff --git a/thrust/thrust/host_vector.h b/thrust/thrust/host_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebe64216e284b8ba1e19cfa6950df3ab1d58f331
--- /dev/null
+++ b/thrust/thrust/host_vector.h
@@ -0,0 +1,514 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file host_vector.h
+ *  \brief A dynamically-sizable array of elements which reside in the "host" memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/memory_wrapper.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+#include <utility>
+
+namespace thrust
+{
+
+// forward declaration of device_vector
+template<typename T, typename Alloc> class device_vector;
+
+/*! \addtogroup container_classes Container Classes
+ *  \addtogroup host_containers Host Containers
+ *  \ingroup container_classes
+ *  \{
+ */
+
+/*! A \p host_vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p host_vector may vary dynamically; memory management is
+ *  automatic. The memory associated with a \p host_vector resides in the memory
+ *  space of the host associated with a parallel device.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see device_vector
+ */
+template<typename T, typename Alloc = std::allocator<T> >
+  class host_vector
+    : public detail::vector_base<T,Alloc>
+{
+  private:
+    typedef detail::vector_base<T,Alloc> Parent;
+
+  public:
+    /*! \cond
+     */
+    typedef typename Parent::size_type  size_type;
+    typedef typename Parent::value_type value_type;
+    /*! \endcond
+     */
+
+    /*! This constructor creates an empty \p host_vector.
+     */
+    __host__
+    host_vector(void)
+      :Parent() {}
+
+    /*! This constructor creates an empty \p host_vector.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    host_vector(const Alloc &alloc)
+      :Parent(alloc) {}
+
+    /*! The destructor erases the elements.
+     */
+    //  Define an empty destructor to explicitly specify
+    //  its execution space qualifier, as a workaround for nvcc warning
+    __host__
+    ~host_vector(void) {}
+
+    /*! This constructor creates a \p host_vector with the given
+     *  size.
+     *  \param n The number of elements to initially create.
+     */
+    __host__
+    explicit host_vector(size_type n)
+      :Parent(n) {}
+
+    /*! This constructor creates a \p host_vector with the given
+     *  size.
+     *  \param n The number of elements to initially create.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    explicit host_vector(size_type n, const Alloc &alloc)
+      :Parent(n,alloc) {}
+
+    /*! This constructor creates a \p host_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     */
+    __host__
+    explicit host_vector(size_type n, const value_type &value)
+      :Parent(n,value) {}
+
+    /*! This constructor creates a \p host_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    explicit host_vector(size_type n, const value_type &value, const Alloc &alloc)
+      :Parent(n,value,alloc) {}
+
+    /*! Copy constructor copies from an exemplar \p host_vector.
+     *  \param v The \p host_vector to copy.
+     */
+    __host__
+    host_vector(const host_vector &v)
+      :Parent(v) {}
+
+    /*! Copy constructor copies from an exemplar \p host_vector.
+     *  \param v The \p host_vector to copy.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    host_vector(const host_vector &v, const Alloc &alloc)
+      :Parent(v,alloc) {}
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move constructor moves from another host_vector.
+     *  \param v The host_vector to move.
+     */
+     __host__
+    host_vector(host_vector &&v)
+      :Parent(std::move(v)) {}
+
+    /*! Move constructor moves from another host_vector.
+     *  \param v The host_vector to move.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+     __host__
+    host_vector(host_vector &&v, const Alloc &alloc)
+      :Parent(std::move(v),alloc) {}
+  #endif
+
+  /*! Assign operator copies from an exemplar \p host_vector.
+   *  \param v The \p host_vector to copy.
+   */
+  __host__
+  host_vector &operator=(const host_vector &v)
+  { Parent::operator=(v); return *this; }
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move assign operator moves from another host_vector.
+     *  \param v The host_vector to move.
+     */
+     __host__
+     host_vector &operator=(host_vector &&v)
+     { Parent::operator=(std::move(v)); return *this; }
+  #endif
+
+    /*! Copy constructor copies from an exemplar \p host_vector with different type.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector(const host_vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar \p host_vector with different type.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! Copy constructor copies from an exemplar <tt>std::vector</tt>.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector(const std::vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar <tt>std::vector</tt>.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this;}
+
+    /*! Copy constructor copies from an exemplar \p device_vector with possibly different type.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector(const device_vector<OtherT,OtherAlloc> &v);
+
+    /*! Assign operator copies from an exemplar \p device_vector.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! This constructor builds a \p host_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    __host__
+    host_vector(InputIterator first, InputIterator last)
+      :Parent(first, last) {}
+
+    /*! This constructor builds a \p host_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    template<typename InputIterator>
+    __host__
+    host_vector(InputIterator first, InputIterator last, const Alloc &alloc)
+      :Parent(first, last, alloc) {}
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! \brief Resizes this vector to the specified number of elements.
+     *  \param new_size Number of elements this vector should contain.
+     *  \param x Data with which new elements should be populated.
+     *  \throw std::length_error If n exceeds max_size().
+     *
+     *  This method will resize this vector to the specified number of
+     *  elements.  If the number is smaller than this vector's current
+     *  size this vector is truncated, otherwise this vector is
+     *  extended and new elements are populated with given data.
+     */
+    void resize(size_type new_size, const value_type &x = value_type());
+
+    /*! Returns the number of elements in this vector.
+     */
+    size_type size(void) const;
+
+    /*! Returns the size() of the largest possible vector.
+     *  \return The largest possible return value of size().
+     */
+    size_type max_size(void) const;
+
+    /*! \brief If n is less than or equal to capacity(), this call has no effect.
+     *         Otherwise, this method is a request for allocation of additional memory. If
+     *         the request is successful, then capacity() is greater than or equal to
+     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
+     *  \throw std::length_error If n exceeds max_size().
+     */
+    void reserve(size_type n);
+
+    /*! Returns the number of elements which have been reserved in this
+     *  vector.
+     */
+    size_type capacity(void) const;
+
+    /*! This method shrinks the capacity of this vector to exactly
+     *  fit its elements.
+     */
+    void shrink_to_fit(void);
+
+    /*! \brief Subscript access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read/write reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    reference operator[](size_type n);
+
+    /*! \brief Subscript read access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    const_reference operator[](size_type n) const;
+
+    /*! This method returns an iterator pointing to the beginning of
+     *  this vector.
+     *  \return mStart
+     */
+    iterator begin(void);
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator begin(void) const;
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator cbegin(void) const;
+
+    /*! This method returns a reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    reverse_iterator rbegin(void);
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator rbegin(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator crbegin(void) const;
+
+    /*! This method returns an iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    iterator end(void);
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator end(void) const;
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator cend(void) const;
+
+    /*! This method returns a reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    reverse_iterator rend(void);
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator rend(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator crend(void) const;
+
+    /*! This method returns a const_reference referring to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    const_reference front(void) const;
+
+    /*! This method returns a reference pointing to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    reference front(void);
+
+    /*! This method returns a const reference pointing to the last element of
+     *  this vector.
+     *  \return The last element of this vector.
+     */
+    const_reference back(void) const;
+
+    /*! This method returns a reference referring to the last element of
+     *  this vector_dev.
+     *  \return The last element of this vector.
+     */
+    reference back(void);
+
+    /*! This method returns a pointer to this vector's first element.
+     *  \return A pointer to the first element of this vector.
+     */
+    pointer data(void);
+
+    /*! This method returns a const_pointer to this vector's first element.
+     *  \return a const_pointer to the first element of this vector.
+     */
+    const_pointer data(void) const;
+
+    /*! This method resizes this vector to 0.
+     */
+    void clear(void);
+
+    /*! This method returns true iff size() == 0.
+     *  \return true if size() == 0; false, otherwise.
+     */
+    bool empty(void) const;
+
+    /*! This method appends the given element to the end of this vector.
+     *  \param x The element to append.
+     */
+    void push_back(const value_type &x);
+
+    /*! This method erases the last element of this vector, invalidating
+     *  all iterators and references to it.
+     */
+    void pop_back(void);
+
+    /*! This method swaps the contents of this host_vector with another vector.
+     *  \param v The vector with which to swap.
+     */
+    void swap(host_vector &v);
+
+    /*! This method removes the element at position pos.
+     *  \param pos The position of the element of interest.
+     *  \return An iterator pointing to the new location of the element that followed the element
+     *          at position pos.
+     */
+    iterator erase(iterator pos);
+
+    /*! This method removes the range of elements [first,last) from this vector.
+     *  \param first The beginning of the range of elements to remove.
+     *  \param last The end of the range of elements to remove.
+     *  \return An iterator pointing to the new location of the element that followed the last
+     *          element in the sequence [first,last).
+     */
+    iterator erase(iterator first, iterator last);
+
+    /*! This method inserts a single copy of a given exemplar value at the
+     *  specified position in this vector.
+     *  \param position The insertion position.
+     *  \param x The exemplar element to copy & insert.
+     *  \return An iterator pointing to the newly inserted element.
+     */
+    iterator insert(iterator position, const T &x); 
+
+    /*! This method inserts a copy of an exemplar value to a range at the
+     *  specified position in this vector.
+     *  \param position The insertion position
+     *  \param n The number of insertions to perform.
+     *  \param x The value to replicate and insert.
+     */
+    void insert(iterator position, size_type n, const T &x);
+
+    /*! This method inserts a copy of an input range at the specified position
+     *  in this vector.
+     *  \param position The insertion position.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     */
+    template<typename InputIterator>
+    void insert(iterator position, InputIterator first, InputIterator last);
+
+    /*! This version of \p assign replicates a given exemplar
+     *  \p n times into this vector.
+     *  \param n The number of times to copy \p x.
+     *  \param x The exemplar element to replicate.
+     */
+    void assign(size_type n, const T &x);
+
+    /*! This version of \p assign makes this vector a copy of a given input range.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     */
+    template<typename InputIterator>
+    void assign(InputIterator first, InputIterator last);
+
+    /*! This method returns a copy of this vector's allocator.
+     *  \return A copy of the alloctor used by this vector.
+     */
+    allocator_type get_allocator(void) const;
+#endif // end doxygen-only members
+}; // end host_vector
+
+/*! Exchanges the values of two vectors.
+ *  \p x The first \p host_vector of interest.
+ *  \p y The second \p host_vector of interest.
+ */
+template<typename T, typename Alloc>
+  void swap(host_vector<T,Alloc> &a, host_vector<T,Alloc> &b)
+{
+  a.swap(b);
+} // end swap()
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/host_vector.inl>
+
diff --git a/thrust/thrust/inner_product.h b/thrust/thrust/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..0206eff38a4800282e3c585162fbeea1c6a350ca
--- /dev/null
+++ b/thrust/thrust/inner_product.h
@@ -0,0 +1,264 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file inner_product.h
+ *  \brief Mathematical inner product between ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup transformed_reductions Transformed Reductions
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>init + (*first1 * *first2) + (*(first1+1) * *(first2+1)) + ... </tt>
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt>
+ *          and <tt>[first2, last2)</tt> plus \p init.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
+ *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
+ *          and is convertible to \p OutputType.
+ *
+ *  The following code demonstrates how to use \p inner_product to
+ *  compute the dot product of two vectors using the \p thrust::host execution policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float result = thrust::inner_product(thrust::host, vec1, vec1 + 3, vec2, 0.0f);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType>
+__host__ __device__
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init);
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>init + (*first1 * *first2) + (*(first1+1) * *(first2+1)) + ... </tt>
+ *
+ *  Unlike the C++ Standard Template Library function <tt>std::inner_product</tt>,
+ *  this version offers no guarantee on order of execution.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt>
+ *          and <tt>[first2, last2)</tt> plus \p init.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
+ *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
+ *          and is convertible to \p OutputType.
+ *
+ *  The following code demonstrates how to use \p inner_product to
+ *  compute the dot product of two vectors.
+ *
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float result = thrust::inner_product(vec1, vec1 + 3, vec2, 0.0f);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template<typename InputIterator1, typename InputIterator2, typename OutputType>
+OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
+                         InputIterator2 first2, OutputType init);
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  This version of \p inner_product is identical to the first, except that is uses
+ *  two user-supplied function objects instead of \c operator+ and \c operator*.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>binary_op1( init, binary_op2(*first1, *first2) ), ... </tt>
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \param binary_op1 Generalized addition operation.
+ *  \param binary_op2 Generalized multiplication operation.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
+ *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
+ *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
+ * 
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float init = 0.0f;
+ *  thrust::plus<float>       binary_op1;
+ *  thrust::multiplies<float> binary_op2;
+ *
+ *  float result = thrust::inner_product(thrust::host, vec1, vec1 + 3, vec2, init, binary_op1, binary_op2);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType,
+         typename BinaryFunction1,
+         typename BinaryFunction2>
+__host__ __device__
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2);
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  This version of \p inner_product is identical to the first, except that is uses
+ *  two user-supplied function objects instead of \c operator+ and \c operator*.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>binary_op1( init, binary_op2(*first1, *first2) ), ... </tt>
+ *
+ *  Unlike the C++ Standard Template Library function <tt>std::inner_product</tt>,
+ *  this version offers no guarantee on order of execution.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \param binary_op1 Generalized addition operation.
+ *  \param binary_op2 Generalized multiplication operation.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
+ *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
+ *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
+ * 
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float init = 0.0f;
+ *  thrust::plus<float>       binary_op1;
+ *  thrust::multiplies<float> binary_op2;
+ *
+ *  float result = thrust::inner_product(vec1, vec1 + 3, vec2, init, binary_op1, binary_op2);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template<typename InputIterator1, typename InputIterator2, typename OutputType,
+         typename BinaryFunction1, typename BinaryFunction2>
+OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
+                         InputIterator2 first2, OutputType init, 
+                         BinaryFunction1 binary_op1, BinaryFunction2 binary_op2);
+
+
+/*! \} // end transformed_reductions
+ *  \} // end reductions
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/inner_product.inl>
+
diff --git a/thrust/thrust/iterator/constant_iterator.h b/thrust/thrust/iterator/constant_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda85291855d2461da2fcd958fb05746d94101d0
--- /dev/null
+++ b/thrust/thrust/iterator/constant_iterator.h
@@ -0,0 +1,251 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/constant_iterator.h
+ *  \brief An iterator which returns a constant value when
+ *         dereferenced
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/constant_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p constant_iterator is an iterator which represents a pointer into a range
+ *  of constant values. This iterator is useful for creating a range filled with the same
+ *  value without explicitly storing it in memory. Using \p constant_iterator saves both
+ *  memory capacity and bandwidth.
+ *
+ *  The following code snippet demonstrates how to create a \p constant_iterator whose
+ *  \c value_type is \c int and whose value is \c 10.
+ *
+ *  \code
+ *  #include <thrust/iterator/constant_iterator.h>
+ *
+ *  thrust::constant_iterator<int> iter(10);
+ *
+ *  *iter;    // returns 10
+ *  iter[0];  // returns 10
+ *  iter[1];  // returns 10
+ *  iter[13]; // returns 10
+ *
+ *  // and so on...
+ *  \endcode
+ *
+ *  This next example demonstrates how to use a \p constant_iterator with the
+ *  \p thrust::transform function to increment all elements of a sequence by the
+ *  same value. We will create a temporary \p constant_iterator with the function
+ *  \p make_constant_iterator function in order to avoid explicitly specifying
+ *  its type:
+ *
+ *  \code
+ *  #include <thrust/iterator/constant_iterator.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<int> data(4);
+ *    data[0] = 3;
+ *    data[1] = 7;
+ *    data[2] = 2;
+ *    data[3] = 5;
+ *    
+ *    // add 10 to all values in data
+ *    thrust::transform(data.begin(), data.end(),
+ *                      thrust::make_constant_iterator(10),
+ *                      data.begin(),
+ *                      thrust::plus<int>());
+ *    
+ *    // data is now [13, 17, 12, 15]
+ *    
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_constant_iterator
+ */
+template<typename Value,
+         typename Incrementable = use_default,
+         typename System = use_default>
+  class constant_iterator
+    : public detail::constant_iterator_base<Value, Incrementable, System>::type
+{
+    /*! \cond
+     */
+    friend class thrust::iterator_core_access;
+    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::type          super_t;
+    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::incrementable incrementable;
+    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::base_iterator base_iterator;
+
+  public:
+    typedef typename super_t::reference  reference;
+    typedef typename super_t::value_type value_type;
+
+    /*! \endcond
+     */
+
+    /*! Null constructor initializes this \p constant_iterator's constant using its
+     *  null constructor.
+     */
+    __host__ __device__
+    constant_iterator()
+      : super_t(), m_value() {}
+
+    /*! Copy constructor copies the value of another \p constant_iterator into this
+     *  \p constant_iterator.
+     *
+     *  \p rhs The constant_iterator to copy.
+     */
+    __host__ __device__
+    constant_iterator(constant_iterator const &rhs)
+      : super_t(rhs.base()), m_value(rhs.m_value) {}
+
+    /*! Copy constructor copies the value of another \p constant_iterator with related
+     *  System type.
+     *
+     *  \param rhs The \p constant_iterator to copy.
+     */
+    template<typename OtherSystem>
+    __host__ __device__
+    constant_iterator(constant_iterator<Value,Incrementable,OtherSystem> const &rhs,
+                      typename thrust::detail::enable_if_convertible<
+                        typename thrust::iterator_system<constant_iterator<Value,Incrementable,OtherSystem> >::type,
+                        typename thrust::iterator_system<super_t>::type
+                      >::type * = 0)
+      : super_t(rhs.base()), m_value(rhs.value()) {}
+
+    /*! This constructor receives a value to use as the constant value of this
+     *  \p constant_iterator and an index specifying the location of this
+     *  \p constant_iterator in a sequence.
+     *  
+     *  \p v The value of this \p constant_iterator's constant value.
+     *  \p i The index of this \p constant_iterator in a sequence. Defaults to the
+     *       value returned by \c Incrementable's null constructor. For example,
+     *       when <tt>Incrementable == int</tt>, \c 0.
+     */
+    __host__ __device__
+    constant_iterator(value_type const& v, incrementable const &i = incrementable())
+      : super_t(base_iterator(i)), m_value(v) {}
+
+    /*! This constructor is templated to allow construction from a value type and
+     *  incrementable type related this this \p constant_iterator's respective types.
+     *
+     *  \p v The value of this \p constant_iterator's constant value.
+     *  \p i The index of this \p constant_iterator in a sequence. Defaults to the
+     *       value returned by \c Incrementable's null constructor. For example,
+     *       when <tt>Incrementable == int</tt>, \c 0.
+     */
+    template<typename OtherValue, typename OtherIncrementable>
+    __host__ __device__
+    constant_iterator(OtherValue const& v, OtherIncrementable const& i = incrementable())
+      : super_t(base_iterator(i)), m_value(v) {}
+
+    /*! This method returns the value of this \p constant_iterator's constant value.
+     *  \return A \c const reference to this \p constant_iterator's constant value.
+     */
+    __host__ __device__
+    Value const& value() const
+    { return m_value; }
+
+    /*! \cond
+     */
+
+  protected:
+    __host__ __device__
+    Value const& value_reference() const
+    { return m_value; }
+
+    __host__ __device__
+    Value & value_reference()
+    { return m_value; }
+  
+  private: // Core iterator interface
+    __host__ __device__
+    reference dereference() const
+    {
+      return m_value;
+    }
+
+  private:
+    Value m_value;
+
+    /*! \endcond
+     */
+}; // end constant_iterator
+
+
+/*! This version of \p make_constant_iterator creates a \p constant_iterator
+ *  from values given for both value and index. The type of \p constant_iterator
+ *  may be inferred by the compiler from the types of its parameters.
+ *
+ *  \param x The value of the returned \p constant_iterator's constant value.
+ *  \param i The index of the returned \p constant_iterator within a sequence.
+ *           The type of this parameter defaults to \c int. In the default case,
+ *           the value of this parameter is \c 0.
+ *
+ *  \return A new \p constant_iterator with constant value & index as given
+ *          by \p x & \p i.
+ *
+ *  \see constant_iterator
+ */
+template<typename V, typename I>
+inline __host__ __device__
+constant_iterator<V,I> make_constant_iterator(V x, I i = int())
+{
+  return constant_iterator<V,I>(x, i);
+} // end make_constant_iterator()
+
+
+/*! This version of \p make_constant_iterator creates a \p constant_iterator
+ *  using only a parameter for the desired constant value. The value of the
+ *  returned \p constant_iterator's index is set to \c 0.
+ *
+ *  \param x The value of the returned \p constant_iterator's constant value.
+ *  \return A new \p constant_iterator with constant value equal to \p x and
+ *          index equal to \c 0.
+ *  \see constant_iterator
+ */
+template<typename V>
+inline __host__ __device__
+constant_iterator<V> make_constant_iterator(V x)
+{
+  return constant_iterator<V>(x, 0);
+} // end make_constant_iterator()
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/iterator/counting_iterator.h b/thrust/thrust/iterator/counting_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..25d495db05ee3d18467ef7975147a839086bfe4a
--- /dev/null
+++ b/thrust/thrust/iterator/counting_iterator.h
@@ -0,0 +1,247 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/counting_iterator.h
+ *  \brief An iterator which returns an increasing incrementable value
+ *         when dereferenced
+ */
+
+/*
+ * Copyright David Abrahams 2003.
+ *
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_categories.h>
+
+// #include the details first
+#include <thrust/iterator/detail/counting_iterator.inl>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p counting_iterator is an iterator which represents a pointer into a range
+ *  of sequentially changing values. This iterator is useful for creating a range
+ *  filled with a sequence without explicitly storing it in memory. Using
+ *  \p counting_iterator saves memory capacity and bandwidth.
+ *
+ *  The following code snippet demonstrates how to create a \p counting_iterator whose
+ *  \c value_type is \c int and which sequentially increments by \c 1.
+ *
+ *  \code
+ *  #include <thrust/iterator/counting_iterator.h>
+ *  ...
+ *  // create iterators
+ *  thrust::counting_iterator<int> first(10);
+ *  thrust::counting_iterator<int> last = first + 3;
+ *
+ *  first[0]   // returns 10
+ *  first[1]   // returns 11
+ *  first[100] // returns 110
+ *
+ *  // sum of [first, last)
+ *  thrust::reduce(first, last);   // returns 33 (i.e. 10 + 11 + 12)
+ *
+ *  // initialize vector to [0,1,2,..]
+ *  thrust::counting_iterator<int> iter(0);
+ *  thrust::device_vector<int> vec(500);
+ *  thrust::copy(iter, iter + vec.size(), vec.begin());
+ *  \endcode
+ *
+ *  This next example demonstrates how to use a \p counting_iterator with the
+ *  \p thrust::copy_if function to compute the indices of the non-zero elements
+ *  of a \p device_vector. In this example, we use the \p make_counting_iterator
+ *  function to avoid specifying the type of the \p counting_iterator.
+ *
+ *  \code
+ *  #include <thrust/iterator/counting_iterator.h>
+ *  #include <thrust/copy.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *   // this example computes indices for all the nonzero values in a sequence
+ *
+ *   // sequence of zero and nonzero values
+ *   thrust::device_vector<int> stencil(8);
+ *   stencil[0] = 0;
+ *   stencil[1] = 1;
+ *   stencil[2] = 1;
+ *   stencil[3] = 0;
+ *   stencil[4] = 0;
+ *   stencil[5] = 1;
+ *   stencil[6] = 0;
+ *   stencil[7] = 1;
+ *
+ *   // storage for the nonzero indices
+ *   thrust::device_vector<int> indices(8);
+ *
+ *   // compute indices of nonzero elements
+ *   typedef thrust::device_vector<int>::iterator IndexIterator;
+ *
+ *   // use make_counting_iterator to define the sequence [0, 8)
+ *   IndexIterator indices_end = thrust::copy_if(thrust::make_counting_iterator(0),
+ *                                               thrust::make_counting_iterator(8),
+ *                                               stencil.begin(),
+ *                                               indices.begin(),
+ *                                               thrust::identity<int>());
+ *   // indices now contains [1,2,5,7]
+ *
+ *   return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_counting_iterator
+ */
+template<typename Incrementable,
+         typename System = use_default,
+         typename Traversal = use_default,
+         typename Difference = use_default>
+  class counting_iterator
+    : public detail::counting_iterator_base<Incrementable, System, Traversal, Difference>::type
+{
+    /*! \cond
+     */
+    typedef typename detail::counting_iterator_base<Incrementable, System, Traversal, Difference>::type super_t;
+
+    friend class thrust::iterator_core_access;
+
+  public:
+    typedef typename super_t::reference       reference;
+    typedef typename super_t::difference_type difference_type;
+
+    /*! \endcond
+     */
+
+    /*! Null constructor initializes this \p counting_iterator's \c Incrementable
+     *  counter using its null constructor.
+     */
+    __host__ __device__
+    counting_iterator() {}
+
+    /*! Copy constructor copies the value of another \p counting_iterator into a
+     *  new \p counting_iterator.
+     *
+     *  \p rhs The \p counting_iterator to copy.
+     */
+    __host__ __device__
+    counting_iterator(counting_iterator const &rhs):super_t(rhs.base()){}
+
+    /*! Copy constructor copies the value of another counting_iterator
+     *  with related System type.
+     *
+     *  \param rhs The \p counting_iterator to copy.
+     */
+    template<typename OtherSystem>
+    __host__ __device__
+    counting_iterator(counting_iterator<Incrementable, OtherSystem, Traversal, Difference> const &rhs,
+                      typename thrust::detail::enable_if_convertible<
+                        typename thrust::iterator_system<counting_iterator<Incrementable,OtherSystem,Traversal,Difference> >::type,
+                        typename thrust::iterator_system<super_t>::type
+                      >::type * = 0)
+      : super_t(rhs.base()){}
+
+    /*! This \c explicit constructor copies the value of an \c Incrementable
+     *  into a new \p counting_iterator's \c Incrementable counter.
+     *
+     *  \param x The initial value of the new \p counting_iterator's \c Incrementable
+     *         counter.
+     */
+    __host__ __device__
+    explicit counting_iterator(Incrementable x):super_t(x){}
+
+#if THRUST_CPP_DIALECT >= 2011
+    counting_iterator & operator=(const counting_iterator &) = default;
+#endif
+
+    /*! \cond
+     */
+  private:
+    __host__ __device__
+    reference dereference() const
+    {
+      return this->base_reference();
+    }
+
+    // note that we implement equal specially for floating point counting_iterator
+    template <typename OtherIncrementable, typename OtherSystem, typename OtherTraversal, typename OtherDifference>
+    __host__ __device__
+    bool equal(counting_iterator<OtherIncrementable, OtherSystem, OtherTraversal, OtherDifference> const& y) const
+    {
+      typedef thrust::detail::counting_iterator_equal<difference_type,Incrementable,OtherIncrementable> e;
+      return e::equal(this->base(), y.base());
+    }
+
+    template <class OtherIncrementable>
+    __host__ __device__
+    difference_type
+    distance_to(counting_iterator<OtherIncrementable, System, Traversal, Difference> const& y) const
+    {
+      typedef typename
+      thrust::detail::eval_if<
+        thrust::detail::is_numeric<Incrementable>::value,
+        thrust::detail::identity_<thrust::detail::number_distance<difference_type, Incrementable, OtherIncrementable> >,
+        thrust::detail::identity_<thrust::detail::iterator_distance<difference_type, Incrementable, OtherIncrementable> >
+      >::type d;
+
+      return d::distance(this->base(), y.base());
+    }
+
+    /*! \endcond
+     */
+}; // end counting_iterator
+
+
+/*! \p make_counting_iterator creates a \p counting_iterator
+ *  using an initial value for its \c Incrementable counter.
+ *
+ *  \param x The initial value of the new \p counting_iterator's counter.
+ *  \return A new \p counting_iterator whose counter has been initialized to \p x.
+ */
+template <typename Incrementable>
+inline __host__ __device__
+counting_iterator<Incrementable> make_counting_iterator(Incrementable x)
+{
+  return counting_iterator<Incrementable>(x);
+}
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/any_assign.h b/thrust/thrust/iterator/detail/any_assign.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e7f2cf20bedd44001611b62ce498ea9687dd7db
--- /dev/null
+++ b/thrust/thrust/iterator/detail/any_assign.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// a type which may be assigned any other type
+struct any_assign
+{
+  inline __host__ __device__ any_assign()
+  {}
+
+  template<typename T>
+  inline __host__ __device__ any_assign(T)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  any_assign &operator=(T)
+  {
+    if(0)
+    {
+      // trick the compiler into silencing "warning: this expression has no effect"
+      int *x = 0;
+      *x = 13;
+    } // end if
+
+    return *this;
+  }
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/any_system_tag.h b/thrust/thrust/iterator/detail/any_system_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..27640b5e0dd83881cbd16d19229c409307bc7da8
--- /dev/null
+++ b/thrust/thrust/iterator/detail/any_system_tag.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+struct any_system_tag
+  : thrust::execution_policy<any_system_tag>
+{
+  // allow any_system_tag to convert to any type at all
+  // XXX make this safer using enable_if<is_tag<T>> upon c++11
+  template<typename T> operator T () const {return T();}
+};
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/constant_iterator_base.h b/thrust/thrust/iterator/detail/constant_iterator_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..56b1cc4f4c244b3f54793b05312f7419fc696cef
--- /dev/null
+++ b/thrust/thrust/iterator/detail/constant_iterator_base.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace thrust
+{
+
+// forward declaration of constant_iterator
+template<typename,typename,typename> class constant_iterator;
+
+namespace detail
+{
+
+template<typename Value,
+         typename Incrementable,
+         typename System>
+  struct constant_iterator_base
+{
+  typedef Value              value_type;
+
+  // the reference type is the same as the value_type.
+  // we wish to avoid returning a reference to the internal state
+  // of the constant_iterator, which is prone to subtle bugs.
+  // consider the temporary iterator created in the expression
+  // *(iter + i)
+  typedef value_type         reference;
+
+  // the incrementable type is int unless otherwise specified
+  typedef typename thrust::detail::ia_dflt_help<
+    Incrementable,
+    thrust::detail::identity_<thrust::detail::intmax_t>
+  >::type incrementable;
+
+  typedef typename thrust::counting_iterator<
+    incrementable,
+    System,
+    thrust::random_access_traversal_tag
+  > base_iterator;
+
+  typedef typename thrust::iterator_adaptor<
+    constant_iterator<Value, Incrementable, System>,
+    base_iterator,
+    value_type, // XXX we may need to pass const value_type here as boost counting_iterator does
+    typename thrust::iterator_system<base_iterator>::type,
+    typename thrust::iterator_traversal<base_iterator>::type,
+    reference
+  > type;
+}; // end constant_iterator_base
+
+} // end detail
+  
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/counting_iterator.inl b/thrust/thrust/iterator/detail/counting_iterator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..abcd87989ba1b6c8cfc0936f133ba9558df2fff5
--- /dev/null
+++ b/thrust/thrust/iterator/detail/counting_iterator.inl
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/numeric_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <cstddef>
+
+namespace thrust
+{
+
+// forward declaration of counting_iterator
+template <typename Incrementable, typename System, typename Traversal, typename Difference>
+  class counting_iterator;
+
+namespace detail
+{
+
+template <typename Incrementable, typename System, typename Traversal, typename Difference>
+  struct counting_iterator_base
+{
+  typedef typename thrust::detail::eval_if<
+    // use any_system_tag if we are given use_default
+    thrust::detail::is_same<System,use_default>::value,
+    thrust::detail::identity_<thrust::any_system_tag>,
+    thrust::detail::identity_<System>
+  >::type system;
+
+  typedef typename thrust::detail::ia_dflt_help<
+      Traversal,
+      thrust::detail::eval_if<
+          thrust::detail::is_numeric<Incrementable>::value,
+          thrust::detail::identity_<random_access_traversal_tag>,
+          thrust::iterator_traversal<Incrementable>
+      >
+  >::type traversal;
+
+  // unlike Boost, we explicitly use std::ptrdiff_t as the difference type
+  // for floating point counting_iterators
+  typedef typename thrust::detail::ia_dflt_help<
+    Difference,
+    thrust::detail::eval_if<
+      thrust::detail::is_numeric<Incrementable>::value,
+        thrust::detail::eval_if<
+          thrust::detail::is_integral<Incrementable>::value,
+          thrust::detail::numeric_difference<Incrementable>,
+          thrust::detail::identity_<std::ptrdiff_t>
+        >,
+      thrust::iterator_difference<Incrementable>
+    >
+  >::type difference;
+
+  // our implementation departs from Boost's in that counting_iterator::dereference
+  // returns a copy of its counter, rather than a reference to it. returning a reference
+  // to the internal state of an iterator causes subtle bugs (consider the temporary
+  // iterator created in the expression *(iter + i)) and has no compelling use case
+  typedef thrust::iterator_adaptor<
+    counting_iterator<Incrementable, System, Traversal, Difference>, // self
+    Incrementable,                                                  // Base
+    Incrementable,                                                  // XXX we may need to pass const here as Boost does
+    system,
+    traversal,
+    Incrementable,
+    difference
+  > type;
+}; // end counting_iterator_base
+
+
+template<typename Difference, typename Incrementable1, typename Incrementable2>
+  struct iterator_distance
+{
+  __host__ __device__
+  static Difference distance(Incrementable1 x, Incrementable2 y)
+  {
+    return y - x;
+  }
+};
+
+
+template<typename Difference, typename Incrementable1, typename Incrementable2>
+  struct number_distance
+{
+  __host__ __device__
+  static Difference distance(Incrementable1 x, Incrementable2 y)
+  {
+      return static_cast<Difference>(numeric_distance(x,y));
+  }
+};
+
+
+template<typename Difference, typename Incrementable1, typename Incrementable2, typename Enable = void>
+  struct counting_iterator_equal
+{
+  __host__ __device__
+  static bool equal(Incrementable1 x, Incrementable2 y)
+  {
+    return x == y;
+  }
+};
+
+
+// specialization for floating point equality
+template<typename Difference, typename Incrementable1, typename Incrementable2>
+  struct counting_iterator_equal<
+    Difference,
+    Incrementable1,
+    Incrementable2,
+    typename thrust::detail::enable_if<
+      thrust::detail::is_floating_point<Incrementable1>::value ||
+      thrust::detail::is_floating_point<Incrementable2>::value
+    >::type
+  >
+{
+  __host__ __device__
+  static bool equal(Incrementable1 x, Incrementable2 y)
+  {
+    typedef number_distance<Difference,Incrementable1,Incrementable2> d;
+    return d::distance(x,y) == 0;
+  }
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/device_system_tag.h b/thrust/thrust/iterator/detail/device_system_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..394b991cde7e23a76a32b6b92476b34881352b87
--- /dev/null
+++ b/thrust/thrust/iterator/detail/device_system_tag.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the device system's execution_policy header
+#define __THRUST_DEVICE_SYSTEM_TAG_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/execution_policy.h>
+#include __THRUST_DEVICE_SYSTEM_TAG_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TAG_HEADER
+
+namespace thrust
+{
+
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag device_system_tag;
+
+} // end thrust
+
+// TODO remove this in 1.8.0
+namespace thrust
+{
+
+typedef THRUST_DEPRECATED device_system_tag device_space_tag;
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/discard_iterator_base.h b/thrust/thrust/iterator/detail/discard_iterator_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4a8c312b158807a7daef7d4bad79c4367a2cd33
--- /dev/null
+++ b/thrust/thrust/iterator/detail/discard_iterator_base.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/detail/any_assign.h>
+#include <cstddef> // for std::ptrdiff_t
+
+namespace thrust
+{
+
+// forward declaration of discard_iterator
+template<typename> class discard_iterator;
+
+namespace detail
+{
+
+
+template<typename System>
+  struct discard_iterator_base
+{
+  // XXX value_type should actually be void
+  //     but this interferes with zip_iterator<discard_iterator>
+  typedef any_assign         value_type;
+  typedef any_assign&        reference;
+  typedef std::ptrdiff_t     incrementable;
+
+  typedef typename thrust::counting_iterator<
+    incrementable,
+    System,
+    thrust::random_access_traversal_tag
+  > base_iterator;
+
+  typedef typename thrust::iterator_adaptor<
+    discard_iterator<System>,
+    base_iterator,
+    value_type,
+    typename thrust::iterator_system<base_iterator>::type,
+    typename thrust::iterator_traversal<base_iterator>::type,
+    reference
+  > type;
+}; // end discard_iterator_base
+
+
+} // end detail
+  
+} // end thrust
+
+
diff --git a/thrust/thrust/iterator/detail/distance_from_result.h b/thrust/thrust/iterator/detail/distance_from_result.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b7e0d60e5d31816ce9695444129ff8b5eed52d7
--- /dev/null
+++ b/thrust/thrust/iterator/detail/distance_from_result.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// since both arguments are known to be specializations of iterator_facade,
+// it's legal to access IteratorFacade2::difference_type
+template<typename IteratorFacade1, typename IteratorFacade2>
+  struct distance_from_result
+    : eval_if<
+        is_convertible<IteratorFacade2,IteratorFacade1>::value,
+        identity_<typename IteratorFacade1::difference_type>,
+        identity_<typename IteratorFacade2::difference_type>
+      >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/host_system_tag.h b/thrust/thrust/iterator/detail/host_system_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..a487e6ac5f21614233d518c95f240d2191deeab6
--- /dev/null
+++ b/thrust/thrust/iterator/detail/host_system_tag.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the host system's execution_policy header
+#define __THRUST_HOST_SYSTEM_TAG_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/execution_policy.h>
+#include __THRUST_HOST_SYSTEM_TAG_HEADER
+#undef __THRUST_HOST_SYSTEM_TAG_HEADER
+
+namespace thrust
+{
+
+typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag host_system_tag;
+
+} // end thrust
+
+// TODO remove this in 1.8.0
+namespace thrust
+{
+
+typedef THRUST_DEPRECATED host_system_tag host_space_tag;
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/is_iterator_category.h b/thrust/thrust/iterator/detail/is_iterator_category.h
new file mode 100644
index 0000000000000000000000000000000000000000..b538358be33bf6bfcda040bfada6fa74cf8e18b8
--- /dev/null
+++ b/thrust/thrust/iterator/detail/is_iterator_category.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename T>
+  struct is_host_iterator_category
+    : thrust::detail::or_<
+        thrust::detail::is_convertible<T, thrust::input_host_iterator_tag>,
+        thrust::detail::is_convertible<T, thrust::output_host_iterator_tag>
+      >
+{
+}; // end is_host_iterator_category
+
+template <typename T>
+  struct is_device_iterator_category
+    : thrust::detail::or_<
+        thrust::detail::is_convertible<T, thrust::input_device_iterator_tag>,
+        thrust::detail::is_convertible<T, thrust::output_device_iterator_tag>
+      >
+{
+}; // end is_device_iterator_category
+
+
+template <typename T>
+  struct is_iterator_category
+    : thrust::detail::or_<
+        is_host_iterator_category<T>,
+        is_device_iterator_category<T>
+      >
+{
+}; // end is_iterator_category
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_adaptor_base.h b/thrust/thrust/iterator/detail/iterator_adaptor_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9dbfaae6b82a06c3b54763bb518d78b830bb952
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_adaptor_base.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace thrust
+{
+
+
+// forward declaration of iterator_adaptor for iterator_adaptor_base below
+template<typename Derived,
+         typename Base,
+         typename Value,
+         typename System,
+         typename Traversal,
+         typename Reference,
+         typename Difference
+>
+class iterator_adaptor;
+
+
+namespace detail
+{
+
+// If T is use_default, return the result of invoking
+// DefaultNullaryFn, otherwise return T.
+// XXX rename to dflt_help
+template <class T, class DefaultNullaryFn>
+struct ia_dflt_help
+  : thrust::detail::eval_if<
+        thrust::detail::is_same<T, thrust::use_default>::value
+      , DefaultNullaryFn
+      , thrust::detail::identity_<T>
+    >
+{
+}; // end ia_dflt_help
+
+
+// A metafunction which computes an iterator_adaptor's base class,
+// a specialization of iterator_facade.
+template<typename Derived,
+         typename Base,
+         typename Value,
+         typename System,
+         typename Traversal,
+         typename Reference,
+         typename Difference
+>
+  struct iterator_adaptor_base
+{
+  typedef typename ia_dflt_help<
+    Value,
+    iterator_value<Base>
+  >::type value;
+
+  typedef typename ia_dflt_help<
+    System,
+    thrust::iterator_system<Base>
+  >::type system;
+
+  typedef typename ia_dflt_help<
+    Traversal,
+    thrust::iterator_traversal<Base>
+  >::type traversal;
+
+  typedef typename ia_dflt_help<
+    Reference,
+    thrust::detail::eval_if<
+      thrust::detail::is_same<Value,use_default>::value,
+      thrust::iterator_reference<Base>,
+      thrust::detail::add_reference<Value>
+    >
+  >::type reference;
+
+  typedef typename ia_dflt_help<
+    Difference,
+    iterator_difference<Base>
+  >::type difference;
+
+  typedef thrust::iterator_facade<
+    Derived,
+    value,
+    system,
+    traversal,
+    reference,
+    difference
+  > type;
+}; // end iterator_adaptor_base
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_category_to_system.h b/thrust/thrust/iterator/detail/iterator_category_to_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd378fae7314fac33f4fadf5cb1ae348dbeaa0e7
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_category_to_system.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/host_system_tag.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// forward declaration
+template <typename> struct is_iterator_system;
+
+template <typename> struct device_iterator_category_to_backend_system;
+
+// XXX this should work entirely differently
+// we should just specialize this metafunction for iterator_category_with_system_and_traversal
+template<typename Category>
+  struct iterator_category_to_system
+    // convertible to host iterator?
+    : eval_if<
+        or_<
+          is_convertible<Category, thrust::input_host_iterator_tag>,
+          is_convertible<Category, thrust::output_host_iterator_tag>
+        >::value,
+
+        detail::identity_<thrust::host_system_tag>,
+        
+        // convertible to device iterator?
+        eval_if<
+          or_<
+            is_convertible<Category, thrust::input_device_iterator_tag>,
+            is_convertible<Category, thrust::output_device_iterator_tag>
+          >::value,
+
+          detail::identity_<thrust::device_system_tag>,
+
+          // unknown system
+          detail::identity_<void>
+        > // if device
+      > // if host
+{
+}; // end iterator_category_to_system
+
+
+template<typename CategoryOrTraversal>
+  struct iterator_category_or_traversal_to_system
+    : eval_if<
+        is_iterator_system<CategoryOrTraversal>::value,
+        detail::identity_<CategoryOrTraversal>,
+        iterator_category_to_system<CategoryOrTraversal>
+      >
+{
+}; // end iterator_category_or_traversal_to_system
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_category_to_traversal.h b/thrust/thrust/iterator/detail/iterator_category_to_traversal.h
new file mode 100644
index 0000000000000000000000000000000000000000..7596682e2ecaa42f0128b7d2c4b70707199b9b1a
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_category_to_traversal.h
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/iterator_category_to_system.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// forward declarations
+template <typename> struct is_iterator_system;
+template <typename> struct is_iterator_traversal;
+
+template <typename Category>
+  struct host_system_category_to_traversal
+    : eval_if<
+        is_convertible<Category, random_access_host_iterator_tag>::value,
+        detail::identity_<random_access_traversal_tag>,
+        eval_if<
+          is_convertible<Category, bidirectional_host_iterator_tag>::value,
+          detail::identity_<bidirectional_traversal_tag>,
+          eval_if<
+            is_convertible<Category, forward_host_iterator_tag>::value,
+            detail::identity_<forward_traversal_tag>,
+            eval_if<
+              is_convertible<Category, input_host_iterator_tag>::value,
+              detail::identity_<single_pass_traversal_tag>,
+              eval_if<
+                is_convertible<Category, output_host_iterator_tag>::value,
+                detail::identity_<incrementable_traversal_tag>,
+                void
+              >
+            >
+          >
+        >
+      >
+{
+}; // end host_system_category_to_traversal
+
+
+
+template <typename Category>
+  struct device_system_category_to_traversal
+    : eval_if<
+        is_convertible<Category, random_access_device_iterator_tag>::value,
+        detail::identity_<random_access_traversal_tag>,
+        eval_if<
+          is_convertible<Category, bidirectional_device_iterator_tag>::value,
+          detail::identity_<bidirectional_traversal_tag>,
+          eval_if<
+            is_convertible<Category, forward_device_iterator_tag>::value,
+            detail::identity_<forward_traversal_tag>,
+            eval_if<
+              is_convertible<Category, input_device_iterator_tag>::value,
+              detail::identity_<single_pass_traversal_tag>,
+              eval_if<
+                is_convertible<Category, output_device_iterator_tag>::value,
+                detail::identity_<incrementable_traversal_tag>,
+                void
+              >
+            >
+          >
+        >
+      >
+{
+}; // end device_system_category_to_traversal
+
+
+template<typename Category>
+  struct category_to_traversal
+      // check for host system
+    : eval_if<
+        or_<
+          is_convertible<Category, thrust::input_host_iterator_tag>,
+          is_convertible<Category, thrust::output_host_iterator_tag>
+        >::value,
+
+        host_system_category_to_traversal<Category>,
+
+        // check for device system
+        eval_if<
+          or_<
+            is_convertible<Category, thrust::input_device_iterator_tag>,
+            is_convertible<Category, thrust::output_device_iterator_tag>
+          >::value,
+
+          device_system_category_to_traversal<Category>,
+
+          // unknown category
+          void
+        >
+      >
+{};
+
+
+template <typename CategoryOrTraversal>
+  struct iterator_category_to_traversal
+    : eval_if<
+        is_iterator_traversal<CategoryOrTraversal>::value,
+        detail::identity_<CategoryOrTraversal>,
+        category_to_traversal<CategoryOrTraversal>
+      >
+{
+}; // end iterator_category_to_traversal
+
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_category_with_system_and_traversal.h b/thrust/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f5374b165d06a582ded7fe7cffebc70822dcf2f
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Category, typename System, typename Traversal>
+  struct iterator_category_with_system_and_traversal
+    : Category
+{
+}; // end iterator_category_with_system_and_traversal
+
+
+// specialize iterator_category_to_system for iterator_category_with_system_and_traversal
+template<typename Category> struct iterator_category_to_system;
+
+template<typename Category, typename System, typename Traversal>
+  struct iterator_category_to_system<iterator_category_with_system_and_traversal<Category,System,Traversal> >
+{
+  typedef System type;
+}; // end iterator_category_to_system
+
+
+// specialize iterator_category_to_traversal for iterator_category_with_system_and_traversal
+template<typename Category> struct iterator_category_to_traversal;
+
+template<typename Category, typename System, typename Traversal>
+  struct iterator_category_to_traversal<iterator_category_with_system_and_traversal<Category,System,Traversal> >
+{
+  typedef Traversal type;
+}; // end iterator_category_to_traversal
+
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_facade_category.h b/thrust/thrust/iterator/detail/iterator_facade_category.h
new file mode 100644
index 0000000000000000000000000000000000000000..e00d3ef054bd740b801e47cc1344e38621d8c055
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_facade_category.h
@@ -0,0 +1,253 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/host_system_tag.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/is_iterator_category.h>
+#include <thrust/iterator/detail/iterator_category_with_system_and_traversal.h>
+#include <thrust/iterator/detail/iterator_category_to_traversal.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+
+// adapted from http://www.boost.org/doc/libs/1_37_0/libs/iterator/doc/iterator_facade.html#iterator-category
+//
+// in our implementation, R need not be a reference type to result in a category
+// derived from forward_XXX_iterator_tag
+//
+// iterator-category(T,V,R) :=
+//   if(T is convertible to input_host_iterator_tag
+//      || T is convertible to output_host_iterator_tag
+//      || T is convertible to input_device_iterator_tag
+//      || T is convertible to output_device_iterator_tag
+//   )
+//     return T
+//
+//   else if (T is not convertible to incrementable_traversal_tag)
+//     the program is ill-formed
+//
+//   else return a type X satisfying the following two constraints:
+//
+//     1. X is convertible to X1, and not to any more-derived
+//        type, where X1 is defined by:
+//
+//        if (T is convertible to forward_traversal_tag)
+//        {
+//          if (T is convertible to random_access_traversal_tag)
+//            X1 = random_access_host_iterator_tag
+//          else if (T is convertible to bidirectional_traversal_tag)
+//            X1 = bidirectional_host_iterator_tag
+//          else
+//            X1 = forward_host_iterator_tag
+//        }
+//        else
+//        {
+//          if (T is convertible to single_pass_traversal_tag
+//              && R is convertible to V)
+//            X1 = input_host_iterator_tag
+//          else
+//            X1 = T
+//        }
+//
+//     2. category-to-traversal(X) is convertible to the most
+//        derived traversal tag type to which X is also convertible,
+//        and not to any more-derived traversal tag type.
+
+
+template<typename System, typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category;
+
+
+// Thrust's implementation of iterator_facade_default_category is slightly
+// different from Boost's equivalent.
+// Thrust does not check is_convertible<Reference, ValueParam> because Reference
+// may not be a complete type at this point, and implementations of is_convertible
+// typically require that both types be complete.
+// Instead, it simply assumes that if is_convertible<Traversal, single_pass_traversal_tag>,
+// then the category is input_iterator_tag
+
+
+// this is the function for standard system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_std :
+    thrust::detail::eval_if<
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<std::random_access_iterator_tag>,
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<std::bidirectional_iterator_tag>,
+          thrust::detail::identity_<std::forward_iterator_tag>
+        >
+      >,
+      thrust::detail::eval_if< // XXX note we differ from Boost here
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value,
+        thrust::detail::identity_<std::input_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_std
+
+
+// this is the function for host system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_host :
+    thrust::detail::eval_if<
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<thrust::random_access_host_iterator_tag>,
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<thrust::bidirectional_host_iterator_tag>,
+          thrust::detail::identity_<thrust::forward_host_iterator_tag>
+        >
+      >,
+      thrust::detail::eval_if< // XXX note we differ from Boost here
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value,
+        thrust::detail::identity_<thrust::input_host_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_host
+
+
+// this is the function for device system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_device :
+    thrust::detail::eval_if<
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<thrust::random_access_device_iterator_tag>,
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<thrust::bidirectional_device_iterator_tag>,
+          thrust::detail::identity_<thrust::forward_device_iterator_tag>
+        >
+      >,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value, // XXX note we differ from Boost here
+        thrust::detail::identity_<thrust::input_device_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_device
+
+
+// this is the function for any system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_any
+{
+  typedef thrust::detail::iterator_category_with_system_and_traversal<
+    typename iterator_facade_default_category_std<Traversal, ValueParam, Reference>::type,
+    thrust::any_system_tag,
+    Traversal
+  > type;
+}; // end iterator_facade_default_category_any
+
+
+template<typename System, typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category
+      // check for any system
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::any_system_tag>::value,
+        iterator_facade_default_category_any<Traversal, ValueParam, Reference>,
+
+        // check for host system
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+          iterator_facade_default_category_host<Traversal, ValueParam, Reference>,
+
+          // check for device system
+          thrust::detail::eval_if<
+            thrust::detail::is_convertible<System, thrust::device_system_tag>::value,
+            iterator_facade_default_category_device<Traversal, ValueParam, Reference>,
+
+            // if we don't recognize the system, get a standard iterator category
+            // and combine it with System & Traversal
+            thrust::detail::identity_<
+              thrust::detail::iterator_category_with_system_and_traversal<
+                typename iterator_facade_default_category_std<Traversal, ValueParam, Reference>::type,
+                System,
+                Traversal
+              >
+            >
+          >
+        >
+      >
+{};
+
+
+template<typename System, typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_category_impl
+{
+  typedef typename iterator_facade_default_category<
+    System,Traversal,ValueParam,Reference
+  >::type category;
+
+  // we must be able to deduce both Traversal & System from category
+  // otherwise, munge them all together
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::and_<
+      thrust::detail::is_same<
+        Traversal,
+        typename thrust::detail::iterator_category_to_traversal<category>::type
+      >,
+      thrust::detail::is_same<
+        System,
+        typename thrust::detail::iterator_category_to_system<category>::type
+      >
+    >::value,
+    thrust::detail::identity_<category>,
+    thrust::detail::identity_<thrust::detail::iterator_category_with_system_and_traversal<category,System,Traversal> >
+  >::type type;
+}; // end iterator_facade_category_impl
+
+
+template<typename CategoryOrSystem,
+         typename CategoryOrTraversal,
+         typename ValueParam,
+         typename Reference>
+  struct iterator_facade_category
+{
+  typedef typename
+  thrust::detail::eval_if<
+    thrust::detail::is_iterator_category<CategoryOrTraversal>::value,
+    thrust::detail::identity_<CategoryOrTraversal>, // categories are fine as-is
+    iterator_facade_category_impl<CategoryOrSystem, CategoryOrTraversal, ValueParam, Reference>
+  >::type type;
+}; // end iterator_facade_category
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_traits.inl b/thrust/thrust/iterator/detail/iterator_traits.inl
new file mode 100644
index 0000000000000000000000000000000000000000..8a9cc4ffb0a781afe692298283670514cd08c21c
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_traits.inl
@@ -0,0 +1,127 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file iterator_traits.inl
+ *  \brief Inline file for iterator_traits.h.
+ */
+
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_category_to_traversal.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/void_t.h>
+
+namespace thrust
+{
+
+template<typename Iterator>
+  struct iterator_value
+{
+  typedef typename thrust::iterator_traits<Iterator>::value_type type;
+}; // end iterator_value
+
+
+template<typename Iterator>
+  struct iterator_pointer
+{
+  typedef typename thrust::iterator_traits<Iterator>::pointer type;
+}; // end iterator_pointer
+
+
+template<typename Iterator>
+  struct iterator_reference
+{
+  typedef typename iterator_traits<Iterator>::reference type;
+}; // end iterator_reference
+
+
+template<typename Iterator>
+  struct iterator_difference
+{
+  typedef typename thrust::iterator_traits<Iterator>::difference_type type;
+}; // end iterator_difference
+
+namespace detail
+{
+
+template <typename Iterator, typename = void>
+struct iterator_system_impl {};
+
+template <typename Iterator>
+struct iterator_system_impl<
+  Iterator
+, typename voider<
+    typename iterator_traits<Iterator>::iterator_category
+  >::type
+>
+  : detail::iterator_category_to_system<
+      typename iterator_traits<Iterator>::iterator_category
+    >
+{}; 
+
+} // namespace detail
+
+template <typename Iterator>
+struct iterator_system : detail::iterator_system_impl<Iterator> {};
+
+// specialize iterator_system for void *, which has no category
+template<>
+  struct iterator_system<void *>
+{
+  typedef thrust::iterator_system<int*>::type type;
+}; // end iterator_system<void*>
+
+template<>
+  struct iterator_system<const void *>
+{
+  typedef thrust::iterator_system<const int*>::type type;
+}; // end iterator_system<void*>
+
+
+template <typename Iterator>
+  struct iterator_traversal
+    : detail::iterator_category_to_traversal<
+        typename thrust::iterator_traits<Iterator>::iterator_category
+      >
+{
+}; // end iterator_traversal
+
+namespace detail
+{
+
+template <typename T>
+  struct is_iterator_traversal
+    : thrust::detail::is_convertible<T, incrementable_traversal_tag>
+{
+}; // end is_iterator_traversal
+
+
+template<typename T>
+  struct is_iterator_system
+    : detail::or_<
+        detail::is_convertible<T, any_system_tag>,
+        detail::or_<
+          detail::is_convertible<T, host_system_tag>,
+          detail::is_convertible<T, device_system_tag>
+        >
+      >
+{
+}; // end is_iterator_system
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/thrust/thrust/iterator/detail/iterator_traversal_tags.h b/thrust/thrust/iterator/detail/iterator_traversal_tags.h
new file mode 100644
index 0000000000000000000000000000000000000000..73cd1f76af298ab1e88aad2c91c9266be77d793f
--- /dev/null
+++ b/thrust/thrust/iterator/detail/iterator_traversal_tags.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+// define Boost's traversal tags
+struct no_traversal_tag {};
+
+struct incrementable_traversal_tag
+  : no_traversal_tag {};
+
+struct single_pass_traversal_tag
+  : incrementable_traversal_tag {};
+
+struct forward_traversal_tag
+  : single_pass_traversal_tag {};
+
+struct bidirectional_traversal_tag
+  : forward_traversal_tag {};
+
+struct random_access_traversal_tag
+  : bidirectional_traversal_tag {};
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/join_iterator.h b/thrust/thrust/iterator/detail/join_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ab99ce47fb5d0dcb2bf62aebd94cf39d659e0b0
--- /dev/null
+++ b/thrust/thrust/iterator/detail/join_iterator.h
@@ -0,0 +1,134 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/detail/type_traits.h>
+
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Difference,
+         typename Reference>
+class join_iterator;
+
+
+namespace join_iterator_detail
+{
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Difference,
+         typename Reference>
+struct join_iterator_base
+{
+  typedef typename thrust::detail::remove_reference<Reference>::type value_type;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type  system1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type  system2;
+  typedef typename thrust::detail::minimum_system<system1,system2>::type system;
+
+  typedef thrust::iterator_adaptor<
+    join_iterator<RandomAccessIterator1,RandomAccessIterator2,Difference,Reference>,
+    thrust::counting_iterator<Difference>,
+    value_type,
+    system,
+    thrust::random_access_traversal_tag,
+    Reference,
+    Difference
+  > type;
+}; // end join_iterator_base
+
+
+} // end join_iterator_detail
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Difference = typename thrust::iterator_difference<RandomAccessIterator1>::type,
+         typename Reference  = typename thrust::iterator_value<RandomAccessIterator1>::type>
+class join_iterator
+  : public join_iterator_detail::join_iterator_base<RandomAccessIterator1, RandomAccessIterator2, Difference, Reference>::type
+{
+  private:
+    typedef typename join_iterator_detail::join_iterator_base<RandomAccessIterator1, RandomAccessIterator2, Difference, Reference>::type super_t;
+    typedef typename super_t::difference_type size_type;
+
+  public:
+    inline __host__ __device__
+    join_iterator(RandomAccessIterator1 first1, size_type n, RandomAccessIterator2 first2)
+      : super_t(thrust::counting_iterator<size_type>(0)),
+        m_n1(n),
+        m_iter1(first1),
+        m_iter2(first2 - m_n1)
+    {}
+
+
+    inline __host__ __device__
+    join_iterator(const join_iterator &other)
+      : super_t(other),
+        m_n1(other.m_n1),
+        m_iter1(other.m_iter1),
+        m_iter2(other.m_iter2)
+    {}
+
+
+  private:
+    friend class thrust::iterator_core_access;
+
+    // MSVC 2013 and 2015 incorrectly warning about returning a reference to
+    // a local/temporary here.
+    // See goo.gl/LELTNp
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4172)
+
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      size_type i = *super_t::base();
+      return (i < m_n1) ? m_iter1[i] : static_cast<typename super_t::reference>(m_iter2[i]);
+    } // end dereference()
+
+    THRUST_DISABLE_MSVC_WARNING_END(4172)
+
+
+    size_type m_n1;
+    RandomAccessIterator1 m_iter1;
+    RandomAccessIterator2 m_iter2;
+}; // end join_iterator
+
+
+template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
+__host__ __device__
+join_iterator<RandomAccessIterator1,RandomAccessIterator2,Size> make_join_iterator(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2)
+{
+  return join_iterator<RandomAccessIterator1,RandomAccessIterator2,Size>(first1, n1, first2);
+} // end make_join_iterator()
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/minimum_category.h b/thrust/thrust/iterator/detail/minimum_category.h
new file mode 100644
index 0000000000000000000000000000000000000000..abb80d8c1048353490ab6c4ddc238af1bea76b9f
--- /dev/null
+++ b/thrust/thrust/iterator/detail/minimum_category.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits/minimum_type.h>
+
+namespace thrust
+{
+
+namespace detail
+{ 
+
+template<typename T1,
+         typename T2  = minimum_type_detail::any_conversion,
+         typename T3  = minimum_type_detail::any_conversion,
+         typename T4  = minimum_type_detail::any_conversion,
+         typename T5  = minimum_type_detail::any_conversion,
+         typename T6  = minimum_type_detail::any_conversion,
+         typename T7  = minimum_type_detail::any_conversion,
+         typename T8  = minimum_type_detail::any_conversion,
+         typename T9  = minimum_type_detail::any_conversion,
+         typename T10 = minimum_type_detail::any_conversion,
+         typename T11 = minimum_type_detail::any_conversion,
+         typename T12 = minimum_type_detail::any_conversion,
+         typename T13 = minimum_type_detail::any_conversion,
+         typename T14 = minimum_type_detail::any_conversion,
+         typename T15 = minimum_type_detail::any_conversion,
+         typename T16 = minimum_type_detail::any_conversion>
+  struct minimum_category
+    : minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
+{
+}; // end minimum_category
+
+} // end detail
+
+} // end thrust
+
+
diff --git a/thrust/thrust/iterator/detail/minimum_system.h b/thrust/thrust/iterator/detail/minimum_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..45b5a592fc5796892c143bf677ca988d788ec20d
--- /dev/null
+++ b/thrust/thrust/iterator/detail/minimum_system.h
@@ -0,0 +1,82 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/detail/type_traits/minimum_type.h>
+
+namespace thrust
+{
+namespace detail
+{ 
+
+
+template<typename T1,
+         typename T2  = void,
+         typename T3  = void,
+         typename T4  = void,
+         typename T5  = void,
+         typename T6  = void,
+         typename T7  = void,
+         typename T8  = void,
+         typename T9  = void,
+         typename T10 = void,
+         typename T11 = void,
+         typename T12 = void,
+         typename T13 = void,
+         typename T14 = void,
+         typename T15 = void,
+         typename T16 = void>
+  struct unrelated_systems {};
+
+
+// if a minimum system exists for these arguments, return it
+// otherwise, collect the arguments and report them as unrelated
+template<typename T1,
+         typename T2  = minimum_type_detail::any_conversion,
+         typename T3  = minimum_type_detail::any_conversion,
+         typename T4  = minimum_type_detail::any_conversion,
+         typename T5  = minimum_type_detail::any_conversion,
+         typename T6  = minimum_type_detail::any_conversion,
+         typename T7  = minimum_type_detail::any_conversion,
+         typename T8  = minimum_type_detail::any_conversion,
+         typename T9  = minimum_type_detail::any_conversion,
+         typename T10 = minimum_type_detail::any_conversion,
+         typename T11 = minimum_type_detail::any_conversion,
+         typename T12 = minimum_type_detail::any_conversion,
+         typename T13 = minimum_type_detail::any_conversion,
+         typename T14 = minimum_type_detail::any_conversion,
+         typename T15 = minimum_type_detail::any_conversion,
+         typename T16 = minimum_type_detail::any_conversion>
+  struct minimum_system
+    : thrust::detail::eval_if<
+        is_metafunction_defined<
+          minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
+        >::value,
+        minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>,
+        thrust::detail::identity_<
+          unrelated_systems<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
+        >
+      >
+{}; // end minimum_system
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/normal_iterator.h b/thrust/thrust/iterator/detail/normal_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f6e1660e8f4692b08bca7af2a971c3e7cf554e1
--- /dev/null
+++ b/thrust/thrust/iterator/detail/normal_iterator.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file normal_iterator.h
+ *  \brief Defines the interface to an iterator class
+ *         which adapts a pointer type.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Pointer>
+  class normal_iterator
+    : public iterator_adaptor<
+        normal_iterator<Pointer>,
+        Pointer
+      >
+{
+  typedef iterator_adaptor<normal_iterator<Pointer>, Pointer> super_t;
+
+  public:
+    __host__ __device__
+    normal_iterator() {}
+
+    __host__ __device__
+    normal_iterator(Pointer p)
+      : super_t(p) {}
+    
+    template<typename OtherPointer>
+    __host__ __device__
+    normal_iterator(const normal_iterator<OtherPointer> &other,
+                    typename thrust::detail::enable_if_convertible<
+                      OtherPointer,
+                      Pointer
+                    >::type * = 0)
+      : super_t(other.base()) {}
+
+}; // end normal_iterator
+
+
+template<typename Pointer>
+  inline __host__ __device__ normal_iterator<Pointer> make_normal_iterator(Pointer ptr)
+{
+  return normal_iterator<Pointer>(ptr);
+}
+
+} // end detail
+
+template <typename T>
+struct proclaim_contiguous_iterator<
+  thrust::detail::normal_iterator<T>
+> : true_type {};
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/permutation_iterator_base.h b/thrust/thrust/iterator/detail/permutation_iterator_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..2610cfdfaffdeb50ad085f90d4ff9b85920ede4f
--- /dev/null
+++ b/thrust/thrust/iterator/detail/permutation_iterator_base.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+
+namespace thrust
+{
+
+template<typename,typename> class permutation_iterator;
+
+
+namespace detail
+{
+
+template<typename ElementIterator,
+         typename IndexIterator>
+  struct permutation_iterator_base
+{
+  typedef typename thrust::iterator_system<ElementIterator>::type System1;
+  typedef typename thrust::iterator_system<IndexIterator>::type System2;
+
+  typedef thrust::iterator_adaptor<
+    permutation_iterator<ElementIterator,IndexIterator>,
+    IndexIterator,
+    typename thrust::iterator_value<ElementIterator>::type,
+    typename detail::minimum_system<System1,System2>::type,
+    thrust::use_default,
+    typename thrust::iterator_reference<ElementIterator>::type
+  > type;
+}; // end permutation_iterator_base
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/retag.h b/thrust/thrust/iterator/detail/retag.h
new file mode 100644
index 0000000000000000000000000000000000000000..a512d3640d6213d9d446bda7ce5cd7be24dc6608
--- /dev/null
+++ b/thrust/thrust/iterator/detail/retag.h
@@ -0,0 +1,148 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tagged_iterator.h>
+#include <thrust/detail/pointer.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// we can retag an iterator if FromTag converts to ToTag
+// or vice versa
+template<typename FromTag, typename ToTag>
+  struct is_retaggable
+    : integral_constant<
+        bool,
+        (is_convertible<FromTag,ToTag>::value || is_convertible<ToTag,FromTag>::value)
+      >
+{};
+
+
+template<typename FromTag, typename ToTag, typename Result>
+  struct enable_if_retaggable
+    : enable_if<
+        is_retaggable<FromTag,ToTag>::value,
+        Result
+      >
+{}; // end enable_if_retaggable
+
+
+} // end detail
+
+
+template<typename Tag, typename Iterator>
+__host__ __device__
+  thrust::detail::tagged_iterator<Iterator,Tag>
+    reinterpret_tag(Iterator iter)
+{
+  return thrust::detail::tagged_iterator<Iterator,Tag>(iter);
+} // end reinterpret_tag()
+
+
+// specialization for raw pointer
+template<typename Tag, typename T>
+__host__ __device__
+  thrust::pointer<T,Tag>
+    reinterpret_tag(T *ptr)
+{
+  return thrust::pointer<T,Tag>(ptr);
+} // end reinterpret_tag()
+
+
+// specialization for thrust::pointer
+template<typename Tag, typename T, typename OtherTag, typename Reference, typename Derived>
+__host__ __device__
+  thrust::pointer<T,Tag>
+    reinterpret_tag(thrust::pointer<T,OtherTag,Reference,Derived> ptr)
+{
+  return reinterpret_tag<Tag>(ptr.get());
+} // end reinterpret_tag()
+
+
+// avoid deeply-nested tagged_iterator
+template<typename Tag, typename BaseIterator, typename OtherTag>
+__host__ __device__
+  thrust::detail::tagged_iterator<BaseIterator,Tag>
+    reinterpret_tag(thrust::detail::tagged_iterator<BaseIterator,OtherTag> iter)
+{
+  return reinterpret_tag<Tag>(iter.base());
+} // end reinterpret_tag()
+
+
+template<typename Tag, typename Iterator>
+__host__ __device__
+  typename thrust::detail::enable_if_retaggable<
+    typename thrust::iterator_system<Iterator>::type,
+    Tag,
+    thrust::detail::tagged_iterator<Iterator,Tag>
+  >::type
+    retag(Iterator iter)
+{
+  return reinterpret_tag<Tag>(iter);
+} // end retag()
+
+
+// specialization for raw pointer
+template<typename Tag, typename T>
+__host__ __device__
+  typename thrust::detail::enable_if_retaggable<
+    typename thrust::iterator_system<T*>::type,
+    Tag,
+    thrust::pointer<T,Tag>
+  >::type
+    retag(T *ptr)
+{
+  return reinterpret_tag<Tag>(ptr);
+} // end retag()
+
+
+// specialization for thrust::pointer
+template<typename Tag, typename T, typename OtherTag>
+__host__ __device__
+  typename thrust::detail::enable_if_retaggable<
+    OtherTag,
+    Tag,
+    thrust::pointer<T,Tag>
+  >::type
+    retag(thrust::pointer<T,OtherTag> ptr)
+{
+  return reinterpret_tag<Tag>(ptr);
+} // end retag()
+
+
+// avoid deeply-nested tagged_iterator
+template<typename Tag, typename BaseIterator, typename OtherTag>
+__host__ __device__
+  typename thrust::detail::enable_if_retaggable<
+    OtherTag,
+    Tag,
+    thrust::detail::tagged_iterator<BaseIterator,Tag>
+  >::type
+    retag(thrust::detail::tagged_iterator<BaseIterator,OtherTag> iter)
+{
+  return reinterpret_tag<Tag>(iter);
+} // end retag()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/reverse_iterator.inl b/thrust/thrust/iterator/detail/reverse_iterator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..bb96c497fc5937d9e94c55d42077b814fc049114
--- /dev/null
+++ b/thrust/thrust/iterator/detail/reverse_iterator.inl
@@ -0,0 +1,115 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+__thrust_exec_check_disable__
+template<typename Iterator>
+  __host__ __device__
+  Iterator prior(Iterator x)
+{
+  return --x;
+} // end prior()
+
+} // end detail
+
+template<typename BidirectionalIterator>
+  __host__ __device__
+  reverse_iterator<BidirectionalIterator>
+    ::reverse_iterator(BidirectionalIterator x)
+      :super_t(x)
+{
+} // end reverse_iterator::reverse_iterator()
+
+template<typename BidirectionalIterator>
+  template<typename OtherBidirectionalIterator>
+    __host__ __device__
+    reverse_iterator<BidirectionalIterator>
+      ::reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
+// XXX msvc screws this up
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                     , typename thrust::detail::enable_if<
+                         thrust::detail::is_convertible<
+                           OtherBidirectionalIterator,
+                           BidirectionalIterator
+                         >::value
+                       >::type *
+#endif // MSVC
+                     )
+        :super_t(r.base())
+{
+} // end reverse_iterator::reverse_iterator()
+
+template<typename BidirectionalIterator>
+  __host__ __device__
+  typename reverse_iterator<BidirectionalIterator>::super_t::reference
+    reverse_iterator<BidirectionalIterator>
+      ::dereference() const
+{
+  return *thrust::detail::prior(this->base());
+} // end reverse_iterator::increment()
+
+template<typename BidirectionalIterator>
+  __host__ __device__
+  void reverse_iterator<BidirectionalIterator>
+    ::increment()
+{
+  --this->base_reference();
+} // end reverse_iterator::increment()
+
+template<typename BidirectionalIterator>
+  __host__ __device__
+  void reverse_iterator<BidirectionalIterator>
+    ::decrement()
+{
+  ++this->base_reference();
+} // end reverse_iterator::decrement()
+
+template<typename BidirectionalIterator>
+  __host__ __device__
+  void reverse_iterator<BidirectionalIterator>
+    ::advance(typename super_t::difference_type n)
+{
+  this->base_reference() += -n;
+} // end reverse_iterator::advance()
+
+template<typename BidirectionalIterator>
+  template<typename OtherBidirectionalIterator>
+    __host__ __device__
+    typename reverse_iterator<BidirectionalIterator>::super_t::difference_type
+      reverse_iterator<BidirectionalIterator>
+        ::distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const
+{
+  return this->base_reference() - y.base();
+} // end reverse_iterator::distance_to()
+
+template<typename BidirectionalIterator>
+__host__ __device__
+reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalIterator x)
+{
+  return reverse_iterator<BidirectionalIterator>(x);
+} // end make_reverse_iterator()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/reverse_iterator_base.h b/thrust/thrust/iterator/detail/reverse_iterator_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..68fa1f2f818a456bc53f7cb81aaa425a63e475ff
--- /dev/null
+++ b/thrust/thrust/iterator/detail/reverse_iterator_base.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+template <typename> class reverse_iterator;
+
+namespace detail
+{
+
+template<typename BidirectionalIterator>
+  struct reverse_iterator_base
+{
+  typedef thrust::iterator_adaptor<
+    thrust::reverse_iterator<BidirectionalIterator>,
+    BidirectionalIterator
+  > type;
+}; // end reverse_iterator_base
+
+} // end detail
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/tagged_iterator.h b/thrust/thrust/iterator/detail/tagged_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..125a4675e89894fd9f919da7ad410406dc1328d5
--- /dev/null
+++ b/thrust/thrust/iterator/detail/tagged_iterator.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename,typename> class tagged_iterator;
+
+template<typename Iterator, typename Tag>
+  struct tagged_iterator_base
+{
+  typedef thrust::iterator_adaptor<
+    tagged_iterator<Iterator,Tag>,
+    Iterator,
+    typename thrust::iterator_value<Iterator>::type,
+    Tag,
+    typename thrust::iterator_traversal<Iterator>::type,
+    typename thrust::iterator_reference<Iterator>::type,
+    typename thrust::iterator_difference<Iterator>::type
+  > type;
+}; // end tagged_iterator_base
+
+template<typename Iterator, typename Tag>
+  class tagged_iterator
+    : public tagged_iterator_base<Iterator,Tag>::type
+{
+  private:
+    typedef typename tagged_iterator_base<Iterator,Tag>::type super_t;
+
+  public:
+    __host__ __device__
+    tagged_iterator() {}
+
+    __host__ __device__
+    explicit tagged_iterator(Iterator x)
+      : super_t(x) {}
+}; // end tagged_iterator
+
+} // end detail
+
+// tagged_iterator is trivial if its base iterator is.
+template <typename BaseIterator, typename Tag>
+struct proclaim_contiguous_iterator<
+  detail::tagged_iterator<BaseIterator, Tag>
+> : is_contiguous_iterator<BaseIterator> {};
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/transform_input_output_iterator.inl b/thrust/thrust/iterator/detail/transform_input_output_iterator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b3c9e1bc51e306a26d09ac4868e7fbeb571de312
--- /dev/null
+++ b/thrust/thrust/iterator/detail/transform_input_output_iterator.inl
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace thrust
+{
+
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator;
+
+namespace detail 
+{
+
+// Proxy reference that invokes InputFunction when reading from and
+// OutputFunction when writing to the dereferenced iterator
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator_proxy
+{
+  using iterator_value_type = typename thrust::iterator_value<Iterator>::type;
+
+  // std::result_of is deprecated in 2017, replace with std::invoke_result
+#if THRUST_CPP_DIALECT < 2017
+  using Value = typename std::result_of<InputFunction(iterator_value_type)>::type;
+#else
+  using Value = std::invoke_result_t<InputFunction, iterator_value_type>;
+#endif
+
+  public:
+    __host__ __device__
+    transform_input_output_iterator_proxy(const Iterator& io, InputFunction input_function, OutputFunction output_function)
+      : io(io), input_function(input_function), output_function(output_function)
+    {
+    }
+
+    transform_input_output_iterator_proxy(const transform_input_output_iterator_proxy&) = default;
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    operator Value const() const
+    {
+      return input_function(*io);
+    }
+
+    __thrust_exec_check_disable__
+    template <typename T>
+    __host__ __device__
+    transform_input_output_iterator_proxy operator=(const T& x)
+    {
+      *io = output_function(x);
+      return *this;
+    }
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    transform_input_output_iterator_proxy operator=(const transform_input_output_iterator_proxy& x)
+    {
+      *io = output_function(x);
+      return *this;
+    }
+
+  private:
+    Iterator io;
+    InputFunction input_function;
+    OutputFunction output_function;
+};
+
+// Compute the iterator_adaptor instantiation to be used for transform_input_output_iterator
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+struct transform_input_output_iterator_base
+{
+private:
+  using iterator_value_type = typename thrust::iterator_value<Iterator>::type;
+
+public:
+    typedef thrust::iterator_adaptor
+    <
+        transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+      , Iterator
+    // std::result_of is deprecated in 2017, replace with std::invoke_result
+#if THRUST_CPP_DIALECT < 2017
+      , typename std::result_of<InputFunction(iterator_value_type)>::type
+#else
+      , std::invoke_result_t<InputFunction, iterator_value_type>
+#endif
+      , thrust::use_default
+      , thrust::use_default
+      , transform_input_output_iterator_proxy<InputFunction, OutputFunction, Iterator>
+    > type;
+};
+
+// Register transform_input_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+struct is_proxy_reference<
+    transform_input_output_iterator_proxy<InputFunction, OutputFunction, Iterator> >
+    : public thrust::detail::true_type {};
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/transform_iterator.inl b/thrust/thrust/iterator/detail/transform_iterator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..65eee868737267a35f1d73b6ae722ff7f28aad69
--- /dev/null
+++ b/thrust/thrust/iterator/detail/transform_iterator.inl
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+
+namespace thrust
+{
+
+template <class UnaryFunction, class Iterator, class Reference, class Value>
+  class transform_iterator;
+  
+namespace detail 
+{
+
+// Compute the iterator_adaptor instantiation to be used for transform_iterator
+template <class UnaryFunc, class Iterator, class Reference, class Value>
+struct transform_iterator_base
+{
+ private:
+    // By default, dereferencing the iterator yields the same as the function.
+    typedef typename thrust::detail::ia_dflt_help<
+      Reference,
+      thrust::detail::result_of_adaptable_function<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>
+    >::type reference;
+
+    // To get the default for Value: remove any reference on the
+    // result type, but retain any constness to signal
+    // non-writability.  Note that if we adopt Thomas' suggestion
+    // to key non-writability *only* on the Reference argument,
+    // we'd need to strip constness here as well.
+    typedef typename thrust::detail::ia_dflt_help<
+      Value,
+      thrust::detail::remove_reference<reference>
+    >::type cv_value_type;
+
+ public:
+    typedef thrust::iterator_adaptor
+    <
+        transform_iterator<UnaryFunc, Iterator, Reference, Value>
+      , Iterator
+      , cv_value_type
+      , thrust::use_default   // Leave the system alone
+        //, thrust::use_default   // Leave the traversal alone
+        // use the Iterator's category to let any system iterators remain random access even though
+        // transform_iterator's reference type may not be a reference
+        // XXX figure out why only iterators whose reference types are true references are random access
+        , typename thrust::iterator_traits<Iterator>::iterator_category
+      , reference
+    > type;
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/transform_output_iterator.inl b/thrust/thrust/iterator/detail/transform_output_iterator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..91f657ca7207462e66814934f21c2c2b5278b9a1
--- /dev/null
+++ b/thrust/thrust/iterator/detail/transform_output_iterator.inl
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2016 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace thrust
+{
+
+template <typename UnaryFunction, typename OutputIterator>
+  class transform_output_iterator;
+
+namespace detail 
+{
+
+// Proxy reference that uses Unary Function to transform the rhs of assigment
+// operator before writing the result to OutputIterator
+template <typename UnaryFunction, typename OutputIterator>
+  class transform_output_iterator_proxy
+{
+  public:
+    __host__ __device__
+    transform_output_iterator_proxy(const OutputIterator& out, UnaryFunction fun) : out(out), fun(fun)
+    {
+    }
+
+    __thrust_exec_check_disable__
+    template <typename T>
+    __host__ __device__
+    transform_output_iterator_proxy operator=(const T& x)
+    {
+      *out = fun(x);
+      return *this;
+    }
+
+  private:
+    OutputIterator out;
+    UnaryFunction fun;
+};
+
+// Compute the iterator_adaptor instantiation to be used for transform_output_iterator
+template <typename UnaryFunction, typename OutputIterator>
+struct transform_output_iterator_base
+{
+    typedef thrust::iterator_adaptor
+    <
+        transform_output_iterator<UnaryFunction, OutputIterator>
+      , OutputIterator
+      , thrust::use_default
+      , thrust::use_default
+      , thrust::use_default
+      , transform_output_iterator_proxy<UnaryFunction, OutputIterator>
+    > type;
+};
+
+// Register transform_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class UnaryFunction, class OutputIterator>
+struct is_proxy_reference<
+    transform_output_iterator_proxy<UnaryFunction, OutputIterator> >
+    : public thrust::detail::true_type {};
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/tuple_of_iterator_references.h b/thrust/thrust/iterator/detail/tuple_of_iterator_references.h
new file mode 100644
index 0000000000000000000000000000000000000000..93d7e05e4f0edb5752b33382186437485ec95fdd
--- /dev/null
+++ b/thrust/thrust/iterator/detail/tuple_of_iterator_references.h
@@ -0,0 +1,263 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+#include <thrust/pair.h>
+#include <thrust/detail/reference_forward_declaration.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+  
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  class tuple_of_iterator_references
+    : public thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+{
+  private:
+    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> super_t;
+
+  public:
+    // allow implicit construction from tuple<refs>
+    inline __host__ __device__
+    tuple_of_iterator_references(const super_t &other)
+      : super_t(other)
+    {}
+
+    // allow assignment from tuples
+    // XXX might be worthwhile to guard this with an enable_if is_assignable
+    __thrust_exec_check_disable__
+    template<typename U1, typename U2>
+    inline __host__ __device__
+    tuple_of_iterator_references &operator=(const detail::cons<U1,U2> &other)
+    {
+      super_t::operator=(other);
+      return *this;
+    }
+
+    // allow assignment from pairs
+    // XXX might be worthwhile to guard this with an enable_if is_assignable
+    __thrust_exec_check_disable__
+    template<typename U1, typename U2>
+    inline __host__ __device__
+    tuple_of_iterator_references &operator=(const thrust::pair<U1,U2> &other)
+    {
+      super_t::operator=(other);
+      return *this;
+    }
+
+    // allow assignment from reference<tuple>
+    // XXX perhaps we should generalize to reference<T>
+    //     we could captures reference<pair> this way
+    __thrust_exec_check_disable__
+    template<typename U0, typename U1, typename U2,
+             typename U3, typename U4, typename U5,
+             typename U6, typename U7, typename U8,
+             typename U9,
+             typename Pointer, typename Derived>
+    inline __host__ __device__
+// XXX gcc-4.2 crashes on is_assignable
+//    typename thrust::detail::enable_if<
+//      thrust::detail::is_assignable<
+//        super_t,
+//        const thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>
+//      >::value,
+//      tuple_of_iterator_references &
+//    >::type
+    tuple_of_iterator_references &
+    operator=(const thrust::reference<thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>, Pointer, Derived> &other)
+    {
+      typedef thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> tuple_type;
+
+      // XXX perhaps this could be accelerated
+      tuple_type other_tuple = other;
+      super_t::operator=(other_tuple);
+      return *this;
+    }
+
+
+    // duplicate thrust::tuple's constructors
+    inline __host__ __device__
+    tuple_of_iterator_references() {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0)
+      : super_t(t0,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1)
+      : super_t(t0, t1,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2)
+      : super_t(t0, t1, t2,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3)
+      : super_t(t0, t1, t2, t3,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4)
+      : super_t(t0, t1, t2, t3, t4,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5)
+      : super_t(t0, t1, t2, t3, t4, t5,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6)
+      : super_t(t0, t1, t2, t3, t4, t5, t6,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6,
+                                 typename access_traits<T7>::parameter_type t7)
+      : super_t(t0, t1, t2, t3, t4, t5, t6, t7,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6,
+                                 typename access_traits<T7>::parameter_type t7,
+                                 typename access_traits<T8>::parameter_type t8)
+      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8,
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6,
+                                 typename access_traits<T7>::parameter_type t7,
+                                 typename access_traits<T8>::parameter_type t8,
+                                 typename access_traits<T9>::parameter_type t9)
+      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+    {}
+};
+
+
+// this overload of swap() permits swapping tuple_of_iterator_references returned as temporaries from
+// iterator dereferences
+template<
+  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
+  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
+>
+inline __host__ __device__
+void swap(tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> x,
+          tuple_of_iterator_references<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> y)
+{
+  x.swap(y);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/universal_categories.h b/thrust/thrust/iterator/detail/universal_categories.h
new file mode 100644
index 0000000000000000000000000000000000000000..2389796b190ae04772882edd2a83c7642cdf8103
--- /dev/null
+++ b/thrust/thrust/iterator/detail/universal_categories.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+
+// XXX eliminate this file
+
+namespace thrust
+{
+
+// define these types without inheritance to avoid ambiguous conversion to base classes
+
+struct input_universal_iterator_tag
+{
+  operator input_host_iterator_tag () {return input_host_iterator_tag();}
+
+  operator input_device_iterator_tag () {return input_device_iterator_tag();}
+};
+
+struct output_universal_iterator_tag
+{
+  operator output_host_iterator_tag () {return output_host_iterator_tag();}
+
+  operator output_device_iterator_tag () {return output_device_iterator_tag();}
+};
+
+struct forward_universal_iterator_tag
+  : input_universal_iterator_tag
+{
+  operator forward_host_iterator_tag () {return forward_host_iterator_tag();};
+
+  operator forward_device_iterator_tag () {return forward_device_iterator_tag();};
+};
+
+struct bidirectional_universal_iterator_tag
+  : forward_universal_iterator_tag
+{
+  operator bidirectional_host_iterator_tag () {return bidirectional_host_iterator_tag();};
+
+  operator bidirectional_device_iterator_tag () {return bidirectional_device_iterator_tag();};
+};
+
+
+namespace detail
+{
+
+// create this struct to control conversion precedence in random_access_universal_iterator_tag
+template<typename T>
+struct one_degree_of_separation
+  : T
+{
+};
+
+} // end detail
+
+
+struct random_access_universal_iterator_tag
+{
+  // these conversions are all P0
+  operator random_access_host_iterator_tag () {return random_access_host_iterator_tag();};
+
+  operator random_access_device_iterator_tag () {return random_access_device_iterator_tag();};
+
+  // bidirectional_universal_iterator_tag is P1
+  operator detail::one_degree_of_separation<bidirectional_universal_iterator_tag> () {return detail::one_degree_of_separation<bidirectional_universal_iterator_tag>();}
+
+};
+
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/zip_iterator.inl b/thrust/thrust/iterator/detail/zip_iterator.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7eb35b091b9f69209ea198a3cc08abb61ed58af4
--- /dev/null
+++ b/thrust/thrust/iterator/detail/zip_iterator.inl
@@ -0,0 +1,143 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/tuple_transform.h>
+
+namespace thrust
+{
+
+
+template<typename IteratorTuple>
+__host__ __device__
+  zip_iterator<IteratorTuple>
+    ::zip_iterator()
+{
+} // end zip_iterator::zip_iterator()
+
+
+template<typename IteratorTuple>
+__host__ __device__
+  zip_iterator<IteratorTuple>
+    ::zip_iterator(IteratorTuple iterator_tuple)
+      :m_iterator_tuple(iterator_tuple)
+{
+} // end zip_iterator::zip_iterator()
+
+
+template<typename IteratorTuple>
+  template<typename OtherIteratorTuple>
+  __host__ __device__
+    zip_iterator<IteratorTuple>
+      ::zip_iterator(const zip_iterator<OtherIteratorTuple> &other,
+                     typename thrust::detail::enable_if_convertible<
+                       OtherIteratorTuple,
+                       IteratorTuple
+                     >::type *)
+        :m_iterator_tuple(other.get_iterator_tuple())
+{
+} // end zip_iterator::zip_iterator()
+
+
+template<typename IteratorTuple>
+__host__ __device__
+const IteratorTuple &zip_iterator<IteratorTuple>
+  ::get_iterator_tuple() const
+{
+  return m_iterator_tuple;
+} // end zip_iterator::get_iterator_tuple()
+
+
+template<typename IteratorTuple>
+  typename zip_iterator<IteratorTuple>::super_t::reference
+  __host__ __device__
+    zip_iterator<IteratorTuple>
+      ::dereference() const
+{
+  using namespace detail::tuple_impl_specific;
+
+  return thrust::detail::tuple_host_device_transform<
+    detail::dereference_iterator::template apply
+  >(get_iterator_tuple(), detail::dereference_iterator());
+} // end zip_iterator::dereference()
+
+
+__thrust_exec_check_disable__
+template<typename IteratorTuple>
+  template<typename OtherIteratorTuple>
+  __host__ __device__
+    bool zip_iterator<IteratorTuple>
+      ::equal(const zip_iterator<OtherIteratorTuple> &other) const
+{
+  return get<0>(get_iterator_tuple()) == get<0>(other.get_iterator_tuple());
+} // end zip_iterator::equal()
+
+
+template<typename IteratorTuple>
+__host__ __device__
+  void zip_iterator<IteratorTuple>
+    ::advance(typename super_t::difference_type n)
+{
+  using namespace detail::tuple_impl_specific;
+  tuple_for_each(m_iterator_tuple,
+                 detail::advance_iterator<typename super_t::difference_type>(n));
+} // end zip_iterator::advance()
+
+
+template<typename IteratorTuple>
+__host__ __device__
+  void zip_iterator<IteratorTuple>
+    ::increment()
+{
+  using namespace detail::tuple_impl_specific;
+  tuple_for_each(m_iterator_tuple, detail::increment_iterator());
+} // end zip_iterator::increment()
+
+
+template<typename IteratorTuple>
+__host__ __device__
+  void zip_iterator<IteratorTuple>
+    ::decrement()
+{
+  using namespace detail::tuple_impl_specific;
+  tuple_for_each(m_iterator_tuple, detail::decrement_iterator());
+} // end zip_iterator::decrement()
+
+
+__thrust_exec_check_disable__
+template<typename IteratorTuple>
+  template <typename OtherIteratorTuple>
+  __host__ __device__
+    typename zip_iterator<IteratorTuple>::super_t::difference_type
+      zip_iterator<IteratorTuple>
+        ::distance_to(const zip_iterator<OtherIteratorTuple> &other) const
+{
+  return get<0>(other.get_iterator_tuple()) - get<0>(get_iterator_tuple());
+} // end zip_iterator::distance_to()
+
+
+template<typename IteratorTuple>
+__host__ __device__
+  zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t)
+{
+  return zip_iterator<IteratorTuple>(t);
+} // end make_zip_iterator()
+
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/detail/zip_iterator_base.h b/thrust/thrust/iterator/detail/zip_iterator_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1603aed4d209bbe3d8b8f211857c250b0bb7c3e
--- /dev/null
+++ b/thrust/thrust/iterator/detail/zip_iterator_base.h
@@ -0,0 +1,405 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/minimum_category.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/tuple_meta_transform.h>
+#include <thrust/detail/tuple_transform.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tuple_of_iterator_references.h>
+
+namespace thrust
+{
+
+// forward declare zip_iterator for zip_iterator_base
+template<typename IteratorTuple> class zip_iterator;
+
+namespace detail
+{
+
+
+// Functors to be used with tuple algorithms
+//
+template<typename DiffType>
+class advance_iterator
+{
+public:
+  inline __host__ __device__
+  advance_iterator(DiffType step) : m_step(step) {}
+
+  __thrust_exec_check_disable__
+  template<typename Iterator>
+  inline __host__ __device__
+  void operator()(Iterator& it) const
+  { thrust::advance(it, m_step); }
+
+private:
+  DiffType m_step;
+}; // end advance_iterator
+
+
+struct increment_iterator
+{
+  __thrust_exec_check_disable__
+  template<typename Iterator>
+  inline __host__ __device__
+  void operator()(Iterator& it)
+  { ++it; }
+}; // end increment_iterator
+
+
+struct decrement_iterator
+{
+  __thrust_exec_check_disable__
+  template<typename Iterator>
+  inline __host__ __device__
+  void operator()(Iterator& it)
+  { --it; }
+}; // end decrement_iterator
+
+
+struct dereference_iterator
+{
+  template<typename Iterator>
+  struct apply
+  { 
+    typedef typename
+      iterator_traits<Iterator>::reference
+    type;
+  }; // end apply
+
+  // XXX silence warnings of the form "calling a __host__ function from a __host__ __device__ function is not allowed
+  __thrust_exec_check_disable__
+  template<typename Iterator>
+  __host__ __device__
+    typename apply<Iterator>::type operator()(Iterator const& it)
+  {
+    return *it;
+  }
+}; // end dereference_iterator
+
+
+// The namespace tuple_impl_specific provides two meta-
+// algorithms and two algorithms for tuples.
+namespace tuple_impl_specific
+{
+
+// define apply1 for tuple_meta_transform_impl
+template<typename UnaryMetaFunctionClass, class Arg>
+  struct apply1
+    : UnaryMetaFunctionClass::template apply<Arg>
+{
+}; // end apply1
+
+
+// define apply2 for tuple_meta_accumulate_impl
+template<typename UnaryMetaFunctionClass, class Arg1, class Arg2>
+  struct apply2
+    : UnaryMetaFunctionClass::template apply<Arg1,Arg2>
+{
+}; // end apply2
+
+
+// Meta-accumulate algorithm for tuples. Note: The template 
+// parameter StartType corresponds to the initial value in 
+// ordinary accumulation.
+//
+template<class Tuple, class BinaryMetaFun, class StartType>
+  struct tuple_meta_accumulate;
+
+template<
+    typename Tuple
+  , class BinaryMetaFun
+  , typename StartType
+>
+  struct tuple_meta_accumulate_impl
+{
+   typedef typename apply2<
+       BinaryMetaFun
+     , typename Tuple::head_type
+     , typename tuple_meta_accumulate<
+           typename Tuple::tail_type
+         , BinaryMetaFun
+         , StartType 
+       >::type
+   >::type type;
+};
+
+
+template<
+    typename Tuple
+  , class BinaryMetaFun
+  , typename StartType
+>
+struct tuple_meta_accumulate
+  : thrust::detail::eval_if<
+        thrust::detail::is_same<Tuple, thrust::null_type>::value
+      , thrust::detail::identity_<StartType>
+      , tuple_meta_accumulate_impl<
+            Tuple
+          , BinaryMetaFun
+          , StartType
+        >
+    > // end eval_if
+{
+}; // end tuple_meta_accumulate
+
+
+// transform algorithm for tuples. The template parameter Fun
+// must be a unary functor which is also a unary metafunction
+// class that computes its return type based on its argument
+// type. For example:
+//
+// struct to_ptr
+// {
+//     template <class Arg>
+//     struct apply
+//     {
+//          typedef Arg* type;
+//     }
+//
+//     template <class Arg>
+//     Arg* operator()(Arg x);
+// };
+
+
+
+// for_each algorithm for tuples.
+template<typename Fun>
+inline __host__ __device__
+Fun tuple_for_each(thrust::null_type, Fun f)
+{
+  return f;
+} // end tuple_for_each()
+
+
+template<typename Tuple, typename Fun>
+inline __host__ __device__
+Fun tuple_for_each(Tuple& t, Fun f)
+{ 
+  f( t.get_head() );
+  return tuple_for_each(t.get_tail(), f);
+} // end tuple_for_each()
+
+
+// Equality of tuples. NOTE: "==" for tuples currently (7/2003)
+// has problems under some compilers, so I just do my own.
+// No point in bringing in a bunch of #ifdefs here. This is
+// going to go away with the next tuple implementation anyway.
+//
+__host__ __device__
+inline bool tuple_equal(thrust::null_type, thrust::null_type)
+{ return true; }
+
+
+template<typename Tuple1, typename Tuple2>
+__host__ __device__
+bool tuple_equal(Tuple1 const& t1, Tuple2 const& t2)
+{ 
+  return t1.get_head() == t2.get_head() && 
+  tuple_equal(t1.get_tail(), t2.get_tail());
+} // end tuple_equal()
+
+} // end end tuple_impl_specific
+
+
+// Metafunction to obtain the type of the tuple whose element types
+// are the value_types of an iterator tupel.
+//
+template<typename IteratorTuple>
+  struct tuple_of_value_types
+    : tuple_meta_transform<
+          IteratorTuple,
+          iterator_value
+        >
+{
+}; // end tuple_of_value_types
+
+
+struct minimum_category_lambda
+{
+  template<typename T1, typename T2>
+    struct apply : minimum_category<T1,T2>
+  {};
+};
+
+
+
+// Metafunction to obtain the minimal traversal tag in a tuple
+// of iterators.
+//
+template<typename IteratorTuple>
+struct minimum_traversal_category_in_iterator_tuple
+{
+  typedef typename tuple_meta_transform<
+      IteratorTuple
+    , thrust::iterator_traversal
+  >::type tuple_of_traversal_tags;
+      
+  typedef typename tuple_impl_specific::tuple_meta_accumulate<
+      tuple_of_traversal_tags
+    , minimum_category_lambda
+    , thrust::random_access_traversal_tag
+  >::type type;
+};
+
+
+struct minimum_system_lambda
+{
+  template<typename T1, typename T2>
+    struct apply : minimum_system<T1,T2>
+  {};
+};
+
+
+
+// Metafunction to obtain the minimal system tag in a tuple
+// of iterators.
+template<typename IteratorTuple>
+struct minimum_system_in_iterator_tuple
+{
+  typedef typename thrust::detail::tuple_meta_transform<
+    IteratorTuple,
+    thrust::iterator_system
+  >::type tuple_of_system_tags;
+
+  typedef typename tuple_impl_specific::tuple_meta_accumulate<
+    tuple_of_system_tags,
+    minimum_system_lambda,
+    thrust::any_system_tag
+  >::type type;
+};
+
+namespace zip_iterator_base_ns
+{
+
+
+template<int i, typename Tuple>
+  struct tuple_elements_helper
+    : eval_if<
+        (i < tuple_size<Tuple>::value),
+        tuple_element<i,Tuple>,
+        identity_<thrust::null_type>
+      >
+{};
+
+
+template<typename Tuple>
+  struct tuple_elements
+{
+  typedef typename tuple_elements_helper<0,Tuple>::type T0;
+  typedef typename tuple_elements_helper<1,Tuple>::type T1;
+  typedef typename tuple_elements_helper<2,Tuple>::type T2;
+  typedef typename tuple_elements_helper<3,Tuple>::type T3;
+  typedef typename tuple_elements_helper<4,Tuple>::type T4;
+  typedef typename tuple_elements_helper<5,Tuple>::type T5;
+  typedef typename tuple_elements_helper<6,Tuple>::type T6;
+  typedef typename tuple_elements_helper<7,Tuple>::type T7;
+  typedef typename tuple_elements_helper<8,Tuple>::type T8;
+  typedef typename tuple_elements_helper<9,Tuple>::type T9;
+};
+
+
+template<typename IteratorTuple>
+  struct tuple_of_iterator_references
+{
+  // get a thrust::tuple of the iterators' references
+  typedef typename tuple_meta_transform<
+    IteratorTuple,
+    iterator_reference
+  >::type tuple_of_references;
+
+  // get at the individual tuple element types by name
+  typedef tuple_elements<tuple_of_references> elements;
+
+  // map thrust::tuple<T...> to tuple_of_iterator_references<T...>
+  typedef thrust::detail::tuple_of_iterator_references<
+    typename elements::T0,
+    typename elements::T1,
+    typename elements::T2,
+    typename elements::T3,
+    typename elements::T4,
+    typename elements::T5,
+    typename elements::T6,
+    typename elements::T7,
+    typename elements::T8,
+    typename elements::T9
+  > type;
+};
+
+
+} // end zip_iterator_base_ns
+
+///////////////////////////////////////////////////////////////////
+//
+// Class zip_iterator_base
+//
+// Builds and exposes the iterator facade type from which the zip 
+// iterator will be derived.
+//
+template<typename IteratorTuple>
+  struct zip_iterator_base
+{
+ //private:
+    // reference type is the type of the tuple obtained from the
+    // iterators' reference types.
+    typedef typename zip_iterator_base_ns::tuple_of_iterator_references<IteratorTuple>::type reference;
+
+    // Boost's Value type is the same as reference type.
+    //typedef reference value_type;
+    typedef typename tuple_of_value_types<IteratorTuple>::type value_type;
+
+    // Difference type is the first iterator's difference type
+    typedef typename thrust::iterator_traits<
+      typename thrust::tuple_element<0, IteratorTuple>::type
+    >::difference_type difference_type;
+
+    // Iterator system is the minimum system tag in the
+    // iterator tuple
+    typedef typename
+    minimum_system_in_iterator_tuple<IteratorTuple>::type system;
+
+    // Traversal category is the minimum traversal category in the
+    // iterator tuple
+    typedef typename
+    minimum_traversal_category_in_iterator_tuple<IteratorTuple>::type traversal_category;
+  
+ public:
+  
+    // The iterator facade type from which the zip iterator will
+    // be derived.
+    typedef thrust::iterator_facade<
+        zip_iterator<IteratorTuple>,
+        value_type,  
+        system,
+        traversal_category,
+        reference,
+        difference_type
+    > type;
+}; // end zip_iterator_base
+
+} // end detail
+
+} // end thrust
+
+
diff --git a/thrust/thrust/iterator/discard_iterator.h b/thrust/thrust/iterator/discard_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1613694d85e83732652510785e09940642746fc
--- /dev/null
+++ b/thrust/thrust/iterator/discard_iterator.h
@@ -0,0 +1,175 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/discard_iterator.h
+ *  \brief An iterator which "discards" (ignores) values assigned to it upon dereference
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/discard_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p discard_iterator is an iterator which represents a special kind of pointer that
+ *  ignores values written to it upon dereference. This iterator is useful for ignoring
+ *  the output of certain algorithms without wasting memory capacity or bandwidth.
+ *  \p discard_iterator may also be used to count the size of an algorithm's output which
+ *  may not be known a priori.
+ *
+ *  The following code snippet demonstrates how to use \p discard_iterator to ignore
+ *  ignore one of the output ranges of reduce_by_key
+ *
+ *  \code
+ *  #include <thrust/iterator/discard_iterator.h>
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<int> keys(7), values(7);
+ *
+ *    keys[0] = 1;
+ *    keys[1] = 3;
+ *    keys[2] = 3;
+ *    keys[3] = 3;
+ *    keys[4] = 2;
+ *    keys[5] = 2;
+ *    keys[6] = 1;
+ *
+ *    values[0] = 9;
+ *    values[1] = 8;
+ *    values[2] = 7;
+ *    values[3] = 6;
+ *    values[4] = 5;
+ *    values[5] = 4;
+ *    values[6] = 3;
+ *
+ *    thrust::device_vector<int> result(4);
+ *
+ *    // we are only interested in the reduced values
+ *    // use discard_iterator to ignore the output keys
+ *    thrust::reduce_by_key(keys.begin(), keys.end(),
+ *                          values.begin(),
+ *                          thrust::make_discard_iterator(),
+ *                          result.begin());
+ *
+ *    // result is now [9, 21, 9, 3]
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_discard_iterator
+ */
+template<typename System = use_default>
+  class discard_iterator
+    : public detail::discard_iterator_base<System>::type
+{
+    /*! \cond
+     */
+    friend class thrust::iterator_core_access;
+    typedef typename detail::discard_iterator_base<System>::type          super_t;
+    typedef typename detail::discard_iterator_base<System>::incrementable incrementable;
+    typedef typename detail::discard_iterator_base<System>::base_iterator base_iterator;
+
+  public:
+    typedef typename super_t::reference  reference;
+    typedef typename super_t::value_type value_type;
+
+    /*! \endcond
+     */
+
+    /*! Copy constructor copies from a source discard_iterator.
+     *
+     *  \p rhs The discard_iterator to copy.
+     */
+    __host__ __device__
+    discard_iterator(discard_iterator const &rhs)
+      : super_t(rhs.base()) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+    discard_iterator & operator=(const discard_iterator &) = default;
+#endif
+
+    /*! This constructor receives an optional index specifying the position of this
+     *  \p discard_iterator in a range.
+     *
+     *  \p i The index of this \p discard_iterator in a range. Defaults to the
+     *       value returned by \c Incrementable's null constructor. For example,
+     *       when <tt>Incrementable == int</tt>, \c 0.
+     */
+    __host__ __device__
+    discard_iterator(incrementable const &i = incrementable())
+      : super_t(base_iterator(i)) {}
+
+    /*! \cond
+     */
+
+  private: // Core iterator interface
+    __host__ __device__
+    reference dereference() const
+    {
+      return m_element;
+    }
+
+    mutable value_type m_element;
+
+    /*! \endcond
+     */
+}; // end constant_iterator
+
+
+/*! \p make_discard_iterator creates a \p discard_iterator from an optional index parameter.
+ *
+ *  \param i The index of the returned \p discard_iterator within a range.
+ *           In the default case, the value of this parameter is \c 0.
+ *
+ *  \return A new \p discard_iterator with index as given by \p i.
+ *
+ *  \see constant_iterator
+ */
+inline __host__ __device__
+discard_iterator<> make_discard_iterator(discard_iterator<>::difference_type i = discard_iterator<>::difference_type(0))
+{
+  return discard_iterator<>(i);
+} // end make_discard_iterator()
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end namespace thrust
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
diff --git a/thrust/thrust/iterator/iterator_adaptor.h b/thrust/thrust/iterator/iterator_adaptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3c9b86554adb3dc1c7b628339e16405da6bf7a6
--- /dev/null
+++ b/thrust/thrust/iterator/iterator_adaptor.h
@@ -0,0 +1,240 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/iterator_adaptor.h
+ *  \brief An iterator which adapts a base iterator
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/iterator/detail/iterator_adaptor_base.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p iterator_adaptor is an iterator which adapts an existing type of iterator to create a new type of
+ *  iterator. Most of Thrust's fancy iterators are defined via inheritance from \p iterator_adaptor.
+ *  While composition of these existing Thrust iterators is often sufficient for expressing the desired
+ *  functionality, it is occasionally more straightforward to derive from \p iterator_adaptor directly.
+ *
+ *  To see how to use \p iterator_adaptor to create a novel iterator type, let's examine how to use it to
+ *  define \p repeat_iterator, a fancy iterator which repeats elements from another range a given number of time:
+ *
+ *  \code
+ *  #include <thrust/iterator/iterator_adaptor.h>
+ *
+ *  // derive repeat_iterator from iterator_adaptor
+ *  template<typename Iterator>
+ *    class repeat_iterator
+ *      : public thrust::iterator_adaptor<
+ *          repeat_iterator<Iterator>, // the first template parameter is the name of the iterator we're creating
+ *          Iterator                   // the second template parameter is the name of the iterator we're adapting
+ *                                     // we can use the default for the additional template parameters
+ *        >
+ *  {
+ *    public:
+ *      // shorthand for the name of the iterator_adaptor we're deriving from
+ *      typedef thrust::iterator_adaptor<
+ *        repeat_iterator<Iterator>,
+ *        Iterator
+ *      > super_t;
+ *
+ *      __host__ __device__
+ *      repeat_iterator(const Iterator &x, int n) : super_t(x), begin(x), n(n) {}
+ *
+ *      // befriend thrust::iterator_core_access to allow it access to the private interface below
+ *      friend class thrust::iterator_core_access;
+ *
+ *    private:
+ *      // repeat each element of the adapted range n times
+ *      unsigned int n;
+ *
+ *      // used to keep track of where we began
+ *      const Iterator begin;
+ *
+ *      // it is private because only thrust::iterator_core_access needs access to it
+ *      __host__ __device__
+ *      typename super_t::reference dereference() const
+ *      {
+ *        return *(begin + (this->base() - begin) / n);
+ *      }
+ *  };
+ *  \endcode
+ *
+ *  Except for the first two, \p iterator_adaptor's template parameters are optional. When omitted, or when the
+ *  user specifies \p thrust::use_default in its place, \p iterator_adaptor will use a default type inferred from \p Base.
+ *
+ *  \p iterator_adaptor's functionality is derived from and generally equivalent to \p boost::iterator_adaptor.
+ *  The exception is Thrust's addition of the template parameter \p System, which is necessary to allow Thrust
+ *  to dispatch an algorithm to one of several parallel backend systems.
+ *
+ *  \p iterator_adaptor is a powerful tool for creating custom iterators directly. However, the large set of iterator semantics which must be satisfied
+ *  for algorithm compatibility can make \p iterator_adaptor difficult to use correctly. Unless you require the full expressivity of \p iterator_adaptor,
+ *  consider building a custom iterator through composition of existing higher-level fancy iterators instead. 
+ *
+ *  Interested users may refer to <tt>boost::iterator_adaptor</tt>'s documentation for further usage examples.
+ */
+template<typename Derived,
+         typename Base,
+         typename Value      = use_default,
+         typename System     = use_default,
+         typename Traversal  = use_default,
+         typename Reference  = use_default,
+         typename Difference = use_default>
+  class iterator_adaptor:
+    public detail::iterator_adaptor_base<
+      Derived, Base, Value, System, Traversal, Reference, Difference
+    >::type
+{
+  /*! \cond
+   */
+
+    friend class thrust::iterator_core_access;
+
+  protected:
+    typedef typename detail::iterator_adaptor_base<
+        Derived, Base, Value, System, Traversal, Reference, Difference
+    >::type super_t;
+
+  /*! \endcond
+   */
+  
+  public:
+    /*! \p iterator_adaptor's default constructor does nothing.
+     */
+    __host__ __device__
+    iterator_adaptor(){}
+
+    /*! This constructor copies from a given instance of the \p Base iterator.
+     */
+    __thrust_exec_check_disable__
+    __host__ __device__
+    explicit iterator_adaptor(Base const& iter)
+      : m_iterator(iter)
+    {}
+
+    /*! The type of iterator this \p iterator_adaptor's \p adapts.
+     */
+    typedef Base       base_type;
+                                                                                              
+    /*! \cond
+     */
+    typedef typename super_t::reference reference;
+                                                                                              
+    typedef typename super_t::difference_type difference_type;
+    /*! \endcond
+     */
+
+    /*! \return A \p const reference to the \p Base iterator this \p iterator_adaptor adapts.
+     */
+    __host__ __device__
+    Base const& base() const
+    { return m_iterator; }
+
+  protected:
+    /*! \return A \p const reference to the \p Base iterator this \p iterator_adaptor adapts.
+     */
+    __host__ __device__
+    Base const& base_reference() const
+    { return m_iterator; }
+
+    /*! \return A mutable reference to the \p Base iterator this \p iterator_adaptor adapts.
+     */
+    __host__ __device__
+    Base& base_reference()
+    { return m_iterator; }
+
+    /*! \cond
+     */
+  private: // Core iterator interface for iterator_facade
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    typename iterator_adaptor::reference dereference() const
+    { return *m_iterator; }
+
+    __thrust_exec_check_disable__
+    template<typename OtherDerived, typename OtherIterator, typename V, typename S, typename T, typename R, typename D>
+    __host__ __device__
+    bool equal(iterator_adaptor<OtherDerived, OtherIterator, V, S, T, R, D> const& x) const
+    { return m_iterator == x.base(); }
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    void advance(typename iterator_adaptor::difference_type n)
+    {
+      // XXX statically assert on random_access_traversal_tag
+      m_iterator += n;
+    }
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    void increment()
+    { ++m_iterator; }
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    void decrement()
+    {
+      // XXX statically assert on bidirectional_traversal_tag
+      --m_iterator;
+    }
+
+    __thrust_exec_check_disable__
+    template<typename OtherDerived, typename OtherIterator, typename V, typename S, typename T, typename R, typename D>
+    __host__ __device__
+    typename iterator_adaptor::difference_type distance_to(iterator_adaptor<OtherDerived, OtherIterator, V, S, T, R, D> const& y) const
+    { return y.base() - m_iterator; }
+
+  private:
+    Base m_iterator;
+
+    /*! \endcond
+     */
+}; // end iterator_adaptor
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/iterator_categories.h b/thrust/thrust/iterator/iterator_categories.h
new file mode 100644
index 0000000000000000000000000000000000000000..02246d446b95e0c6cc57f514b7be1163e76bed05
--- /dev/null
+++ b/thrust/thrust/iterator/iterator_categories.h
@@ -0,0 +1,224 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/iterator_categories.h
+ *  \brief Types for reasoning about the categories of iterators
+ */
+
+/*
+ * (C) Copyright Jeremy Siek 2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/iterator_category_with_system_and_traversal.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+
+// #include this for stl's iterator tags
+#include <iterator>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \addtogroup iterator_tags Iterator Tags
+ *  \ingroup iterators
+ *  \addtogroup iterator_tag_classes Iterator Tag Classes
+ *  \ingroup iterator_tags
+ *  \{
+ */
+
+/*! \p input_device_iterator_tag is an empty class: it has no member functions,
+ *  member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Input Device Iterator concept within the C++ type
+ *  system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html, iterator_traits,
+ *  output_device_iterator_tag, forward_device_iterator_tag,
+ *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct input_device_iterator_tag
+  : thrust::detail::iterator_category_with_system_and_traversal<
+      std::input_iterator_tag,
+      thrust::device_system_tag,
+      thrust::single_pass_traversal_tag
+    >
+{};
+
+/*! \p output_device_iterator_tag is an empty class: it has no member functions,
+ *  member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Output Device Iterator concept within the C++ type
+ *  system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html, iterator_traits,
+ *  input_device_iterator_tag, forward_device_iterator_tag,
+ *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct output_device_iterator_tag
+  : thrust::detail::iterator_category_with_system_and_traversal<
+      std::output_iterator_tag,
+      thrust::device_system_tag,
+      thrust::single_pass_traversal_tag
+    >
+{};
+
+/*! \p forward_device_iterator_tag is an empty class: it has no member functions,
+ *  member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Device Iterator concept within the C++ type
+ *  system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html, iterator_traits,
+ *  input_device_iterator_tag, output_device_iterator_tag,
+ *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct forward_device_iterator_tag
+  : thrust::detail::iterator_category_with_system_and_traversal<
+      std::forward_iterator_tag,
+      thrust::device_system_tag,
+      thrust::forward_traversal_tag
+    >
+{};
+
+/*! \p bidirectional_device_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Bidirectional Device Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct bidirectional_device_iterator_tag
+  : thrust::detail::iterator_category_with_system_and_traversal<
+      std::bidirectional_iterator_tag,
+      thrust::device_system_tag,
+      thrust::bidirectional_traversal_tag
+    >
+{};
+
+/*! \p random_access_device_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Random Access Device Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct random_access_device_iterator_tag
+  : thrust::detail::iterator_category_with_system_and_traversal<
+      std::random_access_iterator_tag,
+      thrust::device_system_tag,
+      thrust::random_access_traversal_tag
+    >
+{};
+
+/*! \p input_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Input Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::input_iterator_tag input_host_iterator_tag;
+
+/*! \p output_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Output Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::output_iterator_tag output_host_iterator_tag;
+
+/*! \p forward_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::forward_iterator_tag forward_host_iterator_tag;
+
+/*! \p bidirectional_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag,
+ *  forward_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::bidirectional_iterator_tag bidirectional_host_iterator_tag;
+
+/*! \p random_access_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag,
+ *  forward_host_iterator_tag, bidirectional_host_iterator_tag
+ */
+typedef std::random_access_iterator_tag random_access_host_iterator_tag;
+
+/*! \} // end iterator_tag_classes
+ */
+
+} // end namespace thrust
+
+#include <thrust/iterator/detail/universal_categories.h>
+
diff --git a/thrust/thrust/iterator/iterator_facade.h b/thrust/thrust/iterator/iterator_facade.h
new file mode 100644
index 0000000000000000000000000000000000000000..86757d7122ed457720b9cee966b24245d7cbdba7
--- /dev/null
+++ b/thrust/thrust/iterator/iterator_facade.h
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/iterator_facade.h
+ *  \brief A class which exposes a public interface for iterators
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+#include <thrust/iterator/detail/distance_from_result.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+
+// This forward declaration is required for the friend declaration
+// in iterator_core_access
+template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference> class iterator_facade;
+
+
+/*! \p iterator_core_access is the class which user iterator types derived from \p thrust::iterator_adaptor
+ *  or \p thrust::iterator_facade must befriend to allow it to access their private interface.
+ */
+class iterator_core_access
+{
+    /*! \cond
+     */
+
+    // declare our friends
+    template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference> friend class iterator_facade;
+
+    // iterator comparisons are our friends
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator ==(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator !=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator <(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+               iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator >(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+               iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator <=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator >=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    // iterator difference is our friend
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend
+      typename thrust::detail::distance_from_result<
+        iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1>,
+        iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2>
+      >::type
+    operator-(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+              iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template<typename Facade>
+    __host__ __device__
+    static typename Facade::reference dereference(Facade const& f)
+    {
+      return f.dereference();
+    }
+
+    template<typename Facade>
+    __host__ __device__
+    static void increment(Facade& f)
+    {
+      f.increment();
+    }
+
+    template<typename Facade>
+    __host__ __device__
+    static void decrement(Facade& f)
+    {
+      f.decrement();
+    }
+
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static bool equal(Facade1 const& f1, Facade2 const& f2)
+    {
+      return f1.equal(f2);
+    }
+
+    // XXX TODO: Investigate whether we need both of these cases
+    //template <class Facade1, class Facade2>
+    //__host__ __device__
+    //static bool equal(Facade1 const& f1, Facade2 const& f2, mpl::true_)
+    //{
+    //  return f1.equal(f2);
+    //}
+
+    //template <class Facade1, class Facade2>
+    //__host__ __device__
+    //static bool equal(Facade1 const& f1, Facade2 const& f2, mpl::false_)
+    //{
+    //  return f2.equal(f1);
+    //}
+
+    template <class Facade>
+    __host__ __device__
+    static void advance(Facade& f, typename Facade::difference_type n)
+    {
+      f.advance(n);
+    }
+
+    // Facade2 is convertible to Facade1,
+    // so return Facade1's difference_type
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static typename Facade1::difference_type
+      distance_from(Facade1 const& f1, Facade2 const& f2, thrust::detail::true_type)
+    {
+      return -f1.distance_to(f2);
+    }
+
+    // Facade2 is not convertible to Facade1,
+    // so return Facade2's difference_type
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static typename Facade2::difference_type
+      distance_from(Facade1 const& f1, Facade2 const& f2, thrust::detail::false_type)
+    {
+      return f2.distance_to(f1);
+    }
+    
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static typename thrust::detail::distance_from_result<Facade1,Facade2>::type
+      distance_from(Facade1 const& f1, Facade2 const& f2)
+    {
+      // dispatch the implementation of this method upon whether or not
+      // Facade2 is convertible to Facade1
+      return distance_from(f1, f2,
+        typename thrust::detail::is_convertible<Facade2,Facade1>::type());
+    }
+
+    //
+    // Curiously Recurring Template interface.
+    //
+    template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+    __host__ __device__
+    static Derived& derived(iterator_facade<Derived,Value,System,Traversal,Reference,Difference>& facade)
+    {
+      return *static_cast<Derived*>(&facade);
+    }
+
+    template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+    __host__ __device__
+    static Derived const& derived(iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& facade)
+    {
+      return *static_cast<Derived const*>(&facade);
+    }
+
+    /*! \endcond
+     */
+}; // end iterator_core_access
+
+
+/*! \p iterator_facade is a template which allows the programmer to define a novel iterator with a standards-conforming interface
+ *  which Thrust can use to reason about algorithm acceleration opportunities.
+ *
+ *  Because most of a standard iterator's interface is defined in terms of a small set of core primitives, \p iterator_facade
+ *  defines the non-primitive portion mechanically. In principle a novel iterator could explicitly provide the entire interface in
+ *  an ad hoc fashion but doing so might be tedious and prone to subtle errors.
+ *
+ *  Often \p iterator_facade is too primitive a tool to use for defining novel iterators. In these cases, \p iterator_adaptor
+ *  or a specific fancy iterator should be used instead.
+ *
+ *  \p iterator_facade's functionality is derived from and generally equivalent to \p boost::iterator_facade.
+ *  The exception is Thrust's addition of the template parameter \p System, which is necessary to allow Thrust
+ *  to dispatch an algorithm to one of several parallel backend systems. An additional exception is Thrust's omission
+ *  of the \c operator-> member function.
+ *
+ *  Interested users may refer to <tt>boost::iterator_facade</tt>'s documentation for usage examples.
+ *
+ *  \note \p iterator_facade's arithmetic operator free functions exist with the usual meanings but are omitted here for brevity.
+ */
+template<typename Derived,
+         typename Value,
+         typename System,
+         typename Traversal,
+         typename Reference,
+         typename Difference = std::ptrdiff_t>
+  class iterator_facade
+{
+  private:
+    /*! \cond
+     */
+
+    //
+    // Curiously Recurring Template interface.
+    //
+    __host__ __device__
+    Derived& derived()
+    {
+      return *static_cast<Derived*>(this);
+    }
+
+    __host__ __device__
+    Derived const& derived() const
+    {
+      return *static_cast<Derived const*>(this);
+    }
+    /*! \endcond
+     */
+
+  public:
+    /*! The type of element pointed to by \p iterator_facade.
+     */
+    typedef typename thrust::detail::remove_const<Value>::type value_type;
+
+    /*! The return type of \p iterator_facade::operator*().
+     */
+    typedef Reference                                          reference;
+
+    /*! The return type of \p iterator_facade's non-existent \c operator->()
+     *  member function. Unlike \c boost::iterator_facade, \p iterator_facade
+     *  disallows access to the \p value_type's members through expressions of the
+     *  form <tt>iter->member</tt>. \p pointer is defined to \c void to indicate
+     *  that these expressions are not allowed. This limitation may be relaxed in a
+     *  future version of Thrust.
+     */
+    typedef void                                               pointer;
+
+    /*! The type of expressions of the form <tt>x - y</tt> where <tt>x</tt> and <tt>y</tt>
+     *  are of type \p iterator_facade.
+     */
+    typedef Difference                                         difference_type;
+
+    /*! The type of iterator category of \p iterator_facade.
+     */
+    typedef typename thrust::detail::iterator_facade_category<
+      System, Traversal, Value, Reference
+    >::type                                                    iterator_category;
+
+    /*! \p operator*() dereferences this \p iterator_facade.
+     *  \return A reference to the element pointed to by this \p iterator_facade.
+     */
+    __host__ __device__
+    reference operator*() const
+    {
+      return iterator_core_access::dereference(this->derived());
+    }
+
+    // XXX unimplemented for now, consider implementing it later
+    //pointer operator->() const
+    //{
+    //  return;
+    //}
+
+    // XXX investigate whether or not we need to go to the lengths
+    //     boost does to determine the return type
+
+    /*! \p operator[] performs indexed dereference.
+     *  \return A reference to the element \p n distance away from this \p iterator_facade.
+     */
+    __host__ __device__
+    reference operator[](difference_type n) const
+    {
+      return *(this->derived() + n);
+    }
+
+    /*! \p operator++ pre-increments this \p iterator_facade to refer to the element in the next position.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator++()
+    {
+      iterator_core_access::increment(this->derived());
+      return this->derived();
+    }
+
+    /*! \p operator++ post-increments this \p iterator_facade and returns a new \p iterator_facade referring to the element in the next position.
+     *  \return A copy of <tt>*this</tt> before increment.
+     */
+    __host__ __device__
+    Derived  operator++(int)
+    {
+      Derived tmp(this->derived());
+      ++*this;
+      return tmp;
+    }
+
+    /*! \p operator-- pre-decrements this \p iterator_facade to refer to the element in the previous position.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator--()
+    {
+      iterator_core_access::decrement(this->derived());
+      return this->derived();
+    }
+
+    /*! \p operator-- post-decrements this \p iterator_facade and returns a new \p iterator_facade referring to the element in the previous position.
+     *  \return A copy of <tt>*this</tt> before decrement.
+     */
+    __host__ __device__
+    Derived  operator--(int)
+    {
+      Derived tmp(this->derived());
+      --*this;
+      return tmp;
+    }
+
+    /*! \p operator+= increments this \p iterator_facade to refer to an element a given distance after its current position.
+     *  \param n The quantity to increment.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator+=(difference_type n)
+    {
+      iterator_core_access::advance(this->derived(), n);
+      return this->derived();
+    }
+
+    /*! \p operator-= decrements this \p iterator_facade to refer to an element a given distance before its current postition.
+     *  \param n The quantity to decrement.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator-=(difference_type n)
+    {
+      iterator_core_access::advance(this->derived(), -n);
+      return this->derived();
+    }
+
+    /*! \p operator- subtracts a given quantity from this \p iterator_facade and returns a new \p iterator_facade referring to the element at the given position before this \p iterator_facade.
+     *  \param n The quantity to decrement
+     *  \return An \p iterator_facade pointing \p n elements before this \p iterator_facade.
+     */
+    __host__ __device__
+    Derived  operator-(difference_type n) const
+    {
+      Derived result(this->derived());
+      return result -= n;
+    }
+}; // end iterator_facade
+
+/*! \cond
+ */
+
+// Comparison operators
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator ==(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return iterator_core_access
+    ::equal(*static_cast<Derived1 const*>(&lhs),
+            *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator !=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return !iterator_core_access
+    ::equal(*static_cast<Derived1 const*>(&lhs),
+            *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator <(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+           iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 > iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator >(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+           iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 < iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator <=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 >= iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator >=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 <= iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+// Iterator difference
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+
+// divine the type this operator returns
+typename thrust::detail::distance_from_result<
+  iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1>,
+  iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2>
+>::type
+
+operator-(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+          iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+// Iterator addition
+template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+inline __host__ __device__
+Derived operator+ (iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& i,
+                   typename Derived::difference_type n)
+{
+  Derived tmp(static_cast<Derived const&>(i));
+  return tmp += n;
+}
+
+template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+inline __host__ __device__
+Derived operator+ (typename Derived::difference_type n,
+                   iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& i)
+{
+  Derived tmp(static_cast<Derived const&>(i));
+  return tmp += n;
+}
+
+/*! \endcond
+ */
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/iterator_traits.h b/thrust/thrust/iterator/iterator_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a33658c22bba60783c0b3c5a422a48c7bb2a2f1
--- /dev/null
+++ b/thrust/thrust/iterator/iterator_traits.h
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/iterator_traits.h
+ *  \brief Traits and metafunctions for reasoning about the traits of iterators
+ */
+
+/*
+ * (C) Copyright David Abrahams 2003.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/type_traits/void_t.h>
+
+#include <iterator>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename T, typename = void>
+struct iterator_traits_impl {};
+
+template <typename T>
+struct iterator_traits_impl<
+  T
+, typename voider<
+    typename T::difference_type
+  , typename T::value_type
+  , typename T::pointer
+  , typename T::reference
+  , typename T::iterator_category
+  >::type 
+>
+{
+  typedef typename T::difference_type difference_type;
+  typedef typename T::value_type value_type;
+  typedef typename T::pointer pointer;
+  typedef typename T::reference reference;
+  typedef typename T::iterator_category iterator_category;
+};
+
+} // namespace detail
+
+/*! \p iterator_traits is a type trait class that provides a uniform
+ *  interface for querying the properties of iterators at compile-time.
+ */
+template <typename T>
+struct iterator_traits : detail::iterator_traits_impl<T> {};
+
+// traits are specialized for pointer types
+template<typename T>
+  struct iterator_traits<T*>
+{
+  typedef std::ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef T* pointer;
+  typedef T& reference;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+
+template<typename T>
+  struct iterator_traits<const T*>
+{
+  typedef std::ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef const T* pointer;
+  typedef const T& reference;
+  typedef std::random_access_iterator_tag iterator_category;
+}; // end iterator_traits
+
+template<typename Iterator> struct iterator_value;
+
+template<typename Iterator> struct iterator_pointer;
+
+template<typename Iterator> struct iterator_reference;
+
+template<typename Iterator> struct iterator_difference;
+
+template<typename Iterator> struct iterator_traversal;
+
+template<typename Iterator> struct iterator_system;
+
+} // namespace thrust
+
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/host_system_tag.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/iterator/detail/iterator_traits.inl>
+
diff --git a/thrust/thrust/iterator/permutation_iterator.h b/thrust/thrust/iterator/permutation_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..73827040abd1000ccb616c18a6fdb0d7d8484ccd
--- /dev/null
+++ b/thrust/thrust/iterator/permutation_iterator.h
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/permutation_iterator.h
+ *  \brief An iterator which performs a gather or scatter operation when dereferenced
+ */
+
+/*
+ * (C) Copyright Toon Knapen    2001.
+ * (C) Copyright David Abrahams 2003.
+ * (C) Copyright Roland Richter 2003.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/permutation_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p permutation_iterator is an iterator which represents a pointer into a
+ *  reordered view of a given range. \p permutation_iterator is an imprecise name;
+ *  the reordered view need not be a strict permutation. This iterator is useful
+ *  for fusing a scatter or gather operation with other algorithms.
+ *
+ *  This iterator takes two arguments:
+ *
+ *    - an iterator to the range \c V on which the "permutation" will be applied
+ *    - the reindexing scheme that defines how the elements of \c V will be permuted.
+ *
+ *  Note that \p permutation_iterator is not limited to strict permutations of the
+ *  given range \c V. The distance between begin and end of the reindexing iterators
+ *  is allowed to be smaller compared to the size of the range \c V, in which case
+ *  the \p permutation_iterator only provides a "permutation" of a subrange of \c V.
+ *  The indices neither need to be unique. In this same context, it must be noted
+ *  that the past-the-end \p permutation_iterator is completely defined by means of
+ *  the past-the-end iterator to the indices.
+ *
+ *  The following code snippet demonstrates how to create a \p permutation_iterator
+ *  which represents a reordering of the contents of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/iterator/permutation_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<float> values(4);
+ *  values[0] = 10.0f;
+ *  values[1] = 20.0f;
+ *  values[2] = 30.0f;
+ *  values[3] = 40.0f;
+ *  values[4] = 50.0f;
+ *  values[5] = 60.0f;
+ *  values[6] = 70.0f;
+ *  values[7] = 80.0f;
+ *
+ *  thrust::device_vector<int> indices(4);
+ *  indices[0] = 2;
+ *  indices[1] = 6;
+ *  indices[2] = 1;
+ *  indices[3] = 3;
+ *
+ *  typedef thrust::device_vector<float>::iterator ElementIterator;
+ *  typedef thrust::device_vector<int>::iterator   IndexIterator;
+ *
+ *  thrust::permutation_iterator<ElementIterator,IndexIterator> iter(values.begin(), indices.begin());
+ *
+ *  *iter;   // returns 30.0f;
+ *  iter[0]; // returns 30.0f;
+ *  iter[1]; // returns 70.0f;
+ *  iter[2]; // returns 20.0f;
+ *  iter[3]; // returns 40.0f;
+ *
+ *  // iter[4] is an out-of-bounds error
+ *
+ *  *iter   = -1.0f; // sets values[2] to -1.0f;
+ *  iter[0] = -1.0f; // sets values[2] to -1.0f;
+ *  iter[1] = -1.0f; // sets values[6] to -1.0f;
+ *  iter[2] = -1.0f; // sets values[1] to -1.0f;
+ *  iter[3] = -1.0f; // sets values[3] to -1.0f;
+ *
+ *  // values is now {10, -1, -1, -1, 50, 60, -1, 80}
+ *  \endcode
+ *
+ *  \see make_permutation_iterator
+ */
+template <typename ElementIterator,
+          typename IndexIterator>
+  class permutation_iterator
+    : public thrust::detail::permutation_iterator_base<
+        ElementIterator,
+        IndexIterator
+      >::type
+{
+  /*! \cond
+   */
+  private:
+    typedef typename detail::permutation_iterator_base<ElementIterator,IndexIterator>::type super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  public:
+    /*! Null constructor calls the null constructor of this \p permutation_iterator's
+     *  element iterator.
+     */
+    __host__ __device__
+    permutation_iterator()
+      : m_element_iterator() {}
+
+    /*! Constructor accepts an \c ElementIterator into a range of values and an
+     *  \c IndexIterator into a range of indices defining the indexing scheme on the
+     *  values.
+     *
+     *  \param x An \c ElementIterator pointing this \p permutation_iterator's range of values.
+     *  \param y An \c IndexIterator pointing to an indexing scheme to use on \p x.
+     */
+    __host__ __device__
+    explicit permutation_iterator(ElementIterator x, IndexIterator y)
+      : super_t(y), m_element_iterator(x) {}
+
+    /*! Copy constructor accepts a related \p permutation_iterator.
+     *  \param r A compatible \p permutation_iterator to copy from.
+     */
+    template<typename OtherElementIterator, typename OtherIndexIterator>
+    __host__ __device__
+    permutation_iterator(permutation_iterator<OtherElementIterator,OtherIndexIterator> const &r
+    // XXX remove these guards when we have static_assert
+    , typename detail::enable_if_convertible<OtherElementIterator, ElementIterator>::type* = 0
+    , typename detail::enable_if_convertible<OtherIndexIterator, IndexIterator>::type* = 0
+    )
+      : super_t(r.base()), m_element_iterator(r.m_element_iterator)
+    {}
+
+  /*! \cond
+   */
+  private:
+    // MSVC 2013 and 2015 incorrectly warning about returning a reference to
+    // a local/temporary here.
+    // See goo.gl/LELTNp
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4172)
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return *(m_element_iterator + *this->base());
+    }
+
+    THRUST_DISABLE_MSVC_WARNING_END(4172)
+
+    // make friends for the copy constructor
+    template<typename,typename> friend class permutation_iterator;
+
+    ElementIterator m_element_iterator;
+  /*! \endcond
+   */
+}; // end permutation_iterator
+
+
+/*! \p make_permutation_iterator creates a \p permutation_iterator
+ *  from an \c ElementIterator pointing to a range of elements to "permute"
+ *  and an \c IndexIterator pointing to a range of indices defining an indexing
+ *  scheme on the values.
+ *
+ *  \param e An \c ElementIterator pointing to a range of values.
+ *  \param i An \c IndexIterator pointing to an indexing scheme to use on \p e.
+ *  \return A new \p permutation_iterator which permutes the range \p e by \p i.
+ *  \see permutation_iterator
+ */
+template<typename ElementIterator, typename IndexIterator>
+__host__ __device__
+permutation_iterator<ElementIterator,IndexIterator> make_permutation_iterator(ElementIterator e, IndexIterator i)
+{
+  return permutation_iterator<ElementIterator,IndexIterator>(e,i);
+}
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/retag.h b/thrust/thrust/iterator/retag.h
new file mode 100644
index 0000000000000000000000000000000000000000..6adf5e24444ee8e9fc562c22b5ed28a03adad371
--- /dev/null
+++ b/thrust/thrust/iterator/retag.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/retag.h
+ *  \brief Functionality for altering an iterator's associated system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/retag.h>
+
+namespace thrust
+{
+
+
+/*! \ingroup iterator_tags
+ *  \{
+ */
+
+#if 0
+/*! \p reinterpret_tag returns a copy of an iterator and changes the type of the result's system tag.
+ *  \tparam Tag Any system tag.
+ *  \tparam Iterator Any iterator type.
+ *  \param iter The iterator of interest.
+ *  \return An iterator of unspecified type whose system tag is \p Tag and whose behavior is otherwise
+ *          equivalent to \p iter.
+ *  \note Unlike \p retag, \p reinterpret_tag does not enforce that the converted-to system tag be
+ *        related to the converted-from system tag.
+ *  \see retag
+ */
+template<typename Tag, typename Iterator>
+__host__ __device__
+unspecified_iterator_type reinterpret_tag(Iterator iter);
+
+/*! \p retag returns a copy of an iterator and changes the type of the result's system tag.
+ *  \tparam Tag \p Tag shall be convertible to <tt>thrust::iterator_system<Iterator>::type</tt>,
+ *              or <tt>thrust::iterator_system<Iterator>::type</tt> is a base type of \p Tag.
+ *  \tparam Iterator Any iterator type.
+ *  \param iter The iterator of interest.
+ *  \return An iterator of unspecified type whose system tag is \p Tag and whose behavior is
+ *          otherwise equivalent to \p iter.
+ *  \note Unlike \p reinterpret_tag, \p retag enforces that the converted-to system tag be
+ *        related to the converted-from system tag.
+ *  \see reinterpret_tag
+ */
+template<typename Tag, typename Iterator>
+__host__ __device__
+unspecified_iterator_type retag(Iterator iter);
+#endif
+
+/*! \} // iterator_tags
+ */
+
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/reverse_iterator.h b/thrust/thrust/iterator/reverse_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..365bc34d2eee7598997be4586c00924b5c7f44ea
--- /dev/null
+++ b/thrust/thrust/iterator/reverse_iterator.h
@@ -0,0 +1,238 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/reverse_iterator.h
+ *  \brief An iterator adaptor which adapts another iterator to traverse backwards
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/reverse_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p reverse_iterator is an iterator which represents a pointer into a
+ *  reversed view of a given range. In this way, \p reverse_iterator allows
+ *  backwards iteration through a bidirectional input range.
+ *
+ *  It is important to note that although \p reverse_iterator is constructed
+ *  from a given iterator, it points to the element preceding it. In this way,
+ *  the past-the-end \p reverse_iterator of a given range points to the element
+ *  preceding the first element of the input range. By the same token, the first
+ *  \p reverse_iterator of a given range is constructed from a past-the-end iterator
+ *  of the original range yet points to the last element of the input.
+ *
+ *  The following code snippet demonstrates how to create a \p reverse_iterator
+ *  which represents a reversed view of the contents of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/iterator/reverse_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<float> v(4);
+ *  v[0] = 0.0f;
+ *  v[1] = 1.0f;
+ *  v[2] = 2.0f;
+ *  v[3] = 3.0f;
+ *
+ *  typedef thrust::device_vector<float>::iterator Iterator;
+ *
+ *  // note that we point the iterator to the *end* of the device_vector
+ *  thrust::reverse_iterator<Iterator> iter(values.end());
+ *
+ *  *iter;   // returns 3.0f;
+ *  iter[0]; // returns 3.0f;
+ *  iter[1]; // returns 2.0f;
+ *  iter[2]; // returns 1.0f;
+ *  iter[3]; // returns 0.0f;
+ *
+ *  // iter[4] is an out-of-bounds error
+ *  \endcode
+ *
+ *  Since reversing a range is a common operation, containers like \p device_vector
+ *  have nested typedefs for declaration shorthand and methods for constructing
+ *  reverse_iterators. The following code snippet is equivalent to the previous:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<float> v(4);
+ *  v[0] = 0.0f;
+ *  v[1] = 1.0f;
+ *  v[2] = 2.0f;
+ *  v[3] = 3.0f;
+ *
+ *  // we use the nested type reverse_iterator to refer to a reversed view of
+ *  // a device_vector and the method rbegin() to create a reverse_iterator pointing
+ *  // to the beginning of the reversed device_vector
+ *  thrust::device_iterator<float>::reverse_iterator iter = values.rbegin();
+ *
+ *  *iter;   // returns 3.0f;
+ *  iter[0]; // returns 3.0f;
+ *  iter[1]; // returns 2.0f;
+ *  iter[2]; // returns 1.0f;
+ *  iter[3]; // returns 0.0f;
+ *
+ *  // iter[4] is an out-of-bounds error
+ *
+ *  // similarly, rend() points to the end of the reversed sequence:
+ *  assert(values.rend() == (iter + 4));
+ *  \endcode
+ *
+ *  Finally, the following code snippet demonstrates how to use reverse_iterator to
+ *  perform a reversed prefix sum operation on the contents of a device_vector:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/scan.h>
+ *  ...
+ *  thrust::device_vector<int> v(5);
+ *  v[0] = 0;
+ *  v[1] = 1;
+ *  v[2] = 2;
+ *  v[3] = 3;
+ *  v[4] = 4;
+ *
+ *  thrust::device_vector<int> result(5);
+ *
+ *  // exclusive scan v into result in reverse
+ *  thrust::exclusive_scan(v.rbegin(), v.rend(), result.begin());
+ *
+ *  // result is now {0, 4, 7, 9, 10}
+ *  \endcode
+ *
+ *  \see make_reverse_iterator
+ */
+template<typename BidirectionalIterator>
+  class reverse_iterator
+    : public detail::reverse_iterator_base<BidirectionalIterator>::type
+{
+  /*! \cond
+   */
+  private:
+    typedef typename thrust::detail::reverse_iterator_base<
+      BidirectionalIterator
+    >::type super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  public:
+    /*! Default constructor does nothing.
+     */
+    __host__ __device__
+    reverse_iterator() {}
+
+    /*! \p Constructor accepts a \c BidirectionalIterator pointing to a range
+     *  for this \p reverse_iterator to reverse.
+     *
+     *  \param x A \c BidirectionalIterator pointing to a range to reverse.
+     */
+    __host__ __device__
+    explicit reverse_iterator(BidirectionalIterator x);
+
+    /*! \p Copy constructor allows construction from a related compatible
+     *  \p reverse_iterator.
+     *
+     *  \param r A \p reverse_iterator to copy from.
+     */
+    template<typename OtherBidirectionalIterator>
+    __host__ __device__
+    reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
+// XXX msvc screws this up
+// XXX remove these guards when we have static_assert
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                     , typename thrust::detail::enable_if<
+                         thrust::detail::is_convertible<
+                           OtherBidirectionalIterator,
+                           BidirectionalIterator
+                         >::value
+                       >::type * = 0
+#endif // MSVC
+                     );
+
+  /*! \cond
+   */
+  private:
+    __thrust_exec_check_disable__
+    __host__ __device__
+    typename super_t::reference dereference() const;
+
+    __host__ __device__
+    void increment();
+
+    __host__ __device__
+    void decrement();
+
+    __host__ __device__
+    void advance(typename super_t::difference_type n);
+
+    template<typename OtherBidirectionalIterator>
+    __host__ __device__
+    typename super_t::difference_type
+    distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const;
+  /*! \endcond
+   */
+}; // end reverse_iterator
+
+
+/*! \p make_reverse_iterator creates a \p reverse_iterator
+ *  from a \c BidirectionalIterator pointing to a range of elements to reverse.
+ *  
+ *  \param x A \c BidirectionalIterator pointing to a range to reverse.
+ *  \return A new \p reverse_iterator which reverses the range \p x.
+ */
+template<typename BidirectionalIterator>
+__host__ __device__
+reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalIterator x);
+
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/iterator/detail/reverse_iterator.inl>
+
diff --git a/thrust/thrust/iterator/transform_input_output_iterator.h b/thrust/thrust/iterator/transform_input_output_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..25c10eb58e93cbadb298fc68bbd4d24b3dc5a7cb
--- /dev/null
+++ b/thrust/thrust/iterator/transform_input_output_iterator.h
@@ -0,0 +1,163 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/transform_input_output_iterator.h
+ *  \brief An iterator which adapts another iterator by applying transform
+ *         functions when reading and writing dereferenced values.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/transform_input_output_iterator.inl>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_input_output_iterator is a special kind of iterator which applies
+ * transform functions when reading from or writing to dereferenced values.
+ * This iterator is useful for algorithms that operate on a type that needs to
+ * be serialized/deserialized from values in another iterator, avoiding the
+ * need to materialize intermediate results in memory. This also enables the
+ * transform functions to be fused with the operations that read and write to
+ * the `transform_input_output_iterator`.
+ *
+ * The following code snippet demonstrates how to create a
+ * \p transform_input_output_iterator which performs different transformations when
+ * reading from and writing to the iterator.
+ *
+ * \code
+ * #include <thrust/iterator/transform_input_output_iterator.h>
+ * #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *    const size_t size = 4;
+ *    thrust::device_vector<float> v(size);
+ *
+ *    // Write 1.0f, 2.0f, 3.0f, 4.0f to vector
+ *    thrust::sequence(v.begin(), v.end(), 1);
+ *
+ *    // Iterator that returns negated values and writes squared values
+ *    auto iter = thrust::make_transform_input_output_iterator(v.begin(),
+ *        thrust::negate<float>{}, thrust::square<float>{});
+ * 
+ *    // Iterator negates values when reading
+ *    std::cout << iter[0] << " ";  // -1.0f;
+ *    std::cout << iter[1] << " ";  // -2.0f;
+ *    std::cout << iter[2] << " ";  // -3.0f;
+ *    std::cout << iter[3] << "\n"; // -4.0f;
+ *
+ *    // Write 1.0f, 2.0f, 3.0f, 4.0f to iterator
+ *    thrust::sequence(iter, iter + size, 1);
+ *
+ *    // Values were squared before writing to vector
+ *    std::cout << v[0] << " ";  // 1.0f;
+ *    std::cout << v[1] << " ";  // 4.0f;
+ *    std::cout << v[2] << " ";  // 9.0f;
+ *    std::cout << v[3] << "\n"; // 16.0f;
+ *
+ *  }
+ * \endcode
+ *
+ * \see make_transform_input_output_iterator
+ */
+
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator
+    : public detail::transform_input_output_iterator_base<InputFunction, OutputFunction, Iterator>::type
+{
+
+  /*! \cond
+   */
+
+  public:
+
+    typedef typename
+    detail::transform_input_output_iterator_base<InputFunction, OutputFunction, Iterator>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  /*! This constructor takes as argument a \c Iterator an \c InputFunction and an
+   * \c OutputFunction and copies them to a new \p transform_input_output_iterator
+   *
+   * \param io An \c Iterator pointing to where the input to \c InputFunction
+   *           will be read from and the result of \c OutputFunction will be written to
+   * \param input_function An \c InputFunction to be executed on values read from the iterator
+   * \param output_function An \c OutputFunction to be executed on values written to the iterator
+   */
+    __host__ __device__
+    transform_input_output_iterator(Iterator const& io, InputFunction input_function, OutputFunction output_function)
+      : super_t(io), input_function(input_function), output_function(output_function)
+    {
+    }
+
+    /*! \cond
+     */
+  private:
+
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return detail::transform_input_output_iterator_proxy<
+        InputFunction, OutputFunction, Iterator
+      >(this->base_reference(), input_function, output_function);
+    }
+
+    InputFunction input_function;
+    OutputFunction output_function;
+
+    /*! \endcond
+     */
+}; // end transform_input_output_iterator
+
+/*! \p make_transform_input_output_iterator creates a \p transform_input_output_iterator from
+ *  an \c Iterator a \c InputFunction and a \c OutputFunction
+ *
+ * \param io An \c Iterator pointing to where the input to \c InputFunction
+ *           will be read from and the result of \c OutputFunction will be written to
+ * \param input_function An \c InputFunction to be executed on values read from the iterator
+ * \param output_function An \c OutputFunction to be executed on values written to the iterator
+ *  \see transform_input_output_iterator
+ */
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+__host__ __device__
+make_transform_input_output_iterator(Iterator io, InputFunction input_function, OutputFunction output_function)
+{
+    return transform_input_output_iterator<InputFunction, OutputFunction, Iterator>(io, input_function, output_function);
+} // end make_transform_input_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/transform_iterator.h b/thrust/thrust/iterator/transform_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..fff050e1c05a3b6ed21478aec96cf6394415f3ab
--- /dev/null
+++ b/thrust/thrust/iterator/transform_iterator.h
@@ -0,0 +1,356 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/transform_iterator.h
+ *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ *
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the details first
+#include <thrust/iterator/detail/transform_iterator.inl>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_iterator is an iterator which represents a pointer into a range
+ *  of values after transformation by a function. This iterator is useful for
+ *  creating a range filled with the result of applying an operation to another range
+ *  without either explicitly storing it in memory, or explicitly executing the transformation.
+ *  Using \p transform_iterator facilitates kernel fusion by deferring the execution
+ *  of a transformation until the value is needed while saving both memory capacity
+ *  and bandwidth.
+ *
+ *  The following code snippet demonstrates how to create a \p transform_iterator
+ *  which represents the result of \c sqrtf applied to the contents of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/iterator/transform_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  // note: functor inherits from unary_function
+ *  struct square_root : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return sqrtf(x);
+ *    }
+ *  };
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<float> v(4);
+ *    v[0] = 1.0f;
+ *    v[1] = 4.0f;
+ *    v[2] = 9.0f;
+ *    v[3] = 16.0f;
+ *
+ *    typedef thrust::device_vector<float>::iterator FloatIterator;
+ *
+ *    thrust::transform_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
+ *
+ *    *iter;   // returns 1.0f
+ *    iter[0]; // returns 1.0f;
+ *    iter[1]; // returns 2.0f;
+ *    iter[2]; // returns 3.0f;
+ *    iter[3]; // returns 4.0f;
+ *
+ *    // iter[4] is an out-of-bounds error
+ *  }
+ *  \endcode
+ *
+ *  This next example demonstrates how to use a \p transform_iterator with the
+ *  \p thrust::reduce function to compute the sum of squares of a sequence.
+ *  We will create temporary \p transform_iterators with the
+ *  \p make_transform_iterator function in order to avoid explicitly specifying their type:
+ *
+ *  \code
+ *  #include <thrust/iterator/transform_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/reduce.h>
+ *  #include <iostream>
+ *
+ *  // note: functor inherits from unary_function
+ *  struct square : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return x * x;
+ *    }
+ *  };
+ *
+ *  int main()
+ *  {
+ *    // initialize a device array
+ *    thrust::device_vector<float> v(4);
+ *    v[0] = 1.0f;
+ *    v[1] = 2.0f;
+ *    v[2] = 3.0f;
+ *    v[3] = 4.0f;
+ *
+ *    float sum_of_squares =
+ *     thrust::reduce(thrust::make_transform_iterator(v.begin(), square()),
+ *                    thrust::make_transform_iterator(v.end(),   square()));
+ *
+ *    std::cout << "sum of squares: " << sum_of_squares << std::endl;
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Note that in the previous two examples the transform functor (namely \c square_root
+ *  and \c square) inherits from \c thrust::unary_function.  Inheriting from
+ *  \c thrust::unary_function ensures that a functor is a valid \c AdaptableUnaryFunction
+ *  and provides all the necessary \c typedef declarations.  The \p transform_iterator
+ *  can also be applied to a \c UnaryFunction that does not inherit from
+ *  \c thrust::unary_function using an optional template argument.  The following example
+ *  illustrates how to use the third template argument to specify the \c result_type of
+ *  the function.
+ *
+ *  \code
+ *  #include <thrust/iterator/transform_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  // note: functor *does not* inherit from unary_function
+ *  struct square_root
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return sqrtf(x);
+ *    }
+ *  };
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<float> v(4);
+ *    v[0] = 1.0f;
+ *    v[1] = 4.0f;
+ *    v[2] = 9.0f;
+ *    v[3] = 16.0f;
+ *
+ *    typedef thrust::device_vector<float>::iterator FloatIterator;
+ *
+ *    // note: float result_type is specified explicitly
+ *    thrust::transform_iterator<square_root, FloatIterator, float> iter(v.begin(), square_root());
+ *
+ *    *iter;   // returns 1.0f
+ *    iter[0]; // returns 1.0f;
+ *    iter[1]; // returns 2.0f;
+ *    iter[2]; // returns 3.0f;
+ *    iter[3]; // returns 4.0f;
+ *
+ *    // iter[4] is an out-of-bounds error
+ *  }
+ *  \endcode
+ *
+ *  \see make_transform_iterator
+ */
+template <class AdaptableUnaryFunction, class Iterator, class Reference = use_default, class Value = use_default>
+  class transform_iterator
+    : public detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
+{
+  /*! \cond
+   */
+  public:
+    typedef typename
+    detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  public:
+    /*! Null constructor does nothing.
+     */
+    __host__ __device__
+    transform_iterator() {}
+
+#if THRUST_CPP_DIALECT >= 2011
+    transform_iterator(transform_iterator const&) = default;
+#endif
+
+    /*! This constructor takes as arguments an \c Iterator and an \c AdaptableUnaryFunction
+     *  and copies them to a new \p transform_iterator.
+     *
+     *  \param x An \c Iterator pointing to the input to this \p transform_iterator's \c AdaptableUnaryFunction.
+     *  \param f An \c AdaptableUnaryFunction used to transform the objects pointed to by \p x.
+     */
+    __host__ __device__
+    transform_iterator(Iterator const& x, AdaptableUnaryFunction f)
+      : super_t(x), m_f(f) {
+    }
+
+    /*! This explicit constructor copies the value of a given \c Iterator and creates
+     *  this \p transform_iterator's \c AdaptableUnaryFunction using its null constructor.
+     *
+     *  \param x An \c Iterator to copy.
+     */
+    __host__ __device__
+    explicit transform_iterator(Iterator const& x)
+      : super_t(x) { }
+
+    /*! This copy constructor creates a new \p transform_iterator from another
+     *  \p transform_iterator.
+     *
+     *  \param other The \p transform_iterator to copy.
+     */
+    template<typename OtherAdaptableUnaryFunction,
+             typename OtherIterator,
+             typename OtherReference,
+             typename OtherValue>
+    __host__ __device__
+    transform_iterator(const transform_iterator<OtherAdaptableUnaryFunction, OtherIterator, OtherReference, OtherValue> &other,
+                       typename thrust::detail::enable_if_convertible<OtherIterator, Iterator>::type* = 0,
+                       typename thrust::detail::enable_if_convertible<OtherAdaptableUnaryFunction, AdaptableUnaryFunction>::type* = 0)
+      : super_t(other.base()), m_f(other.functor()) {}
+
+    /*! Copy assignment operator copies from another \p transform_iterator.
+     *  \p other The other \p transform_iterator to copy
+     *  \return <tt>*this</tt>
+     *
+     *  \note If the type of this \p transform_iterator's functor is not copy assignable
+     *        (for example, if it is a lambda) it is not an error to call this function.
+     *        In this case, however, the functor will not be modified.
+     *
+     *        In any case, this \p transform_iterator's underlying iterator will be copy assigned.
+     */
+    __host__ __device__
+    transform_iterator &operator=(const transform_iterator &other)
+    {
+      return do_assign(other,
+      // XXX gcc 4.2.1 crashes on is_copy_assignable; just assume the functor is assignable as a WAR
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION <= 40201)
+          thrust::detail::true_type()
+#else
+          typename thrust::detail::is_copy_assignable<AdaptableUnaryFunction>::type()
+#endif // THRUST_HOST_COMPILER
+      );
+    }
+
+    /*! This method returns a copy of this \p transform_iterator's \c AdaptableUnaryFunction.
+     *  \return A copy of this \p transform_iterator's \c AdaptableUnaryFunction.
+     */
+    __host__ __device__
+    AdaptableUnaryFunction functor() const
+      { return m_f; }
+
+    /*! \cond
+     */
+  private:
+    __host__ __device__
+    transform_iterator &do_assign(const transform_iterator &other, thrust::detail::true_type)
+    {
+      super_t::operator=(other);
+
+      // do assign to m_f
+      m_f = other.functor();
+
+      return *this;
+    }
+
+    __host__ __device__
+    transform_iterator &do_assign(const transform_iterator &other, thrust::detail::false_type)
+    {
+      super_t::operator=(other);
+
+      // don't assign to m_f
+
+      return *this;
+    }
+
+    // MSVC 2013 and 2015 incorrectly warning about returning a reference to
+    // a local/temporary here.
+    // See goo.gl/LELTNp
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4172)
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      // Create a temporary to allow iterators with wrapped references to
+      // convert to their value type before calling m_f. Note that this
+      // disallows non-constant operations through m_f.
+      typename thrust::iterator_value<Iterator>::type x = *this->base();
+      return m_f(x);
+    }
+
+    THRUST_DISABLE_MSVC_WARNING_END(4172)
+
+    // tag this as mutable per Dave Abrahams in this thread:
+    // http://lists.boost.org/Archives/boost/2004/05/65332.php
+    mutable AdaptableUnaryFunction m_f;
+
+    /*! \endcond
+     */
+}; // end transform_iterator
+
+
+/*! \p make_transform_iterator creates a \p transform_iterator
+ *  from an \c Iterator and \c AdaptableUnaryFunction.
+ *
+ *  \param it The \c Iterator pointing to the input range of the
+ *            newly created \p transform_iterator.
+ *  \param fun The \c AdaptableUnaryFunction used to transform the range pointed
+ *             to by \p it in the newly created \p transform_iterator.
+ *  \return A new \p transform_iterator which transforms the range at
+ *          \p it by \p fun.
+ *  \see transform_iterator
+ */
+template <class AdaptableUnaryFunction, class Iterator>
+inline __host__ __device__
+transform_iterator<AdaptableUnaryFunction, Iterator>
+make_transform_iterator(Iterator it, AdaptableUnaryFunction fun)
+{
+  return transform_iterator<AdaptableUnaryFunction, Iterator>(it, fun);
+} // end make_transform_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/transform_output_iterator.h b/thrust/thrust/iterator/transform_output_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c6683ae5c9b441d0c31d50d36fcabed60996b8e
--- /dev/null
+++ b/thrust/thrust/iterator/transform_output_iterator.h
@@ -0,0 +1,163 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/transform_output_iterator.h
+ *  \brief An output iterator which adapts another output iterator by applying a
+ *         function to the result of its dereference before writing it.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/transform_output_iterator.inl>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_output_iterator is a special kind of output iterator which
+ * transforms a value written upon dereference. This iterator is useful
+ * for transforming an output from algorithms without explicitly storing the
+ * intermediate result in the memory and applying subsequent transformation, 
+ * thereby avoiding wasting memory capacity and bandwidth.
+ * Using \p transform_iterator facilitates kernel fusion by deferring execution
+ * of transformation until the value is written while saving both memory
+ * capacity and bandwidth.
+ *
+ * The following code snippet demonstrated how to create a
+ * \p transform_output_iterator which applies \c sqrtf to the assigning value.
+ *
+ * \code
+ * #include <thrust/iterator/transform_output_iterator.h>
+ * #include <thrust/device_vector.h>
+ *
+ * // note: functor inherits form unary function
+ *  // note: functor inherits from unary_function
+ *  struct square_root : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return sqrtf(x);
+ *    }
+ *  };
+ *  
+ *  int main()
+ *  {
+ *    thrust::device_vector<float> v(4);
+ *
+ *    typedef thrust::device_vector<float>::iterator FloatIterator;
+ *    thrust::transform_output_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
+ *
+ *    iter[0] =  1.0f;    // stores sqrtf( 1.0f) 
+ *    iter[1] =  4.0f;    // stores sqrtf( 4.0f)
+ *    iter[2] =  9.0f;    // stores sqrtf( 9.0f)
+ *    iter[3] = 16.0f;    // stores sqrtf(16.0f)
+ *    // iter[4] is an out-of-bounds error
+ *                                                                                           
+ *    v[0]; // returns 1.0f;
+ *    v[1]; // returns 2.0f;
+ *    v[2]; // returns 3.0f;
+ *    v[3]; // returns 4.0f;
+ *                                                                                           
+ *  }
+ *  \endcode
+ *
+ *  \see make_transform_output_iterator
+ */
+
+template <typename UnaryFunction, typename OutputIterator>
+  class transform_output_iterator
+    : public detail::transform_output_iterator_base<UnaryFunction, OutputIterator>::type
+{
+
+  /*! \cond
+   */
+
+  public:
+
+    typedef typename
+    detail::transform_output_iterator_base<UnaryFunction, OutputIterator>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  /*! This constructor takes as argument an \c OutputIterator and an \c
+   * UnaryFunction and copies them to a new \p transform_output_iterator
+   *
+   * \param out An \c OutputIterator pointing to the output range whereto the result of 
+   *            \p transform_output_iterator's \c UnaryFunction will be written.
+   * \param fun An \c UnaryFunction used to transform the objects assigned to
+   *            this \p transform_output_iterator.
+   */
+    __host__ __device__
+    transform_output_iterator(OutputIterator const& out, UnaryFunction fun) : super_t(out), fun(fun)
+    {
+    }
+
+    /*! \cond
+     */
+  private:
+
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return detail::transform_output_iterator_proxy<
+        UnaryFunction, OutputIterator
+      >(this->base_reference(), fun);
+    }
+
+    UnaryFunction fun;
+
+    /*! \endcond
+     */
+}; // end transform_output_iterator
+
+/*! \p make_transform_output_iterator creates a \p transform_output_iterator from
+ *  an \c OutputIterator and \c UnaryFunction.
+ *
+ *  \param out The \c OutputIterator pointing to the output range of the newly
+ *            created \p transform_output_iterator
+ *  \param fun The \c UnaryFunction transform the object before assigning it to
+ *            \c out by the newly created \p transform_output_iterator
+ *  \see transform_output_iterator
+ */
+template <typename UnaryFunction, typename OutputIterator>
+transform_output_iterator<UnaryFunction, OutputIterator>
+__host__ __device__
+make_transform_output_iterator(OutputIterator out, UnaryFunction fun)
+{
+    return transform_output_iterator<UnaryFunction, OutputIterator>(out, fun);
+} // end make_transform_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/iterator/zip_iterator.h b/thrust/thrust/iterator/zip_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b86d06d513253c5c89dd1d88ef508bbc2a3684f
--- /dev/null
+++ b/thrust/thrust/iterator/zip_iterator.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/zip_iterator.h
+ *  \brief An iterator which returns a tuple of the result of dereferencing
+ *         a tuple of iterators when dereferenced
+ */
+
+/*
+ * Copyright David Abrahams and Thomas Becker 2000-2006.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/zip_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p zip_iterator is an iterator which represents a pointer into a range
+ *  of \p tuples whose elements are themselves taken from a \p tuple of input
+ *  iterators. This iterator is useful for creating a virtual array of structures
+ *  while achieving the same performance and bandwidth as the structure of arrays
+ *  idiom. \p zip_iterator also facilitates kernel fusion by providing a convenient
+ *  means of amortizing the execution of the same operation over multiple ranges.
+ *
+ *  The following code snippet demonstrates how to create a \p zip_iterator
+ *  which represents the result of "zipping" multiple ranges together.
+ *  
+ *  \code
+ *  #include <thrust/iterator/zip_iterator.h>
+ *  #include <thrust/tuple.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> int_v(3);
+ *  int_v[0] = 0; int_v[1] = 1; int_v[2] = 2;
+ *
+ *  thrust::device_vector<float> float_v(3);
+ *  float_v[0] = 0.0f; float_v[1] = 1.0f; float_v[2] = 2.0f;
+ *
+ *  thrust::device_vector<char> char_v(3);
+ *  char_v[0] = 'a'; char_v[1] = 'b'; char_v[2] = 'c';
+ *
+ *  // typedef these iterators for shorthand
+ *  typedef thrust::device_vector<int>::iterator   IntIterator;
+ *  typedef thrust::device_vector<float>::iterator FloatIterator;
+ *  typedef thrust::device_vector<char>::iterator  CharIterator;
+ *
+ *  // typedef a tuple of these iterators
+ *  typedef thrust::tuple<IntIterator, FloatIterator, CharIterator> IteratorTuple;
+ *
+ *  // typedef the zip_iterator of this tuple
+ *  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+ *
+ *  // finally, create the zip_iterator
+ *  ZipIterator iter(thrust::make_tuple(int_v.begin(), float_v.begin(), char_v.begin()));
+ *
+ *  *iter;   // returns (0, 0.0f, 'a')
+ *  iter[0]; // returns (0, 0.0f, 'a')
+ *  iter[1]; // returns (1, 1.0f, 'b')
+ *  iter[2]; // returns (2, 2.0f, 'c')
+ *
+ *  thrust::get<0>(iter[2]); // returns 2
+ *  thrust::get<1>(iter[0]); // returns 0.0f
+ *  thrust::get<2>(iter[1]); // returns 'b'
+ *
+ *  // iter[3] is an out-of-bounds error
+ *  \endcode
+ *
+ *  Defining the type of a \p zip_iterator can be complex. The next code example demonstrates
+ *  how to use the \p make_zip_iterator function with the \p make_tuple function to avoid
+ *  explicitly specifying the type of the \p zip_iterator. This example shows how to use
+ *  \p zip_iterator to copy multiple ranges with a single call to \p thrust::copy.
+ *
+ *  \code
+ *  #include <thrust/zip_iterator.h>
+ *  #include <thrust/tuple.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<int> int_in(3), int_out(3);
+ *    int_in[0] = 0;
+ *    int_in[1] = 1;
+ *    int_in[2] = 2;
+ *
+ *    thrust::device_vector<float> float_in(3), float_out(3);
+ *    float_in[0] =  0.0f;
+ *    float_in[1] = 10.0f;
+ *    float_in[2] = 20.0f;
+ *
+ *    thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(int_in.begin(), float_in.begin())),
+ *                 thrust::make_zip_iterator(thrust::make_tuple(int_in.end(),   float_in.end())),
+ *                 thrust::make_zip_iterator(thrust::make_tuple(int_out.begin(),float_out.begin())));
+ *
+ *    // int_out is now [0, 1, 2]
+ *    // float_out is now [0.0f, 10.0f, 20.0f]
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_zip_iterator
+ *  \see make_tuple
+ *  \see tuple
+ *  \see get
+ */
+template <typename IteratorTuple>
+  class zip_iterator
+    : public detail::zip_iterator_base<IteratorTuple>::type
+{
+  public:
+    /*! Null constructor does nothing.
+     */
+    inline __host__ __device__
+    zip_iterator();
+
+    /*! This constructor creates a new \p zip_iterator from a
+     *  \p tuple of iterators.
+     *  
+     *  \param iterator_tuple The \p tuple of iterators to copy from.
+     */
+    inline __host__ __device__
+    zip_iterator(IteratorTuple iterator_tuple);
+
+    /*! This copy constructor creates a new \p zip_iterator from another
+     *  \p zip_iterator.
+     *
+     *  \param other The \p zip_iterator to copy.
+     */
+    template<typename OtherIteratorTuple>
+    inline __host__ __device__
+    zip_iterator(const zip_iterator<OtherIteratorTuple> &other,
+                 typename thrust::detail::enable_if_convertible<
+                   OtherIteratorTuple,
+                   IteratorTuple
+                 >::type * = 0);
+
+    /*! This method returns a \c const reference to this \p zip_iterator's
+     *  \p tuple of iterators.
+     *
+     *  \return A \c const reference to this \p zip_iterator's \p tuple
+     *          of iterators.
+     */
+    inline __host__ __device__
+    const IteratorTuple &get_iterator_tuple() const;
+
+    /*! \cond
+     */
+  private:
+    typedef typename
+    detail::zip_iterator_base<IteratorTuple>::type super_t;
+
+    friend class thrust::iterator_core_access;
+
+    // Dereferencing returns a tuple built from the dereferenced
+    // iterators in the iterator tuple.
+    __host__ __device__
+    typename super_t::reference dereference() const;
+
+    // Two zip_iterators are equal if the two first iterators of the
+    // tuple are equal. Note this differs from Boost's implementation, which
+    // considers the entire tuple.
+    template<typename OtherIteratorTuple>
+    inline __host__ __device__
+    bool equal(const zip_iterator<OtherIteratorTuple> &other) const;
+
+    // Advancing a zip_iterator means to advance all iterators in the tuple
+    inline __host__ __device__
+    void advance(typename super_t::difference_type n);
+
+    // Incrementing a zip iterator means to increment all iterators in the tuple
+    inline __host__ __device__
+    void increment();
+
+    // Decrementing a zip iterator means to decrement all iterators in the tuple
+    inline __host__ __device__
+    void decrement();
+
+    // Distance is calculated using the first iterator in the tuple.
+    template<typename OtherIteratorTuple>
+    inline __host__ __device__
+      typename super_t::difference_type
+        distance_to(const zip_iterator<OtherIteratorTuple> &other) const;
+
+    // The iterator tuple.
+    IteratorTuple m_iterator_tuple;
+
+    /*! \endcond
+     */
+}; // end zip_iterator
+
+/*! \p make_zip_iterator creates a \p zip_iterator from a \p tuple
+ *  of iterators.
+ *
+ *  \param t The \p tuple of iterators to copy.
+ *  \return A newly created \p zip_iterator which zips the iterators encapsulated in \p t.
+ *
+ *  \see zip_iterator
+ */
+template<typename IteratorTuple>
+inline __host__ __device__
+zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/iterator/detail/zip_iterator.inl>
+
diff --git a/thrust/thrust/limits.h b/thrust/thrust/limits.h
new file mode 100644
index 0000000000000000000000000000000000000000..f83dde9c370a73dff878a21a0ca919c8d0859f18
--- /dev/null
+++ b/thrust/thrust/limits.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <limits>
+
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+template <typename T>
+struct numeric_limits : std::numeric_limits<T> {};
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/logical.h b/thrust/thrust/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce212721962b31208926da6fd32f80cb54414d32
--- /dev/null
+++ b/thrust/thrust/logical.h
@@ -0,0 +1,279 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file logical.h
+ *  \brief Logical operations on ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup logical
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p all_of determines whether all elements in a range satify a predicate.
+ *  Specifically, \p all_of returns \c true if <tt>pred(*i)</tt> is \c true
+ *  for every iterator \c i in the range <tt>[first, last)</tt> and 
+ *  \c false otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::all_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::all_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::all_of(thrust::host, A, A, thrust::identity<bool>()); // returns false
+ *  
+ *  \endcode
+ *
+ *  \see any_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p all_of determines whether all elements in a range satify a predicate.
+ * Specifically, \p all_of returns \c true if <tt>pred(*i)</tt> is \c true
+ * for every iterator \c i in the range <tt>[first, last)</tt> and 
+ * \c false otherwise.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::all_of(A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::all_of(A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::all_of(A, A, thrust::identity<bool>()); // returns false
+ *  
+ *  \endcode
+ *
+ *  \see any_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template<typename InputIterator, typename Predicate>
+bool all_of(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p any_of determines whether any element in a range satifies a predicate.
+ *  Specifically, \p any_of returns \c true if <tt>pred(*i)</tt> is \c true
+ *  for any iterator \c i in the range <tt>[first, last)</tt> and 
+ *  \c false otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::any_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::any_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  thrust::any_of(thrust::host, A + 2, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::any_of(thrust::host, A, A, thrust::identity<bool>()); // returns false
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+   
+
+/*! \p any_of determines whether any element in a range satifies a predicate.
+ * Specifically, \p any_of returns \c true if <tt>pred(*i)</tt> is \c true
+ * for any iterator \c i in the range <tt>[first, last)</tt> and 
+ * \c false otherwise.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::any_of(A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::any_of(A, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  thrust::any_of(A + 2, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::any_of(A, A, thrust::identity<bool>()); // returns false
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template<typename InputIterator, typename Predicate>
+bool any_of(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p none_of determines whether no element in a range satifies a predicate.
+ *  Specifically, \p none_of returns \c true if there is no iterator \c i in 
+ *  the range <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true,
+ *  and \c false otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::none_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns false
+ *  thrust::none_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  thrust::none_of(thrust::host, A + 2, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  // empty range
+ *  thrust::none_of(thrust::host, A, A, thrust::identity<bool>()); // returns true
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see any_of
+ *  \see transform_reduce
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p none_of determines whether no element in a range satifies a predicate.
+ *  Specifically, \p none_of returns \c true if there is no iterator \c i in 
+ *  the range <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true,
+ *  and \c false otherwise.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::none_of(A, A + 2, thrust::identity<bool>()); // returns false
+ *  thrust::none_of(A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  thrust::none_of(A + 2, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  // empty range
+ *  thrust::none_of(A, A, thrust::identity<bool>()); // returns true
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see any_of
+ *  \see transform_reduce
+ */
+template<typename InputIterator, typename Predicate>
+bool none_of(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \} // end logical
+ *  \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/logical.inl>
+
diff --git a/thrust/thrust/memory.h b/thrust/thrust/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ef8833f57a0c00187732a93df515196a05a1491
--- /dev/null
+++ b/thrust/thrust/memory.h
@@ -0,0 +1,547 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/memory.h
+ *  \brief Abstractions for Thrust's memory model.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/malloc_and_free.h>
+#include <thrust/detail/temporary_buffer.h>
+
+namespace thrust
+{
+
+/*! \defgroup memory_management Memory Management
+ *
+ *  All Thrust functionalities related to memory allocation and deallocation.
+ *
+ */
+
+/** \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+// define pointer for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+/*! \p pointer stores a pointer to an object allocated in memory. Like \p device_ptr, this
+ *  type ensures type safety when dispatching standard algorithms on ranges resident in memory.
+ *
+ *  \p pointer generalizes \p device_ptr by relaxing the backend system associated with the \p pointer.
+ *  Instead of the backend system specified by \p THRUST_DEFAULT_DEVICE_BACKEND, \p pointer's
+ *  system is given by its second template parameter, \p Tag. For the purpose of Thrust dispatch,
+ *  <tt>device_ptr<Element></tt> and <tt>pointer<Element,device_system_tag></tt> are considered equivalent.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained through its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast free function.
+ *
+ *  \tparam Element specifies the type of the pointed-to object.
+ *
+ *  \tparam Tag specifies the system with which this \p pointer is associated. This may be any Thrust
+ *          backend system, or a user-defined tag.
+ *
+ *  \tparam Reference allows the client to specify the reference type returned upon derereference.
+ *          By default, this type is <tt>reference<Element,pointer></tt>.
+ *
+ *  \tparam Derived allows the client to specify the name of the derived type when \p pointer is used as
+ *          a base class. This is useful to ensure that arithmetic on values of the derived type return
+ *          values of the derived type as a result. By default, this type is <tt>pointer<Element,Tag,Reference></tt>.
+ *
+ *  \note \p pointer is not a smart pointer; it is the client's responsibility to deallocate memory
+ *        pointer to by \p pointer.
+ *
+ *  \see device_ptr
+ *  \see reference
+ *  \see raw_pointer_cast
+ */
+template<typename Element, typename Tag, typename Reference = thrust::use_default, typename Derived = thrust::use_default>
+  class pointer
+{
+  public:
+    /*! The type of the raw pointer
+     */
+    typedef typename super_t::base_type raw_pointer;
+    
+    /*! \p pointer's default constructor initializes its encapsulated pointer to \c 0
+     */
+    __host__ __device__
+    pointer();
+
+    /*! This constructor allows construction of a <tt>pointer<const T, ...></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in \p Tag's memory.
+     *  \tparam OtherElement \p OtherElement shall be convertible to \p Element.
+     */
+    template<typename OtherElement>
+    __host__ __device__
+    explicit pointer(OtherElement *ptr);
+
+    /*! This contructor allows initialization from another pointer-like object.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *
+     *  \tparam OtherPointer The tag associated with \p OtherPointer shall be convertible to \p Tag,
+     *                       and its element type shall be convertible to \p Element.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer<Element,Tag,Reference,Derived>
+            >::type * = 0);
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \return <tt>*this</tt>
+     *
+     *  \tparam OtherPointer The tag associated with \p OtherPointer shall be convertible to \p Tag,
+     *                       and its element type shall be convertible to \p Element.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      derived_type &
+    >::type
+    operator=(const OtherPointer &other);
+
+    /*! \p get returns this \p pointer's encapsulated raw pointer.
+     *  \return This \p pointer's raw pointer.
+     */
+    __host__ __device__
+    Element *get() const;
+};
+#endif
+
+// define pointer for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+/*! \p reference is a wrapped reference to an object stored in memory. \p reference generalizes
+ *  \p device_reference by relaxing the type of pointer associated with the object. \p reference
+ *  is the type of the result of dereferencing a tagged pointer-like object such as \p pointer, and
+ *  intermediates operations on objects existing in a remote memory.
+ *
+ *  \tparam Element specifies the type of the referent object.
+ *  \tparam Pointer specifies the type of the result of taking the address of \p reference.
+ *  \tparam Derived allows the client to specify the name of the derived type when \p reference is used as
+ *          a base class. This is useful to ensure that assignment to objects of the derived type return
+ *          values of the derived type as a result. By default, this type is <tt>reference<Element,Pointer></tt>.
+ */
+template<typename Element, typename Pointer, typename Derived = thrust::use_default>
+  class reference
+{
+  public:
+    /*! The type of this \p reference's wrapped pointers.
+     */
+    typedef Pointer                                              pointer;
+
+    /*! The \p value_type of this \p reference.
+     */
+    typedef typename thrust::detail::remove_const<Element>::type value_type;
+
+    /*! This copy constructor initializes this \p reference
+     *  to refer to an object pointed to by the given \p pointer. After
+     *  this \p reference is constructed, it shall refer to the
+     *  object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr);
+
+    /*! This copy constructor accepts a const reference to another
+     *  \p reference of related type. After this \p reference is constructed,
+     *  it shall refer to the same object as \p other.
+     *  
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherElement the element type of the other \p reference.
+     *  \tparam OtherPointer the pointer type of the other \p reference.
+     *  \tparam OtherDerived the derived type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of 
+     *  <tt>reference<const T,...></tt> from <tt>reference<T,...></tt>.
+     */
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
+                pointer
+              >::type * = 0);
+
+    /*! Copy assignment operator copy assigns from another \p reference.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>static_cast<derived_type&>(*this)</tt>
+     */
+    __host__ __device__
+    derived_type &operator=(const reference &other);
+
+    /*! Assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>static_cast<derived_type&>(*this)</tt>
+     *
+     *  \tparam OtherElement the element type of the other \p reference.
+     *  \tparam OtherPointer the pointer type of the other \p reference.
+     *  \tparam OtherDerived the derived type of the other \p reference.
+     */
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>static_cast<derived_type&>(*this)</tt>.
+     */
+    __host__ __device__
+    derived_type &operator=(const value_type &x);
+
+    /*! Address-of operator returns a \p pointer pointing to the object
+     *  referenced by this \p reference. It does not return the address of this
+     *  \p reference.
+     *
+     *  \return A \p pointer pointing to the referenct object.
+     */
+    __host__ __device__
+    pointer operator&() const;
+
+    /*! Conversion operator converts this \p reference to \p value_type by
+     *  returning a copy of the referent object.
+     *  
+     *  \return A copy of the referent object.
+     */
+    __host__ __device__
+    operator value_type () const;
+
+    /*! Swaps the value of the referent object with another.
+     *
+     *  \param other The other \p reference with which to swap.
+     *  \note The argument is of type \p derived_type rather than \p reference.
+     */
+    __host__ __device__
+    void swap(derived_type &other);
+
+    /*! Prefix increment operator increments the referent object.
+     *
+     *  \return <tt>static_Cast<derived_type&>(*this)</tt>.
+     *
+     *  \note Documentation for other arithmetic operators omitted for brevity.
+     */
+    derived_type &operator++();
+};
+#endif
+
+/*! \}
+ */
+
+/*!
+ *  \addtogroup memory_management_functions Memory Management Functions
+ *  \ingroup memory_management
+ *  \{
+ */
+
+
+/*! \addtogroup allocation_functions
+ *  \{
+ */
+
+
+/*! This version of \p malloc allocates untyped uninitialized storage associated with a given system.
+ *
+ *  \param system The Thrust system with which to associate the storage.
+ *  \param n The number of bytes of storage to allocate.
+ *  \return If allocation succeeds, a pointer to the allocated storage; a null pointer otherwise.
+ *          The pointer must be deallocated with \p thrust::free.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *
+ *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
+ *  associated with Thrust's device system.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate some memory with thrust::malloc
+ *  const int N = 100;
+ *  thrust::device_system_tag device_sys;
+ *  thrust::pointer<void,thrust::device_space_tag> void_ptr = thrust::malloc(device_sys, N);
+ *
+ *  // manipulate memory
+ *  ...
+ *
+ *  // deallocate void_ptr with thrust::free
+ *  thrust::free(device_sys, void_ptr);
+ *  \endcode
+ *
+ *  \see free
+ *  \see device_malloc
+ */
+template<typename DerivedPolicy>
+__host__ __device__
+pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &system, std::size_t n);
+
+
+/*! This version of \p malloc allocates typed uninitialized storage associated with a given system.
+ *
+ *  \param system The Thrust system with which to associate the storage.
+ *  \param n The number of elements of type \c T which the storage should accomodate.
+ *  \return If allocation succeeds, a pointer to an allocation large enough to accomodate \c n
+ *          elements of type \c T; a null pointer otherwise.
+ *          The pointer must be deallocated with \p thrust::free.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *
+ *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
+ *  to accomodate integers associated with Thrust's device system.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::malloc
+ *  const int N = 100;
+ *  thrust::device_system_tag device_sys;
+ *  thrust::pointer<int,thrust::device_system_tag> ptr = thrust::malloc<int>(device_sys, N);
+ *
+ *  // manipulate memory
+ *  ...
+ *
+ *  // deallocate ptr with thrust::free
+ *  thrust::free(device_sys, ptr);
+ *  \endcode
+ *
+ *  \see free
+ *  \see device_malloc
+ */
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &system, std::size_t n);
+
+
+/*! \p get_temporary_buffer returns a pointer to storage associated with a given Thrust system sufficient to store up to
+ *  \p n objects of type \c T. If not enough storage is available to accomodate \p n objects, an implementation may return
+ *  a smaller buffer. The number of objects the returned buffer can accomodate is also returned.
+ *
+ *  Thrust uses \p get_temporary_buffer internally when allocating temporary storage required by algorithm implementations.
+ *
+ *  The storage allocated with \p get_temporary_buffer must be returned to the system with \p return_temporary_buffer.
+ *
+ *  \param system The Thrust system with which to associate the storage.
+ *  \param n The requested number of objects of type \c T the storage should accomodate.
+ *  \return A pair \c p such that <tt>p.first</tt> is a pointer to the allocated storage and <tt>p.second</tt> is the number of
+ *          contiguous objects of type \c T that the storage can accomodate. If no storage can be allocated, <tt>p.first</tt> if
+ *          no storage can be obtained. The storage must be returned to the system using \p return_temporary_buffer.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *
+ *  The following code snippet demonstrates how to use \p get_temporary_buffer to allocate a range of memory
+ *  to accomodate integers associated with Thrust's device system.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::get_temporary_buffer
+ *  const int N = 100;
+ *
+ *  typedef thrust::pair<
+ *    thrust::pointer<int,thrust::device_system_tag>,
+ *    std::ptrdiff_t
+ *  > ptr_and_size_t;
+ *
+ *  thrust::device_system_tag device_sys;
+ *  ptr_and_size_t ptr_and_size = thrust::get_temporary_buffer<int>(device_sys, N);
+ *
+ *  // manipulate up to 100 ints
+ *  for(int i = 0; i < ptr_and_size.second; ++i)
+ *  {
+ *    *ptr_and_size.first = i;
+ *  }
+ *
+ *  // deallocate storage with thrust::return_temporary_buffer
+ *  thrust::return_temporary_buffer(device_sys, ptr_and_size.first);
+ *  \endcode
+ *
+ *  \see malloc
+ *  \see return_temporary_buffer
+ */
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
+
+
+/*! \} allocation_functions
+ */
+
+
+/*! \addtogroup deallocation_functions
+ *  \{
+ */
+
+
+/*! \p free deallocates the storage previously allocated by \p thrust::malloc.
+ *
+ *  \param system The Thrust system with which the storage is associated.
+ *  \param ptr A pointer previously returned by \p thrust::malloc. If \p ptr is null, \p free
+ *         does nothing.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p ptr shall have been returned by a previous call to <tt>thrust::malloc(system, n)</tt> or <tt>thrust::malloc<T>(system, n)</tt> for some type \c T.
+ *
+ *  The following code snippet demonstrates how to use \p free to deallocate a range of memory
+ *  previously allocated with \p thrust::malloc.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::malloc
+ *  const int N = 100;
+ *  thrust::device_system_tag device_sys;
+ *  thrust::pointer<int,thrust::device_system_tag> ptr = thrust::malloc<int>(device_sys, N);
+ *
+ *  // mainpulate memory
+ *  ...
+ *
+ *  // deallocate ptr with thrust::free
+ *  thrust::free(device_sys, ptr);
+ *  \endcode
+ */
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void free(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer ptr);
+
+
+/*! \p return_temporary_buffer deallocates storage associated with a given Thrust system previously allocated by \p get_temporary_buffer.
+ *
+ *  Thrust uses \p return_temporary_buffer internally when deallocating temporary storage required by algorithm implementations.
+ *
+ *  \param system The Thrust system with which the storage is associated.
+ *  \param p A pointer previously returned by \p thrust::get_temporary_buffer. If \p ptr is null, \p return_temporary_buffer does nothing.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p p shall have been previously allocated by \p thrust::get_temporary_buffer.
+ *
+ *  The following code snippet demonstrates how to use \p return_temporary_buffer to deallocate a range of memory
+ *  previously allocated by \p get_temporary_buffer.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::get_temporary_buffer
+ *  const int N = 100;
+ *
+ *  typedef thrust::pair<
+ *    thrust::pointer<int,thrust::device_system_tag>,
+ *    std::ptrdiff_t
+ *  > ptr_and_size_t;
+ *
+ *  thrust::device_system_tag device_sys;
+ *  ptr_and_size_t ptr_and_size = thrust::get_temporary_buffer<int>(device_sys, N);
+ *
+ *  // manipulate up to 100 ints
+ *  for(int i = 0; i < ptr_and_size.second; ++i)
+ *  {
+ *    *ptr_and_size.first = i;
+ *  }
+ *
+ *  // deallocate storage with thrust::return_temporary_buffer
+ *  thrust::return_temporary_buffer(device_sys, ptr_and_size.first);
+ *  \endcode
+ *
+ *  \see free
+ *  \see get_temporary_buffer
+ */
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p, std::ptrdiff_t n);
+
+
+/*! \} deallocation_functions
+ */
+
+
+/*! \p raw_pointer_cast creates a "raw" pointer from a pointer-like type,
+ *  simply returning the wrapped pointer, should it exist.
+ *
+ *  \param ptr The pointer of interest.
+ *  \return <tt>ptr.get()</tt>, if the expression is well formed; <tt>ptr</tt>, otherwise.
+ *  \see raw_reference_cast
+ */
+template<typename Pointer>
+__host__ __device__
+typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+  raw_pointer_cast(Pointer ptr);
+
+
+/*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
+ *  simply returning the underlying reference, should it exist.
+ *
+ *  If the argument is not a reference wrapper, the result is a reference to the argument.
+ *
+ *  \param ref The reference of interest.
+ *  \return <tt>*thrust::raw_pointer_cast(&ref)</tt>.
+ *  \note There are two versions of \p raw_reference_cast. One for <tt>const</tt> references,
+ *        and one for non-<tt>const</tt>.
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+__host__ __device__
+typename detail::raw_reference<T>::type
+  raw_reference_cast(T &ref);
+
+
+/*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
+ *  simply returning the underlying reference, should it exist.
+ *
+ *  If the argument is not a reference wrapper, the result is a reference to the argument.
+ *
+ *  \param ref The reference of interest.
+ *  \return <tt>*thrust::raw_pointer_cast(&ref)</tt>.
+ *  \note There are two versions of \p raw_reference_cast. One for <tt>const</tt> references,
+ *        and one for non-<tt>const</tt>.
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+__host__ __device__
+typename detail::raw_reference<const T>::type
+  raw_reference_cast(const T &ref);
+
+
+/*! \}
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/memory/detail/device_system_resource.h b/thrust/thrust/memory/detail/device_system_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e94991d6124c42702ce44795c100d38a1016fe1
--- /dev/null
+++ b/thrust/thrust/memory/detail/device_system_resource.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the device system's memory_resource header
+#define __THRUST_DEVICE_SYSTEM_MEMORY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/memory_resource.h>
+#include __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+
+namespace thrust
+{
+
+
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::memory_resource
+    device_memory_resource;
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_memory_resource
+    universal_memory_resource;
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_host_pinned_memory_resource
+    universal_host_pinned_memory_resource;
+
+
+} // end thrust
+
diff --git a/thrust/thrust/memory/detail/host_system_resource.h b/thrust/thrust/memory/detail/host_system_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..ded1c4d0bfac5efed867743b5e1a1ad70e736cb3
--- /dev/null
+++ b/thrust/thrust/memory/detail/host_system_resource.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the host system's memory_resource header
+#define __THRUST_HOST_SYSTEM_MEMORY_HEADER <__THRUST_HOST_SYSTEM_ROOT/memory_resource.h>
+#include __THRUST_HOST_SYSTEM_MEMORY_HEADER
+#undef __THRUST_HOST_SYSTEM_MEMORY_HEADER
+
+namespace thrust
+{
+
+typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::memory_resource
+    host_memory_resource;
+
+} // end thrust
+
diff --git a/thrust/thrust/merge.h b/thrust/thrust/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..184141f6f626b1d667564867e3d1ce045fc65d19
--- /dev/null
+++ b/thrust/thrust/merge.h
@@ -0,0 +1,680 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file merge.h
+ *  \brief Merging sorted ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup merging Merging
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sorted sets of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[13];
+ *
+ *  int *result_end =
+ *    thrust::merge(thrust::host,
+ *                  A1, A1 + 6,
+ *                  A2, A2 + 7,
+ *                  result);
+ *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p set_union
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result);
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sorted sets of integers.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[13];
+ *
+ *  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result);
+ *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p set_union
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result);
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sets of integers sorted in
+ *  descending order using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int result[13];
+ *
+ *  int *result_end = thrust::merge(thrust::host,
+ *                                  A1, A1 + 6,
+ *                                  A2, A2 + 7,
+ *                                  result,
+ *                                  thrust::greater<int>());
+ *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakCompare comp);
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sets of integers sorted in
+ *  descending order.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int result[13];
+ *
+ *  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
+ *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakCompare comp);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  ascending order using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {1, 1, 2, 3, 5, 8, 13};
+ *  int B_vals[7] = {1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end =
+ *    thrust::merge_by_key(thrust::host,
+ *                         A_keys, A_keys + 6,
+ *                         B_keys, B_keys + 7,
+ *                         A_vals, B_vals,
+ *                         keys_result, vals_result);
+ *
+ *  // keys_result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  // vals_result = {0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,  0,  1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  ascending order.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {1, 1, 2, 3, 5, 8, 13};
+ *  int B_vals[7] = {1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end = thrust::merge_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, B_vals, keys_result, vals_result);
+ *
+ *  // keys_result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  // vals_result = {0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,  0,  1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  This version of \p merge_by_key compares key elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized using \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  descending order using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end =
+ *    thrust::merge_by_key(thrust::host,
+ *                         A_keys, A_keys + 6,
+ *                         B_keys, B_keys + 7,
+ *                         A_vals, B_vals,
+ *                         keys_result, vals_result,
+ *                         thrust::greater<int>());
+ *
+ *  // keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  // vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  This version of \p merge_by_key compares key elements using a function object \p comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  descending order.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end = thrust::merge_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *
+ *  // keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  // vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 StrictWeakCompare comp);
+
+
+/*! \} // merging
+ */
+
+} // end thrust
+
+#include <thrust/detail/merge.inl>
+
diff --git a/thrust/thrust/mismatch.h b/thrust/thrust/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..413db84f56361af4d028b756e267f655a591b34c
--- /dev/null
+++ b/thrust/thrust/mismatch.h
@@ -0,0 +1,260 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file mismatch.h
+ *  \brief Search for differences between ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup searching
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ *  \p mismatch use different tests for whether elements differ.
+ *
+ *  This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ *  such that <tt>*i == *(first2 + (i - first1))</tt> is \c false. The return value is a
+ *  \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ *  If no such iterator \c i exists, the return value is a \c pair whose first element
+ *  is \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(thrust::device, vec1.begin(), vec1.end(), vec2.begin());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2);
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ * and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ * \p mismatch use different tests for whether elements differ.
+ *
+ * This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ * such that <tt>*i == *(first2 + (i - first1))</tt> is \c false. The return value is a
+ * \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ * If no such iterator \c i exists, the return value is a \c pair whose first element
+ * is \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(vec1.begin(), vec1.end(), vec2.begin());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template <typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2);
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ *  \p mismatch use different tests for whether elements differ.
+ *
+ *  This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ *  such that <tt>pred(\*i, \*(first2 + (i - first1))</tt> is \c false. The return value is a
+ *  \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ *  If no such iterator \c i exists, the return value is a \c pair whose first element is
+ *  \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param pred   The binary predicate to compare elements.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(thrust::device, vec1.begin(), vec1.end(), vec2.begin(), thrust::equal_to<int>());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred);
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ * and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ * \p mismatch use different tests for whether elements differ.
+ *
+ * This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ * such that <tt>pred(\*i, \*(first2 + (i - first1))</tt> is \c false. The return value is a
+ * \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ * If no such iterator \c i exists, the return value is a \c pair whose first element is
+ * \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param pred   The binary predicate to compare elements.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(vec1.begin(), vec1.end(), vec2.begin(), thrust::equal_to<int>());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template <typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred);
+
+/*! \} // end searching
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/mismatch.inl>
+
diff --git a/thrust/thrust/mr/allocator.h b/thrust/thrust/mr/allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c6c3288601fcacce058fc2ae7f654d334a33827
--- /dev/null
+++ b/thrust/thrust/mr/allocator.h
@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file allocator.h
+ *  \brief Allocator types usable with NPA-based memory resources.
+ */
+
+#pragma once
+
+#include <limits>
+
+#include <thrust/detail/config/exec_check_disable.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#include <thrust/mr/detail/config.h>
+#include <thrust/mr/validator.h>
+#include <thrust/mr/polymorphic_adaptor.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! An \p mr::allocator is a template that fulfills the C++ requirements for Allocators,
+ *  allowing to use the NPA-based memory resources where an Allocator is required. Unlike
+ *  memory resources, but like other allocators, \p mr::allocator is typed and bound to
+ *  allocate object of a specific type, however it can be freely rebound to other types.
+ *
+ *  \tparam T the type that will be allocated by this allocator.
+ *  \tparam MR the upstream memory resource to use for memory allocation. Must derive from
+ *      \p thrust::mr::memory_resource and must be \p final (in C++11 and beyond).
+ */
+template<typename T, class MR>
+class allocator : private validator<MR>
+{
+public:
+    /*! The pointer to void type of this allocator. */
+    typedef typename MR::pointer void_pointer;
+
+    /*! The value type allocated by this allocator. Equivalent to \p T. */
+    typedef T value_type;
+    /*! The pointer type allocated by this allocator. Equivaled to the pointer type of \p MR rebound to \p T. */
+    typedef typename thrust::detail::pointer_traits<void_pointer>::template rebind<T>::other pointer;
+    /*! The pointer to const type. Equivalent to a pointer type of \p MR reboud to <tt>const T</tt>. */
+    typedef typename thrust::detail::pointer_traits<void_pointer>::template rebind<const T>::other const_pointer;
+    /*! The reference to the type allocated by this allocator. Supports smart references. */
+    typedef typename thrust::detail::pointer_traits<pointer>::reference reference;
+    /*! The const reference to the type allocated by this allocator. Supports smart references. */
+    typedef typename thrust::detail::pointer_traits<const_pointer>::reference const_reference;
+    /*! The size type of this allocator. Always \p std::size_t. */
+    typedef std::size_t size_type;
+    /*! The difference type between pointers allocated by this allocator. */
+    typedef typename thrust::detail::pointer_traits<pointer>::difference_type difference_type;
+
+    /*! Specifies that the allocator shall be propagated on container copy assignment. */
+    typedef detail::true_type propagate_on_container_copy_assignment;
+    /*! Specifies that the allocator shall be propagated on container move assignment. */
+    typedef detail::true_type propagate_on_container_move_assignment;
+    /*! Specifies that the allocator shall be propagated on container swap. */
+    typedef detail::true_type propagate_on_container_swap;
+
+    /*! The \p rebind metafunction provides the type of an \p allocator instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p allocator.
+         */
+        typedef allocator<U, MR> other;
+    };
+
+    /*! Calculates the maximum number of elements allocated by this allocator.
+     *
+     *  \returns the maximum value of \p std::size_t, divided by the size of \p T.
+     */
+    __thrust_exec_check_disable__
+    __host__ __device__
+    size_type max_size() const
+    {
+        return std::numeric_limits<size_type>::max() / sizeof(T);
+    }
+
+    /*! Constructor.
+     *
+     *  \param resource the resource to be used to allocate raw memory.
+     */
+    __host__ __device__
+    allocator(MR * resource) : mem_res(resource)
+    {
+    }
+
+    /*! Copy constructor. Copies the resource pointer. */
+    template<typename U>
+    __host__ __device__
+    allocator(const allocator<U, MR> & other) : mem_res(other.resource())
+    {
+    }
+
+    /*! Allocates objects of type \p T.
+     *
+     *  \param n number of elements to allocate
+     *  \returns a pointer to the newly allocated storage.
+     */
+    THRUST_NODISCARD
+    __host__
+    pointer allocate(size_type n)
+    {
+        return static_cast<pointer>(mem_res->do_allocate(n * sizeof(T), THRUST_ALIGNOF(T)));
+    }
+
+    /*! Deallocates objects of type \p T.
+     *
+     *  \param p pointer returned by a previous call to \p allocate
+     *  \param n number of elements, passed as an argument to the \p allocate call that produced \p p
+     */
+    __host__
+    void deallocate(pointer p, size_type n)
+    {
+        return mem_res->do_deallocate(p, n * sizeof(T), THRUST_ALIGNOF(T));
+    }
+
+    /*! Extracts the memory resource used by this allocator.
+     *
+     *  \returns the memory resource used by this allocator.
+     */
+    __host__ __device__
+    MR * resource() const
+    {
+        return mem_res;
+    }
+
+private:
+    MR * mem_res;
+};
+
+/*! Compares the allocators for equality by comparing the underlying memory resources. */
+template<typename T, typename MR>
+__host__ __device__
+bool operator==(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) THRUST_NOEXCEPT
+{
+    return *lhs.resource() == *rhs.resource();
+}
+
+/*! Compares the allocators for inequality by comparing the underlying memory resources. */
+template<typename T, typename MR>
+__host__ __device__
+bool operator!=(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) THRUST_NOEXCEPT
+{
+    return !(lhs == rhs);
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template<typename T, typename Pointer>
+using polymorphic_allocator = allocator<T, polymorphic_adaptor_resource<Pointer> >;
+
+#else // C++11
+
+template<typename T, typename Pointer>
+class polymorphic_allocator : public allocator<T, polymorphic_adaptor_resource<Pointer> >
+{
+    typedef allocator<T, polymorphic_adaptor_resource<Pointer> > base;
+
+public:
+    /*! Initializes the base class with the parameter \p resource.
+     */
+    polymorphic_allocator(polymorphic_adaptor_resource<Pointer>  * resource) : base(resource)
+    {
+    }
+};
+
+#endif // C++11
+
+/*! A helper allocator class that uses global instances of a given upstream memory resource. Requires the memory resource
+ *      to be default constructible.
+ *
+ *  \tparam T the type that will be allocated by this allocator.
+ *  \tparam Upstream the upstream memory resource to use for memory allocation. Must derive from
+ *      \p thrust::mr::memory_resource and must be \p final (in C++11 and beyond).
+ */
+template<typename T, typename Upstream>
+class stateless_resource_allocator : public thrust::mr::allocator<T, Upstream>
+{
+    typedef thrust::mr::allocator<T, Upstream> base;
+
+public:
+    /*! The \p rebind metafunction provides the type of an \p stateless_resource_allocator instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p stateless_resource_allocator.
+         */
+        typedef stateless_resource_allocator<U, Upstream> other;
+    };
+
+    /*! Default constructor. Uses \p get_global_resource to get the global instance of \p Upstream and initializes the
+     *      \p allocator base subobject with that resource.
+     */
+    __host__
+    stateless_resource_allocator() : base(get_global_resource<Upstream>())
+    {
+    }
+
+    /*! Copy constructor. Copies the memory resource pointer. */
+    __host__ __device__
+    stateless_resource_allocator(const stateless_resource_allocator & other)
+        : base(other) {}
+
+    /*! Conversion constructor from an allocator of a different type. Copies the memory resource pointer. */
+    template<typename U>
+    __host__ __device__
+    stateless_resource_allocator(const stateless_resource_allocator<U, Upstream> & other)
+        : base(other) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+    stateless_resource_allocator & operator=(const stateless_resource_allocator &) = default;
+#endif
+
+    /*! Destructor. */
+    __host__ __device__
+    ~stateless_resource_allocator() {}
+};
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/detail/config.h b/thrust/thrust/mr/detail/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cfc50d3e4290bbe63ae2a1d028b14b87c6a1665
--- /dev/null
+++ b/thrust/thrust/mr/detail/config.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/config/cpp_compatibility.h>
+
+#define THRUST_MR_DEFAULT_ALIGNMENT THRUST_ALIGNOF(::thrust::detail::max_align_t)
+
+#if THRUST_CPP_DIALECT >= 2017
+#  if __has_include(<memory_resource>)
+#    define THRUST_MR_STD_MR_HEADER <memory_resource>
+#    define THRUST_MR_STD_MR_NS std::pmr
+#  elif __has_include(<experimental/memory_resource>)
+#    define THRUST_MR_STD_MR_HEADER <experimental/memory_resource>
+#    define THRUST_MR_STD_MR_NS std::experimental::pmr
+#  endif
+#endif
+
diff --git a/thrust/thrust/mr/disjoint_pool.h b/thrust/thrust/mr/disjoint_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..898e499c807dc48a35c7dafe3da00d2885b62396
--- /dev/null
+++ b/thrust/thrust/mr/disjoint_pool.h
@@ -0,0 +1,489 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file disjoint_pool.h
+ *  \brief A caching and pooling memory resource adaptor which uses separate upstream resources for memory allocation
+ *      and bookkeeping.
+ */
+
+#pragma once
+
+#include <thrust/detail/algorithm_wrapper.h>
+
+#include <thrust/host_vector.h>
+#include <thrust/binary_search.h>
+#include <thrust/detail/seq.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/pool_options.h>
+
+#include <cassert>
+
+namespace thrust
+{
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! A memory resource adaptor allowing for pooling and caching allocations from \p Upstream, using \p Bookkeeper for
+ *      management of that cached and pooled memory, allowing to cache portions of memory inaccessible from the host.
+ *
+ *  On a typical memory resource, calls to \p allocate and \p deallocate actually allocate and deallocate memory. Pooling
+ *      memory resources only allocate and deallocate memory from an external resource (the upstream memory resource) when
+ *      there's no suitable memory currently cached; otherwise, they use memory they have acquired beforehand, to make
+ *      memory allocation faster and more efficient.
+ *
+ *  The disjoint version of the pool resources uses a separate upstream memory resource, \p Bookkeeper, to allocate memory
+ *      necessary to manage the cached memory. There may be many reasons to do that; the canonical one is that \p Upstream
+ *      allocates memory that is inaccessible to the code of the pool resource, which means that it cannot embed the necessary
+ *      information in memory obtained from \p Upstream; for instance, \p Upstream can be a CUDA non-managed memory
+ *      resource, or a CUDA managed memory resource whose memory we would prefer to not migrate back and forth between
+ *      host and device when executing bookkeeping code.
+ *
+ *  This is not the only case where it makes sense to use a disjoint pool resource, though. In a multi-core environment
+ *      it may be beneficial to avoid stealing cache lines from other cores by writing over bookkeeping information
+ *      embedded in an allocated block of memory. In such a case, one can imagine wanting to use a disjoint pool where
+ *      both the upstream and the bookkeeper are of the same type, to allocate memory consistently, but separately for
+ *      those two purposes.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks to be handed off to the user
+ *  \tparam Bookkeeper the type of memory resources that will be used for allocating bookkeeping memory
+ */
+template<typename Upstream, typename Bookkeeper>
+class disjoint_unsynchronized_pool_resource THRUST_FINAL
+    : public memory_resource<typename Upstream::pointer>,
+        private validator2<Upstream, Bookkeeper>
+{
+public:
+    /*! Get the default options for a disjoint pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        pool_options ret;
+
+        ret.min_blocks_per_chunk = 16;
+        ret.min_bytes_per_chunk = 1024;
+        ret.max_blocks_per_chunk = static_cast<std::size_t>(1) << 20;
+        ret.max_bytes_per_chunk = static_cast<std::size_t>(1) << 30;
+
+        ret.smallest_block_size = THRUST_MR_DEFAULT_ALIGNMENT;
+        ret.largest_block_size = static_cast<std::size_t>(1) << 20;
+
+        ret.alignment = THRUST_MR_DEFAULT_ALIGNMENT;
+
+        ret.cache_oversized = true;
+
+        ret.cached_size_cutoff_factor = 16;
+        ret.cached_alignment_cutoff_factor = 16;
+
+        return ret;
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param bookkeeper the upstream memory resource for bookkeeping
+     *  \param options pool options to use
+     */
+    disjoint_unsynchronized_pool_resource(Upstream * upstream, Bookkeeper * bookkeeper,
+        pool_options options = get_default_options())
+        : m_upstream(upstream),
+        m_bookkeeper(bookkeeper),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(m_bookkeeper),
+        m_allocated(m_bookkeeper),
+        m_cached_oversized(m_bookkeeper),
+        m_oversized(m_bookkeeper)
+    {
+        assert(m_options.validate());
+
+        pointer_vector free(m_bookkeeper);
+        pool p(free);
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    // TODO: C++11: use delegating constructors
+
+    /*! Constructor. Upstream and bookkeeping resources are obtained by calling \p get_global_resource for their types.
+     *
+     *  \param options pool options to use
+     */
+    disjoint_unsynchronized_pool_resource(pool_options options = get_default_options())
+        : m_upstream(get_global_resource<Upstream>()),
+        m_bookkeeper(get_global_resource<Bookkeeper>()),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(m_bookkeeper),
+        m_allocated(m_bookkeeper),
+        m_cached_oversized(m_bookkeeper),
+        m_oversized(m_bookkeeper)
+    {
+        assert(m_options.validate());
+
+        pointer_vector free(m_bookkeeper);
+        pool p(free);
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    /*! Destructor. Releases all held memory to upstream.
+     */
+    ~disjoint_unsynchronized_pool_resource()
+    {
+        release();
+    }
+
+private:
+    typedef typename Upstream::pointer void_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<char>::other char_ptr;
+
+    struct chunk_descriptor
+    {
+        std::size_t size;
+        void_ptr pointer;
+    };
+
+    typedef thrust::host_vector<
+        chunk_descriptor,
+        allocator<chunk_descriptor, Bookkeeper>
+    > chunk_vector;
+
+    struct oversized_block_descriptor
+    {
+        std::size_t size;
+        std::size_t alignment;
+        void_ptr pointer;
+
+        __host__ __device__
+        bool operator==(const oversized_block_descriptor & other) const
+        {
+            return size == other.size && alignment == other.alignment && pointer == other.pointer;
+        }
+
+        __host__ __device__
+        bool operator<(const oversized_block_descriptor & other) const
+        {
+            return size < other.size || (size == other.size && alignment < other.alignment);
+        }
+    };
+
+    struct equal_pointers
+    {
+    public:
+        __host__ __device__
+        equal_pointers(void_ptr p) : p(p)
+        {
+        }
+
+        __host__ __device__
+        bool operator()(const oversized_block_descriptor & desc) const
+        {
+            return desc.pointer == p;
+        }
+
+    private:
+        void_ptr p;
+    };
+
+    struct matching_alignment
+    {
+    public:
+        __host__ __device__
+        matching_alignment(std::size_t requested) : requested(requested)
+        {
+        }
+
+        __host__ __device__
+        bool operator()(const oversized_block_descriptor & desc) const
+        {
+            return desc.alignment >= requested;
+        }
+
+    private:
+        std::size_t requested;
+    };
+
+    typedef thrust::host_vector<
+        oversized_block_descriptor,
+        allocator<oversized_block_descriptor, Bookkeeper>
+    > oversized_block_vector;
+
+    typedef thrust::host_vector<
+        void_ptr,
+        allocator<void_ptr, Bookkeeper>
+    > pointer_vector;
+
+    struct pool
+    {
+        __host__
+        pool(const pointer_vector & free)
+            : free_blocks(free),
+            previous_allocated_count(0)
+        {
+        }
+
+        __host__
+        pool(const pool & other)
+            : free_blocks(other.free_blocks),
+            previous_allocated_count(other.previous_allocated_count)
+        {
+        }
+
+#if THRUST_CPP_DIALECT >= 2011
+        pool & operator=(const pool &) = default;
+#endif
+
+        __host__
+        ~pool() {}
+
+        pointer_vector free_blocks;
+        std::size_t previous_allocated_count;
+    };
+
+    typedef thrust::host_vector<
+        pool,
+        allocator<pool, Bookkeeper>
+    > pool_vector;
+
+    Upstream * m_upstream;
+    Bookkeeper * m_bookkeeper;
+
+    pool_options m_options;
+    std::size_t m_smallest_block_log2;
+
+    // buckets containing free lists for each pooled size
+    pool_vector m_pools;
+    // list of all allocations from upstream for the above
+    chunk_vector m_allocated;
+    // list of all cached oversized/overaligned blocks that have been returned to the pool to cache
+    oversized_block_vector m_cached_oversized;
+    // list of all oversized/overaligned allocations from upstream
+    oversized_block_vector m_oversized;
+
+public:
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        // reset the buckets
+        for (std::size_t i = 0; i < m_pools.size(); ++i)
+        {
+            m_pools[i].free_blocks.clear();
+            m_pools[i].previous_allocated_count = 0;
+        }
+
+        // deallocate memory allocated for the buckets
+        for (std::size_t i = 0; i < m_allocated.size(); ++i)
+        {
+            m_upstream->do_deallocate(
+                m_allocated[i].pointer,
+                m_allocated[i].size,
+                m_options.alignment);
+        }
+
+        // deallocate cached oversized/overaligned memory
+        for (std::size_t i = 0; i < m_oversized.size(); ++i)
+        {
+            m_upstream->do_deallocate(
+                m_oversized[i].pointer,
+                m_oversized[i].size,
+                m_oversized[i].alignment);
+        }
+
+        m_allocated.clear();
+        m_oversized.clear();
+        m_cached_oversized.clear();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        bytes = (std::max)(bytes, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // an oversized and/or overaligned allocation requested; needs to be allocated separately
+        if (bytes > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            oversized_block_descriptor oversized;
+            oversized.size = bytes;
+            oversized.alignment = alignment;
+
+            if (m_options.cache_oversized && !m_cached_oversized.empty())
+            {
+                typename oversized_block_vector::iterator it = thrust::lower_bound(
+                    thrust::seq,
+                    m_cached_oversized.begin(),
+                    m_cached_oversized.end(),
+                    oversized);
+
+                // if the size is bigger than the requested size by a factor
+                // bigger than or equal to the specified cutoff for size,
+                // allocate a new block
+                if (it != m_cached_oversized.end())
+                {
+                    std::size_t size_factor = (*it).size / bytes;
+                    if (size_factor >= m_options.cached_size_cutoff_factor)
+                    {
+                        it = m_cached_oversized.end();
+                    }
+                }
+
+                if (it != m_cached_oversized.end() && (*it).alignment < alignment)
+                {
+                    it = find_if(it + 1, m_cached_oversized.end(), matching_alignment(alignment));
+                }
+
+                // if the alignment is bigger than the requested one by a factor
+                // bigger than or equal to the specified cutoff for alignment,
+                // allocate a new block
+                if (it != m_cached_oversized.end())
+                {
+                    std::size_t alignment_factor = (*it).alignment / alignment;
+                    if (alignment_factor >= m_options.cached_alignment_cutoff_factor)
+                    {
+                        it = m_cached_oversized.end();
+                    }
+                }
+
+                if (it != m_cached_oversized.end())
+                {
+                    oversized.pointer = (*it).pointer;
+                    m_cached_oversized.erase(it);
+                    return oversized.pointer;
+                }
+            }
+
+            // no fitting cached block found; allocate a new one that's just up to the specs
+            oversized.pointer = m_upstream->do_allocate(bytes, alignment);
+            m_oversized.push_back(oversized);
+
+            return oversized.pointer;
+        }
+
+        // the request is NOT for oversized and/or overaligned memory
+        // allocate a block from an appropriate bucket
+        std::size_t bytes_log2 = thrust::detail::log2_ri(bytes);
+        std::size_t bucket_idx = bytes_log2 - m_smallest_block_log2;
+        pool & bucket = m_pools[bucket_idx];
+
+        // if the free list of the bucket has no elements, allocate a new chunk
+        // and split it into blocks pushed to the free list
+        if (bucket.free_blocks.empty())
+        {
+            std::size_t bucket_size = static_cast<std::size_t>(1) << bytes_log2;
+
+            std::size_t n = bucket.previous_allocated_count;
+            if (n == 0)
+            {
+                n = m_options.min_blocks_per_chunk;
+                if (n < (m_options.min_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.min_bytes_per_chunk >> bytes_log2;
+                }
+            }
+            else
+            {
+                n = n * 3 / 2;
+                if (n > (m_options.max_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.max_bytes_per_chunk >> bytes_log2;
+                }
+                if (n > m_options.max_blocks_per_chunk)
+                {
+                    n = m_options.max_blocks_per_chunk;
+                }
+            }
+
+            bytes = n << bytes_log2;
+
+            assert(n >= m_options.min_blocks_per_chunk);
+            assert(n <= m_options.max_blocks_per_chunk);
+            assert(bytes >= m_options.min_bytes_per_chunk);
+            assert(bytes <= m_options.max_bytes_per_chunk);
+
+            chunk_descriptor allocated;
+            allocated.size = bytes;
+            allocated.pointer = m_upstream->do_allocate(bytes, m_options.alignment);
+            m_allocated.push_back(allocated);
+            bucket.previous_allocated_count = n;
+
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                bucket.free_blocks.push_back(
+                    static_cast<void_ptr>(
+                        static_cast<char_ptr>(allocated.pointer) + i * bucket_size
+                    )
+                );
+            }
+        }
+
+        // allocate a block from the front of the bucket's free list
+        void_ptr ret = bucket.free_blocks.back();
+        bucket.free_blocks.pop_back();
+        return ret;
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        n = (std::max)(n, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // verify that the pointer is at least as aligned as claimed
+        assert(reinterpret_cast<detail::intmax_t>(detail::pointer_traits<void_ptr>::get(p)) % alignment == 0);
+
+        // the deallocated block is oversized and/or overaligned
+        if (n > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            typename oversized_block_vector::iterator it = find_if(m_oversized.begin(), m_oversized.end(), equal_pointers(p));
+            assert(it != m_oversized.end());
+
+            oversized_block_descriptor oversized = *it;
+
+            if (m_options.cache_oversized)
+            {
+                typename oversized_block_vector::iterator position = lower_bound(m_cached_oversized.begin(), m_cached_oversized.end(), oversized);
+                m_cached_oversized.insert(position, oversized);
+                return;
+            }
+
+            m_oversized.erase(it);
+
+            m_upstream->do_deallocate(p, oversized.size, oversized.alignment);
+
+            return;
+        }
+
+        // push the block to the front of the appropriate bucket's free list
+        std::size_t n_log2 = thrust::detail::log2_ri(n);
+        std::size_t bucket_idx = n_log2 - m_smallest_block_log2;
+        pool & bucket = m_pools[bucket_idx];
+
+        bucket.free_blocks.push_back(p);
+    }
+};
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/disjoint_sync_pool.h b/thrust/thrust/mr/disjoint_sync_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed6cab7ed67b4358934c360e83351b90db0211a4
--- /dev/null
+++ b/thrust/thrust/mr/disjoint_sync_pool.h
@@ -0,0 +1,119 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file disjoint_sync_pool.h
+ *  \brief A mutex-synchronized version of \p disjoint_unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <mutex>
+
+#include <thrust/mr/disjoint_pool.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_resources
+ *  \{
+ */
+
+/*! A mutex-synchronized version of \p disjoint_unsynchronized_pool_resource. Uses \p std::mutex, and therefore requires C++11.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks to be handed off to the user
+ *  \tparam Bookkeeper the type of memory resources that will be used for allocating bookkeeping memory
+ */
+template<typename Upstream, typename Bookkeeper>
+struct disjoint_synchronized_pool_resource : public memory_resource<typename Upstream::pointer>
+{
+    typedef disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper> unsync_pool;
+    typedef std::lock_guard<std::mutex> lock_t;
+
+    typedef typename Upstream::pointer void_ptr;
+
+public:
+    /*! Get the default options for a disjoint pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        return unsync_pool::get_default_options();
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param bookkeeper the upstream memory resource for bookkeeping
+     *  \param options pool options to use
+     */
+    disjoint_synchronized_pool_resource(Upstream * upstream, Bookkeeper * bookkeeper,
+        pool_options options = get_default_options())
+        : upstream_pool(upstream, bookkeeper, options)
+    {
+    }
+
+    /*! Constructor. Upstream and bookkeeping resources are obtained by calling \p get_global_resource for their types.
+     *
+     *  \param options pool options to use
+     */
+    disjoint_synchronized_pool_resource(pool_options options = get_default_options())
+        : upstream_pool(get_global_resource<Upstream>(), get_global_resource<Bookkeeper>(), options)
+    {
+    }
+
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        lock_t lock(mtx);
+        upstream_pool.release();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        lock_t lock(mtx);
+        return upstream_pool.do_allocate(bytes, alignment);
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        lock_t lock(mtx);
+        upstream_pool.do_deallocate(p, n, alignment);
+    }
+
+private:
+    std::mutex mtx;
+    unsync_pool upstream_pool;
+};
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/mr/disjoint_tls_pool.h b/thrust/thrust/mr/disjoint_tls_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..e50eba76255421812bb1b0c4a355e879eef37492
--- /dev/null
+++ b/thrust/thrust/mr/disjoint_tls_pool.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file disjoint_tls_pool.h
+ *  \brief A function wrapping a thread local instance of a \p disjoint_unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/mr/disjoint_pool.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_resources
+ *  \{
+ */
+
+/*! Potentially constructs, if not yet created, and then returns the address of a thread-local
+ *      \p disjoint_unsynchronized_pool_resource,
+ *
+ *  \tparam Upstream the first template argument to the pool template
+ *  \tparam Bookkeeper the second template argument to the pool template
+ *  \param upstream the first argument to the constructor, if invoked
+ *  \param bookkeeper the second argument to the constructor, if invoked
+ */
+template<typename Upstream, typename Bookkeeper>
+__host__
+thrust::mr::disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper> & tls_disjoint_pool(
+    Upstream * upstream = NULL,
+    Bookkeeper * bookkeeper = NULL)
+{
+    static thread_local auto adaptor = [&]{
+        assert(upstream && bookkeeper);
+        return thrust::mr::disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper>(upstream, bookkeeper);
+    }();
+
+    return adaptor;
+}
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/mr/fancy_pointer_resource.h b/thrust/thrust/mr/fancy_pointer_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..53ffc7eb76baf00f291e05e22dc9a49c2224e8f8
--- /dev/null
+++ b/thrust/thrust/mr/fancy_pointer_resource.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/mr/validator.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+template<typename Upstream, typename Pointer>
+class fancy_pointer_resource THRUST_FINAL : public memory_resource<Pointer>, private validator<Upstream>
+{
+public:
+    fancy_pointer_resource() : m_upstream(get_global_resource<Upstream>())
+    {
+    }
+
+    fancy_pointer_resource(Upstream * upstream) : m_upstream(upstream)
+    {
+    }
+
+    THRUST_NODISCARD
+    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        return static_cast<Pointer>(m_upstream->do_allocate(bytes, alignment));
+    }
+
+    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    {
+        return m_upstream->do_deallocate(
+            static_cast<typename Upstream::pointer>(
+                thrust::detail::pointer_traits<Pointer>::get(p)),
+            bytes, alignment);
+    }
+
+private:
+    Upstream * m_upstream;
+};
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/memory_resource.h b/thrust/thrust/mr/memory_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..048ca2405931bc75fc3716dbbf3da4bc2f3827f1
--- /dev/null
+++ b/thrust/thrust/mr/memory_resource.h
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file mr/memory_resource.h
+ *  \brief A base class for the memory resource system, similar to std::memory_resource,
+ *      and related utilities.
+ */
+
+#pragma once
+
+#include "detail/config.h"
+#ifdef THRUST_MR_STD_MR_HEADER
+#  include THRUST_MR_STD_MR_HEADER
+#endif
+
+namespace thrust
+{
+/*! \brief \p thrust::mr is the namespace containing system agnostic types and functions for \p memory_resource related functionalities.
+ */
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! \p memory_resource is the base class for all other memory resources.
+ *
+ *  \tparam Pointer the pointer type that is allocated and deallocated by the memory resource
+ *      derived from this base class. If this is <tt>void *</tt>, this class derives from
+ *      <tt>std::pmr::memory_resource</tt>.
+ */
+template<typename Pointer = void *>
+class memory_resource
+{
+public:
+    /*! Alias for the template parameter.
+     */
+    typedef Pointer pointer;
+
+    /*! Virtual destructor, defaulted when possible.
+     */
+    virtual ~memory_resource() THRUST_DEFAULT
+
+    /*! Allocates memory of size at least \p bytes and alignment at least \p alignment.
+     *
+     *  \param bytes size, in bytes, that is requested from this allocation
+     *  \param alignment alignment that is requested from this allocation
+     *  \throws thrust::bad_alloc when no memory with requested size and alignment can be allocated.
+     *  \returns A pointer to void to the newly allocated memory.
+     */
+    THRUST_NODISCARD
+    pointer allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        return do_allocate(bytes, alignment);
+    }
+
+    /*! Deallocates memory pointed to by \p p.
+     *
+     *  \param p pointer to be deallocated
+     *  \param bytes the size of the allocation. This must be equivalent to the value of \p bytes that
+     *      was passed to the allocation function that returned \p p.
+     *  \param alignment the alignment of the allocation. This must be equivalent to the value of \p alignment
+     *      that was passed to the allocation function that returned \p p.
+     */
+    void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        do_deallocate(p, bytes, alignment);
+    }
+
+    /*! Compares this resource to the other one. The default implementation uses identity comparison,
+     *      which is often the right thing to do and doesn't require RTTI involvement.
+     *
+     *  \param other the other resource to compare this resource to
+     *  \returns whether the two resources are equivalent.
+     */
+    __host__ __device__
+    bool is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    {
+        return do_is_equal(other);
+    }
+
+    /*! Allocates memory of size at least \p bytes and alignment at least \p alignment.
+     *
+     *  \param bytes size, in bytes, that is requested from this allocation
+     *  \param alignment alignment that is requested from this allocation
+     *  \throws thrust::bad_alloc when no memory with requested size and alignment can be allocated.
+     *  \returns A pointer to void to the newly allocated memory.
+     */
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment) = 0;
+
+    /*! Deallocates memory pointed to by \p p.
+     *
+     *  \param p pointer to be deallocated
+     *  \param bytes the size of the allocation. This must be equivalent to the value of \p bytes that
+     *      was passed to the allocation function that returned \p p.
+     *  \param alignment the size of the allocation. This must be equivalent to the value of \p alignment
+     *      that was passed to the allocation function that returned \p p.
+     */
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) = 0;
+
+    /*! Compares this resource to the other one. The default implementation uses identity comparison,
+     *      which is often the right thing to do and doesn't require RTTI involvement.
+     *
+     *  \param other the other resource to compare this resource to
+     *  \returns whether the two resources are equivalent.
+     */
+    __host__ __device__
+    virtual bool do_is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    {
+        return this == &other;
+    }
+};
+
+template<>
+class memory_resource<void *>
+#ifdef THRUST_STD_MR_NS
+    : THRUST_STD_MR_NS::memory_resource
+#endif
+{
+public:
+    typedef void * pointer;
+
+    virtual ~memory_resource() THRUST_DEFAULT
+
+    THRUST_NODISCARD
+    pointer allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        return do_allocate(bytes, alignment);
+    }
+
+    void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        do_deallocate(p, bytes, alignment);
+    }
+
+    __host__ __device__
+    bool is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    {
+        return do_is_equal(other);
+    }
+
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment) = 0;
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) = 0;
+    __host__ __device__
+    virtual bool do_is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    {
+        return this == &other;
+    }
+
+#ifdef THRUST_STD_MR_NS
+    // the above do_is_equal is a different function than the one from the standard memory resource
+    // can't implement this reasonably without RTTI though; it's reasonable to assume false otherwise
+
+    virtual bool do_is_equal(const THRUST_STD_MR_NS::memory_resource & other) const noexcept override
+    {
+#  ifdef THRUST_HAS_DYNAMIC_CAST
+        auto mr_resource = dynamic_cast<memory_resource<> *>(&other);
+        return mr_resource && do_is_equal(*mr_resource);
+#  else
+        return this == &other;
+#  endif
+    }
+#endif
+};
+
+/*! Compares the memory resources for equality, first by identity, then by \p is_equal.
+ */
+template<typename Pointer>
+__host__ __device__
+bool operator==(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) THRUST_NOEXCEPT
+{
+    return &lhs == &rhs || rhs.is_equal(rhs);
+}
+
+/*! Compares the memory resources for inequality, first by identity, then by \p is_equal.
+ */
+template<typename Pointer>
+__host__ __device__
+bool operator!=(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) THRUST_NOEXCEPT
+{
+    return !(lhs == rhs);
+}
+
+/*! Returns a global instance of \p MR, created as a function local static variable.
+ *
+ *  \tparam MR type of a memory resource to get an instance from. Must be \p DefaultConstructible.
+ *  \returns a pointer to a global instance of \p MR.
+ */
+template<typename MR>
+__host__
+MR * get_global_resource()
+{
+    static MR resource;
+    return &resource;
+}
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/new.h b/thrust/thrust/mr/new.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8e4fe0212c1ec22f7ee417e6302cb819972c40c
--- /dev/null
+++ b/thrust/thrust/mr/new.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file new.h
+ *  \brief Global operator new-based memory resource.
+ */
+
+#pragma once
+
+#include <thrust/mr/memory_resource.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! A memory resource that uses global operators new and delete to allocate and deallocate memory. Uses alignment-enabled
+ *      overloads when available, otherwise uses regular overloads and implements alignment requirements by itself.
+ */
+class new_delete_resource THRUST_FINAL : public memory_resource<>
+{
+public:
+    void * do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+#if defined(__cpp_aligned_new)
+        return ::operator new(bytes, std::align_val_t(alignment));
+#else
+        // allocate memory for bytes, plus potential alignment correction,
+        // plus store of the correction offset
+        void * p = ::operator new(bytes + alignment + sizeof(std::size_t));
+        std::size_t ptr_int = reinterpret_cast<std::size_t>(p);
+        // calculate the offset, i.e. how many bytes of correction was necessary
+        // to get an aligned pointer
+        std::size_t offset = (ptr_int % alignment) ? (alignment - ptr_int % alignment) : 0;
+        // calculate the return pointer
+        char * ptr = static_cast<char *>(p) + offset;
+        // store the offset right after the actually returned value
+        std::size_t * offset_store = reinterpret_cast<std::size_t *>(ptr + bytes);
+        *offset_store = offset;
+        return static_cast<void *>(ptr);
+#endif
+    }
+
+    void do_deallocate(void * p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+#if defined(__cpp_aligned_new)
+# if defined(__cpp_sized_deallocation)
+        ::operator delete(p, bytes, std::align_val_t(alignment));
+# else
+        (void)bytes;
+        ::operator delete(p, std::align_val_t(alignment));
+# endif
+#else
+        (void)alignment;
+        char * ptr = static_cast<char *>(p);
+        // calculate where the offset is stored
+        std::size_t * offset = reinterpret_cast<std::size_t *>(ptr + bytes);
+        // calculate the original pointer
+        p = static_cast<void *>(ptr - *offset);
+        ::operator delete(p);
+#endif
+    }
+};
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/polymorphic_adaptor.h b/thrust/thrust/mr/polymorphic_adaptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5d98bf8382e9544605a6689e4bc2611b55f960d
--- /dev/null
+++ b/thrust/thrust/mr/polymorphic_adaptor.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2018-2019 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include "memory_resource.h"
+
+namespace thrust
+{
+namespace mr
+{
+
+template<typename Pointer = void *>
+class polymorphic_adaptor_resource THRUST_FINAL : public memory_resource<Pointer>
+{
+public:
+    polymorphic_adaptor_resource(memory_resource<Pointer> * t) : upstream_resource(t)
+    {
+    }
+
+    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        return upstream_resource->allocate(bytes, alignment);
+    }
+
+    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    {
+        return upstream_resource->deallocate(p, bytes, alignment);
+    }
+
+    __host__ __device__
+    virtual bool do_is_equal(const memory_resource<Pointer> & other) const THRUST_NOEXCEPT THRUST_OVERRIDE
+    {
+        return upstream_resource->is_equal(other);
+    }
+
+private:
+    memory_resource<Pointer> * upstream_resource;
+};
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/pool.h b/thrust/thrust/mr/pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..322e4312f0ec25e3e0d7f4e7db384b55c2de13ef
--- /dev/null
+++ b/thrust/thrust/mr/pool.h
@@ -0,0 +1,505 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file pool.h
+ *  \brief A caching and pooling memory resource adaptor which uses a single upstream resource for memory allocation,
+ *      and embeds bookkeeping information in allocated blocks.
+ */
+
+#pragma once
+
+#include <thrust/detail/algorithm_wrapper.h>
+
+#include <thrust/host_vector.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/pool_options.h>
+
+#include <cassert>
+
+namespace thrust
+{
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! A memory resource adaptor allowing for pooling and caching allocations from \p Upstream, using memory allocated
+ *      from it for both blocks then allocated to the user and for internal bookkeeping of the cached memory.
+ *
+ *  On a typical memory resource, calls to \p allocate and \p deallocate actually allocate and deallocate memory. Pooling
+ *      memory resources only allocate and deallocate memory from an external resource (the upstream memory resource) when
+ *      there's no suitable memory currently cached; otherwise, they use memory they have acquired beforehand, to make
+ *      memory allocation faster and more efficient.
+ *
+ *  The non-disjoint version of the pool resource uses a single upstream memory resource. Every allocation is larger than
+ *      strictly necessary to fulfill the end-user's request, because it needs to account for the memory overhead of tracking
+ *      the memory blocks and chunks inside those same memory regions. Nevertheless, this version should be more memory-efficient
+ *      than the \p disjoint_unsynchronized_pool_resource, because it doesn't need to allocate additional blocks of memory
+ *      from a separate resource, which in turn would necessitate the bookkeeping overhead in the upstream resource.
+ *
+ *  This version requires that memory allocated from Upstream is accessible from device. It supports smart references,
+ *      meaning that the non-managed CUDA resource, returning a device-tagged pointer, will work, but will be much less
+ *      efficient than the disjoint version, which wouldn't need to touch device memory at all, and therefore wouldn't need
+ *      to transfer it back and forth between the host and the device whenever an allocation or a deallocation happens.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks
+ */
+template<typename Upstream>
+class unsynchronized_pool_resource THRUST_FINAL
+    : public memory_resource<typename Upstream::pointer>,
+        private validator<Upstream>
+{
+public:
+    /*! Get the default options for a pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        pool_options ret;
+
+        ret.min_blocks_per_chunk = 16;
+        ret.min_bytes_per_chunk = 1024;
+        ret.max_blocks_per_chunk = static_cast<std::size_t>(1) << 20;
+        ret.max_bytes_per_chunk = static_cast<std::size_t>(1) << 30;
+
+        ret.smallest_block_size = THRUST_MR_DEFAULT_ALIGNMENT;
+        ret.largest_block_size = static_cast<std::size_t>(1) << 20;
+
+        ret.alignment = THRUST_MR_DEFAULT_ALIGNMENT;
+
+        ret.cache_oversized = true;
+
+        ret.cached_size_cutoff_factor = 16;
+        ret.cached_alignment_cutoff_factor = 16;
+
+        return ret;
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param options pool options to use
+     */
+    unsynchronized_pool_resource(Upstream * upstream, pool_options options = get_default_options())
+        : m_upstream(upstream),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(upstream),
+        m_allocated(),
+        m_oversized(),
+        m_cached_oversized()
+    {
+        assert(m_options.validate());
+
+        pool p = { block_descriptor_ptr(), 0 };
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    // TODO: C++11: use delegating constructors
+
+    /*! Constructor. The upstream resource is obtained by calling \p get_global_resource<Upstream>.
+     *
+     *  \param options pool options to use
+     */
+    unsynchronized_pool_resource(pool_options options = get_default_options())
+        : m_upstream(get_global_resource<Upstream>()),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(get_global_resource<Upstream>()),
+        m_allocated(),
+        m_oversized(),
+        m_cached_oversized()
+    {
+        assert(m_options.validate());
+
+        pool p = { block_descriptor_ptr(), 0 };
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    /*! Destructor. Releases all held memory to upstream.
+     */
+    ~unsynchronized_pool_resource()
+    {
+        release();
+    }
+
+private:
+    typedef typename Upstream::pointer void_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<char>::other char_ptr;
+
+    struct block_descriptor;
+    struct chunk_descriptor;
+    struct oversized_block_descriptor;
+
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<block_descriptor>::other block_descriptor_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<chunk_descriptor>::other chunk_descriptor_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<oversized_block_descriptor>::other oversized_block_descriptor_ptr;
+
+    struct block_descriptor
+    {
+        block_descriptor_ptr next;
+    };
+
+    struct chunk_descriptor
+    {
+        std::size_t size;
+        chunk_descriptor_ptr next;
+    };
+
+    // this was originally a forward list, but I made it a doubly linked list
+    // because that way deallocation when not caching is faster and doesn't require
+    // traversal of a linked list (it's still a forward list for the cached list,
+    // because allocation from that list already traverses)
+    //
+    // TODO: investigate whether it's better to have this be a doubly-linked list
+    // with fast do_deallocate when !m_options.cache_oversized, or to have this be
+    // a forward list and require traversal in do_deallocate
+    //
+    // I assume that it is better this way, but the additional pointer could
+    // potentially hurt? these are supposed to be oversized and/or overaligned,
+    // so they are kinda memory intensive already
+    struct oversized_block_descriptor
+    {
+        std::size_t size;
+        std::size_t alignment;
+        oversized_block_descriptor_ptr prev;
+        oversized_block_descriptor_ptr next;
+        oversized_block_descriptor_ptr next_cached;
+    };
+
+    struct pool
+    {
+        block_descriptor_ptr free_list;
+        std::size_t previous_allocated_count;
+    };
+
+    typedef thrust::host_vector<
+        pool,
+        allocator<pool, Upstream>
+    > pool_vector;
+
+    Upstream * m_upstream;
+
+    pool_options m_options;
+    std::size_t m_smallest_block_log2;
+
+    pool_vector m_pools;
+    chunk_descriptor_ptr m_allocated;
+    oversized_block_descriptor_ptr m_oversized;
+    oversized_block_descriptor_ptr m_cached_oversized;
+
+public:
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        // reset the buckets
+        for (std::size_t i = 0; i < m_pools.size(); ++i)
+        {
+            thrust::raw_reference_cast(m_pools[i]).free_list = block_descriptor_ptr();
+            thrust::raw_reference_cast(m_pools[i]).previous_allocated_count = 0;
+        }
+
+        // deallocate memory allocated for the buckets
+        while (detail::pointer_traits<chunk_descriptor_ptr>::get(m_allocated))
+        {
+            chunk_descriptor_ptr alloc = m_allocated;
+            m_allocated = thrust::raw_reference_cast(*m_allocated).next;
+
+            void_ptr p = static_cast<void_ptr>(
+                static_cast<char_ptr>(
+                    static_cast<void_ptr>(alloc)
+                ) - thrust::raw_reference_cast(*alloc).size
+            );
+            m_upstream->do_deallocate(p, thrust::raw_reference_cast(*alloc).size + sizeof(chunk_descriptor), m_options.alignment);
+        }
+
+        // deallocate cached oversized/overaligned memory
+        while (detail::pointer_traits<oversized_block_descriptor_ptr>::get(m_oversized))
+        {
+            oversized_block_descriptor_ptr alloc = m_oversized;
+            m_oversized = thrust::raw_reference_cast(*m_oversized).next;
+
+            void_ptr p = static_cast<void_ptr>(
+                static_cast<char_ptr>(
+                    static_cast<void_ptr>(alloc)
+                ) - thrust::raw_reference_cast(*alloc).size
+            );
+            m_upstream->do_deallocate(p, thrust::raw_reference_cast(*alloc).size + sizeof(oversized_block_descriptor), thrust::raw_reference_cast(*alloc).alignment);
+        }
+
+        m_cached_oversized = oversized_block_descriptor_ptr();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        bytes = (std::max)(bytes, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // an oversized and/or overaligned allocation requested; needs to be allocated separately
+        if (bytes > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            if (m_options.cache_oversized)
+            {
+                oversized_block_descriptor_ptr ptr = m_cached_oversized;
+                oversized_block_descriptor_ptr * previous = &m_cached_oversized;
+                while (detail::pointer_traits<oversized_block_descriptor_ptr>::get(ptr))
+                {
+                    oversized_block_descriptor desc = *ptr;
+                    bool is_good = desc.size >= bytes && desc.alignment >= alignment;
+
+                    // if the size is bigger than the requested size by a factor
+                    // bigger than or equal to the specified cutoff for size,
+                    // allocate a new block
+                    if (is_good)
+                    {
+                        std::size_t size_factor = desc.size / bytes;
+                        if (size_factor >= m_options.cached_size_cutoff_factor)
+                        {
+                            is_good = false;
+                        }
+                    }
+
+                    // if the alignment is bigger than the requested one by a factor
+                    // bigger than or equal to the specified cutoff for alignment,
+                    // allocate a new block
+                    if (is_good)
+                    {
+                        std::size_t alignment_factor = desc.alignment / alignment;
+                        if (alignment_factor >= m_options.cached_alignment_cutoff_factor)
+                        {
+                            is_good = false;
+                        }
+                    }
+
+                    if (is_good)
+                    {
+                        if (previous != &m_cached_oversized)
+                        {
+                            oversized_block_descriptor previous_desc = **previous;
+                            previous_desc.next_cached = desc.next_cached;
+                            **previous = previous_desc;
+                        }
+                        else
+                        {
+                            m_cached_oversized = desc.next_cached;
+                        }
+
+                        desc.next_cached = oversized_block_descriptor_ptr();
+                        *ptr = desc;
+
+                        return static_cast<void_ptr>(
+                            static_cast<char_ptr>(
+                                static_cast<void_ptr>(ptr)
+                            ) - desc.size
+                        );
+                    }
+
+                    previous = &thrust::raw_reference_cast(*ptr).next_cached;
+                    ptr = *previous;
+                }
+            }
+
+            // no fitting cached block found; allocate a new one that's just up to the specs
+            void_ptr allocated = m_upstream->do_allocate(bytes + sizeof(oversized_block_descriptor), alignment);
+            oversized_block_descriptor_ptr block = static_cast<oversized_block_descriptor_ptr>(
+                static_cast<void_ptr>(
+                    static_cast<char_ptr>(allocated) + bytes
+                )
+            );
+
+            oversized_block_descriptor desc;
+            desc.size = bytes;
+            desc.alignment = alignment;
+            desc.prev = oversized_block_descriptor_ptr();
+            desc.next = m_oversized;
+            desc.next_cached = oversized_block_descriptor_ptr();
+            *block = desc;
+            m_oversized = block;
+
+            if (detail::pointer_traits<oversized_block_descriptor_ptr>::get(desc.next))
+            {
+                oversized_block_descriptor next = *desc.next;
+                next.prev = block;
+                *desc.next = next;
+            }
+
+            return allocated;
+        }
+
+        // the request is NOT for oversized and/or overaligned memory
+        // allocate a block from an appropriate bucket
+        std::size_t bytes_log2 = thrust::detail::log2_ri(bytes);
+        std::size_t bucket_idx = bytes_log2 - m_smallest_block_log2;
+        pool & bucket = thrust::raw_reference_cast(m_pools[bucket_idx]);
+
+        bytes = static_cast<std::size_t>(1) << bytes_log2;
+
+        // if the free list of the bucket has no elements, allocate a new chunk
+        // and split it into blocks pushed to the free list
+        if (!detail::pointer_traits<block_descriptor_ptr>::get(bucket.free_list))
+        {
+            std::size_t n = bucket.previous_allocated_count;
+            if (n == 0)
+            {
+                n = m_options.min_blocks_per_chunk;
+                if (n < (m_options.min_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.min_bytes_per_chunk >> bytes_log2;
+                }
+            }
+            else
+            {
+                n = n * 3 / 2;
+                if (n > (m_options.max_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.max_bytes_per_chunk >> bytes_log2;
+                }
+                if (n > m_options.max_blocks_per_chunk)
+                {
+                    n = m_options.max_blocks_per_chunk;
+                }
+            }
+
+            std::size_t descriptor_size = (std::max)(sizeof(block_descriptor), m_options.alignment);
+            std::size_t block_size = bytes + descriptor_size;
+            block_size += m_options.alignment - block_size % m_options.alignment;
+            std::size_t chunk_size = block_size * n;
+
+            void_ptr allocated = m_upstream->do_allocate(chunk_size + sizeof(chunk_descriptor), m_options.alignment);
+            chunk_descriptor_ptr chunk = static_cast<chunk_descriptor_ptr>(
+                static_cast<void_ptr>(
+                    static_cast<char_ptr>(allocated) + chunk_size
+                )
+            );
+
+            chunk_descriptor desc;
+            desc.size = chunk_size;
+            desc.next = m_allocated;
+            *chunk = desc;
+            m_allocated = chunk;
+
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
+                    static_cast<void_ptr>(
+                        static_cast<char_ptr>(allocated) + block_size * i + bytes
+                    )
+                );
+
+                block_descriptor desc;
+                desc.next = bucket.free_list;
+                *block = desc;
+                bucket.free_list = block;
+            }
+        }
+
+        // allocate a block from the front of the bucket's free list
+        block_descriptor_ptr block = bucket.free_list;
+        bucket.free_list = thrust::raw_reference_cast(*block).next;
+        return static_cast<void_ptr>(
+            static_cast<char_ptr>(
+                static_cast<void_ptr>(block)
+            ) - bytes
+        );
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        n = (std::max)(n, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // verify that the pointer is at least as aligned as claimed
+        assert(reinterpret_cast<detail::intmax_t>(detail::pointer_traits<void_ptr>::get(p)) % alignment == 0);
+
+        // the deallocated block is oversized and/or overaligned
+        if (n > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            oversized_block_descriptor_ptr block = static_cast<oversized_block_descriptor_ptr>(
+                static_cast<void_ptr>(
+                    static_cast<char_ptr>(p) + n
+                )
+            );
+
+            oversized_block_descriptor desc = *block;
+
+            if (m_options.cache_oversized)
+            {
+                desc.next_cached = m_cached_oversized;
+                *block = desc;
+                m_cached_oversized = block;
+
+                return;
+            }
+
+            if (!detail::pointer_traits<oversized_block_descriptor_ptr>::get(desc.prev))
+            {
+                assert(m_oversized == block);
+                m_oversized = desc.next;
+            }
+            else
+            {
+                oversized_block_descriptor prev = *desc.prev;
+                assert(prev.next == block);
+                prev.next = desc.next;
+                *desc.prev = prev;
+            }
+
+            if (detail::pointer_traits<oversized_block_descriptor_ptr>::get(desc.next))
+            {
+                oversized_block_descriptor next = *desc.next;
+                assert(next.prev == block);
+                next.prev = desc.prev;
+                *desc.next = next;
+            }
+
+            m_upstream->do_deallocate(p, desc.size + sizeof(oversized_block_descriptor), desc.alignment);
+
+            return;
+        }
+
+        // push the block to the front of the appropriate bucket's free list
+        std::size_t n_log2 = thrust::detail::log2_ri(n);
+        std::size_t bucket_idx = n_log2 - m_smallest_block_log2;
+        pool & bucket = thrust::raw_reference_cast(m_pools[bucket_idx]);
+
+        n = static_cast<std::size_t>(1) << n_log2;
+
+        block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
+            static_cast<void_ptr>(
+                static_cast<char_ptr>(p) + n
+            )
+        );
+
+        block_descriptor desc;
+        desc.next = bucket.free_list;
+        *block = desc;
+        bucket.free_list = block;
+    }
+};
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/pool_options.h b/thrust/thrust/mr/pool_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..60430b7d272cd95fa0867d0965e03f48314cc9f6
--- /dev/null
+++ b/thrust/thrust/mr/pool_options.h
@@ -0,0 +1,127 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file pool_options.h
+ *  \brief \p pool_options is a type used by the pooling resource adaptors to fine-tune their behavior.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+#include <thrust/detail/integer_math.h>
+
+#include <thrust/mr/detail/config.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/*! \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A type used for configuring pooling resource adaptors, to fine-tune their behavior and parameters.
+ */
+struct pool_options
+{
+    /*! The minimal number of blocks, i.e. pieces of memory handed off to the user from a pool of a given size, in a single
+     *      chunk allocated from upstream.
+     */
+    std::size_t min_blocks_per_chunk;
+    /*! The minimal number of bytes in a single chunk allocated from upstream.
+     */
+    std::size_t min_bytes_per_chunk;
+    /*! The maximal number of blocks, i.e. pieces of memory handed off to the user from a pool of a given size, in a single
+     *      chunk allocated from upstream.
+     */
+    std::size_t max_blocks_per_chunk;
+    /*! The maximal number of bytes in a single chunk allocated from upstream.
+     */
+    std::size_t max_bytes_per_chunk;
+
+    /*! The size of blocks in the smallest pool covered by the pool resource. All allocation requests below this size will
+     *      be rounded up to this size.
+     */
+    std::size_t smallest_block_size;
+    /*! The size of blocks in the largest pool covered by the pool resource. All allocation requests above this size will
+     *      be considered oversized, allocated directly from upstream (and not from a pool), and cached only of \p cache_oversized
+     *      is true.
+     */
+    std::size_t largest_block_size;
+
+    /*! The alignment of all blocks in internal pools of the pool resource. All allocation requests above this alignment
+     *      will be considered oversized, allocated directly from upstream (and not from a pool), and cached only of
+     *      \p cache_oversized is true.
+     */
+    std::size_t alignment;
+
+    /*! Decides whether oversized and overaligned blocks are cached for later use, or immediately return it to the upstream
+     *      resource.
+     */
+    bool cache_oversized;
+
+    /*! The size factor at which a cached allocation is considered too ridiculously oversized to use to fulfill an allocation
+     *      request. For instance: the user requests an allocation of size 1024 bytes. A block of size 32 * 1024 bytes is
+     *      cached. If \p cached_size_cutoff_factor is 32 or less, this block will be considered too big for that allocation
+     *      request.
+     */
+    std::size_t cached_size_cutoff_factor;
+    /*! The alignment factor at which a cached allocation is considered too ridiculously overaligned to use to fulfill an
+     *      allocation request. For instance: the user requests an allocation aligned to 32 bytes. A block aligned to 1024 bytes
+     *      is cached. If \p cached_size_cutoff_factor is 32 or less, this block will be considered too overaligned for that
+     *      allocation request.
+     */
+    std::size_t cached_alignment_cutoff_factor;
+
+    /*! Checks if the options are self-consistent.
+     *
+     *  /returns true if the options are self-consitent, false otherwise.
+     */
+    bool validate() const
+    {
+        if (!detail::is_power_of_2(smallest_block_size)) return false;
+        if (!detail::is_power_of_2(largest_block_size)) return false;
+        if (!detail::is_power_of_2(alignment)) return false;
+
+        if (max_bytes_per_chunk == 0 || max_blocks_per_chunk == 0) return false;
+        if (smallest_block_size == 0 || largest_block_size == 0) return false;
+
+        if (min_blocks_per_chunk > max_blocks_per_chunk) return false;
+        if (min_bytes_per_chunk > max_bytes_per_chunk) return false;
+
+        if (smallest_block_size > largest_block_size) return false;
+
+        if (min_blocks_per_chunk * smallest_block_size > max_bytes_per_chunk) return false;
+        if (min_blocks_per_chunk * largest_block_size > max_bytes_per_chunk) return false;
+
+        if (max_blocks_per_chunk * largest_block_size < min_bytes_per_chunk) return false;
+        if (max_blocks_per_chunk * smallest_block_size < min_bytes_per_chunk) return false;
+
+        if (alignment > smallest_block_size) return false;
+
+        return true;
+    }
+};
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/mr/sync_pool.h b/thrust/thrust/mr/sync_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cf8640cab158b87bc806976b6f10d1ec0a6e7c0
--- /dev/null
+++ b/thrust/thrust/mr/sync_pool.h
@@ -0,0 +1,116 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file sync_pool.h
+ *  \brief A mutex-synchronized version of \p unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <mutex>
+
+#include <thrust/mr/pool.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_resources
+ *  \{
+ */
+
+/*! A mutex-synchronized version of \p unsynchronized_pool_resource. Uses \p std::mutex, and therefore requires C++11.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory
+ */
+template<typename Upstream>
+struct synchronized_pool_resource : public memory_resource<typename Upstream::pointer>
+{
+    typedef unsynchronized_pool_resource<Upstream> unsync_pool;
+    typedef std::lock_guard<std::mutex> lock_t;
+
+    typedef typename Upstream::pointer void_ptr;
+
+public:
+    /*! Get the default options for a pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        return unsync_pool::get_default_options();
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param options pool options to use
+     */
+    synchronized_pool_resource(Upstream * upstream, pool_options options = get_default_options())
+        : upstream_pool(upstream, options)
+    {
+    }
+
+    /*! Constructor. The upstream resource is obtained by calling \p get_global_resource<Upstream>.
+     *
+     *  \param options pool options to use
+     */
+    synchronized_pool_resource(pool_options options = get_default_options())
+        : upstream_pool(get_global_resource<Upstream>(), options)
+    {
+    }
+
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        lock_t lock(mtx);
+        upstream_pool.release();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        lock_t lock(mtx);
+        return upstream_pool.do_allocate(bytes, alignment);
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    {
+        lock_t lock(mtx);
+        upstream_pool.do_deallocate(p, n, alignment);
+    }
+
+private:
+    std::mutex mtx;
+    unsync_pool upstream_pool;
+};
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/mr/tls_pool.h b/thrust/thrust/mr/tls_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..c732f022f74c29eb71a9cbe1335c9f0177becdc8
--- /dev/null
+++ b/thrust/thrust/mr/tls_pool.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file tls_pool.h
+ *  \brief A function wrapping a thread local instance of a \p unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/mr/pool.h>
+
+namespace thrust
+{
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_resources
+ *  \{
+ */
+
+/*! Potentially constructs, if not yet created, and then returns the address of a thread-local \p unsynchronized_pool_resource,
+ *
+ *  \tparam Upstream the template argument to the pool template
+ *  \param upstream the argument to the constructor, if invoked
+ */
+template<typename Upstream, typename Bookkeeper>
+__host__
+thrust::mr::unsynchronized_pool_resource<Upstream> & tls_pool(Upstream * upstream = NULL)
+{
+    static thread_local auto adaptor = [&]{
+        assert(upstream);
+        return thrust::mr::unsynchronized_pool_resource<Upstream>(upstream);
+    }();
+
+    return adaptor;
+}
+
+/*! \}
+ */
+
+} // end mr
+} // end thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/mr/validator.h b/thrust/thrust/mr/validator.h
new file mode 100644
index 0000000000000000000000000000000000000000..9376ae870b5f6017ef9d27084d580d448fe53e75
--- /dev/null
+++ b/thrust/thrust/mr/validator.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/config.h"
+#include "memory_resource.h"
+
+namespace thrust
+{
+namespace mr
+{
+
+template<typename MR>
+struct validator
+{
+#if THRUST_CPP_DIALECT >= 2011
+  static_assert(
+    std::is_base_of<memory_resource<typename MR::pointer>, MR>::value,
+    "a type used as a memory resource must derive from memory_resource"
+  );
+#endif
+};
+
+template<typename T, typename U>
+struct validator2 : private validator<T>, private validator<U>
+{
+};
+
+template<typename T>
+struct validator2<T, T> : private validator<T>
+{
+};
+
+} // end mr
+} // end thrust
+
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
new file mode 100644
index 0000000000000000000000000000000000000000..133deab56600d22f831be271888d643786f51011
--- /dev/null
+++ b/thrust/thrust/optional.h
@@ -0,0 +1,2886 @@
+///
+// optional - An implementation of std::optional with extensions
+// Written in 2017 by Simon Brand (@TartanLlama)
+//
+// To the extent possible under law, the author(s) have dedicated all
+// copyright and related and neighboring rights to this software to the
+// public domain worldwide. This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication
+// along with this software. If not, see
+// <http://creativecommons.org/publicdomain/zero/1.0/>.
+///
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/addressof.h>
+#include <thrust/swap.h>
+
+#define THRUST_OPTIONAL_VERSION_MAJOR 0
+#define THRUST_OPTIONAL_VERSION_MINOR 2
+
+#include <exception>
+#include <functional>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && _MSC_VER == 1900)
+#define THRUST_OPTIONAL_MSVC2015
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 &&              \
+     !defined(__clang__))
+#define THRUST_OPTIONAL_GCC49
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 4 &&              \
+     !defined(__clang__))
+#define THRUST_OPTIONAL_GCC54
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 5 &&              \
+     !defined(__clang__))
+#define THRUST_OPTIONAL_GCC55
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 &&              \
+     !defined(__clang__))
+// GCC < 5 doesn't support overloading on const&& for member functions
+#define THRUST_OPTIONAL_NO_CONSTRR
+
+// GCC < 5 doesn't support some standard C++11 type traits
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
+  std::has_trivial_copy_constructor<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) std::has_trivial_copy_assign<T>::value
+
+// GCC < 5 doesn't provide a way to emulate std::is_trivially_move_*,
+// so don't enable any optimizations that rely on them:
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) false
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) false
+
+// This one will be different for GCC 5.7 if it's ever supported
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
+
+// GCC 5 < v < 8 has a bug in is_trivially_copy_constructible which breaks std::vector
+// for non-copyable types
+#elif (defined(__GNUC__) && __GNUC__ < 8 &&                                                \
+     !defined(__clang__))
+#ifndef THRUST_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+#define THRUST_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+namespace thrust
+{
+  namespace detail {
+      template<class T>
+      struct is_trivially_copy_constructible : std::is_trivially_copy_constructible<T>{};
+#ifdef _GLIBCXX_VECTOR
+      template<class T, class A>
+      struct is_trivially_copy_constructible<std::vector<T,A>>
+          : std::is_trivially_copy_constructible<T>{};
+#endif      
+  }
+} // end namespace thrust
+#endif
+
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
+    thrust::detail::is_trivially_copy_constructible<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                                        \
+  std::is_trivially_copy_assignable<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T)                                     \
+  std::is_trivially_move_constructible<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T)                                        \
+  std::is_trivially_move_assignable<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
+#else
+
+// To support clang + old libstdc++ without type traits, check for equivalent
+// clang built-ins and use them if present. See note above
+// is_trivially_copyable_impl in
+// thrust/type_traits/is_trivially_relocatable.h for more details.
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_constructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+  __is_trivially_constructible(T, T const&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+  std::is_trivially_copy_constructible<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_assignable)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+  __is_trivially_assignable(T, T const&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+  std::is_trivially_copy_assignable<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_constructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) \
+  __is_trivially_constructible(T, T&&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) \
+  std::is_trivially_move_constructible<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_assignable)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) \
+  __is_trivially_assignable(T, T&&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) \
+  std::is_trivially_move_assignable<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_destructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+  __is_trivially_destructible(T)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+  std::is_trivially_destructible<T>::value
+#endif
+
+#endif
+
+#if THRUST_CPP_DIALECT > 2011
+#define THRUST_OPTIONAL_CPP14
+#endif
+
+// constexpr implies const in C++11, not C++14
+#if (THRUST_CPP_DIALECT == 2011 || defined(THRUST_OPTIONAL_MSVC2015) ||                \
+     defined(THRUST_OPTIONAL_GCC49))
+/// \exclude
+#define THRUST_OPTIONAL_CPP11_CONSTEXPR
+#else
+/// \exclude
+#define THRUST_OPTIONAL_CPP11_CONSTEXPR constexpr
+#endif
+
+namespace thrust
+{
+#ifndef THRUST_MONOSTATE_INPLACE_MUTEX
+#define THRUST_MONOSTATE_INPLACE_MUTEX
+/// \brief Used to represent an optional with no data; essentially a bool
+class monostate {};
+
+/// \brief A tag type to tell optional to construct its value in-place
+struct in_place_t {
+  explicit in_place_t() = default;
+};
+/// \brief A tag to tell optional to construct its value in-place
+static constexpr in_place_t in_place{};
+#endif
+
+template <class T> class optional;
+
+/// \exclude
+namespace detail {
+#ifndef THRUST_TRAITS_MUTEX
+#define THRUST_TRAITS_MUTEX
+// C++14-style aliases for brevity
+template <class T> using remove_const_t = typename std::remove_const<T>::type;
+template <class T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <class T> using decay_t = typename std::decay<T>::type;
+template <bool E, class T = void>
+using enable_if_t = typename std::enable_if<E, T>::type;
+template <bool B, class T, class F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+
+// std::conjunction from C++17
+template <class...> struct conjunction : std::true_type {};
+template <class B> struct conjunction<B> : B {};
+template <class B, class... Bs>
+struct conjunction<B, Bs...>
+    : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {};
+
+#if defined(_LIBCPP_VERSION) && THRUST_CPP_DIALECT == 2011
+#define THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
+#endif
+
+// In C++11 mode, there's an issue in libc++'s std::mem_fn
+// which results in a hard-error when using it in a noexcept expression
+// in some cases. This is a check to workaround the common failing case.
+#ifdef THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
+template <class T> struct is_pointer_to_non_const_member_func : std::false_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&&> : std::true_type{};        
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&&> : std::true_type{};        
+
+template <class T> struct is_const_or_const_ref : std::false_type{};
+template <class T> struct is_const_or_const_ref<T const&> : std::true_type{};
+template <class T> struct is_const_or_const_ref<T const> : std::true_type{};    
+#endif
+
+// std::invoke from C++17
+// https://stackoverflow.com/questions/38288042/c11-14-invoke-workaround
+__thrust_exec_check_disable__
+template <typename Fn, typename... Args,
+#ifdef THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
+          typename = enable_if_t<!(is_pointer_to_non_const_member_func<Fn>::value 
+                                 && is_const_or_const_ref<Args...>::value)>, 
+#endif
+          typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>::value>,
+          int = 0>
+__host__ __device__
+constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
+    noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
+    -> decltype(std::mem_fn(f)(std::forward<Args>(args)...)) {
+  return std::mem_fn(f)(std::forward<Args>(args)...);
+}
+
+__thrust_exec_check_disable__
+template <typename Fn, typename... Args,
+          typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>::value>>
+__host__ __device__
+constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
+    noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+    -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)) {
+  return std::forward<Fn>(f)(std::forward<Args>(args)...);
+}
+
+// std::invoke_result from C++17
+template <class F, class, class... Us> struct invoke_result_impl;
+
+template <class F, class... Us>
+struct invoke_result_impl<
+    F, decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...), void()),
+    Us...> {
+  using type = decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...));
+};
+
+template <class F, class... Us>
+using invoke_result = invoke_result_impl<F, void, Us...>;
+
+template <class F, class... Us>
+using invoke_result_t = typename invoke_result<F, Us...>::type;
+#endif
+
+// std::void_t from C++17
+template <class...> struct voider { using type = void; };
+template <class... Ts> using void_t = typename voider<Ts...>::type;
+
+// Trait for checking if a type is a thrust::optional
+template <class T> struct is_optional_impl : std::false_type {};
+template <class T> struct is_optional_impl<optional<T>> : std::true_type {};
+template <class T> using is_optional = is_optional_impl<decay_t<T>>;
+
+// Change void to thrust::monostate
+template <class U>
+using fixup_void = conditional_t<std::is_void<U>::value, monostate, U>;
+
+template <class F, class U, class = invoke_result_t<F, U>>
+using get_map_return = optional<fixup_void<invoke_result_t<F, U>>>;
+
+// Check if invoking F for some Us returns void
+template <class F, class = void, class... U> struct returns_void_impl;
+template <class F, class... U>
+struct returns_void_impl<F, void_t<invoke_result_t<F, U...>>, U...>
+    : std::is_void<invoke_result_t<F, U...>> {};
+template <class F, class... U>
+using returns_void = returns_void_impl<F, void, U...>;
+
+template <class T, class... U>
+using enable_if_ret_void = enable_if_t<returns_void<T &&, U...>::value>;
+
+template <class T, class... U>
+using disable_if_ret_void = enable_if_t<!returns_void<T &&, U...>::value>;
+
+template <class T, class U>
+using enable_forward_value =
+    detail::enable_if_t<std::is_constructible<T, U &&>::value &&
+                        !std::is_same<detail::decay_t<U>, in_place_t>::value &&
+                        !std::is_same<optional<T>, detail::decay_t<U>>::value>;
+
+template <class T, class U, class Other>
+using enable_from_other = detail::enable_if_t<
+    std::is_constructible<T, Other>::value &&
+    !std::is_constructible<T, optional<U> &>::value &&
+    !std::is_constructible<T, optional<U> &&>::value &&
+    !std::is_constructible<T, const optional<U> &>::value &&
+    !std::is_constructible<T, const optional<U> &&>::value &&
+    !std::is_convertible<optional<U> &, T>::value &&
+    !std::is_convertible<optional<U> &&, T>::value &&
+    !std::is_convertible<const optional<U> &, T>::value &&
+    !std::is_convertible<const optional<U> &&, T>::value>;
+
+template <class T, class U>
+using enable_assign_forward = detail::enable_if_t<
+    !std::is_same<optional<T>, detail::decay_t<U>>::value &&
+    !detail::conjunction<std::is_scalar<T>,
+                         std::is_same<T, detail::decay_t<U>>>::value &&
+    std::is_constructible<T, U>::value && std::is_assignable<T &, U>::value>;
+
+template <class T, class U, class Other>
+using enable_assign_from_other = detail::enable_if_t<
+    std::is_constructible<T, Other>::value &&
+    std::is_assignable<T &, Other>::value &&
+    !std::is_constructible<T, optional<U> &>::value &&
+    !std::is_constructible<T, optional<U> &&>::value &&
+    !std::is_constructible<T, const optional<U> &>::value &&
+    !std::is_constructible<T, const optional<U> &&>::value &&
+    !std::is_convertible<optional<U> &, T>::value &&
+    !std::is_convertible<optional<U> &&, T>::value &&
+    !std::is_convertible<const optional<U> &, T>::value &&
+    !std::is_convertible<const optional<U> &&, T>::value &&
+    !std::is_assignable<T &, optional<U> &>::value &&
+    !std::is_assignable<T &, optional<U> &&>::value &&
+    !std::is_assignable<T &, const optional<U> &>::value &&
+    !std::is_assignable<T &, const optional<U> &&>::value>;
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// TODO make a version which works with MSVC
+template <class T, class U = T> struct is_swappable : std::true_type {};
+
+template <class T, class U = T> struct is_nothrow_swappable : std::true_type {};
+#else
+// https://stackoverflow.com/questions/26744589/what-is-a-proper-way-to-implement-is-swappable-to-test-for-the-swappable-concept
+namespace swap_adl_tests {
+// if swap ADL finds this then it would call std::swap otherwise (same
+// signature)
+struct tag {};
+
+template <class T> tag swap(T &, T &);
+template <class T, std::size_t N> tag swap(T (&a)[N], T (&b)[N]);
+
+// helper functions to test if an unqualified swap is possible, and if it
+// becomes std::swap
+template <class, class> std::false_type can_swap(...) noexcept(false);
+template <class T, class U,
+          class = decltype(swap(std::declval<T &>(), std::declval<U &>()))>
+std::true_type can_swap(int) noexcept(noexcept(swap(std::declval<T &>(),
+                                                    std::declval<U &>())));
+
+template <class, class> std::false_type uses_std(...);
+template <class T, class U>
+std::is_same<decltype(swap(std::declval<T &>(), std::declval<U &>())), tag>
+uses_std(int);
+
+template <class T>
+struct is_std_swap_noexcept
+    : std::integral_constant<bool,
+                             std::is_nothrow_move_constructible<T>::value &&
+                                 std::is_nothrow_move_assignable<T>::value> {};
+
+template <class T, std::size_t N>
+struct is_std_swap_noexcept<T[N]> : is_std_swap_noexcept<T> {};
+
+template <class T, class U>
+struct is_adl_swap_noexcept
+    : std::integral_constant<bool, noexcept(can_swap<T, U>(0))> {};
+} // namespace swap_adl_tests
+
+template <class T, class U = T>
+struct is_swappable
+    : std::integral_constant<
+          bool,
+          decltype(detail::swap_adl_tests::can_swap<T, U>(0))::value &&
+              (!decltype(detail::swap_adl_tests::uses_std<T, U>(0))::value ||
+               (std::is_move_assignable<T>::value &&
+                std::is_move_constructible<T>::value))> {};
+
+template <class T, std::size_t N>
+struct is_swappable<T[N], T[N]>
+    : std::integral_constant<
+          bool,
+          decltype(detail::swap_adl_tests::can_swap<T[N], T[N]>(0))::value &&
+              (!decltype(
+                   detail::swap_adl_tests::uses_std<T[N], T[N]>(0))::value ||
+               is_swappable<T, T>::value)> {};
+
+template <class T, class U = T>
+struct is_nothrow_swappable
+    : std::integral_constant<
+          bool,
+          is_swappable<T, U>::value &&
+              ((decltype(detail::swap_adl_tests::uses_std<T, U>(0))::value
+                    &&detail::swap_adl_tests::is_std_swap_noexcept<T>::value) ||
+               (!decltype(detail::swap_adl_tests::uses_std<T, U>(0))::value &&
+                    detail::swap_adl_tests::is_adl_swap_noexcept<T,
+                                                                 U>::value))> {
+};
+#endif
+
+// The storage base manages the actual storage, and correctly propagates
+// trivial destruction from T. This case is for when T is not trivially
+// destructible.
+template <class T, bool = ::std::is_trivially_destructible<T>::value>
+struct optional_storage_base {
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base() noexcept
+      : m_dummy(), m_has_value(false) {}
+
+  __thrust_exec_check_disable__
+  template <class... U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base(in_place_t, U &&... u)
+      : m_value(std::forward<U>(u)...), m_has_value(true) {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ~optional_storage_base() {
+    if (m_has_value) {
+      m_value.~T();
+      m_has_value = false;
+    }
+  }
+
+  struct dummy {};
+  union {
+    dummy m_dummy;
+    T m_value;
+  };
+
+  bool m_has_value;
+};
+
+// This case is for when T is trivially destructible.
+template <class T> struct optional_storage_base<T, true> {
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base() noexcept
+      : m_dummy(), m_has_value(false) {}
+
+  __thrust_exec_check_disable__
+  template <class... U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base(in_place_t, U &&... u)
+      : m_value(std::forward<U>(u)...), m_has_value(true) {}
+
+  // No destructor, so this class is trivially destructible
+
+  struct dummy {};
+  union {
+    dummy m_dummy;
+    T m_value;
+  };
+
+  bool m_has_value = false;
+};
+
+// This base class provides some handy member functions which can be used in
+// further derived classes
+template <class T> struct optional_operations_base : optional_storage_base<T> {
+  using optional_storage_base<T>::optional_storage_base;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void hard_reset() noexcept {
+    get().~T();
+    this->m_has_value = false;
+  }
+
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  void construct(Args &&... args) noexcept {
+    new (addressof(this->m_value)) T(std::forward<Args>(args)...);
+    this->m_has_value = true;
+  }
+
+  __thrust_exec_check_disable__
+  template <class Opt>
+  __host__ __device__
+  void assign(Opt &&rhs) {
+    if (this->has_value()) {
+      if (rhs.has_value()) {
+        this->m_value = std::forward<Opt>(rhs).get();
+      } else {
+        this->m_value.~T();
+        this->m_has_value = false;
+      }
+    }
+
+    if (rhs.has_value()) {
+      construct(std::forward<Opt>(rhs).get());
+    }
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  bool has_value() const { return this->m_has_value; }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &get() & { return this->m_value; }
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &get() const & { return this->m_value; }
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &&get() && { return std::move(this->m_value); }
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &&get() const && { return std::move(this->m_value); }
+#endif
+};
+
+// This class manages conditionally having a trivial copy constructor
+// This specialization is for when T is trivially copy constructible
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)>
+struct optional_copy_base : optional_operations_base<T> {
+  using optional_operations_base<T>::optional_operations_base;
+};
+
+// This specialization is for when T is not trivially copy constructible
+template <class T>
+struct optional_copy_base<T, false> : optional_operations_base<T> {
+  using optional_operations_base<T>::optional_operations_base;
+
+  __thrust_exec_check_disable__
+  optional_copy_base() = default;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_copy_base(const optional_copy_base &rhs) {
+    if (rhs.has_value()) {
+      this->construct(rhs.get());
+    } else {
+      this->m_has_value = false;
+    }
+  }
+
+  __thrust_exec_check_disable__
+  optional_copy_base(optional_copy_base &&rhs) = default;
+  __thrust_exec_check_disable__
+  optional_copy_base &operator=(const optional_copy_base &rhs) = default;
+  __thrust_exec_check_disable__
+  optional_copy_base &operator=(optional_copy_base &&rhs) = default;
+};
+
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T)>
+struct optional_move_base : optional_copy_base<T> {
+  using optional_copy_base<T>::optional_copy_base;
+};
+template <class T> struct optional_move_base<T, false> : optional_copy_base<T> {
+  using optional_copy_base<T>::optional_copy_base;
+
+  __thrust_exec_check_disable__
+  optional_move_base() = default;
+  __thrust_exec_check_disable__
+  optional_move_base(const optional_move_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_move_base(optional_move_base &&rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value) {
+    if (rhs.has_value()) {
+      this->construct(std::move(rhs.get()));
+    } else {
+      this->m_has_value = false;
+    }
+  }
+  __thrust_exec_check_disable__
+  optional_move_base &operator=(const optional_move_base &rhs) = default;
+  __thrust_exec_check_disable__
+  optional_move_base &operator=(optional_move_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial copy assignment operator
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) &&
+                          THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) &&
+                          THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T)>
+struct optional_copy_assign_base : optional_move_base<T> {
+  using optional_move_base<T>::optional_move_base;
+};
+
+template <class T>
+struct optional_copy_assign_base<T, false> : optional_move_base<T> {
+  using optional_move_base<T>::optional_move_base;
+
+  __thrust_exec_check_disable__
+  optional_copy_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_copy_assign_base(const optional_copy_assign_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  optional_copy_assign_base(optional_copy_assign_base &&rhs) = default;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_copy_assign_base &operator=(const optional_copy_assign_base &rhs) {
+    this->assign(rhs);
+    return *this;
+  }
+  __thrust_exec_check_disable__
+  optional_copy_assign_base &
+  operator=(optional_copy_assign_base &&rhs) = default;
+};
+
+template <class T,
+          bool = THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) &&
+                 THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) &&
+                 THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T)>
+struct optional_move_assign_base : optional_copy_assign_base<T> {
+  using optional_copy_assign_base<T>::optional_copy_assign_base;
+};
+
+template <class T>
+struct optional_move_assign_base<T, false> : optional_copy_assign_base<T> {
+  using optional_copy_assign_base<T>::optional_copy_assign_base;
+
+  __thrust_exec_check_disable__
+  optional_move_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_move_assign_base(const optional_move_assign_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  optional_move_assign_base(optional_move_assign_base &&rhs) = default;
+
+  __thrust_exec_check_disable__
+  optional_move_assign_base &
+  operator=(const optional_move_assign_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_move_assign_base &
+  operator=(optional_move_assign_base &&rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value
+          &&std::is_nothrow_move_assignable<T>::value) {
+    this->assign(std::move(rhs));
+    return *this;
+  }
+};
+
+// optional_delete_ctor_base will conditionally delete copy and move
+// constructors depending on whether T is copy/move constructible
+template <class T, bool EnableCopy = std::is_copy_constructible<T>::value,
+          bool EnableMove = std::is_move_constructible<T>::value>
+struct optional_delete_ctor_base {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_ctor_base<T, true, false> {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_ctor_base<T, false, true> {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_ctor_base<T, false, false> {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+// optional_delete_assign_base will conditionally delete copy and move
+// constructors depending on whether T is copy/move constructible + assignable
+template <class T,
+          bool EnableCopy = (std::is_copy_constructible<T>::value &&
+                             std::is_copy_assignable<T>::value),
+          bool EnableMove = (std::is_move_constructible<T>::value &&
+                             std::is_move_assignable<T>::value)>
+struct optional_delete_assign_base {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_assign_base<T, true, false> {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = delete;
+};
+
+template <class T> struct optional_delete_assign_base<T, false, true> {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_assign_base<T, false, false> {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = delete;
+};
+
+} // namespace detail
+
+/// \brief A tag type to represent an empty optional
+struct nullopt_t {
+  struct do_not_use {};
+  __host__ __device__
+  constexpr explicit nullopt_t(do_not_use, do_not_use) noexcept {}
+};
+/// \brief Represents an empty optional
+/// \synopsis static constexpr nullopt_t nullopt;
+///
+/// *Examples*:
+/// ```
+/// thrust::optional<int> a = thrust::nullopt;
+/// void foo (thrust::optional<int>);
+/// foo(thrust::nullopt); //pass an empty optional
+/// ```
+static constexpr nullopt_t nullopt{nullopt_t::do_not_use{},
+                                   nullopt_t::do_not_use{}};
+
+class bad_optional_access : public std::exception {
+public:
+  bad_optional_access() = default;
+  __host__
+  const char *what() const noexcept { return "Optional has no value"; }
+};
+
+/// An optional object is an object that contains the storage for another
+/// object and manages the lifetime of this contained object, if any. The
+/// contained object may be initialized after the optional object has been
+/// initialized, and may be destroyed before the optional object has been
+/// destroyed. The initialization state of the contained object is tracked by
+/// the optional object.
+template <class T>
+class optional : private detail::optional_move_assign_base<T>,
+                 private detail::optional_delete_ctor_base<T>,
+                 private detail::optional_delete_assign_base<T> {
+  using base = detail::optional_move_assign_base<T>;
+
+  static_assert(!std::is_same<T, in_place_t>::value,
+                "instantiation of optional with in_place_t is ill-formed");
+  static_assert(!std::is_same<detail::decay_t<T>, nullopt_t>::value,
+                "instantiation of optional with nullopt_t is ill-formed");
+
+public:
+// The different versions for C++14 and 11 are needed because deduced return
+// types are not SFINAE-safe. This provides better support for things like
+// generic lambdas. C.f.
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0.html
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`. \returns Let `U` be
+  /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
+  /// `std::optional<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+#endif
+#else
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise the return value of
+  /// `std::invoke(std::forward<F>(f), value())` is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &> and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &&> and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &> and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &&> and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+#endif
+#endif
+
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#else
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(optional_map_impl(std::declval<optional &>(),
+                                             std::declval<F &&>()))
+  map(F &&f) & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(optional_map_impl(std::declval<optional &&>(),
+                                             std::declval<F &&>()))
+  map(F &&f) && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(optional_map_impl(std::declval<const optional &>(),
+                              std::declval<F &&>()))
+  map(F &&f) const & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(optional_map_impl(std::declval<const optional &&>(),
+                              std::declval<F &&>()))
+  map(F &&f) const && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+  /// \brief Calls `f` if the optional is empty
+  /// \requires `std::invoke_result_t<F>` must be void or convertible to
+  /// `optional<T>`.
+  /// \effects If `*this` has a value, returns `*this`.
+  /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)` and returns
+  /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)()`.
+  ///
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) const & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise returns
+  /// `u`.
+  ///
+  /// \details If there is a value stored, then `f` is called with `**this`
+  /// and the value is returned. Otherwise `u` is returned.
+  ///
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise calls
+  /// `u` and returns the result.
+  ///
+  /// \details If there is a value stored, then `f` is
+  /// called with `**this` and the value is returned. Otherwise
+  /// `std::forward<U>(u)()` is returned.
+  ///
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u) &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+#endif
+
+  /// \returns `u` if `*this` has a value, otherwise an empty optional.
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr optional<typename std::decay<U>::type> conjunction(U &&u) const {
+    using result = optional<detail::decay_t<U>>;
+    return has_value() ? result{u} : result{nullopt};
+  }
+
+  /// \returns `rhs` if `*this` is empty, otherwise the current value.
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+#endif
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+#endif
+
+  /// Takes the value out of the optional, leaving it empty
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+#endif
+
+  using value_type = T;
+
+  /// Constructs an optional that does not contain a value.
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  constexpr optional() noexcept = default;
+
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional(nullopt_t) noexcept {}
+
+  /// Copy constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(const optional &rhs) = default;
+
+  /// Move constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(optional &&rhs) = default;
+
+  /// Constructs the stored value in-place using the given arguments.
+  /// \group in_place
+  /// \synopsis template <class... Args> constexpr explicit optional(in_place_t, Args&&... args);
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  constexpr explicit optional(
+      detail::enable_if_t<std::is_constructible<T, Args...>::value, in_place_t>,
+      Args &&... args)
+      : base(in_place, std::forward<Args>(args)...) {}
+
+  /// \group in_place
+  /// \synopsis template <class U, class... Args>\nconstexpr explicit optional(in_place_t, std::initializer_list<U>&, Args&&... args);
+  __thrust_exec_check_disable__
+  template <class U, class... Args>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR explicit optional(
+      detail::enable_if_t<std::is_constructible<T, std::initializer_list<U> &,
+                                                Args &&...>::value,
+                          in_place_t>,
+      std::initializer_list<U> il, Args &&... args) {
+    this->construct(il, std::forward<Args>(args)...);
+  }
+
+  /// Constructs the stored value with `u`.
+  /// \synopsis template <class U=T> constexpr optional(U &&u);
+  __thrust_exec_check_disable__
+  template <
+      class U = T,
+      detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr,
+      detail::enable_forward_value<T, U> * = nullptr>
+  __host__ __device__
+  constexpr optional(U &&u) : base(in_place, std::forward<U>(u)) {}
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <
+      class U = T,
+      detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr,
+      detail::enable_forward_value<T, U> * = nullptr>
+  __host__ __device__
+  constexpr explicit optional(U &&u) : base(in_place, std::forward<U>(u)) {}
+
+  /// Converting copy constructor.
+  /// \synopsis template <class U> optional(const optional<U> &rhs);
+  __thrust_exec_check_disable__
+  template <
+      class U, detail::enable_from_other<T, U, const U &> * = nullptr,
+      detail::enable_if_t<std::is_convertible<const U &, T>::value> * = nullptr>
+  __host__ __device__
+  optional(const optional<U> &rhs) {
+    this->construct(*rhs);
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class U, detail::enable_from_other<T, U, const U &> * = nullptr,
+            detail::enable_if_t<!std::is_convertible<const U &, T>::value> * =
+                nullptr>
+  __host__ __device__
+  explicit optional(const optional<U> &rhs) {
+    this->construct(*rhs);
+  }
+
+  /// Converting move constructor.
+  /// \synopsis template <class U> optional(optional<U> &&rhs);
+  __thrust_exec_check_disable__
+  template <
+      class U, detail::enable_from_other<T, U, U &&> * = nullptr,
+      detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr>
+  __host__ __device__
+  optional(optional<U> &&rhs) {
+    this->construct(std::move(*rhs));
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <
+      class U, detail::enable_from_other<T, U, U &&> * = nullptr,
+      detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr>
+  __host__ __device__
+  explicit optional(optional<U> &&rhs) {
+    this->construct(std::move(*rhs));
+  }
+
+  /// Destroys the stored value if there is one.
+  __thrust_exec_check_disable__
+  ~optional() = default;
+
+  /// Assignment to empty.
+  ///
+  /// Destroys the current value if there is one.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional &operator=(nullopt_t) noexcept {
+    if (has_value()) {
+      this->m_value.~T();
+      this->m_has_value = false;
+    }
+
+    return *this;
+  }
+
+  /// Copy assignment.
+  ///
+  /// Copies the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  __thrust_exec_check_disable__
+  optional &operator=(const optional &rhs) = default;
+
+  /// Move assignment.
+  ///
+  /// Moves the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  __thrust_exec_check_disable__
+  optional &operator=(optional &&rhs) = default;
+
+  /// Assigns the stored value from `u`, destroying the old value if there was
+  /// one.
+  /// \synopsis optional &operator=(U &&u);
+  __thrust_exec_check_disable__
+  template <class U = T, detail::enable_assign_forward<T, U> * = nullptr>
+  __host__ __device__
+  optional &operator=(U &&u) {
+    if (has_value()) {
+      this->m_value = std::forward<U>(u);
+    } else {
+      this->construct(std::forward<U>(u));
+    }
+
+    return *this;
+  }
+
+  /// Converting copy assignment operator.
+  ///
+  /// Copies the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  /// \synopsis optional &operator=(const optional<U> & rhs);
+  __thrust_exec_check_disable__
+  template <class U,
+            detail::enable_assign_from_other<T, U, const U &> * = nullptr>
+  __host__ __device__
+  optional &operator=(const optional<U> &rhs) {
+    if (has_value()) {
+      if (rhs.has_value()) {
+        this->m_value = *rhs;
+      } else {
+        this->hard_reset();
+      }
+    }
+
+    if (rhs.has_value()) {
+      this->construct(*rhs);
+    }
+
+    return *this;
+  }
+
+  // TODO check exception guarantee
+  /// Converting move assignment operator.
+  ///
+  /// Moves the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  /// \synopsis optional &operator=(optional<U> && rhs);
+  __thrust_exec_check_disable__
+  template <class U, detail::enable_assign_from_other<T, U, U> * = nullptr>
+  __host__ __device__
+  optional &operator=(optional<U> &&rhs) {
+    if (has_value()) {
+      if (rhs.has_value()) {
+        this->m_value = std::move(*rhs);
+      } else {
+        this->hard_reset();
+      }
+    }
+
+    if (rhs.has_value()) {
+      this->construct(std::move(*rhs));
+    }
+
+    return *this;
+  }
+
+  /// Constructs the value in-place, destroying the current one if there is
+  /// one.
+  /// \group emplace
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  T &emplace(Args &&... args) {
+    static_assert(std::is_constructible<T, Args &&...>::value,
+                  "T must be constructible with Args");
+
+    *this = nullopt;
+    this->construct(std::forward<Args>(args)...);
+    return value();
+  }
+
+  /// \group emplace
+  /// \synopsis template <class U, class... Args>\nT& emplace(std::initializer_list<U> il, Args &&... args);
+  __thrust_exec_check_disable__
+  template <class U, class... Args>
+  __host__ __device__
+  detail::enable_if_t<
+      std::is_constructible<T, std::initializer_list<U> &, Args &&...>::value,
+      T &>
+  emplace(std::initializer_list<U> il, Args &&... args) {
+    *this = nullopt;
+    this->construct(il, std::forward<Args>(args)...);
+    return value();    
+  }
+
+  /// Swaps this optional with the other.
+  ///
+  /// If neither optionals have a value, nothing happens.
+  /// If both have a value, the values are swapped.
+  /// If one has a value, it is moved to the other and the movee is left
+  /// valueless.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void
+  swap(optional &rhs) noexcept(std::is_nothrow_move_constructible<T>::value
+                                   &&detail::is_nothrow_swappable<T>::value) {
+    if (has_value()) {
+      if (rhs.has_value()) {
+        using thrust::swap;
+        swap(**this, *rhs);
+      } else {
+        new (addressof(rhs.m_value)) T(std::move(this->m_value));
+        this->m_value.T::~T();
+      }
+    } else if (rhs.has_value()) {
+      new (addressof(this->m_value)) T(std::move(rhs.m_value));
+      rhs.m_value.T::~T();
+    }
+  }
+
+  /// \returns a pointer to the stored value
+  /// \requires a value is stored
+  /// \group pointer
+  /// \synopsis constexpr const T *operator->() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T *operator->() const {
+    return addressof(this->m_value);
+  }
+
+  /// \group pointer
+  /// \synopsis constexpr T *operator->();
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T *operator->() {
+    return addressof(this->m_value);
+  }
+
+  /// \returns the stored value
+  /// \requires a value is stored
+  /// \group deref
+  /// \synopsis constexpr T &operator*();
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &operator*() & { return this->m_value; }
+
+  /// \group deref
+  /// \synopsis constexpr const T &operator*() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &operator*() const & { return this->m_value; }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &&operator*() && {
+    return std::move(this->m_value);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &&operator*() const && { return std::move(this->m_value); }
+#endif
+
+  /// \returns whether or not the optional has a value
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool has_value() const noexcept { return this->m_has_value; }
+
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr explicit operator bool() const noexcept {
+    return this->m_has_value;
+  }
+
+  /// \returns the contained value if there is one, otherwise throws
+  /// [bad_optional_access]
+  /// \group value
+  /// \synopsis constexpr T &value();
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &value() & {
+    if (has_value())
+      return this->m_value;
+    throw bad_optional_access();
+  }
+  /// \group value
+  /// \synopsis constexpr const T &value() const;
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &value() const & {
+    if (has_value())
+      return this->m_value;
+    throw bad_optional_access();
+  }
+  /// \exclude
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &&value() && {
+    if (has_value())
+      return std::move(this->m_value);
+    throw bad_optional_access();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &&value() const && {
+    if (has_value())
+      return std::move(this->m_value);
+    throw bad_optional_access();
+  }
+#endif
+
+  /// \returns the stored value if there is one, otherwise returns `u`
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr T value_or(U &&u) const & {
+    static_assert(std::is_copy_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be copy constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T value_or(U &&u) && {
+    static_assert(std::is_move_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be move constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// Destroys the stored value if one exists, making the optional empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void reset() noexcept {
+    if (has_value()) {
+      this->m_value.~T();
+      this->m_has_value = false;
+    }
+  }
+};
+
+/// \group relop
+/// \brief Compares two optional objects
+/// \details If both optionals contain a value, they are compared with `T`s
+/// relational operators. Otherwise `lhs` and `rhs` are equal only if they are
+/// both empty, and `lhs` is less than `rhs` only if `rhs` is empty and `lhs`
+/// is not.
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator==(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return lhs.has_value() == rhs.has_value() &&
+         (!lhs.has_value() || *lhs == *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator!=(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return lhs.has_value() != rhs.has_value() ||
+         (lhs.has_value() && *lhs != *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<(const optional<T> &lhs,
+                                const optional<U> &rhs) {
+  return rhs.has_value() && (!lhs.has_value() || *lhs < *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>(const optional<T> &lhs,
+                                const optional<U> &rhs) {
+  return lhs.has_value() && (!rhs.has_value() || *lhs > *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<=(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return !lhs.has_value() || (rhs.has_value() && *lhs <= *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>=(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return !rhs.has_value() || (lhs.has_value() && *lhs >= *rhs);
+}
+
+/// \group relop_nullopt
+/// \brief Compares an optional to a `nullopt`
+/// \details Equivalent to comparing the optional to an empty optional
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator==(const optional<T> &lhs, nullopt_t) noexcept {
+  return !lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator==(nullopt_t, const optional<T> &rhs) noexcept {
+  return !rhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator!=(const optional<T> &lhs, nullopt_t) noexcept {
+  return lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator!=(nullopt_t, const optional<T> &rhs) noexcept {
+  return rhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator<(const optional<T> &, nullopt_t) noexcept {
+  return false;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator<(nullopt_t, const optional<T> &rhs) noexcept {
+  return rhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator<=(const optional<T> &lhs, nullopt_t) noexcept {
+  return !lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator<=(nullopt_t, const optional<T> &) noexcept {
+  return true;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator>(const optional<T> &lhs, nullopt_t) noexcept {
+  return lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator>(nullopt_t, const optional<T> &) noexcept {
+  return false;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator>=(const optional<T> &, nullopt_t) noexcept {
+  return true;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__                                                    
+template <class T>                                                               
+__host__ __device__       
+inline constexpr bool operator>=(nullopt_t, const optional<T> &rhs) noexcept {
+  return !rhs.has_value();
+}
+
+/// \group relop_t
+/// \brief Compares the optional with a value.
+/// \details If the optional has a value, it is compared with the other value
+/// using `T`s relational operators. Otherwise, the optional is considered
+/// less than the value.
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator==(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs == rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator==(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs == *rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator!=(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs != rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator!=(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs != *rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs < rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs < *rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<=(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs <= rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<=(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs <= *rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs > rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs > *rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>=(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs >= rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>=(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs >= *rhs : true;
+}
+
+/// \synopsis template <class T>\nvoid swap(optional<T> &lhs, optional<T> &rhs);
+__thrust_exec_check_disable__
+template <class T,
+          detail::enable_if_t<std::is_move_constructible<T>::value> * = nullptr,
+          detail::enable_if_t<detail::is_swappable<T>::value> * = nullptr>
+__host__ __device__
+void swap(optional<T> &lhs,
+          optional<T> &rhs) noexcept(noexcept(lhs.swap(rhs))) {
+  return lhs.swap(rhs);
+}
+
+namespace detail {
+struct i_am_secret {};
+} // namespace detail
+
+__thrust_exec_check_disable__
+template <class T = detail::i_am_secret, class U,
+          class Ret =
+              detail::conditional_t<std::is_same<T, detail::i_am_secret>::value,
+                                    detail::decay_t<U>, T>>
+__host__ __device__
+inline constexpr optional<Ret> make_optional(U &&v) {
+  return optional<Ret>(std::forward<U>(v));
+}
+
+__thrust_exec_check_disable__
+template <class T, class... Args>
+__host__ __device__
+inline constexpr optional<T> make_optional(Args &&... args) {
+  return optional<T>(in_place, std::forward<Args>(args)...);
+}
+__thrust_exec_check_disable__
+template <class T, class U, class... Args>
+__host__ __device__
+inline constexpr optional<T> make_optional(std::initializer_list<U> il,
+                                           Args &&... args) {
+  return optional<T>(in_place, il, std::forward<Args>(args)...);
+}
+
+#if THRUST_CPP_DIALECT >= 2017
+template <class T> optional(T)->optional<T>;
+#endif
+
+/// \exclude
+namespace detail {
+#ifdef THRUST_OPTIONAL_CPP14
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+constexpr auto optional_map_impl(Opt &&opt, F &&f) {
+  return opt.has_value()
+             ? detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt))
+             : optional<Ret>(nullopt);
+}
+
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+auto optional_map_impl(Opt &&opt, F &&f) {
+  if (opt.has_value()) {
+    detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt));
+    return make_optional(monostate{});
+  }
+
+  return optional<monostate>(nullopt);
+}
+#else
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+constexpr auto optional_map_impl(Opt &&opt, F &&f) -> optional<Ret> {
+  return opt.has_value()
+             ? detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt))
+             : optional<Ret>(nullopt);
+}
+
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+auto optional_map_impl(Opt &&opt, F &&f) -> optional<monostate> {
+  if (opt.has_value()) {
+    detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt));
+    return monostate{};
+  }
+
+  return nullopt;
+}
+#endif
+} // namespace detail
+
+/// Specialization for when `T` is a reference. `optional<T&>` acts similarly
+/// to a `T*`, but provides more operations and shows intent more clearly.
+///
+/// *Examples*:
+///
+/// ```
+/// int i = 42;
+/// thrust::optional<int&> o = i;
+/// *o == 42; //true
+/// i = 12;
+/// *o = 12; //true
+/// &*o == &i; //true
+/// ```
+///
+/// Assignment has rebind semantics rather than assign-through semantics:
+///
+/// ```
+/// int j = 8;
+/// o = j;
+///
+/// &*o == &j; //true
+/// ```
+template <class T> class optional<T &> {
+public:
+// The different versions for C++14 and 11 are needed because deduced return
+// types are not SFINAE-safe. This provides better support for things like
+// generic lambdas. C.f.
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0.html
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`. \returns Let `U` be
+  /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
+  /// `std::optional<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+#endif
+#else
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`. \returns Let `U` be
+  /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
+  /// `std::optional<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &> and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &> and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &> and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &> and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+#endif
+#endif
+
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#else
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(detail::optional_map_impl(std::declval<optional &>(),
+                                                     std::declval<F &&>()))
+  map(F &&f) & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(detail::optional_map_impl(std::declval<optional &&>(),
+                                                     std::declval<F &&>()))
+  map(F &&f) && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(detail::optional_map_impl(std::declval<const optional &>(),
+                                      std::declval<F &&>()))
+  map(F &&f) const & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(detail::optional_map_impl(std::declval<const optional &&>(),
+                                      std::declval<F &&>()))
+  map(F &&f) const && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+  /// \brief Calls `f` if the optional is empty
+  /// \requires `std::invoke_result_t<F>` must be void or convertible to
+  /// `optional<T>`. \effects If `*this` has a value, returns `*this`.
+  /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)` and returns
+  /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)()`.
+  ///
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T>
+  THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T>
+  THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) const & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise returns
+  /// `u`.
+  ///
+  /// \details If there is a value stored, then `f` is called with `**this`
+  /// and the value is returned. Otherwise `u` is returned.
+  ///
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise calls
+  /// `u` and returns the result.
+  ///
+  /// \details If there is a value stored, then `f` is
+  /// called with `**this` and the value is returned. Otherwise
+  /// `std::forward<U>(u)()` is returned.
+  ///
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u) &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+#endif
+
+  /// \returns `u` if `*this` has a value, otherwise an empty optional.
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr optional<typename std::decay<U>::type> conjunction(U &&u) const {
+    using result = optional<detail::decay_t<U>>;
+    return has_value() ? result{u} : result{nullopt};
+  }
+
+  /// \returns `rhs` if `*this` is empty, otherwise the current value.
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+#endif
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+#endif
+
+  /// Takes the value out of the optional, leaving it empty
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+#endif
+
+  using value_type = T &;
+
+  /// Constructs an optional that does not contain a value.
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional() noexcept : m_value(nullptr) {}
+
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional(nullopt_t) noexcept : m_value(nullptr) {}
+
+  /// Copy constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(const optional &rhs) noexcept = default;
+
+  /// Move constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(optional &&rhs) = default;
+
+  /// Constructs the stored value with `u`.
+  /// \synopsis template <class U=T> constexpr optional(U &&u);
+  __thrust_exec_check_disable__
+  template <class U = T,
+            detail::enable_if_t<!detail::is_optional<detail::decay_t<U>>::value>
+                * = nullptr>
+  __host__ __device__
+  constexpr optional(U &&u) : m_value(addressof(u)) {
+    static_assert(std::is_lvalue_reference<U>::value, "U must be an lvalue");
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr explicit optional(const optional<U> &rhs) : optional(*rhs) {}
+
+  /// No-op
+  __thrust_exec_check_disable__
+  ~optional() = default;
+
+  /// Assignment to empty.
+  ///
+  /// Destroys the current value if there is one.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional &operator=(nullopt_t) noexcept {
+    m_value = nullptr;
+    return *this;
+  }
+
+  /// Copy assignment.
+  ///
+  /// Rebinds this optional to the referee of `rhs` if there is one. Otherwise
+  /// resets the stored value in `*this`.
+  __thrust_exec_check_disable__
+  optional &operator=(const optional &rhs) = default;
+
+  /// Rebinds this optional to `u`.
+  ///
+  /// \requires `U` must be an lvalue reference.
+  /// \synopsis optional &operator=(U &&u);
+  __thrust_exec_check_disable__
+  template <class U = T,
+            detail::enable_if_t<!detail::is_optional<detail::decay_t<U>>::value>
+                * = nullptr>
+  __host__ __device__
+  optional &operator=(U &&u) {
+    static_assert(std::is_lvalue_reference<U>::value, "U must be an lvalue");
+    m_value = addressof(u);
+    return *this;
+  }
+
+  /// Converting copy assignment operator.
+  ///
+  /// Rebinds this optional to the referee of `rhs` if there is one. Otherwise
+  /// resets the stored value in `*this`.
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  optional &operator=(const optional<U> &rhs) {
+    m_value = addressof(rhs.value());
+    return *this;
+  }
+
+  /// Constructs the value in-place, destroying the current one if there is
+  /// one.
+  ///
+  /// \group emplace
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  T &emplace(Args &&... args) noexcept {
+    static_assert(std::is_constructible<T, Args &&...>::value,
+                  "T must be constructible with Args");
+
+    *this = nullopt;
+    this->construct(std::forward<Args>(args)...);
+  }
+
+  /// Swaps this optional with the other.
+  ///
+  /// If neither optionals have a value, nothing happens.
+  /// If both have a value, the values are swapped.
+  /// If one has a value, it is moved to the other and the movee is left
+  /// valueless.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void swap(optional &rhs) noexcept { std::swap(m_value, rhs.m_value); }
+
+  /// \returns a pointer to the stored value
+  /// \requires a value is stored
+  /// \group pointer
+  /// \synopsis constexpr const T *operator->() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T *operator->() const { return m_value; }
+
+  /// \group pointer
+  /// \synopsis constexpr T *operator->();
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T *operator->() { return m_value; }
+
+  /// \returns the stored value
+  /// \requires a value is stored
+  /// \group deref
+  /// \synopsis constexpr T &operator*();
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &operator*() { return *m_value; }
+
+  /// \group deref
+  /// \synopsis constexpr const T &operator*() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &operator*() const { return *m_value; }
+
+  /// \returns whether or not the optional has a value
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool has_value() const noexcept { return m_value != nullptr; }
+
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr explicit operator bool() const noexcept {
+    return m_value != nullptr;
+  }
+
+  /// \returns the contained value if there is one, otherwise throws
+  /// [bad_optional_access]
+  /// \group value
+  /// synopsis constexpr T &value();
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &value() {
+    if (has_value())
+      return *m_value;
+    throw bad_optional_access();
+  }
+  /// \group value
+  /// \synopsis constexpr const T &value() const;
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &value() const {
+    if (has_value())
+      return *m_value;
+    throw bad_optional_access();
+  }
+
+  /// \returns the stored value if there is one, otherwise returns `u`
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr T value_or(U &&u) const & {
+    static_assert(std::is_copy_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be copy constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T value_or(U &&u) && {
+    static_assert(std::is_move_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be move constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// Destroys the stored value if one exists, making the optional empty
+  __thrust_exec_check_disable__
+  void reset() noexcept { m_value = nullptr; }
+
+private:
+  T *m_value;
+};
+
+} // end namespace thrust
+
+namespace std {
+// TODO SFINAE
+template <class T> struct hash<thrust::optional<T>> {
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ::std::size_t operator()(const thrust::optional<T> &o) const {
+    if (!o.has_value())
+      return 0;
+
+    return std::hash<thrust::detail::remove_const_t<T>>()(*o);
+  }
+};
+} // namespace std
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h
new file mode 100644
index 0000000000000000000000000000000000000000..48da892c7d937afb66a60c9076c8f7f5e4752b40
--- /dev/null
+++ b/thrust/thrust/pair.h
@@ -0,0 +1,283 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file pair.h
+ *  \brief A type encapsulating a heterogeneous pair of elements
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <utility>
+
+namespace thrust
+{
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup pair
+ *  \{
+ */
+
+/*! \p pair is a generic data structure encapsulating a heterogeneous
+ *  pair of values.
+ *
+ *  \tparam T1 The type of \p pair's first object type.  There are no
+ *          requirements on the type of \p T1. <tt>T1</tt>'s type is
+ *          provided by <tt>pair::first_type</tt>.
+ *
+ *  \tparam T2 The type of \p pair's second object type.  There are no
+ *          requirements on the type of \p T2. <tt>T2</tt>'s type is
+ *          provided by <tt>pair::second_type</tt>.
+ */
+template <typename T1, typename T2>
+  struct pair
+{
+  /*! \p first_type is the type of \p pair's first object type.
+   */
+  typedef T1 first_type;
+
+  /*! \p second_type is the type of \p pair's second object type.
+   */
+  typedef T2 second_type;
+
+  /*! The \p pair's first object.
+   */
+  first_type first;
+
+  /*! The \p pair's second object.
+   */
+  second_type second;
+
+  /*! \p pair's default constructor constructs \p first
+   *  and \p second using \c first_type & \c second_type's
+   *  default constructors, respectively.
+   */
+  __host__ __device__ pair(void);
+
+  /*! This constructor accepts two objects to copy into this \p pair.
+   *
+   *  \param x The object to copy into \p first.
+   *  \param y The object to copy into \p second.
+   */
+  inline __host__ __device__
+  pair(const T1 &x, const T2 &y);
+
+  /*! This copy constructor copies from a \p pair whose types are
+   *  convertible to this \p pair's \c first_type and \c second_type,
+   *  respectively.
+   *
+   *  \param p The \p pair to copy from.
+   *
+   *  \tparam U1 is convertible to \c first_type.
+   *  \tparam U2 is convertible to \c second_type.
+   */
+  template <typename U1, typename U2>
+  inline __host__ __device__
+  pair(const pair<U1,U2> &p);
+
+  /*! This copy constructor copies from a <tt>std::pair</tt> whose types are
+   *  convertible to this \p pair's \c first_type and \c second_type,
+   *  respectively.
+   *
+   *  \param p The <tt>std::pair</tt> to copy from.
+   *
+   *  \tparam U1 is convertible to \c first_type.
+   *  \tparam U2 is convertible to \c second_type.
+   */
+  template <typename U1, typename U2>
+  inline __host__ __device__
+  pair(const std::pair<U1,U2> &p);
+
+  /*! \p swap swaps the elements of two <tt>pair</tt>s.
+   *  
+   *  \param p The other <tt>pair</tt> with which to swap.
+   */
+  inline __host__ __device__
+  void swap(pair &p);
+}; // end pair
+
+
+/*! This operator tests two \p pairs for equality.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>x.first == y.first && x.second == y.second</tt>.
+ *  
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator==(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for ascending ordering.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>x.first < y.first || (!(y.first < x.first) && x.second < y.second)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for inequality.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>!(x == y)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator!=(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for descending ordering.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>y < x</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for ascending ordering or equivalence.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>!(y < x)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<=(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for descending ordering or equivalence.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>!(x < y)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>=(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! \p swap swaps the contents of two <tt>pair</tt>s.
+ *
+ *  \param x The first \p pair to swap.
+ *  \param y The second \p pair to swap.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    void swap(pair<T1,T2> &x, pair<T1,T2> &y);
+
+
+/*! This convenience function creates a \p pair from two objects.
+ *
+ *  \param x The first object to copy from.
+ *  \param y The second object to copy from.
+ *  \return A newly-constructed \p pair copied from \p a and \p b.
+ *
+ *  \tparam T1 There are no requirements on the type of \p T1.
+ *  \tparam T2 There are no requirements on the type of \p T2.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    pair<T1,T2> make_pair(T1 x, T2 y);
+
+
+/*! This convenience metafunction is included for compatibility with
+ *  \p tuple. It returns either the type of a \p pair's
+ *  \c first_type or \c second_type in its nested type, \c type.
+ *
+ *  \tparam N This parameter selects the member of interest.
+ *  \tparam T A \c pair type of interest.
+ */
+template<int N, typename T> struct tuple_element;
+
+
+/*! This convenience metafunction is included for compatibility with
+ *  \p tuple. It returns \c 2, the number of elements of a \p pair,
+ *  in its nested data member, \c value.
+ *
+ *  \tparam Pair A \c pair type of interest.
+ */
+template<typename Pair> struct tuple_size;
+
+
+/*! This convenience function returns a reference to either the first or
+ *  second member of a \p pair.
+ *
+ *  \param p The \p pair of interest.
+ *  \return \c p.first or \c p.second, depending on the template
+ *          parameter.
+ *
+ *  \tparam N This parameter selects the member of interest.
+ */
+// XXX comment out these prototypes as a WAR to a problem on MSVC 2005
+//template<unsigned int N, typename T1, typename T2>
+//  inline __host__ __device__
+//    typename tuple_element<N, pair<T1,T2> >::type &
+//      get(pair<T1,T2> &p);
+
+
+/*! This convenience function returns a const reference to either the
+ *  first or second member of a \p pair.
+ *
+ *  \param p The \p pair of interest.
+ *  \return \c p.first or \c p.second, depending on the template
+ *          parameter.
+ *
+ *  \tparam i This parameter selects the member of interest.
+ */
+// XXX comment out these prototypes as a WAR to a problem on MSVC 2005
+//template<int N, typename T1, typename T2>
+//  inline __host__ __device__
+//    const typename tuple_element<N, pair<T1,T2> >::type &
+//      get(const pair<T1,T2> &p);
+
+/*! \} // pair
+ */
+
+/*! \} // utility
+ */
+
+} // end thrust
+
+#include <thrust/detail/pair.inl>
+
diff --git a/thrust/thrust/partition.h b/thrust/thrust/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c493e0881639d75faa9516a34588dcfa2ea0fa2
--- /dev/null
+++ b/thrust/thrust/partition.h
@@ -0,0 +1,1439 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief Reorganizes a range based on a predicate
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reordering
+ *  \ingroup algorithms
+ *
+ *  \addtogroup partitioning
+ *  \ingroup reordering
+ *  \{
+ */
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred, such that all of the elements that satisfy \p pred precede the
+ *  elements that fail to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every
+ *  iterator \c i in the range <tt>[first,middle)</tt> and \c false for every iterator
+ *  \c i in the range <tt>[middle, last)</tt>. The return value of \p partition is
+ *  \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition, does guarantee to preserve the relative order.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy \p pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(thrust::host,
+ *                    A, A + N,
+ *                    is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred, such that all of the elements that satisfy \p pred precede the
+ *  elements that fail to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every
+ *  iterator \c i in the range <tt>[first,middle)</tt> and \c false for every iterator
+ *  \c i in the range <tt>[middle, last)</tt>. The return value of \p partition is
+ *  \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition, does guarantee to preserve the relative order.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy \p pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(A, A + N,
+ *                     is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
+ *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition, does guarantee to preserve the relative order.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(thrust::host, A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
+ *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition, does guarantee to preserve the relative order.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p partition_copy differs from \p partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input range shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::partition_copy(thrust::host, A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p partition_copy differs from \p partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input range shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::partition_copy(A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p partition_copy differs from \p partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers using the \p thrust::host execution
+ *  policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(thrust::host, A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p partition_copy differs from \p partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p stable_partition is much like \p partition : it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
+ *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
+ *  it. The postcondition is that, for some iterator \p middle in the range
+ *  <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every iterator \c i in the
+ *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
+ *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
+ *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
+ *  and <tt>pred(stencil_x) == pred(stencil_y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(thrust::host,
+ *                           A, A + N,
+ *                           is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+
+/*! \p stable_partition is much like \p partition : it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
+ *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
+ *  it. The postcondition is that, for some iterator \p middle in the range
+ *  <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every iterator \c i in the
+ *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
+ *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
+ *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
+ *  and <tt>pred(stencil_x) == pred(stencil_y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(A, A + N,
+ *                            is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+
+/*! \p stable_partition is much like \p partition: it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred applied to a stencil
+ *  range <tt>[stencil, stencil + (last - first))</tt>, such that all of
+ *  the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(thrust::host, A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+
+/*! \p stable_partition is much like \p partition: it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred applied to a stencil
+ *  range <tt>[stencil, stencil + (last - first))</tt>, such that all of
+ *  the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \p partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(thrust::host, A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \p partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \p partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(thrust::host, A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \p partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \} // end stream_compaction
+ */
+
+/*! \} // end reordering
+ */
+
+/*! \addtogroup searching
+ *  \{
+ */
+
+
+/*! \p partition_point returns an iterator pointing to the end of the true
+ *  partition of a partitioned range. \p partition_point requires the input range
+ *  <tt>[first,last)</tt> to be a partition; that is, all elements which satisfy
+ *  <tt>pred</tt> shall appear before those that do not.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *              range <tt>[first, last)</tt> belongs.
+ *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
+ *          and <tt>none_of(mid, last, pred)</tt> are both true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
+ *
+ *  \note Though similar, \p partition_point is not redundant with \p find_if_not.
+ *        \p partition_point's precondition provides an opportunity for a
+ *        faster implemention.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int * B = thrust::partition_point(thrust::host, A, A + 10, is_even());
+ *  // B - A is 5
+ *  // [A, B) contains only even values
+ *  \endcode
+ *
+ *  \see \p partition
+ *  \see \p find_if_not
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate>
+__host__ __device__
+  ForwardIterator partition_point(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred);
+
+
+/*! \p partition_point returns an iterator pointing to the end of the true
+ *  partition of a partitioned range. \p partition_point requires the input range
+ *  <tt>[first,last)</tt> to be a partition; that is, all elements which satisfy
+ *  <tt>pred</tt> shall appear before those that do not.
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *              range <tt>[first, last)</tt> belongs.
+ *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
+ *          and <tt>none_of(mid, last, pred)</tt> are both true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
+ *
+ *  \note Though similar, \p partition_point is not redundant with \p find_if_not.
+ *        \p partition_point's precondition provides an opportunity for a
+ *        faster implemention.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int * B = thrust::partition_point(A, A + 10, is_even());
+ *  // B - A is 5
+ *  // [A, B) contains only even values
+ *  \endcode
+ *
+ *  \see \p partition
+ *  \see \p find_if_not
+ */
+template<typename ForwardIterator, typename Predicate>
+  ForwardIterator partition_point(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred);
+
+/*! \} // searching
+ */
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup predicates
+ *  \{
+ */
+
+
+/*! \p is_partitioned returns \c true if the given range 
+ *  is partitioned with respect to a predicate, and \c false otherwise.
+ *
+ *  Specifically, \p is_partitioned returns \c true if <tt>[first, last)</tt>
+ *  is empty of if <tt>[first, last)</tt> is partitioned by \p pred, i.e. if
+ *  all elements that satisfy \p pred appear before those that do not.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *         range <tt>[first, last)</tt> belongs.
+ *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
+ *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int B[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *
+ *  thrust::is_partitioned(thrust::host, A, A + 10, is_even()); // returns true
+ *  thrust::is_partitioned(thrust::host, B, B + 10, is_even()); // returns false
+ *  \endcode
+ *
+ *  \see \p partition
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+  bool is_partitioned(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \p is_partitioned returns \c true if the given range 
+ *  is partitioned with respect to a predicate, and \c false otherwise.
+ *
+ *  Specifically, \p is_partitioned returns \c true if <tt>[first, last)</tt>
+ *  is empty of if <tt>[first, last)</tt> is partitioned by \p pred, i.e. if
+ *  all elements that satisfy \p pred appear before those that do not.
+ *
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *         range <tt>[first, last)</tt> belongs.
+ *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
+ *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  
+ *  \code
+ *  #include <thrust/partition.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int B[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *
+ *  thrust::is_partitioned(A, A + 10, is_even()); // returns true
+ *  thrust::is_partitioned(B, B + 10, is_even()); // returns false
+ *  \endcode
+ *
+ *  \see \p partition
+ */
+template<typename InputIterator, typename Predicate>
+  bool is_partitioned(InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \} // end predicates
+ *  \} // end reductions
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/partition.inl>
+
diff --git a/thrust/thrust/per_device_resource.h b/thrust/thrust/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c0158aeeab185dba80ed9890e608fbb4d1336ff
--- /dev/null
+++ b/thrust/thrust/per_device_resource.h
@@ -0,0 +1,104 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/system/detail/generic/per_device_resource.h>
+#include <thrust/system/detail/adl/per_device_resource.h>
+#include <thrust/mr/allocator.h>
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/mr/allocator.h>
+
+namespace thrust
+{
+
+/*! Returns a global instance of \p MR for the current device of the provided system.
+ *
+ *  \tparam MR type of a memory resource to get an instance from. Must be \p DefaultConstructible.
+ *  \param system execution policy for which the resource is requested.
+ *  \returns a pointer to a global instance of \p MR for the current device.
+ */
+template<typename MR, typename DerivedPolicy>
+__host__
+MR * get_per_device_resource(const thrust::detail::execution_policy_base<DerivedPolicy> & system)
+{
+    using thrust::system::detail::generic::get_per_device_resource;
+
+    return get_per_device_resource<MR>(
+        thrust::detail::derived_cast(
+            thrust::detail::strip_const(system)));
+}
+
+/*! A helper allocator class that uses global per device instances of a given upstream memory resource. Requires the memory
+ *      resource to be default constructible.
+ *
+ *  \tparam T the type that will be allocated by this allocator.
+ *  \tparam MR the upstream memory resource to use for memory allocation. Must derive from
+ *      \p thrust::mr::memory_resource and must be \p final.
+ *  \tparam ExecutionPolicy the execution policy of the system to be used to retrieve the resource for the current device.
+ */
+template<typename T, typename Upstream, typename ExecutionPolicy>
+class per_device_allocator : public thrust::mr::allocator<T, Upstream>
+{
+    typedef thrust::mr::allocator<T, Upstream> base;
+
+public:
+    /*! The \p rebind metafunction provides the type of an \p per_device_allocator instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p per_device_allocator.
+         */
+        typedef per_device_allocator<U, Upstream, ExecutionPolicy> other;
+    };
+
+    /*! Default constructor. Uses \p get_global_resource to get the global instance of \p Upstream and initializes the
+     *      \p allocator base subobject with that resource.
+     */
+    __host__
+    per_device_allocator() : base(get_per_device_resource<Upstream>(ExecutionPolicy()))
+    {
+    }
+
+    /*! Copy constructor. Copies the memory resource pointer. */
+    __host__ __device__
+    per_device_allocator(const per_device_allocator & other)
+        : base(other) {}
+
+    /*! Conversion constructor from an allocator of a different type. Copies the memory resource pointer. */
+    template<typename U>
+    __host__ __device__
+    per_device_allocator(const per_device_allocator<U, Upstream, ExecutionPolicy> & other)
+        : base(other) {}
+
+    /*! Destructor. */
+    __host__ __device__
+    ~per_device_allocator() {}
+};
+
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/thrust/random.h b/thrust/thrust/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0e9e2282414b6e891808337eef41d016abbbe7e
--- /dev/null
+++ b/thrust/thrust/random.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file random.h
+ *  \brief Pseudo-random number generators.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
+
+// RNGs
+#include <thrust/random/discard_block_engine.h>
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/linear_feedback_shift_engine.h>
+#include <thrust/random/subtract_with_carry_engine.h>
+#include <thrust/random/xor_combine_engine.h>
+
+// distributions
+#include <thrust/random/uniform_int_distribution.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <thrust/random/normal_distribution.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup random Random Number Generation
+ *  \{
+ */
+
+
+/*! \namespace thrust::random
+ *  \brief \p thrust::random is the namespace which contains random number engine class templates,
+ *  random number engine adaptor class templates, engines with predefined parameters,
+ *  and random number distribution class templates. They are provided in a separate namespace
+ *  for import convenience but are also aliased in the top-level \p thrust namespace for
+ *  easy access.
+ */
+namespace random
+{
+
+/*! \addtogroup predefined_random Random Number Engines with Predefined Parameters
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \typedef ranlux24
+ *  \brief A random number engine with predefined parameters which implements the
+ *         RANLUX level-3 random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux24
+ *        shall produce the value \c 9901578 .
+ */
+typedef discard_block_engine<ranlux24_base, 223, 23> ranlux24;
+
+
+/*! \typedef ranlux48
+ *  \brief A random number engine with predefined parameters which implements the
+ *         RANLUX level-4 random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux48
+ *        shall produce the value \c 88229545517833 .
+ */
+typedef discard_block_engine<ranlux48_base, 389, 11> ranlux48;
+
+
+/*! \typedef taus88
+ *  \brief A random number engine with predefined parameters which implements
+ *         L'Ecuyer's 1996 three-component Tausworthe random number generator.
+ *
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p taus88
+ *        shall produce the value \c 3535848941 .
+ */
+typedef xor_combine_engine<
+  linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 31u, 13u, 12u>,
+  0,
+  xor_combine_engine<
+    linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 29u,  2u,  4u>, 0,
+    linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 28u,  3u, 17u>, 0
+  >,
+  0
+> taus88;
+
+/*! \typedef default_random_engine
+ *  \brief An implementation-defined "default" random number engine.
+ *  \note \p default_random_engine is currently an alias for \p minstd_rand, and may change
+ *        in a future version.
+ */
+typedef minstd_rand default_random_engine;
+
+/*! \} // end predefined_random
+ */
+
+} // end random
+
+
+/*! \} // end random
+ */
+
+// import names into thrust::
+using random::ranlux24;
+using random::ranlux48;
+using random::taus88;
+using random::default_random_engine;
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/discard_block_engine.inl b/thrust/thrust/random/detail/discard_block_engine.inl
new file mode 100644
index 0000000000000000000000000000000000000000..fca16c2bfd039b13001f189b0840629d971153b9
--- /dev/null
+++ b/thrust/thrust/random/detail/discard_block_engine.inl
@@ -0,0 +1,212 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/discard_block_engine.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  discard_block_engine<Engine,p,r>
+    ::discard_block_engine()
+      : m_e(), m_n(0)
+{}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  discard_block_engine<Engine,p,r>
+    ::discard_block_engine(result_type s)
+      : m_e(s), m_n(0)
+{}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  discard_block_engine<Engine,p,r>
+    ::discard_block_engine(const base_type &urng)
+      : m_e(urng), m_n(0)
+{}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  void discard_block_engine<Engine,p,r>
+    ::seed(void)
+{
+  m_e.seed();
+  m_n = 0;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  void discard_block_engine<Engine,p,r>
+    ::seed(result_type s)
+{
+  m_e.seed(s);
+  m_n = 0;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  typename discard_block_engine<Engine,p,r>::result_type
+    discard_block_engine<Engine,p,r>
+      ::operator()(void)
+{
+  if(m_n >= used_block)
+  {
+    m_e.discard(block_size - m_n);
+//    for(; m_n < block_size; ++m_n)
+//      m_e();
+    m_n = 0;
+  }
+
+  ++m_n;
+
+  return m_e();
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  void discard_block_engine<Engine,p,r>
+    ::discard(unsigned long long z)
+{
+  // XXX this should be accelerated
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  const typename discard_block_engine<Engine,p,r>::base_type &
+    discard_block_engine<Engine,p,r>
+      ::base(void) const
+{
+  return m_e;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& discard_block_engine<Engine,p,r>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags & fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  // output the base engine followed by n
+  os << m_e << space << m_n;
+
+  // restore flags & fill character
+  os.flags(flags);
+  os.fill(fill);
+
+  return os;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& discard_block_engine<Engine,p,r>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  // input the base engine and then n
+  is >> m_e >> m_n;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  __host__ __device__
+  bool discard_block_engine<Engine,p,r>
+    ::equal(const discard_block_engine<Engine,p,r> &rhs) const
+{
+  return (m_e == rhs.m_e) && (m_n == rhs.m_n);
+}
+
+
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const discard_block_engine<Engine,p,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           discard_block_engine<Engine,p,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+template<typename Engine, size_t p, size_t r>
+__host__ __device__
+bool operator==(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename Engine, size_t p, size_t r>
+__host__ __device__
+bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/linear_congruential_engine.inl b/thrust/thrust/random/detail/linear_congruential_engine.inl
new file mode 100644
index 0000000000000000000000000000000000000000..da0b03e15c2dcc2155479d431659868b8fc2dc0c
--- /dev/null
+++ b/thrust/thrust/random/detail/linear_congruential_engine.inl
@@ -0,0 +1,169 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/detail/mod.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
+  linear_congruential_engine<UIntType,a,c,m>
+    ::linear_congruential_engine(result_type s)
+{
+  seed(s);
+} // end linear_congruential_engine::linear_congruential_engine()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
+  void linear_congruential_engine<UIntType,a,c,m>
+    ::seed(result_type s)
+{
+  if((detail::mod<UIntType, 1, 0, m>(c) == 0) &&
+     (detail::mod<UIntType, 1, 0, m>(s) == 0))
+    m_x = detail::mod<UIntType, 1, 0, m>(1);
+  else
+    m_x = detail::mod<UIntType, 1, 0, m>(s);
+} // end linear_congruential_engine::seed()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
+  typename linear_congruential_engine<UIntType,a,c,m>::result_type
+    linear_congruential_engine<UIntType,a,c,m>
+      ::operator()(void)
+{
+  m_x = detail::mod<UIntType,a,c,m>(m_x);
+  return m_x;
+} // end linear_congruential_engine::operator()()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
+  void linear_congruential_engine<UIntType,a,c,m>
+    ::discard(unsigned long long z)
+{
+  thrust::random::detail::linear_congruential_engine_discard::discard(*this,z);
+} // end linear_congruential_engine::discard()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& linear_congruential_engine<UIntType,a,c,m>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags & fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(os.widen(' '));
+
+  // output one word of state
+  os << m_x;
+
+  // restore flags & fill character
+  os.flags(flags);
+  os.fill(fill);
+
+  return os;
+}
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& linear_congruential_engine<UIntType,a,c,m>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base     ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::dec);
+
+  // input one word of state
+  is >> m_x;
+
+  // restore flags
+  is.flags(flags);
+
+  return is;
+}
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+__host__ __device__
+bool linear_congruential_engine<UIntType,a,c,m>
+  ::equal(const linear_congruential_engine<UIntType,a,c,m> &rhs) const
+{
+  return m_x == rhs.m_x;
+}
+
+
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
+__host__ __device__
+bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
+                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs)
+{
+  return detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+__host__ __device__
+bool operator!=(const linear_congruential_engine<UIntType,a,c,m> &lhs,
+                const linear_congruential_engine<UIntType,a,c,m> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_congruential_engine<UIntType_,a_,c_,m_> &e)
+{
+  return detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_congruential_engine<UIntType_,a_,c_,m_> &e)
+{
+  return detail::random_core_access::stream_in(is,e);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/linear_congruential_engine_discard.h b/thrust/thrust/random/detail/linear_congruential_engine_discard.h
new file mode 100644
index 0000000000000000000000000000000000000000..38159514408b91dc36b5a25a755852f69832d930
--- /dev/null
+++ b/thrust/thrust/random/detail/linear_congruential_engine_discard.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/random/detail/mod.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+
+template<typename UIntType, UIntType a, unsigned long long c, UIntType m>
+  struct linear_congruential_engine_discard_implementation
+{
+  __host__ __device__
+  static void discard(UIntType &state, unsigned long long z)
+  {
+    for(; z > 0; --z)
+    {
+      state = detail::mod<UIntType,a,c,m>(state);
+    }
+  }
+}; // end linear_congruential_engine_discard
+
+
+// specialize for small integers and c == 0
+// XXX figure out a robust implemenation of this for any unsigned integer type later
+template<thrust::detail::uint32_t a, thrust::detail::uint32_t m>
+  struct linear_congruential_engine_discard_implementation<thrust::detail::uint32_t,a,0,m>
+{
+  __host__ __device__
+  static void discard(thrust::detail::uint32_t &state, unsigned long long z)
+  {
+    const thrust::detail::uint32_t modulus = m;
+
+    // XXX we need to use unsigned long long here or we will encounter overflow in the
+    //     multiplies below
+    //     figure out a robust implementation of this later
+    unsigned long long multiplier = a;
+    unsigned long long multiplier_to_z = 1;
+    
+    // see http://en.wikipedia.org/wiki/Modular_exponentiation
+    while(z > 0)
+    {
+      if(z & 1)
+      {
+        // multiply in this bit's contribution while using modulus to keep result small
+        multiplier_to_z = (multiplier_to_z * multiplier) % modulus;
+      }
+
+      // move to the next bit of the exponent, square (and mod) the base accordingly
+      z >>= 1;
+      multiplier = (multiplier * multiplier) % modulus;
+    }
+
+    state = static_cast<thrust::detail::uint32_t>((multiplier_to_z * state) % modulus);
+  }
+}; // end linear_congruential_engine_discard
+
+
+struct linear_congruential_engine_discard
+{
+  template<typename LinearCongruentialEngine>
+  __host__ __device__
+  static void discard(LinearCongruentialEngine &lcg, unsigned long long z)
+  {
+    typedef typename LinearCongruentialEngine::result_type result_type;
+    const result_type c = LinearCongruentialEngine::increment;
+    const result_type a = LinearCongruentialEngine::multiplier;
+    const result_type m = LinearCongruentialEngine::modulus;
+    
+    // XXX WAR unused variable warnings
+    (void) c;
+    (void) a;
+    (void) m;
+
+    linear_congruential_engine_discard_implementation<result_type,a,c,m>::discard(lcg.m_x, z);
+  }
+}; // end linear_congruential_engine_discard
+
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/linear_feedback_shift_engine.inl b/thrust/thrust/random/detail/linear_feedback_shift_engine.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b5d55be1527087e9f64b5a4c6aedf1b8bcf67e62
--- /dev/null
+++ b/thrust/thrust/random/detail/linear_feedback_shift_engine.inl
@@ -0,0 +1,165 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/linear_feedback_shift_engine.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
+  linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::linear_feedback_shift_engine(result_type value)
+{
+  seed(value);
+} // end linear_feedback_shift_engine::linear_feedback_shift_engine()
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
+  void linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::seed(result_type value)
+{
+  m_value = value;
+} // end linear_feedback_shift_engine::seed()
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
+  typename linear_feedback_shift_engine<UIntType,w,k,q,s>::result_type
+    linear_feedback_shift_engine<UIntType,w,k,q,s>
+      ::operator()(void)
+{
+  const UIntType b = (((m_value << q) ^ m_value) & wordmask) >> (k-s);
+  const UIntType mask = ( (~static_cast<UIntType>(0)) << (w-k) ) & wordmask;
+  m_value = ((m_value & mask) << s) ^ b;
+  return m_value;
+} // end linear_feedback_shift_engine::operator()()
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
+  void linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::discard(unsigned long long z)
+{
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+} // end linear_feedback_shift_engine::discard()
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& linear_feedback_shift_engine<UIntType,w,k,q,s>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags & fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(os.widen(' '));
+
+  // output one word of state
+  os << m_value;
+
+  // restore flags & fill character
+  os.flags(flags);
+  os.fill(fill);
+
+  return os;
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& linear_feedback_shift_engine<UIntType,w,k,q,s>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base     ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  // input one word of state
+  is >> m_value;
+
+  // restore flags
+  is.flags(flags);
+
+  return is;
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
+  bool linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::equal(const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs) const
+{
+  return m_value == rhs.m_value;
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+__host__ __device__
+bool operator==(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
+                const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+__host__ __device__
+bool operator!=(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
+                const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/linear_feedback_shift_engine_wordmask.h b/thrust/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..6669350eae5ce8049dee431ba1bb07d89ce86834
--- /dev/null
+++ b/thrust/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+template<typename T, int w, int i = w-1>
+  struct linear_feedback_shift_engine_wordmask
+{
+  static const T value =
+    (T(1u) << i) |
+    linear_feedback_shift_engine_wordmask<T, w, i-1>::value;
+}; // end linear_feedback_shift_engine_wordmask
+
+template<typename T, int w>
+  struct linear_feedback_shift_engine_wordmask<T, w, 0>
+{
+  static const T value = 0;
+}; // end linear_feedback_shift_engine_wordmask
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/mod.h b/thrust/thrust/random/detail/mod.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed6afcf03cefdc2113e720f6f7861e9019f27bc0
--- /dev/null
+++ b/thrust/thrust/random/detail/mod.h
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+template<typename T, T a, T c, T m, bool = (m == 0)>
+  struct static_mod
+{
+  static const T q = m / a;
+  static const T r = m % a;
+
+  __host__ __device__
+  T operator()(T x) const
+  {
+    if(a == 1)
+    {
+      x %= m;
+    }
+    else
+    {
+      T t1 = a * (x % q);
+      T t2 = r * (x / q);
+      if(t1 >= t2)
+      {
+        x = t1 - t2;
+      }
+      else
+      {
+        x = m - t2 + t1;
+      }
+    }
+
+    if(c != 0)
+    {
+      const T d = m - x;
+      if(d > c)
+      {
+        x += c;
+      }
+      else
+      {
+        x = c - d;
+      }
+    }
+
+    return x;
+  }
+}; // end static_mod
+
+
+// Rely on machine overflow handling
+template<typename T, T a, T c, T m>
+  struct static_mod<T,a,c,m,true>
+{
+  __host__ __device__
+  T operator()(T x) const
+  {
+    return a * x + c;
+  }
+}; // end static_mod
+
+template<typename T, T a, T c, T m>
+__host__ __device__
+  T mod(T x)
+{
+  static_mod<T,a,c,m> f;
+  return f(x);
+} // end static_mod
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/normal_distribution.inl b/thrust/thrust/random/detail/normal_distribution.inl
new file mode 100644
index 0000000000000000000000000000000000000000..099a977f3333353bd9fbfbf6d46993c0ea649d84
--- /dev/null
+++ b/thrust/thrust/random/detail/normal_distribution.inl
@@ -0,0 +1,255 @@
+/*
+ *
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/normal_distribution.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/integer_traits.h>
+
+// for floating point infinity
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <math_constants.h>
+#else
+#include <limits>
+#endif
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename RealType>
+  __host__ __device__
+  normal_distribution<RealType>
+    ::normal_distribution(RealType a, RealType b)
+      :super_t(),m_param(a,b)
+{
+} // end normal_distribution::normal_distribution()
+
+
+template<typename RealType>
+  __host__ __device__
+  normal_distribution<RealType>
+    ::normal_distribution(const param_type &parm)
+      :super_t(),m_param(parm)
+{
+} // end normal_distribution::normal_distribution()
+
+
+template<typename RealType>
+  __host__ __device__
+  void normal_distribution<RealType>
+    ::reset(void)
+{
+  super_t::reset();
+} // end normal_distribution::reset()
+
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    typename normal_distribution<RealType>::result_type
+      normal_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng)
+{
+  return operator()(urng, m_param);
+} // end normal_distribution::operator()()
+
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    typename normal_distribution<RealType>::result_type
+      normal_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng,
+                     const param_type &parm)
+{
+  return super_t::sample(urng, parm.first, parm.second);
+} // end normal_distribution::operator()()
+
+
+template<typename RealType>
+  __host__ __device__
+  typename normal_distribution<RealType>::param_type
+    normal_distribution<RealType>
+      ::param(void) const
+{
+  return m_param;
+} // end normal_distribution::param()
+
+
+template<typename RealType>
+  __host__ __device__
+  void normal_distribution<RealType>
+    ::param(const param_type &parm)
+{
+  m_param = parm;
+} // end normal_distribution::param()
+
+
+template<typename RealType>
+  __host__ __device__
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return -this->max THRUST_PREVENT_MACRO_SUBSTITUTION ();
+} // end normal_distribution::min()
+
+
+template<typename RealType>
+  __host__ __device__
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  // XXX this solution is pretty terrible
+  // we can't use numeric_traits<RealType>::max because nvcc will
+  // complain that it is a __host__ function
+  union
+  {
+    thrust::detail::uint32_t inf_as_int;
+    float result;
+  } hack;
+
+  hack.inf_as_int = 0x7f800000u;
+
+  return hack.result;
+} // end normal_distribution::max()
+
+
+template<typename RealType>
+  __host__ __device__
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::mean(void) const
+{
+  return m_param.first;
+} // end normal_distribution::mean()
+
+
+template<typename RealType>
+  __host__ __device__
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::stddev(void) const
+{
+  return m_param.second;
+} // end normal_distribution::stddev()
+
+
+template<typename RealType>
+  __host__ __device__
+  bool normal_distribution<RealType>
+    ::equal(const normal_distribution &rhs) const
+{
+  return m_param == rhs.param();
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>&
+      normal_distribution<RealType>
+        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  os << mean() << space << stddev();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>&
+      normal_distribution<RealType>
+        ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  is >> m_param.first >> m_param.second;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename RealType>
+__host__ __device__
+bool operator==(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename RealType>
+__host__ __device__
+bool operator!=(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const normal_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,d);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           normal_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,d);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/normal_distribution_base.h b/thrust/thrust/random/detail/normal_distribution_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a3bd4470b576465a77a289fee9f959d027e5b03
--- /dev/null
+++ b/thrust/thrust/random/detail/normal_distribution_base.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * Copyright Jens Maurer 2000-2001
+ * Distributed under the Boost Software License, Version 1.0. (See
+ * accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <limits>
+#include <cmath>
+
+namespace thrust
+{
+namespace random
+{
+namespace detail
+{
+
+// this version samples the normal distribution directly
+// and uses the non-standard math function erfcinv
+template<typename RealType>
+  class normal_distribution_nvcc
+{
+  protected:
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
+    {
+      typedef typename UniformRandomNumberGenerator::result_type uint_type;
+      const uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
+
+      // Constants for conversion
+      const RealType S1 = static_cast<RealType>(1) / urng_range;
+      const RealType S2 = S1 / 2;
+
+      RealType S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
+
+      // Get the integer value
+      uint_type u = urng() - UniformRandomNumberGenerator::min;
+
+      // Ensure the conversion to float will give a value in the range [0,0.5)
+      if(u > (urng_range / 2))
+      {
+        u = urng_range - u;
+        S3 = -S3;
+      }
+
+      // Convert to floating point in [0,0.5)
+      RealType p = u*S1 + S2;
+
+      // Apply inverse error function
+      return mean + stddev * S3 * erfcinv(2 * p);
+    }
+
+    // no-op
+    __host__ __device__
+    void reset() {}
+};
+
+// this version samples the normal distribution using
+// Marsaglia's "polar method"
+template<typename RealType>
+  class normal_distribution_portable
+{
+  protected:
+    normal_distribution_portable()
+      : m_r1(), m_r2(), m_cached_rho(), m_valid(false)
+    {}
+
+    normal_distribution_portable(const normal_distribution_portable &other)
+      : m_r1(other.m_r1), m_r2(other.m_r2), m_cached_rho(other.m_cached_rho), m_valid(other.m_valid)
+    {}
+
+    void reset()
+    {
+      m_valid = false;
+    }
+
+    // note that we promise to call this member function with the same mean and stddev
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
+    {
+      // implementation from Boost
+      // allow for Koenig lookup
+      using std::sqrt; using std::log; using std::sin; using std::cos;
+
+      if(!m_valid)
+      {
+        uniform_real_distribution<RealType> u01;
+        m_r1 = u01(urng);
+        m_r2 = u01(urng);
+        m_cached_rho = sqrt(-RealType(2) * log(RealType(1)-m_r2));
+
+        m_valid = true;
+      }
+      else
+      {
+        m_valid = false;
+      }
+
+      const RealType pi = RealType(3.14159265358979323846);
+
+      RealType result = m_cached_rho * (m_valid ?
+                          cos(RealType(2)*pi*m_r1) :
+                          sin(RealType(2)*pi*m_r1));
+
+      return mean + stddev * result;
+    }
+
+  private:
+    RealType m_r1, m_r2, m_cached_rho;
+    bool m_valid;
+};
+
+template<typename RealType>
+  struct normal_distribution_base
+{
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(__NVCOMPILER_CUDA__)
+  typedef normal_distribution_nvcc<RealType> type;
+#else
+  typedef normal_distribution_portable<RealType> type;
+#endif
+};
+
+} // end detail
+} // end random
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/random_core_access.h b/thrust/thrust/random/detail/random_core_access.h
new file mode 100644
index 0000000000000000000000000000000000000000..f03060e0ab06806c3c42d4857bd8bb1acb3eff66
--- /dev/null
+++ b/thrust/thrust/random/detail/random_core_access.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+struct random_core_access
+{
+
+template<typename OStream, typename EngineOrDistribution>
+static OStream &stream_out(OStream &os, const EngineOrDistribution &x)
+{
+  return x.stream_out(os);
+}
+
+template<typename IStream, typename EngineOrDistribution>
+static IStream &stream_in(IStream &is, EngineOrDistribution &x)
+{
+  return x.stream_in(is);
+}
+
+template<typename EngineOrDistribution>
+__host__ __device__
+static bool equal(const EngineOrDistribution &lhs, const EngineOrDistribution &rhs)
+{
+  return lhs.equal(rhs);
+}
+
+}; // end random_core_access
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/subtract_with_carry_engine.inl b/thrust/thrust/random/detail/subtract_with_carry_engine.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9b4a4c45c71b2ce3871ebce6bd32f9026289581d
--- /dev/null
+++ b/thrust/thrust/random/detail/subtract_with_carry_engine.inl
@@ -0,0 +1,210 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/subtract_with_carry_engine.h>
+#include <thrust/random/detail/mod.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  subtract_with_carry_engine<UIntType,w,s,r>
+    ::subtract_with_carry_engine(result_type value)
+{
+  seed(value);
+} // end subtract_with_carry_engine::subtract_with_carry_engine()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  void subtract_with_carry_engine<UIntType,w,s,r>
+    ::seed(result_type value)
+{
+  thrust::random::linear_congruential_engine<result_type,
+    40014u, 0u, 2147483563u> e(value == 0u ? default_seed : value);
+
+  // initialize state
+  for(size_t i = 0; i < long_lag; ++i)
+  {
+    m_x[i] = detail::mod<UIntType, 1, 0, modulus>(e());
+  } // end for i
+
+  m_carry = (m_x[long_lag-1] == 0);
+  m_k = 0;
+} // end subtract_with_carry_engine::seed()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  typename subtract_with_carry_engine<UIntType,w,s,r>::result_type
+    subtract_with_carry_engine<UIntType,w,s,r>
+      ::operator()(void)
+{
+  // XXX we probably need to cache these m_x[m_k] in a register
+  //     maybe we need to cache the use of all member variables
+  int short_index = m_k - short_lag;
+  if(short_index < 0)
+    short_index += long_lag;
+  result_type xi;
+  if (m_x[short_index] >= m_x[m_k] + m_carry)
+  {
+    // x(n) >= 0
+    xi =  m_x[short_index] - m_x[m_k] - m_carry;
+    m_carry = 0;
+  }
+  else
+  {
+    // x(n) < 0
+    xi = modulus - m_x[m_k] - m_carry + m_x[short_index];
+    m_carry = 1;
+  }
+  m_x[m_k] = xi;
+  ++m_k;
+  if(m_k >= long_lag)
+    m_k = 0;
+  return xi;
+} // end subtract_with_carry_engine::operator()()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  void subtract_with_carry_engine<UIntType,w,s,r>
+    ::discard(unsigned long long z)
+{
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+} // end subtract_with_carry_engine::discard()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& subtract_with_carry_engine<UIntType,w,s,r>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base     ios_base;
+                  
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill  = os.fill();
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  const UIntType long_lag = r;
+                                                          
+  for(size_t i = 0; i < r; ++i)
+    os << m_x[(i + m_k) % long_lag] << space;
+  os << m_carry;
+                                                                          
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  template<typename CharType, typename Traits>
+    std::basic_istream<CharType,Traits>& subtract_with_carry_engine<UIntType,w,s,r>
+      ::stream_in(std::basic_istream<CharType,Traits> &is)
+{
+  typedef std::basic_istream<CharType,Traits> istream_type;
+  typedef typename istream_type::ios_base     ios_base;
+
+  const typename ios_base::fmtflags flags = is.flags();
+  is.flags(ios_base::dec | ios_base::skipws);
+
+  for(size_t i = 0; i < r; ++i)
+    is >> m_x[i];
+  is >> m_carry;
+
+  m_k = 0;
+
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  bool subtract_with_carry_engine<UIntType,w,s,r>
+    ::equal(const subtract_with_carry_engine<UIntType,w,s,r> &rhs) const
+{
+  const UIntType long_lag = r;
+
+  bool result = true;
+  for(size_t i = 0; i < r; ++i)
+  {
+    result &= (m_x[(i + m_k) % long_lag] == rhs.m_x[(i + rhs.m_k) % long_lag]);
+  }
+
+  // XXX not sure if this last check is necessary
+  result &= (m_carry == rhs.m_carry);
+
+  return result;
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r,
+         typename CharT, typename Traits>
+  std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits> &os,
+               const subtract_with_carry_engine<UIntType,w,s,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r,
+         typename CharType, typename Traits>
+  std::basic_istream<CharType,Traits>&
+    operator>>(std::basic_istream<CharType,Traits> &is,
+               subtract_with_carry_engine<UIntType,w,s,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  bool operator==(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
+                  const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
+  bool operator!=(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
+                  const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/uniform_int_distribution.inl b/thrust/thrust/random/detail/uniform_int_distribution.inl
new file mode 100644
index 0000000000000000000000000000000000000000..18eb5194cc84cc26a83356d9579eca9015efd76f
--- /dev/null
+++ b/thrust/thrust/random/detail/uniform_int_distribution.inl
@@ -0,0 +1,246 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/uniform_int_distribution.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename IntType>
+  __host__ __device__
+  uniform_int_distribution<IntType>
+    ::uniform_int_distribution(IntType a, IntType b)
+      :m_param(a,b)
+{
+} // end uniform_int_distribution::uniform_int_distribution()
+
+
+template<typename IntType>
+  __host__ __device__
+  uniform_int_distribution<IntType>
+    ::uniform_int_distribution(const param_type &parm)
+      :m_param(parm)
+{
+} // end uniform_int_distribution::uniform_int_distribution()
+
+
+template<typename IntType>
+  __host__ __device__
+  void uniform_int_distribution<IntType>
+    ::reset(void)
+{
+} // end uniform_int_distribution::reset()
+
+
+template<typename IntType>
+  template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    typename uniform_int_distribution<IntType>::result_type
+      uniform_int_distribution<IntType>
+        ::operator()(UniformRandomNumberGenerator &urng)
+{
+  return operator()(urng, m_param);
+} // end uniform_int_distribution::operator()()
+
+
+template<typename IntType>
+  template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    typename uniform_int_distribution<IntType>::result_type
+      uniform_int_distribution<IntType>
+        ::operator()(UniformRandomNumberGenerator &urng, const param_type &parm)
+{
+  // XXX this implementation is somewhat hacky and will skip
+  //     values if the range of the RNG is smaller than the range of the distribution
+  //     we should improve this implementation in a later version
+
+  typedef typename thrust::detail::largest_available_float::type float_type;
+
+  const float_type real_min(static_cast<float_type>(parm.first));
+  const float_type real_max(static_cast<float_type>(parm.second));
+
+  // add one to the right end of the interval because it is half-open
+  // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
+  uniform_real_distribution<float_type> real_dist(real_min, real_max + float_type(1));
+
+  return static_cast<result_type>(real_dist(urng));
+} // end uniform_int_distribution::operator()()
+
+
+template<typename IntType>
+  __host__ __device__
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::a(void) const
+{
+  return m_param.first;
+} // end uniform_int_distribution<IntType>::a()
+
+
+template<typename IntType>
+  __host__ __device__
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::b(void) const
+{
+  return m_param.second;
+} // end uniform_int_distribution::b()
+
+
+template<typename IntType>
+  __host__ __device__
+  typename uniform_int_distribution<IntType>::param_type
+    uniform_int_distribution<IntType>
+      ::param(void) const
+{
+  return m_param;
+} // end uniform_int_distribution::param()
+
+
+template<typename IntType>
+  __host__ __device__
+  void uniform_int_distribution<IntType>
+    ::param(const param_type &parm)
+{
+  m_param = parm;
+} // end uniform_int_distribution::param()
+
+
+template<typename IntType>
+  __host__ __device__
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return a();
+} // end uniform_int_distribution::min()
+
+
+template<typename IntType>
+  __host__ __device__
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return b();
+} // end uniform_int_distribution::max()
+
+
+template<typename IntType>
+  __host__ __device__
+  bool uniform_int_distribution<IntType>
+    ::equal(const uniform_int_distribution &rhs) const
+{
+  return param() == rhs.param();
+}
+
+
+template<typename IntType>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>&
+      uniform_int_distribution<IntType>
+        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  os << a() << space << b();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename IntType>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>&
+      uniform_int_distribution<IntType>
+        ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  is >> m_param.first >> m_param.second;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename IntType>
+__host__ __device__
+bool operator==(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename IntType>
+__host__ __device__
+bool operator!=(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_int_distribution<IntType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,d);
+}
+
+
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_int_distribution<IntType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,d);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/uniform_real_distribution.inl b/thrust/thrust/random/detail/uniform_real_distribution.inl
new file mode 100644
index 0000000000000000000000000000000000000000..ec4f21e9e4405080f3b5eb3a2f5c870e810118ee
--- /dev/null
+++ b/thrust/thrust/random/detail/uniform_real_distribution.inl
@@ -0,0 +1,231 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/uniform_real_distribution.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename RealType>
+  __host__ __device__
+  uniform_real_distribution<RealType>
+    ::uniform_real_distribution(RealType a, RealType b)
+      :m_param(a,b)
+{
+} // end uniform_real_distribution::uniform_real_distribution()
+
+template<typename RealType>
+  __host__ __device__
+  uniform_real_distribution<RealType>
+    ::uniform_real_distribution(const param_type &parm)
+      :m_param(parm)
+{
+} // end uniform_real_distribution::uniform_real_distribution()
+
+template<typename RealType>
+  __host__ __device__
+  void uniform_real_distribution<RealType>
+    ::reset(void)
+{
+} // end uniform_real_distribution::reset()
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    typename uniform_real_distribution<RealType>::result_type
+      uniform_real_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng)
+{
+  return operator()(urng, m_param);
+} // end uniform_real::operator()()
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    typename uniform_real_distribution<RealType>::result_type
+      uniform_real_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng,
+                     const param_type &parm)
+{
+  // call the urng & map its result to [0,1)
+  result_type result = static_cast<result_type>(urng() - UniformRandomNumberGenerator::min);
+
+  // adding one to the denominator ensures that the interval is half-open at 1.0
+  // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
+  // XXX OTOH adding 1 to what is potentially UINT_MAX also seems like a bad idea
+  // XXX we could statically check if 1u + (max - min) is representable and do that, otherwise use the current implementation
+  result /= (result_type(1) + static_cast<result_type>(UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min));
+
+  return (result * (parm.second - parm.first)) + parm.first;
+} // end uniform_real::operator()()
+
+template<typename RealType>
+  __host__ __device__
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::a(void) const
+{
+  return m_param.first;
+} // end uniform_real::a()
+
+template<typename RealType>
+  __host__ __device__
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::b(void) const
+{
+  return m_param.second;
+} // end uniform_real_distribution::b()
+
+template<typename RealType>
+  __host__ __device__
+  typename uniform_real_distribution<RealType>::param_type
+    uniform_real_distribution<RealType>
+      ::param(void) const
+{
+  return m_param;;
+} // end uniform_real_distribution::param()
+
+template<typename RealType>
+  __host__ __device__
+  void uniform_real_distribution<RealType>
+    ::param(const param_type &parm)
+{
+  m_param = parm;
+} // end uniform_real_distribution::param()
+
+template<typename RealType>
+  __host__ __device__
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return a();
+} // end uniform_real_distribution::min()
+
+template<typename RealType>
+  __host__ __device__
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return b();
+} // end uniform_real_distribution::max()
+
+
+template<typename RealType>
+  __host__ __device__
+  bool uniform_real_distribution<RealType>
+    ::equal(const uniform_real_distribution &rhs) const
+{
+  return m_param == rhs.param();
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>&
+      uniform_real_distribution<RealType>
+        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  os << a() << space << b();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>&
+      uniform_real_distribution<RealType>
+        ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  is >> m_param.first >> m_param.second;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename RealType>
+__host__ __device__
+bool operator==(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename RealType>
+__host__ __device__
+bool operator!=(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_real_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,d);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_real_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,d);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/xor_combine_engine.inl b/thrust/thrust/random/detail/xor_combine_engine.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d24865f68f448df24abdca51703d2afa08581f1b
--- /dev/null
+++ b/thrust/thrust/random/detail/xor_combine_engine.inl
@@ -0,0 +1,215 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/xor_combine_engine.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::xor_combine_engine(void)
+      :m_b1(),m_b2()
+{
+} // end xor_combine_engine::xor_combine_engine()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::xor_combine_engine(const base1_type &urng1, const base2_type &urng2)
+      :m_b1(urng1),m_b2(urng2)
+{
+} // end xor_combine_engine::xor_combine_engine()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::xor_combine_engine(result_type s)
+      :m_b1(s),m_b2(s)
+{
+} // end xor_combine_engine::xor_combine_engine()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  void xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::seed(void)
+{
+  m_b1.seed();
+  m_b2.seed();
+} // end xor_combine_engine::seed()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  void xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::seed(result_type s)
+{
+  m_b1.seed(s);
+  m_b2.seed(s);
+} // end xor_combine_engine::seed()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base1_type &
+    xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::base1(void) const
+{
+  return m_b1;
+} // end xor_combine_engine::base1()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base2_type &
+    xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::base2(void) const
+{
+  return m_b2;
+} // end xor_combine_engine::base2()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  typename xor_combine_engine<Engine1,s1,Engine2,s2>::result_type
+    xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::operator()(void)
+{
+  return (result_type(m_b1() - base1_type::min) << shift1) ^
+         (result_type(m_b2() - base2_type::min) << shift2);
+} // end xor_combine_engine::operator()()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  __host__ __device__
+  void xor_combine_engine<Engine1, s1, Engine2, s2>
+    ::discard(unsigned long long z)
+{
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+} // end xor_combine_engine::discard()
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  // output each base engine in turn
+  os << base1() << space << base2();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  // input each base engine in turn
+  is >> m_b1 >> m_b2;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  __host__ __device__
+  bool xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::equal(const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs) const
+{
+  return (m_b1 == rhs.m_b1) && (m_b2 == rhs.m_b2);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const xor_combine_engine<Engine1,s1,Engine2,s2> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           xor_combine_engine<Engine1,s1,Engine2,s2> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+__host__ __device__
+bool operator==(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
+                const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+__host__ __device__
+bool operator!=(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
+                const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/detail/xor_combine_engine_max.h b/thrust/thrust/random/detail/xor_combine_engine_max.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb5bdc831a601765c93aea82cb9cc9cd6bb8c91
--- /dev/null
+++ b/thrust/thrust/random/detail/xor_combine_engine_max.h
@@ -0,0 +1,324 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/mpl/math.h>
+#include <limits>
+#include <cstddef>
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+
+namespace math = thrust::detail::mpl::math;
+
+
+namespace detail
+{
+
+// two cases for this function avoids compile-time warnings of overflow
+template<typename UIntType, UIntType w,
+         UIntType lhs, UIntType rhs,
+         bool shift_will_overflow>
+  struct lshift_w
+{
+  static const UIntType value = 0;
+};
+
+
+template<typename UIntType, UIntType w,
+         UIntType lhs, UIntType rhs>
+  struct lshift_w<UIntType,w,lhs,rhs,false>
+{
+  static const UIntType value = lhs << rhs;
+};
+
+} // end detail
+
+
+template<typename UIntType, UIntType w,
+         UIntType lhs, UIntType rhs>
+  struct lshift_w
+{
+  static const bool shift_will_overflow = rhs >= w;
+
+  static const UIntType value = detail::lshift_w<UIntType, w, lhs, rhs, shift_will_overflow>::value;
+};
+
+
+template<typename UIntType, UIntType lhs, UIntType rhs>
+  struct lshift
+    : lshift_w<UIntType, std::numeric_limits<UIntType>::digits, lhs, rhs>
+{};
+
+
+template<typename UIntType, int p>
+  struct two_to_the_power
+    : lshift<UIntType, 1, p>
+{};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  class xor_combine_engine_max_aux_constants
+{
+  public:
+    static const result_type two_to_the_d = two_to_the_power<result_type, d>::value;
+    static const result_type c = lshift<result_type, a, d>::value;
+
+    static const result_type t =
+      math::max<
+        result_type,
+        c,
+        b
+      >::value;
+
+    static const result_type u =
+      math::min<
+        result_type,
+        c,
+        b
+      >::value;
+
+    static const result_type p            = math::log2<u>::value;
+    static const result_type two_to_the_p = two_to_the_power<result_type, p>::value;
+
+    static const result_type k = math::div<result_type, t, two_to_the_p>::value;
+};
+
+
+template<typename result_type, result_type, result_type, int> struct xor_combine_engine_max_aux;
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case4
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type k_plus_1_times_two_to_the_p =
+    lshift<
+      result_type,
+      math::plus<result_type,constants::k,1>::value,
+      constants::p
+    >::value;
+
+  static const result_type M =
+    xor_combine_engine_max_aux<
+      result_type,
+      math::div<
+        result_type,
+        math::mod<
+          result_type,
+          constants::u,
+          constants::two_to_the_p
+        >::value,
+        constants::two_to_the_p
+      >::value,
+      math::mod<
+        result_type,
+        constants::t,
+        constants::two_to_the_p
+      >::value,
+      d
+    >::value;
+
+  static const result_type value = math::plus<result_type, k_plus_1_times_two_to_the_p, M>::value;
+};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case3
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type k_plus_1_times_two_to_the_p =
+    lshift<
+      result_type,
+      math::plus<result_type,constants::k,1>::value,
+      constants::p
+    >::value;
+
+  static const result_type M =
+    xor_combine_engine_max_aux<
+      result_type,
+      math::div<
+        result_type,
+        math::mod<
+          result_type,
+          constants::t,
+          constants::two_to_the_p
+        >::value,
+        constants::two_to_the_p
+      >::value,
+      math::mod<
+        result_type,
+        constants::u,
+        constants::two_to_the_p
+      >::value,
+      d
+    >::value;
+
+  static const result_type value = math::plus<result_type, k_plus_1_times_two_to_the_p, M>::value;
+};
+
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case2
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type k_plus_1_times_two_to_the_p =
+    lshift<
+      result_type,
+      math::plus<result_type,constants::k,1>::value,
+      constants::p
+    >::value;
+
+  static const result_type value =
+    math::minus<
+      result_type,
+      k_plus_1_times_two_to_the_p,
+      1
+    >::value;
+};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case1
+{
+  static const result_type c     = lshift<result_type, a, d>::value;
+
+  static const result_type value = math::plus<result_type,c,b>::value;
+};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_2
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type value = 
+    thrust::detail::eval_if<
+      // if k is odd...
+      math::is_odd<result_type, constants::k>::value,
+      thrust::detail::identity_<
+        thrust::detail::integral_constant<
+          result_type,
+          xor_combine_engine_max_aux_case2<result_type,a,b,d>::value
+        >
+      >,
+      thrust::detail::eval_if<
+        // otherwise if a * 2^3 >= b, then case 3
+        a * constants::two_to_the_d >= b,
+        thrust::detail::identity_<
+          thrust::detail::integral_constant<
+            result_type,
+            xor_combine_engine_max_aux_case3<result_type,a,b,d>::value
+          >
+        >,
+        // otherwise, case 4
+        thrust::detail::identity_<
+          thrust::detail::integral_constant<
+            result_type,
+            xor_combine_engine_max_aux_case4<result_type,a,b,d>::value
+          >
+        >
+      >
+    >::type::value;
+};
+
+
+template<typename result_type,
+         result_type a,
+         result_type b,
+         int d,
+         bool use_case1 = (a == 0) || (b < two_to_the_power<result_type,d>::value)>
+  struct xor_combine_engine_max_aux_1
+    : xor_combine_engine_max_aux_case1<result_type,a,b,d>
+{};
+
+
+template<typename result_type,
+         result_type a,
+         result_type b,
+         int d>
+  struct xor_combine_engine_max_aux_1<result_type,a,b,d,false>
+    : xor_combine_engine_max_aux_2<result_type,a,b,d>
+{};
+
+
+template<typename result_type,
+         result_type a,
+         result_type b,
+         int d>
+  struct xor_combine_engine_max_aux
+    : xor_combine_engine_max_aux_1<result_type,a,b,d>
+{};
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2, typename result_type>
+  struct xor_combine_engine_max
+{
+  static const size_t w = std::numeric_limits<result_type>::digits;
+
+  static const result_type m1 =
+    math::min<
+      result_type,
+      result_type(Engine1::max - Engine1::min),
+      two_to_the_power<result_type, w-s1>::value - 1 
+    >::value;
+
+  static const result_type m2 =
+    math::min<
+      result_type,
+      result_type(Engine2::max - Engine2::min),
+      two_to_the_power<result_type, w-s2>::value - 1
+    >::value;
+
+  static const result_type s = s1 - s2;
+
+  static const result_type M =
+    xor_combine_engine_max_aux<
+      result_type,
+      m1,
+      m2,
+      s
+    >::value;
+
+  // the value is M(m1,m2,s) lshift_w s2
+  static const result_type value =
+    lshift_w<
+      result_type,
+      w,
+      M,
+      s2
+    >::value;
+}; // end xor_combine_engine_max
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/thrust/thrust/random/discard_block_engine.h b/thrust/thrust/random/discard_block_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d73649c2d275be261cff580f88f39e8f2116c8e
--- /dev/null
+++ b/thrust/thrust/random/discard_block_engine.h
@@ -0,0 +1,252 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file discard_block_engine.h
+ *  \brief A random number engine which adapts a base engine and produces
+ *         numbers by discarding all but a contiguous blocks of its values.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/config.h>
+#include <iostream>
+#include <thrust/detail/cstdint.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_adaptors Random Number Engine Adaptor Class Templates
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \class discard_block_engine
+ *  \brief A \p discard_block_engine adapts an existing base random number engine and produces
+ *         random values by discarding some of the values returned by its base engine.
+ *         Each cycle of the compound engine begins by returning \c r values successively produced
+ *         by the base engine and ends by discarding <tt>p-r</tt> such values. The engine's state
+ *         is the state of its base engine followed by the number of calls to <tt>operator()</tt>
+ *         that have occurred since the beginning of the current cycle.
+ *
+ *  \tparam Engine The type of the base random number engine to adapt.
+ *  \tparam p The discard cycle length.
+ *  \tparam r The number of values to return of the base engine. Because <tt>p-r</tt> will be
+ *            discarded, <tt>r <= p</tt>.
+ *
+ *  The following code snippet shows an example of using a \p discard_block_engine instance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/discard_block_engine.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    // create a discard_block_engine from minstd_rand, with a cycle length of 13
+ *    // keep every first 10 values, and discard the next 3
+ *    thrust::discard_block_engine<thrust::minstd_rand, 13, 10> rng;
+ *
+ *    // print a random number to standard output
+ *    std::cout << rng() << std::endl;
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */         
+template<typename Engine, size_t p, size_t r>
+  class discard_block_engine
+{
+  public:
+    // types
+
+    /*! \typedef base_type
+     *  \brief The type of the adapted base random number engine.
+     */
+    typedef Engine base_type;
+
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p linear_congruential_engine.
+     */
+    typedef typename base_type::result_type result_type;
+
+    // engine characteristics
+
+    /*! The length of the production cycle.
+     */
+    static const size_t block_size = p;
+
+    /*! The number of used numbers per production cycle.
+     */
+    static const size_t used_block = r;
+
+    /*! The smallest value this \p discard_block_engine may potentially produce.
+     */
+    static const result_type min = base_type::min;
+
+    /*! The largest value this \p discard_block_engine may potentially produce.
+     */
+    static const result_type max = base_type::max;
+
+    // constructors and seeding functions
+
+    /*! This constructor constructs a new \p discard_block_engine and constructs
+     *  its \p base_type engine using its null constructor.
+     */
+    __host__ __device__
+    discard_block_engine();
+
+    /*! This constructor constructs a new \p discard_block_engine using
+     *  a given \p base_type engine to initialize its adapted base engine.
+     *
+     *  \param urng A \p base_type to use to initialize this \p discard_block_engine's
+     *         adapted base engine.
+     */
+    __host__ __device__
+    explicit discard_block_engine(const base_type &urng);
+
+    /*! This constructor initializes a new \p discard_block_engine with a given seed.
+     *  
+     *  \param s The seed used to intialize this \p discard_block_engine's adapted base engine.
+     */
+    __host__ __device__
+    explicit discard_block_engine(result_type s);
+
+    /*! This method initializes the state of this \p discard_block_engine's adapted base engine
+     *  by using its \p default_seed value.
+     */
+    __host__ __device__
+    void seed(void);
+
+    /*! This method initializes the state of this \p discard_block_engine's adapted base engine
+     *  by using the given seed.
+     *
+     *  \param s The seed with which to intialize this \p discard_block_engine's adapted base engine.
+     */
+    __host__ __device__
+    void seed(result_type s);
+
+    // generating functions
+    
+    /*! This member function produces a new random value and updates this \p discard_block_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p discard_block_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    // property functions
+
+    /*! This member function returns a const reference to this \p discard_block_engine's
+     *  adapted base engine.
+     *
+     *  \return A const reference to the base engine this \p discard_block_engine adapts.
+     */
+    __host__ __device__
+    const base_type &base(void) const;
+
+    /*! \cond
+     */
+  private:
+    base_type m_e;
+    unsigned int m_n;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const discard_block_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end discard_block_engine
+
+
+/*! This function checks two \p discard_block_engines for equality.
+ *  \param lhs The first \p discard_block_engine to test.
+ *  \param rhs The second \p discard_block_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine, size_t p, size_t r>
+__host__ __device__
+bool operator==(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs);
+
+
+/*! This function checks two \p discard_block_engines for inequality.
+ *  \param lhs The first \p discard_block_engine to test.
+ *  \param rhs The second \p discard_block_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine, size_t p, size_t r>
+__host__ __device__
+bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs);
+
+
+/*! This function streams a discard_block_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p discard_block_engine to stream out.
+ *  \return \p os
+ */
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const discard_block_engine<Engine,p,r> &e);
+
+
+/*! This function streams a discard_block_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p discard_block_engine to stream in.
+ *  \return \p is
+ */
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           discard_block_engine<Engine,p,r> &e);
+
+/*! \} // end random_number_engine_adaptors
+ */
+
+} // end random
+
+// import names into thrust::
+using random::discard_block_engine;
+
+} // end thrust
+
+#include <thrust/random/detail/discard_block_engine.inl>
+
diff --git a/thrust/thrust/random/linear_congruential_engine.h b/thrust/thrust/random/linear_congruential_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dc72b3b136e7c49ddd572d201d575b8a2d2320a
--- /dev/null
+++ b/thrust/thrust/random/linear_congruential_engine.h
@@ -0,0 +1,295 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file linear_congruential_engine.h
+ *  \brief A linear congruential pseudorandom number engine.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <iostream>
+#include <thrust/detail/cstdint.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <thrust/random/detail/linear_congruential_engine_discard.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_templates Random Number Engine Class Templates
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \class linear_congruential_engine
+ *  \brief A \p linear_congruential_engine random number engine produces unsigned integer
+ *         random numbers using a linear congruential random number generation algorithm.
+ *
+ *         The generation algorithm has the form <tt>x_i = (a * x_{i-1} + c) mod m</tt>.
+ *
+ *  \tparam UIntType The type of unsigned integer to produce.
+ *  \tparam a The multiplier used in the generation algorithm.
+ *  \tparam c The increment used in the generation algorithm.
+ *  \tparam m The modulus used in the generation algorithm.
+ *
+ *  \note Inexperienced users should not use this class template directly.  Instead, use
+ *  \p minstd_rand or \p minstd_rand0.
+ *
+ *  The following code snippet shows examples of use of a \p linear_congruential_engine instance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object, which is an instance of linear_congruential_engine
+ *    thrust::minstd_rand rng1;
+ *
+ *    // output some random values to cout
+ *    std::cout << rng1() << std::endl;
+ *
+ *    // a random value is printed
+ *
+ *    // create a new minstd_rand from a seed
+ *    thrust::minstd_rand rng2(13);
+ *
+ *    // discard some random values
+ *    rng2.discard(13);
+ *
+ *    // stream the object to an iostream
+ *    std::cout << rng2 << std::endl;
+ *
+ *    // rng2's current state is printed
+ *
+ *    // print the minimum and maximum values that minstd_rand can produce
+ *    std::cout << thrust::minstd_rand::min << std::endl;
+ *    std::cout << thrust::minstd_rand::max << std::endl;
+ *
+ *    // the range of minstd_rand is printed
+ *
+ *    // save the state of rng2 to a different object
+ *    thrust::minstd_rand rng3 = rng2;
+ *
+ *    // compare rng2 and rng3
+ *    std::cout << (rng2 == rng3) << std::endl;
+ *
+ *    // 1 is printed
+ *
+ *    // re-seed rng2 with a different seed
+ *    rng2.seed(7);
+ *
+ *    // compare rng2 and rng3
+ *    std::cout << (rng2 == rng3) << std::endl;
+ *
+ *    // 0 is printed
+ *
+ *    return 0;
+ *  }
+ *
+ *  \endcode
+ *
+ *  \see thrust::random::minstd_rand
+ *  \see thrust::random::minstd_rand0
+ */
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  class linear_congruential_engine
+{
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p linear_congruential_engine.
+     */
+    typedef UIntType result_type;
+
+    // engine characteristics
+
+    /*! The multiplier used in the generation algorithm.
+     */
+    static const result_type multiplier = a;
+
+    /*! The increment used in the generation algorithm.
+     */
+    static const result_type increment = c;
+
+    /*! The modulus used in the generation algorithm.
+     */
+    static const result_type modulus = m;
+
+    /*! The smallest value this \p linear_congruential_engine may potentially produce.
+     */
+    static const result_type min = c == 0u ? 1u : 0u;
+
+    /*! The largest value this \p linear_congruential_engine may potentially produce.
+     */
+    static const result_type max = m - 1u;
+
+    /*! The default seed of this \p linear_congruential_engine.
+     */
+    static const result_type default_seed = 1u;
+
+    // constructors and seeding functions
+
+    /*! This constructor, which optionally accepts a seed, initializes a new
+     *  \p linear_congruential_engine.
+     *  
+     *  \param s The seed used to intialize this \p linear_congruential_engine's state.
+     */
+    __host__ __device__
+    explicit linear_congruential_engine(result_type s = default_seed);
+
+    /*! This method initializes this \p linear_congruential_engine's state, and optionally accepts
+     *  a seed value.
+     *
+     *  \param s The seed used to initializes this \p linear_congruential_engine's state.
+     */
+    __host__ __device__
+    void seed(result_type s = default_seed);
+
+    // generating functions
+
+    /*! This member function produces a new random value and updates this \p linear_congruential_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p linear_congruential_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    /*! \cond
+     */
+  private:
+    result_type m_x;
+
+    static void transition(result_type &state);
+
+    friend struct thrust::random::detail::random_core_access;
+
+    friend struct thrust::random::detail::linear_congruential_engine_discard;
+
+    __host__ __device__
+    bool equal(const linear_congruential_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    /*! \endcond
+     */
+}; // end linear_congruential_engine
+
+
+/*! This function checks two \p linear_congruential_engines for equality.
+ *  \param lhs The first \p linear_congruential_engine to test.
+ *  \param rhs The second \p linear_congruential_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
+__host__ __device__
+bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
+                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs);
+
+
+/*! This function checks two \p linear_congruential_engines for inequality.
+ *  \param lhs The first \p linear_congruential_engine to test.
+ *  \param rhs The second \p linear_congruential_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
+__host__ __device__
+bool operator!=(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
+                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs);
+
+
+/*! This function streams a linear_congruential_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p linear_congruential_engine to stream out.
+ *  \return \p os
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_congruential_engine<UIntType_,a_,c_,m_> &e);
+
+
+/*! This function streams a linear_congruential_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p linear_congruential_engine to stream in.
+ *  \return \p is
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_congruential_engine<UIntType_,a_,c_,m_> &e);
+
+
+/*! \} // random_number_engine_templates
+ */
+
+
+/*! \addtogroup predefined_random
+ *  \{
+ */
+
+// XXX the type N2111 used here was uint_fast32_t
+
+/*! \typedef minstd_rand0
+ *  \brief A random number engine with predefined parameters which implements a version of
+ *         the Minimal Standard random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p minstd_rand0
+ *        shall produce the value \c 1043618065 .
+ */
+typedef linear_congruential_engine<thrust::detail::uint32_t, 16807, 0, 2147483647> minstd_rand0;
+
+
+/*! \typedef minstd_rand
+ *  \brief A random number engine with predefined parameters which implements a version of
+ *         the Minimal Standard random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p minstd_rand
+ *        shall produce the value \c 399268537 .
+ */
+typedef linear_congruential_engine<thrust::detail::uint32_t, 48271, 0, 2147483647> minstd_rand;
+
+/*! \} // predefined_random
+ */
+  
+} // end random
+
+// import names into thrust::
+using random::linear_congruential_engine;
+using random::minstd_rand;
+using random::minstd_rand0;
+
+} // end thrust
+
+#include <thrust/random/detail/linear_congruential_engine.inl>
+
diff --git a/thrust/thrust/random/linear_feedback_shift_engine.h b/thrust/thrust/random/linear_feedback_shift_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..90c572c9baa2eca22c663a8dd5b9d1a5dbc7a280
--- /dev/null
+++ b/thrust/thrust/random/linear_feedback_shift_engine.h
@@ -0,0 +1,230 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file linear_feedback_shift_engine.h
+ *  \brief A linear feedback shift pseudorandom number generator.
+ */
+
+/*
+ * Copyright Jens Maurer 2002
+ *
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/random/detail/linear_feedback_shift_engine_wordmask.h>
+#include <iostream>
+#include <cstddef> // for size_t
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_templates
+ *  \{
+ */
+
+/*! \class linear_feedback_shift_engine
+ *  \brief A \p linear_feedback_shift_engine random number engine produces
+ *         unsigned integer random values using a linear feedback shift random number
+ *         generation algorithm.
+ *
+ *  \tparam UIntType The type of unsigned integer to produce.
+ *  \tparam w The word size of the produced values (<tt>w <= sizeof(UIntType)</tt>).
+ *  \tparam k The k parameter of Tausworthe's 1965 algorithm.
+ *  \tparam q The q exponent of Tausworthe's 1965 algorithm.
+ *  \tparam s The step size of Tausworthe's 1965 algorithm.
+ *
+ *  \note linear_feedback_shift_engine is based on the Boost Template Library's linear_feedback_shift.
+ */
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  class linear_feedback_shift_engine
+{
+  public:
+    // types
+
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p linear_feedback_shift_engine.
+     */
+    typedef UIntType result_type;
+
+    // engine characteristics
+
+    /*! The word size of the produced values.
+     */
+    static const size_t word_size = w;
+
+    /*! A constant used in the generation algorithm.
+     */
+    static const size_t exponent1 = k;
+
+    /*! A constant used in the generation algorithm.
+     */
+    static const size_t exponent2 = q;
+
+    /*! The step size used in the generation algorithm.
+     */
+    static const size_t step_size = s;
+
+    /*! \cond
+     */
+  private:
+    static const result_type wordmask =
+      detail::linear_feedback_shift_engine_wordmask<
+        result_type,
+        w
+      >::value;
+    /*! \endcond
+     */
+
+  public:
+
+    /*! The smallest value this \p linear_feedback_shift_engine may potentially produce.
+     */
+    static const result_type min = 0;
+
+    /*! The largest value this \p linear_feedback_shift_engine may potentially produce.
+     */
+    static const result_type max = wordmask;
+
+    /*! The default seed of this \p linear_feedback_shift_engine.
+     */
+    static const result_type default_seed = 341u;
+
+    // constructors and seeding functions
+
+    /*! This constructor, which optionally accepts a seed, initializes a new
+     *  \p linear_feedback_shift_engine.
+     *  
+     *  \param value The seed used to intialize this \p linear_feedback_shift_engine's state.
+     */
+    __host__ __device__
+    explicit linear_feedback_shift_engine(result_type value = default_seed);
+
+    /*! This method initializes this \p linear_feedback_shift_engine's state, and optionally accepts
+     *  a seed value.
+     *
+     *  \param value The seed used to initializes this \p linear_feedback_shift_engine's state.
+     */
+    __host__ __device__
+    void seed(result_type value = default_seed);
+
+    // generating functions
+    
+    /*! This member function produces a new random value and updates this \p linear_feedback_shift_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p linear_feedback_shift_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    /*! \cond
+     */
+  private:
+    result_type m_value;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const linear_feedback_shift_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    /*! \endcond
+     */
+}; // end linear_feedback_shift_engine
+
+
+/*! This function checks two \p linear_feedback_shift_engines for equality.
+ *  \param lhs The first \p linear_feedback_shift_engine to test.
+ *  \param rhs The second \p linear_feedback_shift_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_>
+__host__ __device__
+bool operator==(const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &lhs,
+                const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &rhs);
+
+
+/*! This function checks two \p linear_feedback_shift_engines for inequality.
+ *  \param lhs The first \p linear_feedback_shift_engine to test.
+ *  \param rhs The second \p linear_feedback_shift_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_>
+__host__ __device__
+bool operator!=(const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &lhs,
+                const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &rhs);
+
+
+/*! This function streams a linear_feedback_shift_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p linear_feedback_shift_engine to stream out.
+ *  \return \p os
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e);
+
+
+/*! This function streams a linear_feedback_shift_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p linear_feedback_shift_engine to stream in.
+ *  \return \p is
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e);
+
+
+/*! \} // end random_number_engine_templates
+ */
+
+
+} // end random
+
+// import names into thrust::
+using random::linear_feedback_shift_engine;
+
+} // end thrust
+
+#include <thrust/random/detail/linear_feedback_shift_engine.inl>
+
diff --git a/thrust/thrust/random/normal_distribution.h b/thrust/thrust/random/normal_distribution.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac45e161a082aff69d496d37eb4ec327c994db64
--- /dev/null
+++ b/thrust/thrust/random/normal_distribution.h
@@ -0,0 +1,275 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file normal_distribution.h
+ *  \brief A normal (Gaussian) distribution of real-valued numbers.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <thrust/random/detail/normal_distribution_base.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+/*! \addtogroup random_number_distributions
+ *  \{
+ */
+
+/*! \class normal_distribution
+ *  \brief A \p normal_distribution random number distribution produces floating point
+ *         Normally distributed random numbers.
+ *
+ *  \tparam RealType The type of floating point number to produce.
+ *
+ *  The following code snippet demonstrates examples of using a \p normal_distribution with a 
+ *  random number engine to produce random values drawn from the Normal distribution with a given
+ *  mean and variance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/normal_distribution.h>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object to act as our source of randomness
+ *    thrust::minstd_rand rng;
+ *
+ *    // create a normal_distribution to produce floats from the Normal distribution
+ *    // with mean 2.0 and standard deviation 3.5
+ *    thrust::random::normal_distribution<float> dist(2.0f, 3.5f);
+ *
+ *    // write a random number to standard output
+ *    std::cout << dist(rng) << std::endl;
+ *
+ *    // write the mean of the distribution, just in case we forgot
+ *    std::cout << dist.mean() << std::endl;
+ *
+ *    // 2.0 is printed
+ *
+ *    // and the standard deviation
+ *    std::cout << dist.stddev() << std::endl;
+ *
+ *    // 3.5 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename RealType = double>
+  class normal_distribution
+    : public detail::normal_distribution_base<RealType>::type
+{
+  private:
+    typedef typename detail::normal_distribution_base<RealType>::type super_t;
+
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the floating point number produced by this \p normal_distribution.
+     */
+    typedef RealType result_type;
+
+    /*! \typedef param_type
+     *  \brief The type of the object encapsulating this \p normal_distribution's parameters.
+     */
+    typedef thrust::pair<RealType,RealType> param_type;
+
+    // constructors and reset functions
+    
+    /*! This constructor creates a new \p normal_distribution from two values defining the
+     *  half-open interval of the distribution.
+     *  
+     *  \param mean The mean (expected value) of the distribution. Defaults to \c 0.0.
+     *  \param stddev The standard deviation of the distribution. Defaults to \c 1.0.
+     */
+    __host__ __device__
+    explicit normal_distribution(RealType mean = 0.0, RealType stddev = 1.0);
+
+    /*! This constructor creates a new \p normal_distribution from a \p param_type object
+     *  encapsulating the range of the distribution.
+     *  
+     *  \param parm A \p param_type object encapsulating the parameters (i.e., the mean and standard deviation) of the distribution.
+     */
+    __host__ __device__
+    explicit normal_distribution(const param_type &parm);
+
+    /*! Calling this member function guarantees that subsequent uses of this
+     *  \p normal_distribution do not depend on values produced by any random
+     *  number generator prior to invoking this function.
+     */
+    __host__ __device__
+    void reset(void);
+
+    // generating functions
+
+    /*! This method produces a new Normal random integer drawn from this \p normal_distribution's
+     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng);
+
+    /*! This method produces a new Normal random integer as if by creating a new \p normal_distribution 
+     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
+     *  \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     *  \param parm A \p param_type object encapsulating the parameters of the \p normal_distribution
+     *              to draw from.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
+
+    // property functions
+
+    /*! This method returns the value of the parameter with which this \p normal_distribution
+     *  was constructed.
+     *
+     *  \return The mean (expected value) of this \p normal_distribution's output.
+     */
+    __host__ __device__
+    result_type mean(void) const;
+
+    /*! This method returns the value of the parameter with which this \p normal_distribution
+     *  was constructed.
+     *
+     *  \return The standard deviation of this \p uniform_real_distribution's output.
+     */
+    __host__ __device__
+    result_type stddev(void) const;
+
+    /*! This method returns a \p param_type object encapsulating the parameters with which this
+     *  \p normal_distribution was constructed.
+     *
+     *  \return A \p param_type object encapsulating the parameters (i.e., the mean and standard deviation) of this \p normal_distribution.
+     */
+    __host__ __device__
+    param_type param(void) const;
+
+    /*! This method changes the parameters of this \p normal_distribution using the values encapsulated
+     *  in a given \p param_type object.
+     *
+     *  \param parm A \p param_type object encapsulating the new parameters (i.e., the mean and variance) of this \p normal_distribution.
+     */
+    __host__ __device__
+    void param(const param_type &parm);
+
+    /*! This method returns the smallest floating point number this \p normal_distribution can potentially produce.
+     *
+     *  \return The lower bound of this \p normal_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! This method returns the smallest number larger than largest floating point number this \p uniform_real_distribution can potentially produce.
+     *
+     *  \return The upper bound of this \p normal_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! \cond
+     */
+  private:
+    param_type m_param;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const normal_distribution &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end normal_distribution
+
+
+/*! This function checks two \p normal_distributions for equality.
+ *  \param lhs The first \p normal_distribution to test.
+ *  \param rhs The second \p normal_distribution to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator==(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs);
+
+
+/*! This function checks two \p normal_distributions for inequality.
+ *  \param lhs The first \p normal_distribution to test.
+ *  \param rhs The second \p normal_distribution to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator!=(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs);
+
+
+/*! This function streams a normal_distribution to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param d The \p normal_distribution to stream out.
+ *  \return \p os
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const normal_distribution<RealType> &d);
+
+
+/*! This function streams a normal_distribution in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param d The \p normal_distribution to stream in.
+ *  \return \p is
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           normal_distribution<RealType> &d);
+
+
+/*! \} // end random_number_distributions
+ */
+
+
+} // end random
+
+using random::normal_distribution;
+
+} // end thrust
+
+#include <thrust/random/detail/normal_distribution.inl>
+
diff --git a/thrust/thrust/random/subtract_with_carry_engine.h b/thrust/thrust/random/subtract_with_carry_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b12ca3530a5bed1d38b816359fcce4b99d6d9d5
--- /dev/null
+++ b/thrust/thrust/random/subtract_with_carry_engine.h
@@ -0,0 +1,256 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file subtract_with_carry_engine.h
+ *  \brief A subtract-with-carry pseudorandom number generator
+ *         based on Marsaglia & Zaman.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/random/detail/random_core_access.h>
+
+#include <thrust/detail/cstdint.h>
+#include <cstddef> // for size_t
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+/*! \addtogroup random_number_engine_templates
+ *  \{
+ */
+
+/*! \class subtract_with_carry_engine
+ *  \brief A \p subtract_with_carry_engine random number engine produces unsigned
+ *         integer random numbers using the subtract with carry algorithm of Marsaglia & Zaman.
+ *
+ *         The generation algorithm is performed as follows:
+ *         -# Let <tt>Y = X_{i-s}- X_{i-r} - c</tt>.
+ *         -# Set <tt>X_i</tt> to <tt>y = T mod m</tt>. Set \c c to \c 1 if <tt>Y < 0</tt>, otherwise set \c c to \c 0.
+ *
+ *         This algorithm corresponds to a modular linear function of the form
+ *
+ *         <tt>TA(x_i) = (a * x_i) mod b</tt>, where \c b is of the form <tt>m^r - m^s + 1</tt> and
+ *         <tt>a = b - (b-1)/m</tt>.
+ *
+ *  \tparam UIntType The type of unsigned integer to produce.
+ *  \tparam w The word size of the produced values (<tt> w <= sizeof(UIntType)</tt>).
+ *  \tparam s The short lag of the generation algorithm.
+ *  \tparam r The long lag of the generation algorithm.
+ *
+ *  \note Inexperienced users should not use this class template directly.  Instead, use
+ *  \p ranlux24_base or \p ranlux48_base, which are instances of \p subtract_with_carry_engine.
+ *
+ *  \see thrust::random::ranlux24_base
+ *  \see thrust::random::ranlux48_base
+ */
+template<typename UIntType, size_t w, size_t s, size_t r>
+  class subtract_with_carry_engine
+{
+    /*! \cond
+     */
+  private:
+    static const UIntType modulus = UIntType(1) << w;
+    /*! \endcond
+     */
+
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p subtract_with_carry_engine.
+     */
+    typedef UIntType result_type;
+
+    // engine characteristics
+
+    /*! The word size of the produced values.
+     */
+    static const size_t word_size = w;
+
+    /*! The size of the short lag used in the generation algorithm.
+     */
+    static const size_t short_lag = s;
+
+    /*! The size of the long lag used in the generation algorithm.
+     */
+    static const size_t long_lag = r;
+
+    /*! The smallest value this \p subtract_with_carry_engine may potentially produce.
+     */
+    static const result_type min = 0;
+
+    /*! The largest value this \p subtract_with_carry_engine may potentially produce.
+     */
+    static const result_type max = modulus - 1;
+
+    /*! The default seed of this \p subtract_with_carry_engine.
+     */
+    static const result_type default_seed = 19780503u;
+
+    // constructors and seeding functions
+
+    /*! This constructor, which optionally accepts a seed, initializes a new
+     *  \p subtract_with_carry_engine.
+     *  
+     *  \param value The seed used to intialize this \p subtract_with_carry_engine's state.
+     */
+    __host__ __device__
+    explicit subtract_with_carry_engine(result_type value = default_seed);
+
+    /*! This method initializes this \p subtract_with_carry_engine's state, and optionally accepts
+     *  a seed value.
+     *
+     *  \param value The seed used to initializes this \p subtract_with_carry_engine's state.
+     */
+    __host__ __device__
+    void seed(result_type value = default_seed);
+
+    // generating functions
+    
+    /*! This member function produces a new random value and updates this \p subtract_with_carry_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p subtract_with_carry_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    /*! \cond
+     */
+  private:
+    result_type m_x[long_lag];
+    unsigned int m_k;
+    int m_carry;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const subtract_with_carry_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    /*! \endcond
+     */
+}; // end subtract_with_carry_engine
+
+
+/*! This function checks two \p subtract_with_carry_engines for equality.
+ *  \param lhs The first \p subtract_with_carry_engine to test.
+ *  \param rhs The second \p subtract_with_carry_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_>
+__host__ __device__
+bool operator==(const subtract_with_carry_engine<UIntType_,w_,s_,r_> &lhs,
+                const subtract_with_carry_engine<UIntType_,w_,s_,r_> &rhs);
+
+
+/*! This function checks two \p subtract_with_carry_engines for inequality.
+ *  \param lhs The first \p subtract_with_carry_engine to test.
+ *  \param rhs The second \p subtract_with_carry_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_>
+__host__ __device__
+bool operator!=(const subtract_with_carry_engine<UIntType_,w_,s_,r_>&lhs,
+                const subtract_with_carry_engine<UIntType_,w_,s_,r_>&rhs);
+
+
+/*! This function streams a subtract_with_carry_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p subtract_with_carry_engine to stream out.
+ *  \return \p os
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const subtract_with_carry_engine<UIntType_,w_,s_,r_> &e);
+
+
+/*! This function streams a subtract_with_carry_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p subtract_with_carry_engine to stream in.
+ *  \return \p is
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           subtract_with_carry_engine<UIntType_,w_,s_,r_> &e);
+
+
+/*! \} // end random_number_engine_templates
+ */
+
+
+/*! \addtogroup predefined_random
+ *  \{
+ */
+
+// XXX N2111 uses uint_fast32_t here
+
+/*! \typedef ranlux24_base
+ *  \brief A random number engine with predefined parameters which implements the
+ *         base engine of the \p ranlux24 random number engine.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux24_base
+ *        shall produce the value \c 7937952 .
+ */
+typedef subtract_with_carry_engine<thrust::detail::uint32_t, 24, 10, 24> ranlux24_base;
+
+
+// XXX N2111 uses uint_fast64_t here
+
+/*! \typedef ranlux48_base
+ *  \brief A random number engine with predefined parameters which implements the
+ *         base engine of the \p ranlux48 random number engine.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux48_base
+ *        shall produce the value \c 192113843633948 .
+ */
+typedef subtract_with_carry_engine<thrust::detail::uint64_t, 48,  5, 12> ranlux48_base;
+
+/*! \} // end predefined_random
+ */
+
+} // end random
+
+// import names into thrust::
+using random::subtract_with_carry_engine;
+using random::ranlux24_base;
+using random::ranlux48_base;
+
+} // end thrust
+
+#include <thrust/random/detail/subtract_with_carry_engine.inl>
+
diff --git a/thrust/thrust/random/uniform_int_distribution.h b/thrust/thrust/random/uniform_int_distribution.h
new file mode 100644
index 0000000000000000000000000000000000000000..42d745781964e3b4b85add7530fbaa2029635511
--- /dev/null
+++ b/thrust/thrust/random/uniform_int_distribution.h
@@ -0,0 +1,276 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uniform_int_distribution.h
+ *  \brief A uniform distribution of integer-valued numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/integer_traits.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_distributions Random Number Distributions Class Templates
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \class uniform_int_distribution
+ *  \brief A \p uniform_int_distribution random number distribution produces signed or unsigned integer
+ *         uniform random numbers from a given range.
+ *
+ *  \tparam IntType The type of integer to produce.
+ *
+ *  The following code snippet demonstrates examples of using a \p uniform_int_distribution with a 
+ *  random number engine to produce random integers drawn from a given range:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/uniform_int_distribution.h>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object to act as our source of randomness
+ *    thrust::minstd_rand rng;
+ *
+ *    // create a uniform_int_distribution to produce ints from [-7,13]
+ *    thrust::uniform_int_distribution<int> dist(-7,13);
+ *
+ *    // write a random number from the range [-7,13] to standard output
+ *    std::cout << dist(rng) << std::endl;
+ *
+ *    // write the range of the distribution, just in case we forgot
+ *    std::cout << dist.min() << std::endl;
+ *
+ *    // -7 is printed
+ *
+ *    std::cout << dist.max() << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    // write the parameters of the distribution (which happen to be the bounds) to standard output
+ *    std::cout << dist.a() << std::endl;
+ *
+ *    // -7 is printed
+ *
+ *    std::cout << dist.b() << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename IntType = int>
+  class uniform_int_distribution
+{
+  public:
+    // types
+
+    /*! \typedef result_type
+     *  \brief The type of the integer produced by this \p uniform_int_distribution.
+     */
+    typedef IntType result_type;
+
+    /*! \typedef param_type
+     *  \brief The type of the object encapsulating this \p uniform_int_distribution's parameters.
+     */
+    typedef thrust::pair<IntType,IntType> param_type;
+
+    // constructors and reset functions
+
+    /*! This constructor creates a new \p uniform_int_distribution from two values defining the
+     *  range of the distribution.
+     *  
+     *  \param a The smallest integer to potentially produce. Defaults to \c 0.
+     *  \param b The largest integer to potentially produce. Defaults to the largest representable integer in
+     *           the platform.
+     */
+    __host__ __device__
+    explicit uniform_int_distribution(IntType a = 0, IntType b = thrust::detail::integer_traits<IntType>::const_max);
+
+    /*! This constructor creates a new \p uniform_int_distribution from a \p param_type object
+     *  encapsulating the range of the distribution.
+     *  
+     *  \param parm A \p param_type object encapsulating the parameters (i.e., the range) of the distribution.
+     */
+    __host__ __device__
+    explicit uniform_int_distribution(const param_type &parm);
+
+    /*! This does nothing.  It is included to conform to the requirements of the RandomDistribution concept.
+     */
+    __host__ __device__
+    void reset(void);
+
+    // generating functions
+
+    /*! This method produces a new uniform random integer drawn from this \p uniform_int_distribution's
+     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng);
+
+    /*! This method produces a new uniform random integer as if by creating a new \p uniform_int_distribution 
+     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
+     *  \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     *  \param parm A \p param_type object encapsulating the parameters of the \p uniform_int_distribution
+     *              to draw from.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
+
+    // property functions
+    
+    /*! This method returns the value of the parameter with which this \p uniform_int_distribution
+     *  was constructed.
+     *
+     *  \return The lower bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type a(void) const;
+
+    /*! This method returns the value of the parameter with which this \p uniform_int_distribution
+     *  was constructed.
+     *
+     *  \return The upper bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type b(void) const;
+
+    /*! This method returns a \p param_type object encapsulating the parameters with which this
+     *  \p uniform_int_distribution was constructed.
+     *
+     *  \return A \p param_type object enapsulating the range of this \p uniform_int_distribution.
+     */
+    __host__ __device__
+    param_type param(void) const;
+
+    /*! This method changes the parameters of this \p uniform_int_distribution using the values encapsulated
+     *  in a given \p param_type object.
+     *
+     *  \param parm A \p param_type object encapsulating the new range of this \p uniform_int_distribution.
+     */
+    __host__ __device__
+    void param(const param_type &parm);
+
+    /*! This method returns the smallest integer this \p uniform_int_distribution can potentially produce.
+     *
+     *  \return The lower bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! This method returns the largest integer this \p uniform_int_distribution can potentially produce.
+     *
+     *  \return The upper bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! \cond
+     */
+  private:
+    param_type m_param;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const uniform_int_distribution &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end uniform_int_distribution
+
+
+/*! This function checks two \p uniform_int_distributions for equality.
+ *  \param lhs The first \p uniform_int_distribution to test.
+ *  \param rhs The second \p uniform_int_distribution to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename IntType>
+__host__ __device__
+bool operator==(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs);
+
+
+/*! This function checks two \p uniform_int_distributions for inequality.
+ *  \param lhs The first \p uniform_int_distribution to test.
+ *  \param rhs The second \p uniform_int_distribution to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename IntType>
+__host__ __device__
+bool operator!=(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs);
+
+
+/*! This function streams a uniform_int_distribution to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param d The \p uniform_int_distribution to stream out.
+ *  \return \p os
+ */
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_int_distribution<IntType> &d);
+
+
+/*! This function streams a uniform_int_distribution in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param d The \p uniform_int_distribution to stream in.
+ *  \return \p is
+ */
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_int_distribution<IntType> &d);
+
+
+/*! \} // end random_number_distributions
+ */
+
+
+} // end random
+
+using random::uniform_int_distribution;
+
+} // end thrust
+
+#include <thrust/random/detail/uniform_int_distribution.inl>
+
diff --git a/thrust/thrust/random/uniform_real_distribution.h b/thrust/thrust/random/uniform_real_distribution.h
new file mode 100644
index 0000000000000000000000000000000000000000..31210457087c65b689e769570af946dcd6d4c9e1
--- /dev/null
+++ b/thrust/thrust/random/uniform_real_distribution.h
@@ -0,0 +1,274 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uniform_real_distribution.h
+ *  \brief A uniform distribution of real-valued numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+/*! \addtogroup random_number_distributions
+ *  \{
+ */
+
+/*! \class uniform_real_distribution
+ *  \brief A \p uniform_real_distribution random number distribution produces floating point
+ *         uniform random numbers from a half-open interval.
+ *
+ *  \tparam RealType The type of floating point number to produce.
+ *
+ *  The following code snippet demonstrates examples of using a \p uniform_real_distribution with a 
+ *  random number engine to produce random integers drawn from a given range:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/uniform_real_distribution.h>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object to act as our source of randomness
+ *    thrust::minstd_rand rng;
+ *
+ *    // create a uniform_real_distribution to produce floats from [-7,13)
+ *    thrust::uniform_real_distribution<float> dist(-7,13);
+ *
+ *    // write a random number from the range [-7,13) to standard output
+ *    std::cout << dist(rng) << std::endl;
+ *
+ *    // write the range of the distribution, just in case we forgot
+ *    std::cout << dist.min() << std::endl;
+ *
+ *    // -7.0 is printed
+ *
+ *    std::cout << dist.max() << std::endl;
+ *
+ *    // 13.0 is printed
+ *
+ *    // write the parameters of the distribution (which happen to be the bounds) to standard output
+ *    std::cout << dist.a() << std::endl;
+ *
+ *    // -7.0 is printed
+ *
+ *    std::cout << dist.b() << std::endl;
+ *
+ *    // 13.0 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename RealType = double>
+  class uniform_real_distribution
+{
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the floating point number produced by this \p uniform_real_distribution.
+     */
+    typedef RealType result_type;
+
+    /*! \typedef param_type
+     *  \brief The type of the object encapsulating this \p uniform_real_distribution's parameters.
+     */
+    typedef thrust::pair<RealType,RealType> param_type;
+
+    // constructors and reset functions
+    
+    /*! This constructor creates a new \p uniform_real_distribution from two values defining the
+     *  half-open interval of the distribution.
+     *  
+     *  \param a The smallest floating point number to potentially produce. Defaults to \c 0.0.
+     *  \param b The smallest number larger than the largest floating point number to potentially produce. Defaults to \c 1.0.
+     */
+    __host__ __device__
+    explicit uniform_real_distribution(RealType a = 0.0, RealType b = 1.0);
+
+    /*! This constructor creates a new \p uniform_real_distribution from a \p param_type object
+     *  encapsulating the range of the distribution.
+     *  
+     *  \param parm A \p param_type object encapsulating the parameters (i.e., the range) of the distribution.
+     */
+    __host__ __device__
+    explicit uniform_real_distribution(const param_type &parm);
+
+    /*! This does nothing.  It is included to conform to the requirements of the RandomDistribution concept.
+     */
+    __host__ __device__
+    void reset(void);
+
+    // generating functions
+
+    /*! This method produces a new uniform random integer drawn from this \p uniform_real_distribution's
+     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng);
+
+    /*! This method produces a new uniform random integer as if by creating a new \p uniform_real_distribution 
+     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
+     *  \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     *  \param parm A \p param_type object encapsulating the parameters of the \p uniform_real_distribution
+     *              to draw from.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
+
+    // property functions
+
+    /*! This method returns the value of the parameter with which this \p uniform_real_distribution
+     *  was constructed.
+     *
+     *  \return The lower bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type a(void) const;
+
+    /*! This method returns the value of the parameter with which this \p uniform_real_distribution
+     *  was constructed.
+     *
+     *  \return The upper bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type b(void) const;
+
+    /*! This method returns a \p param_type object encapsulating the parameters with which this
+     *  \p uniform_real_distribution was constructed.
+     *
+     *  \return A \p param_type object enapsulating the half-open interval of this \p uniform_real_distribution.
+     */
+    __host__ __device__
+    param_type param(void) const;
+
+    /*! This method changes the parameters of this \p uniform_real_distribution using the values encapsulated
+     *  in a given \p param_type object.
+     *
+     *  \param parm A \p param_type object encapsulating the new half-open interval of this \p uniform_real_distribution.
+     */
+    __host__ __device__
+    void param(const param_type &parm);
+
+    /*! This method returns the smallest floating point number this \p uniform_real_distribution can potentially produce.
+     *
+     *  \return The lower bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! This method returns the smallest number larger than largest floating point number this \p uniform_real_distribution can potentially produce.
+     *
+     *  \return The upper bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! \cond
+     */
+  private:
+    param_type m_param;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const uniform_real_distribution &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end uniform_real_distribution
+
+
+/*! This function checks two \p uniform_real_distributions for equality.
+ *  \param lhs The first \p uniform_real_distribution to test.
+ *  \param rhs The second \p uniform_real_distribution to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator==(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs);
+
+
+/*! This function checks two \p uniform_real_distributions for inequality.
+ *  \param lhs The first \p uniform_real_distribution to test.
+ *  \param rhs The second \p uniform_real_distribution to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator!=(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs);
+
+
+/*! This function streams a uniform_real_distribution to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param d The \p uniform_real_distribution to stream out.
+ *  \return \p os
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_real_distribution<RealType> &d);
+
+
+/*! This function streams a uniform_real_distribution in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param d The \p uniform_real_distribution to stream in.
+ *  \return \p is
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_real_distribution<RealType> &d);
+
+
+/*! \} // end random_number_distributions
+ */
+
+
+} // end random
+
+using random::uniform_real_distribution;
+
+} // end thrust
+
+#include <thrust/random/detail/uniform_real_distribution.inl>
+
diff --git a/thrust/thrust/random/xor_combine_engine.h b/thrust/thrust/random/xor_combine_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5e86b7a9dc782e10e5ddbfbb8e378fdae69b2e8
--- /dev/null
+++ b/thrust/thrust/random/xor_combine_engine.h
@@ -0,0 +1,271 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file xor_combine_engine.h
+ *  \brief A pseudorandom number generator which produces pseudorandom
+ *         numbers from two integer base engines by merging their
+ *         pseudorandom numbers with bitwise exclusive-or.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/random/detail/xor_combine_engine_max.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <iostream>
+#include <cstddef> // for size_t
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_adaptors
+ *  \{
+ */
+
+/*! \class xor_combine_engine
+ *  \brief An \p xor_combine_engine adapts two existing base random number engines and
+ *         produces random values by combining the values produced by each.
+ *
+ *  \tparam Engine1 The type of the first base random number engine to adapt.
+ *  \tparam s1 The size of the first shift to use in the generation algorithm.
+ *  \tparam Engine2 The type of the second base random number engine to adapt.
+ *  \tparam s2 The second of the second shift to use in the generation algorithm. Defaults to \c 0.
+ *
+ *  The following code snippet shows an example of using an \p xor_combine_engine instance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/xor_combine_engine.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    // create an xor_combine_engine from minstd_rand and minstd_rand0
+ *    // use a shift of 0 for each
+ *    thrust::xor_combine_engine<thrust::minstd_rand,0,thrust::minstd_rand0,0> rng;
+ *
+ *    // print a random number to standard output
+ *    std::cout << rng() << std::endl;
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2=0u>
+  class xor_combine_engine
+{
+  public:
+    // types
+
+    /*! \typedef base1_type
+     *  \brief The type of the first adapted base random number engine.
+     */
+    typedef Engine1 base1_type;
+
+    /*! \typedef base2_type
+     *  \brief The type of the second adapted base random number engine.
+     */
+    typedef Engine2 base2_type;
+
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p xor_combine_engine.
+     */
+    typedef typename thrust::detail::eval_if<
+      (sizeof(typename base2_type::result_type) > sizeof(typename base1_type::result_type)),
+      thrust::detail::identity_<typename base2_type::result_type>,
+      thrust::detail::identity_<typename base1_type::result_type>
+    >::type result_type;
+    
+    /*! The size of the first shift used in the generation algorithm.
+     */
+    static const size_t shift1 = s1;
+
+    /*! The size of the second shift used in the generation algorithm.
+     */
+    static const size_t shift2 = s2;
+
+    /*! The smallest value this \p xor_combine_engine may potentially produce.
+     */
+    static const result_type min = 0;
+
+    /*! The largest value this \p xor_combine_engine may potentially produce.
+     */
+    static const result_type max =
+      detail::xor_combine_engine_max<
+        Engine1, s1, Engine2, s2, result_type
+      >::value;
+
+    // constructors and seeding functions
+
+    /*! This constructor constructs a new \p xor_combine_engine and constructs
+     *  its adapted engines using their null constructors.
+     */
+    __host__ __device__
+    xor_combine_engine(void);
+
+    /*! This constructor constructs a new \p xor_combine_engine using
+     *  given \p base1_type and \p base2_type engines to initialize its adapted base engines.
+     *
+     *  \param urng1 A \p base1_type to use to initialize this \p xor_combine_engine's
+     *         first adapted base engine.
+     *  \param urng2 A \p base2_type to use to initialize this \p xor_combine_engine's
+     *         first adapted base engine.
+     */
+    __host__ __device__
+    xor_combine_engine(const base1_type &urng1, const base2_type &urng2);
+
+    /*! This constructor initializes a new \p xor_combine_engine with a given seed.
+     *  
+     *  \param s The seed used to intialize this \p xor_combine_engine's adapted base engines.
+     */
+    __host__ __device__
+    xor_combine_engine(result_type s);
+
+    /*! This method initializes the state of this \p xor_combine_engine's adapted base engines
+     *  by using their \p default_seed values.
+     */
+    __host__ __device__
+    void seed(void);
+
+    /*! This method initializes the state of this \p xor_combine_engine's adapted base engines
+     *  by using the given seed.
+     *
+     *  \param s The seed with which to intialize this \p xor_combine_engine's adapted base engines.
+     */
+    __host__ __device__
+    void seed(result_type s);
+
+    // generating functions
+
+    /*! This member function produces a new random value and updates this \p xor_combine_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p xor_combine_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    // property functions
+
+    /*! This member function returns a const reference to this \p xor_combine_engine's
+     *  first adapted base engine.
+     *
+     *  \return A const reference to the first base engine this \p xor_combine_engine adapts.
+     */
+    __host__ __device__
+    const base1_type &base1(void) const;
+
+    /*! This member function returns a const reference to this \p xor_combine_engine's
+     *  second adapted base engine.
+     *
+     *  \return A const reference to the second base engine this \p xor_combine_engine adapts.
+     */
+    __host__ __device__
+    const base2_type &base2(void) const;
+
+    /*! \cond
+     */
+  private:
+    base1_type m_b1;
+    base2_type m_b2;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const xor_combine_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    /*! \endcond
+     */
+}; // end xor_combine_engine
+
+
+/*! This function checks two \p xor_combine_engines for equality.
+ *  \param lhs The first \p xor_combine_engine to test.
+ *  \param rhs The second \p xor_combine_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_>
+__host__ __device__
+bool operator==(const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &lhs,
+                const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &rhs);
+
+
+/*! This function checks two \p xor_combine_engines for inequality.
+ *  \param lhs The first \p xor_combine_engine to test.
+ *  \param rhs The second \p xor_combine_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_>
+__host__ __device__
+bool operator!=(const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &lhs,
+                const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &rhs);
+
+
+/*! This function streams a xor_combine_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p xor_combine_engine to stream out.
+ *  \return \p os
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &e);
+
+
+/*! This function streams a xor_combine_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p xor_combine_engine to stream in.
+ *  \return \p is
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &e);
+
+
+/*! \} // end random_number_engine_adaptors
+ */
+
+
+} // end random
+
+// import names into thrust::
+using random::xor_combine_engine;
+
+} // end thrust
+
+#include <thrust/random/detail/xor_combine_engine.inl>
+
diff --git a/thrust/thrust/reduce.h b/thrust/thrust/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..cabb83c377660d94a0d0ca88c4d87d108a8e5b25
--- /dev/null
+++ b/thrust/thrust/reduce.h
@@ -0,0 +1,785 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/reduce.h
+ *  \brief Functions for reducing a range to a single value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ */
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \c 0 as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \return The result of the reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
+ *          \c value_type. If \c T is \c InputIterator's \c value_type, then
+ *          <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(thrust::host, data, data + 6);
+ *
+ *  // result == 9
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename DerivedPolicy, typename InputIterator>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \c 0 as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \return The result of the reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
+ *          \c value_type. If \c T is \c InputIterator's \c value_type, then
+ *          <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(data, data + 6);
+ *
+ *  // result == 9
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename InputIterator> typename
+  thrust::iterator_traits<InputIterator>::value_type reduce(InputIterator first, InputIterator last);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \return The result of the reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p T.
+ *  \tparam T is convertible to \p InputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers including an intialization value using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(thrust::host, data, data + 6, 1);
+ *
+ *  // result == 10
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \return The result of the reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p T.
+ *  \tparam T is convertible to \p InputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers including an intialization value.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(data, data + 6, 1);
+ *
+ *  // result == 10
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename InputIterator, typename T>
+  T reduce(InputIterator first,
+           InputIterator last,
+           T init);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction and \p binary_op as the binary function used for summation. \p reduce
+ *  is similar to the C++ Standard Template Library's <tt>std::accumulate</tt>.
+ *  The primary difference between the two functions is that <tt>std::accumulate</tt>
+ *  guarantees the order of summation, while \p reduce requires associativity of
+ *  \p binary_op to parallelize the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case \p binary_op) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \param binary_op The binary function used to 'sum' values.
+ *  \return The result of the reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \c InputIterator's \c value_type is convertible to \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to
+ *  compute the maximum value of a sequence of integers using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(thrust::host,
+ *                              data, data + 6,
+ *                              -1,
+ *                              thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see transform_reduce
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+__host__ __device__
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init,
+           BinaryFunction binary_op);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction and \p binary_op as the binary function used for summation. \p reduce
+ *  is similar to the C++ Standard Template Library's <tt>std::accumulate</tt>.
+ *  The primary difference between the two functions is that <tt>std::accumulate</tt>
+ *  guarantees the order of summation, while \p reduce requires associativity of
+ *  \p binary_op to parallelize the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case \p binary_op) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \param binary_op The binary function used to 'sum' values.
+ *  \return The result of the reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \c InputIterator's \c value_type is convertible to \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to
+ *  compute the maximum value of a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(data, data + 6,
+ *                              -1,
+ *                              thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see transform_reduce
+ */
+template<typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+  T reduce(InputIterator first,
+           InputIterator last,
+           T init,
+           BinaryFunction binary_op);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c equal_to
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c equal_to
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::reduce_by_key(A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::reduce_by_key(A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c BinaryFunction \c binary_op and the result copied to \c values_output. 
+ *  Specifically, if consecutive key iterators \c i and \c (i + 1) are 
+ *  such that <tt>binary_pred(*i, *(i+1))</tt> is \c true, then the corresponding
+ *  values are reduced to a single value with \c binary_op.
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c binary_op to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \param binary_op The binary function used to accumulate values.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int> binary_op;
+ *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D, binary_pred, binary_op);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c BinaryFunction \c binary_op and the result copied to \c values_output. 
+ *  Specifically, if consecutive key iterators \c i and \c (i + 1) are 
+ *  such that <tt>binary_pred(*i, *(i+1))</tt> is \c true, then the corresponding
+ *  values are reduced to a single value with \c binary_op.
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c binary_op to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \param binary_op The binary function used to accumulate values.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int> binary_op;
+ *  new_end = thrust::reduce_by_key(A, A + N, B, C, D, binary_pred, binary_op);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op);
+
+
+/*! \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/reduce.inl>
+
diff --git a/thrust/thrust/remove.h b/thrust/thrust/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e8ec41a60883cc81a67530ec3cf50dc9d00c730
--- /dev/null
+++ b/thrust/thrust/remove.h
@@ -0,0 +1,806 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.h
+ *  \brief Functions for removing elements from a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup stream_compaction Stream Compaction
+ *  \ingroup reordering
+ *  \{
+ *
+ */
+
+
+/*! \p remove removes from the range <tt>[first, last)</tt> all elements that are
+ *  equal to \p value. That is, \p remove returns an iterator \p new_last such
+ *  that the range <tt>[first, new_last)</tt> contains no elements equal to
+ *  \p value. The iterators in the range <tt>[new_first,last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified. \p remove
+ *  is stable, meaning that the relative order of elements that are not equal to
+ *  \p value is unchanged.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param value The value to remove from the range <tt>[first, last)</tt>.
+ *         Elements which are equal to value are removed from the sequence.
+ *  \return A \p ForwardIterator pointing to the end of the resulting range of
+ *          elements which are not equal to \p value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p remove to remove a number
+ *  of interest from a range using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {3, 1, 4, 1, 5, 9};
+ *  int *new_end = thrust::remove(A, A + N, 1);
+ *  // The first four values of A are now {3, 4, 5, 9}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove does not destroy any
+ *  iterators, and does not change the distance between \p first and \p last.
+ *  (There's no way that it could do anything of the sort.) So, for example, if
+ *  \c V is a device_vector, <tt>remove(V.begin(), V.end(), 0)</tt> does not
+ *  change <tt>V.size()</tt>: \c V will contain just as many elements as it did
+ *  before. \p remove returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see remove_if
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  ForwardIterator remove(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value);
+
+
+/*! \p remove removes from the range <tt>[first, last)</tt> all elements that are
+ *  equal to \p value. That is, \p remove returns an iterator \p new_last such
+ *  that the range <tt>[first, new_last)</tt> contains no elements equal to
+ *  \p value. The iterators in the range <tt>[new_first,last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified. \p remove
+ *  is stable, meaning that the relative order of elements that are not equal to
+ *  \p value is unchanged.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param value The value to remove from the range <tt>[first, last)</tt>.
+ *         Elements which are equal to value are removed from the sequence.
+ *  \return A \p ForwardIterator pointing to the end of the resulting range of
+ *          elements which are not equal to \p value.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p remove to remove a number
+ *  of interest from a range.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {3, 1, 4, 1, 5, 9};
+ *  int *new_end = thrust::remove(A, A + N, 1);
+ *  // The first four values of A are now {3, 4, 5, 9}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove does not destroy any
+ *  iterators, and does not change the distance between \p first and \p last.
+ *  (There's no way that it could do anything of the sort.) So, for example, if
+ *  \c V is a device_vector, <tt>remove(V.begin(), V.end(), 0)</tt> does not
+ *  change <tt>V.size()</tt>: \c V will contain just as many elements as it did
+ *  before. \p remove returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see remove_if
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value);
+
+
+/*! \p remove_copy copies elements that are not equal to \p value from the range
+ *  <tt>[first, last)</tt> to a range beginning at \p result. The return value is
+ *  the end of the resulting range. This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as in
+ *  the range <tt>[first, last)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param value The value to omit from the copied range.
+ *  \return An OutputIterator pointing to the end of the resulting range of elements
+ *          which are not equal to \p value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy to copy
+ *  a sequence of numbers to an output range while omitting a value of interest using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[N-2];
+ *  thrust::remove_copy(thrust::host, V, V + N, result, 0);
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, -1, 1, 2}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see remove
+ *  \see remove_if
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator remove_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value);
+
+
+/*! \p remove_copy copies elements that are not equal to \p value from the range
+ *  <tt>[first, last)</tt> to a range beginning at \p result. The return value is
+ *  the end of the resulting range. This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as in
+ *  the range <tt>[first, last)</tt>.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param value The value to omit from the copied range.
+ *  \return An OutputIterator pointing to the end of the resulting range of elements
+ *          which are not equal to \p value.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy to copy
+ *  a sequence of numbers to an output range while omitting a value of interest.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[N-2];
+ *  thrust::remove_copy(V, V + N, result, 0);
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, -1, 1, 2}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see remove
+ *  \see remove_if
+ *  \see remove_copy_if
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first,new_last)</tt> contains
+ *  no elements for which \p pred is \c true. The iterators in the range
+ *  <tt>[new_last,last)</tt> are all still dereferenceable, but the elements that
+ *  they point to are unspecified. \p remove_if is stable, meaning that the
+ *  relative order of elements that are not removed is unchanged.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[first,last)</tt>. Elements for which \p pred evaluates to
+ *              \c true are removed from the sequence.
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  all even numbers from an array of integers using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int *new_end = thrust::remove_if(thrust::host, A, A + N, is_even());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove_if does not
+ *  destroy any iterators, and does not change the distance between \p first and
+ *  \p last. (There's no way that it could do anything of the sort.) So, for
+ *  example, if \c V is a device_vector,
+ *  <tt>remove_if(V.begin(), V.end(), pred)</tt> does not change
+ *  <tt>V.size()</tt>: \c V will contain just as many elements as it did before.
+ *  \p remove_if returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first,new_last)</tt> contains
+ *  no elements for which \p pred is \c true. The iterators in the range
+ *  <tt>[new_last,last)</tt> are all still dereferenceable, but the elements that
+ *  they point to are unspecified. \p remove_if is stable, meaning that the
+ *  relative order of elements that are not removed is unchanged.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[first,last)</tt>. Elements for which \p pred evaluates to
+ *              \c true are removed from the sequence.
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  all even numbers from an array of integers.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int *new_end = thrust::remove_if(A, A + N, is_even());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove_if does not
+ *  destroy any iterators, and does not change the distance between \p first and
+ *  \p last. (There's no way that it could do anything of the sort.) So, for
+ *  example, if \c V is a device_vector,
+ *  <tt>remove_if(V.begin(), V.end(), pred)</tt> does not change
+ *  <tt>V.size()</tt>: \c V will contain just as many elements as it did before.
+ *  \p remove_if returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred is
+ *  \c true are not copied. The return value is the end of the resulting range.
+ *  This operation is stable, meaning that the relative order of the elements that
+ *  are copied is the same as the range <tt>[first,last)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting even numbers using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[2];
+ *  thrust::remove_copy_if(thrust::host, V, V + N, result, is_even());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred is
+ *  \c true are not copied. The return value is the end of the resulting range.
+ *  This operation is stable, meaning that the relative order of the elements that
+ *  are copied is the same as the range <tt>[first,last)</tt>.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting even numbers.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[2];
+ *  thrust::remove_copy_if(V, V + N, result, is_even());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first, new_last)</tt> contains
+ *  no elements for which \p pred of the corresponding stencil value is \c true. 
+ *  The iterators in the range <tt>[new_last,last)</tt> are all still dereferenceable,
+ *  but the elements that they point to are unspecified. \p remove_if is stable,
+ *  meaning that the relative order of elements that are not removed is unchanged.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[stencil, stencil + (last - first))</tt>. Elements for which \p pred evaluates to
+ *              \c true are removed from the sequence <tt>[first, last)</tt>
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  specific elements from an array of integers using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int S[N] = {0, 1, 1, 1, 0, 0};
+ *
+ *  int *new_end = thrust::remove_if(thrust::host, A, A + N, S, thrust::identity<int>());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first, new_last)</tt> contains
+ *  no elements for which \p pred of the corresponding stencil value is \c true. 
+ *  The iterators in the range <tt>[new_last,last)</tt> are all still dereferenceable,
+ *  but the elements that they point to are unspecified. \p remove_if is stable,
+ *  meaning that the relative order of elements that are not removed is unchanged.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[stencil, stencil + (last - first))</tt>. Elements for which \p pred evaluates to
+ *              \c true are removed from the sequence <tt>[first, last)</tt>
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  specific elements from an array of integers.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int S[N] = {0, 1, 1, 1, 0, 0};
+ *
+ *  int *new_end = thrust::remove_if(A, A + N, S, thrust::identity<int>());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred of the 
+ *  corresponding stencil value is \c true are not copied. The return value is 
+ *  the end of the resulting range.  This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as the 
+ *  range <tt>[first,last)</tt>.
+ *
+ *  The algorithm's execution policy is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting specific elements using the \p thrust::host
+ *  execution policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int S[N] = { 1, 1,  0, 1, 0, 1};
+ *  int result[2];
+ *  thrust::remove_copy_if(thrust::host, V, V + N, S, result, thrust::identity<int>());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ *  \see copy_if
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred of the 
+ *  corresponding stencil value is \c true are not copied. The return value is 
+ *  the end of the resulting range.  This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as the 
+ *  range <tt>[first,last)</tt>.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting specific elements.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int S[N] = { 1, 1,  0, 1, 0, 1};
+ *  int result[2];
+ *  thrust::remove_copy_if(V, V + N, S, result, thrust::identity<int>());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ *  \see copy_if
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \} // end stream_compaction
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/remove.inl>
+
diff --git a/thrust/thrust/replace.h b/thrust/thrust/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..225cb060a6ee01d23eaac574f6d03ad7a964a22b
--- /dev/null
+++ b/thrust/thrust/replace.h
@@ -0,0 +1,823 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file replace.h
+ *  \brief Functions for replacing elements in a range with a particular value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \addtogroup replacing
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! \p replace replaces every element in the range [first, last) equal to \p old_value
+ *  with \p new_value. That is: for every iterator \c i, if <tt>*i == old_value</tt>
+ *  then it performs the <tt>assignment *i = new_value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param old_value The value to replace.
+ *  \param new_value The new value to replace \p old_value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *          objects of \p T may be compared for equality with objects of
+ *          \p ForwardIterator's \c value_type,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace to replace
+ *  a value of interest in a \c device_vector with another using the \p thrust::device
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::replace(thrust::device, A.begin(), A.end(), 1, 99);
+ *
+ *  // A contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void replace(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               ForwardIterator first, ForwardIterator last,
+               const T &old_value,
+               const T &new_value);
+
+
+/*! \p replace replaces every element in the range [first, last) equal to \p old_value
+ *  with \p new_value. That is: for every iterator \c i, if <tt>*i == old_value</tt>
+ *  then it performs the <tt>assignment *i = new_value</tt>.
+ *
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param old_value The value to replace.
+ *  \param new_value The new value to replace \p old_value.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *          objects of \p T may be compared for equality with objects of
+ *          \p ForwardIterator's \c value_type,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace to replace
+ *  a value of interest in a \c device_vector with another.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::replace(A.begin(), A.end(), 1, 99);
+ *
+ *  // A contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename ForwardIterator, typename T>
+  void replace(ForwardIterator first, ForwardIterator last, const T &old_value,
+               const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  \p pred returns \c true with \p new_value. That is: for every iterator \c i, if
+ *  <tt>pred(*i)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's negative elements with \c 0 using the \p thrust::device execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ *
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(thrust::device, A.begin(), A.end(), pred, 0);
+ *
+ *  // A contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  \p pred returns \c true with \p new_value. That is: for every iterator \c i, if
+ *  <tt>pred(*i)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's negative elements with \c 0.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ *
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(A.begin(), A.end(), pred, 0);
+ *
+ *  // A contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  <tt>pred(*s)</tt> returns \c true with \p new_value. That is: for every iterator
+ *  \c i in the range <tt>[first, last)</tt>, and \c s in the range <tt>[stencil, stencil + (last - first))</tt>,
+ *  if <tt>pred(*s)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's element with \c 0 when its corresponding stencil element is less than zero
+ *  using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  is_less_than_zero pred;
+ *  thrust::replace_if(thrust::device, A.begin(), A.end(), S.begin(), pred, 0);
+ *
+ *  // A contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  <tt>pred(*s)</tt> returns \c true with \p new_value. That is: for every iterator
+ *  \c i in the range <tt>[first, last)</tt>, and \c s in the range <tt>[stencil, stencil + (last - first))</tt>,
+ *  if <tt>pred(*s)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's element with \c 0 when its corresponding stencil element is less than zero.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  is_less_than_zero pred;
+ *  thrust::replace_if(A.begin(), A.end(), S.begin(), pred, 0);
+ *
+ *  // A contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_copy copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element equal to \p old_value
+ *  is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>, \p replace_copy
+ *  performs the assignment <tt>*(result+n) = new_value</tt> if <tt>*(first+n) == old_value</tt>,
+ *  and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param old_value The value to replace.
+ *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          \p T may be compared for equality with \p InputIterator's \c value_type,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::device_vector<int> B(4);
+ *
+ *  thrust::replace_copy(thrust::device, A.begin(), A.end(), B.begin(), 1, 99);
+ *
+ *  // B contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see \c copy
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+__host__ __device__
+  OutputIterator replace_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first, InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value);
+
+
+/*! \p replace_copy copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element equal to \p old_value
+ *  is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>, \p replace_copy
+ *  performs the assignment <tt>*(result+n) = new_value</tt> if <tt>*(first+n) == old_value</tt>,
+ *  and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param old_value The value to replace.
+ *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          \p T may be compared for equality with \p InputIterator's \c value_type,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::device_vector<int> B(4);
+ *
+ *  thrust::replace_copy(A.begin(), A.end(), B.begin(), 1, 99);
+ *
+ *  // B contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see \c copy
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy_if
+ */
+template<typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(InputIterator first, InputIterator last,
+                              OutputIterator result, const T &old_value,
+                              const T &new_value);
+
+
+/*! \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element for which \p pred
+ *  is \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that 0 <= n < last-first,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(first+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ 
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_copy_if(thrust::device, A.begin(), A.end(), B.begin(), pred, 0);
+ *
+ *  // B contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element for which \p pred
+ *  is \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that 0 <= n < last-first,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(first+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ 
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_copy_if(A.begin(), A.end(), B.begin(), pred, 0);
+ *
+ *  // B contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ */
+template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! This version of \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(stencil+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last - first))</tt>.
+ *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *  \pre \p stencil may equal \p result, but the ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(thrust::device, A.begin(), A.end(), S.begin(), B.begin(), pred, 0);
+ *
+ *  // B contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see \c replace_copy
+ *  \see \c replace_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! This version of \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(stencil+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last - first))</tt>.
+ *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *  \pre \p stencil may equal \p result, but the ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(A.begin(), A.end(), S.begin(), B.begin(), pred, 0);
+ *
+ *  // B contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see \c replace_copy
+ *  \see \c replace_if
+ */
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! \} // end replacing
+ *  \} // transformations
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/replace.inl>
+
diff --git a/thrust/thrust/reverse.h b/thrust/thrust/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..73bd9579f78edbac367d1f5bd4a237420a35c84c
--- /dev/null
+++ b/thrust/thrust/reverse.h
@@ -0,0 +1,215 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reverse.h
+ *  \brief Reverses the order of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reordering
+ *  \ingroup algorithms
+ */
+
+
+/*! \p reverse reverses a range. That is: for every <tt>i</tt> such that
+ *  <tt>0 <= i <= (last - first) / 2</tt>, it exchanges <tt>*(first + i)</tt>
+ *  and <tt>*(last - (i + 1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *          \p BidirectionalIterator is mutable.
+ *
+ *  The following code snippet demonstrates how to use \p reverse to reverse a
+ *  \p device_vector of integers using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> v(data, data + N);
+ *  thrust::reverse(thrust::device, v.begin(), v.end());
+ *  // v is now {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see \p reverse_copy
+ *  \see \p reverse_iterator
+ */
+template<typename DerivedPolicy, typename BidirectionalIterator>
+__host__ __device__
+  void reverse(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last);
+
+
+/*! \p reverse reverses a range. That is: for every <tt>i</tt> such that
+ *  <tt>0 <= i <= (last - first) / 2</tt>, it exchanges <tt>*(first + i)</tt>
+ *  and <tt>*(last - (i + 1))</tt>.
+ *
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *          \p BidirectionalIterator is mutable.
+ *
+ *  The following code snippet demonstrates how to use \p reverse to reverse a
+ *  \p device_vector of integers.
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> v(data, data + N);
+ *  thrust::reverse(v.begin(), v.end());
+ *  // v is now {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see \p reverse_copy
+ *  \see \p reverse_iterator
+ */
+template<typename BidirectionalIterator>
+  void reverse(BidirectionalIterator first,
+               BidirectionalIterator last);
+
+
+/*! \p reverse_copy differs from \p reverse only in that the reversed range
+ *  is written to a different output range, rather than inplace.
+ *
+ *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
+ *  range <tt>[result, result + (last - first))</tt> such that the copy is a 
+ *  reverse of the original range. Specifically: for every <tt>i</tt> such that
+ *  <tt>0 <= i < (last - first)</tt>, \p reverse_copy performs the assignment
+ *  <tt>*(result + (last - first) - i) = *(first + i)</tt>.
+ *
+ *  The return value is <tt>result + (last - first))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *  \param result The beginning of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p reverse_copy to reverse
+ *  an input \p device_vector of integers to an output \p device_vector using the \p thrust::device
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> input(data, data + N);
+ *  thrust::device_vector<int> output(N);
+ *  thrust::reverse_copy(thrust::device, v.begin(), v.end(), output.begin());
+ *  // input is still {0, 1, 2, 3, 4, 5}
+ *  // output is now  {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see \p reverse
+ *  \see \p reverse_iterator
+ */
+template<typename DerivedPolicy, typename BidirectionalIterator, typename OutputIterator>
+__host__ __device__
+  OutputIterator reverse_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result);
+
+
+/*! \p reverse_copy differs from \p reverse only in that the reversed range
+ *  is written to a different output range, rather than inplace.
+ *
+ *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
+ *  range <tt>[result, result + (last - first))</tt> such that the copy is a 
+ *  reverse of the original range. Specifically: for every <tt>i</tt> such that
+ *  <tt>0 <= i < (last - first)</tt>, \p reverse_copy performs the assignment
+ *  <tt>*(result + (last - first) - i) = *(first + i)</tt>.
+ *
+ *  The return value is <tt>result + (last - first))</tt>.
+ *
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *  \param result The beginning of the output range.
+ *
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p reverse_copy to reverse
+ *  an input \p device_vector of integers to an output \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> input(data, data + N);
+ *  thrust::device_vector<int> output(N);
+ *  thrust::reverse_copy(v.begin(), v.end(), output.begin());
+ *  // input is still {0, 1, 2, 3, 4, 5}
+ *  // output is now  {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see \p reverse
+ *  \see \p reverse_iterator
+ */
+template<typename BidirectionalIterator, typename OutputIterator>
+  OutputIterator reverse_copy(BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result);
+
+
+/*! \} // end reordering
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/reverse.inl>
+
diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b79af04895ddab6df64b3080f713ac43e60173b
--- /dev/null
+++ b/thrust/thrust/scan.h
@@ -0,0 +1,1564 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief Functions for computing prefix sums
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+
+/*! \addtogroup prefixsums Prefix Sums
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum. More precisely, <tt>*first</tt> is 
+ *  assigned to <tt>*result</tt> and the sum of <tt>*first</tt> and 
+ *  <tt>*(first + 1)</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p inclusive_scan assumes plus as the associative operator.  
+ *  When the input and output sequences are the same, the scan is performed 
+ *  in-place.
+ 
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::inclusive_scan(thrust::host, data, data + 6, data); // in-place scan
+ *
+ *  // data is now {1, 1, 3, 5, 6, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum. More precisely, <tt>*first</tt> is 
+ *  assigned to <tt>*result</tt> and the sum of <tt>*first</tt> and 
+ *  <tt>*(first + 1)</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p inclusive_scan assumes plus as the associative operator.  
+ *  When the input and output sequences are the same, the scan is performed 
+ *  in-place.
+ 
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::inclusive_scan(data, data + 6, data); // in-place scan
+ *
+ *  // data is now {1, 1, 3, 5, 6, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *
+ */
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum.  When the input and output sequences 
+ *  are the same, the scan is performed in-place.
+ *    
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::inclusive_scan(thrust::host, data, data + 10, data, binary_op); // in-place scan
+ *
+ *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op);
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum.  When the input and output sequences 
+ *  are the same, the scan is performed in-place.
+ *    
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan
+ *
+ *  \code
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::inclusive_scan(data, data + 10, data, binary_op); // in-place scan
+ *
+ *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  <tt>0</tt> is assigned to <tt>*result</tt> and the sum of 
+ *  <tt>0</tt> and <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>,
+ *  and so on. This version of \p exclusive_scan assumes plus as the 
+ *  associative operator and \c 0 as the initial value.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(thrust::host, data, data + 6, data); // in-place scan
+ *
+ *  // data is now {0, 1, 1, 3, 5, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  <tt>0</tt> is assigned to <tt>*result</tt> and the sum of 
+ *  <tt>0</tt> and <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>,
+ *  and so on. This version of \p exclusive_scan assumes plus as the 
+ *  associative operator and \c 0 as the initial value.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(data, data + 6, data); // in-place scan
+ *
+ *  // data is now {0, 1, 1, 3, 5, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>*result</tt> and the sum of \p init and 
+ *  <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p exclusive_scan assumes plus as the associative 
+ *  operator but requires an initial value \p init.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(thrust::host, data, data + 6, data, 4); // in-place scan
+ *
+ *  // data is now {4, 5, 5, 7, 9, 10}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>*result</tt> and the sum of \p init and 
+ *  <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p exclusive_scan assumes plus as the associative 
+ *  operator but requires an initial value \p init.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(data, data + 6, data, 4); // in-place scan
+ *
+ *  // data is now {4, 5, 5, 7, 9, 10}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>\*result</tt> and the value
+ *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
+ *  and so on. This version of the function requires both an associative 
+ *  operator and an initial value \p init.  When the input and output
+ *  sequences are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::exclusive_scan(thrust::host, data, data + 10, data, 1, binary_op); // in-place scan
+ *
+ *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>\*result</tt> and the value
+ *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
+ *  and so on. This version of the function requires both an associative 
+ *  operator and an initial value \p init.  When the input and output
+ *  sequences are the same, the scan is performed in-place.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::exclusive_scan(data, data + 10, data, 1, binary_op); // in-place scan
+ *
+ *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op);
+
+
+/*! \addtogroup segmentedprefixsums Segmented Prefix Sums
+ *  \ingroup prefixsums
+ *  \{
+ */
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, data, data); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+ 
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, data, data); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec. 
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, data, data, binary_pred); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, data, data, binary_pred); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, data, data, binary_pred, binary_op); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, data, data, binary_pred, binary_op); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive segmented prefix 
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c 0 to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ * 
+ *  This version of \p exclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
+ *  different segments otherwise.
+ *
+ *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals); // in-place scan
+ *
+ *  // vals is now {0, 1, 2, 0, 1, 0, 0, 1, 2, 3};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive segmented prefix 
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c 0 to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ * 
+ *  This version of \p exclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
+ *  different segments otherwise.
+ *
+ *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key.
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals); // in-place scan
+ *
+ *  // vals is now {0, 1, 2, 0, 1, 0, 0, 1, 2, 3};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the \p
+ *  thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init, binary_pred); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init, binary_pred); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if 
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  This version of \p exclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init, binary_pred, binary_op); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if 
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  This version of \p exclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init, binary_pred, binary_op); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \} // end segmentedprefixsums
+ */
+
+
+/*! \} // end prefix sums
+ */
+
+	
+} // end namespace thrust
+
+#include <thrust/detail/scan.inl>
+
diff --git a/thrust/thrust/scatter.h b/thrust/thrust/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..baaf1e63b1e28fbe8b071ca0fb6666145bfe7c1f
--- /dev/null
+++ b/thrust/thrust/scatter.h
@@ -0,0 +1,423 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scatter.h
+ *  \brief Irregular copying to a destination range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup scattering
+ *  \ingroup copying
+ *  \{
+ */
+
+
+/*! \p scatter copies elements from a source range into an output array
+ *  according to a map. For each iterator \c i in the range [\p first, \p last),
+ *  the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>. The 
+ *  output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>,
+ *  the result is undefined.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map  Beginning of the sequence of output indices.
+ *  \param result Destination of the source elements.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
+ *
+ *  The following code snippet demonstrates how to use \p scatter to
+ *  reorder a range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // scatter all even indices into the first half of the
+ *  // range, and odd indices vice versa
+ *  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::scatter(thrust::device,
+ *                  d_values.begin(), d_values.end(),
+ *                  d_map.begin(), d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ *
+ *  \note \p scatter is the inverse of thrust::gather.
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator result);
+
+
+/*! \p scatter copies elements from a source range into an output array
+ *  according to a map. For each iterator \c i in the range [\p first, \p last),
+ *  the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>. The 
+ *  output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>,
+ *  the result is undefined.
+ *
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map  Beginning of the sequence of output indices.
+ *  \param result Destination of the source elements.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
+ *
+ *  The following code snippet demonstrates how to use \p scatter to
+ *  reorder a range.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // scatter all even indices into the first half of the
+ *  // range, and odd indices vice versa
+ *  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::scatter(d_values.begin(), d_values.end(),
+ *                  d_map.begin(), d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ *
+ *  \note \p scatter is the inverse of thrust::gather.
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator result);
+
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>*(stencil + (i - first))</tt> is
+ *  true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `*(stencil + i) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  thrust::scatter_if(thrust::host, V, V + 8, M, S, D);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output);
+
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>*(stencil + (i - first))</tt> is
+ *  true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `*(stencil + i) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  ...
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  thrust::scatter_if(V, V + 8, M, S, D);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output);
+
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>pred(*(stencil + (i - first)))</tt> is
+ *  \c true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `pred(*(stencil + i)) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {2, 1, 2, 1, 2, 1, 2, 1};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  is_even pred;
+ *  thrust::scatter_if(thrust::host, V, V + 8, M, S, D, pred);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *  
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+__host__ __device__
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred);
+                  
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>pred(*(stencil + (i - first)))</tt> is
+ *  \c true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `pred(*(stencil + i)) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {2, 1, 2, 1, 2, 1, 2, 1};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  is_even pred;
+ *  thrust::scatter_if(V, V + 8, M, S, D, pred);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *  
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred);
+
+
+/*! \} // end scattering
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/scatter.inl>
+
diff --git a/thrust/thrust/sequence.h b/thrust/thrust/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..e92391f64e1fd7d4fd82e08b662b45d285b45fa8
--- /dev/null
+++ b/thrust/thrust/sequence.h
@@ -0,0 +1,296 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sequence.h
+ *  \brief Fills a range with a sequence of numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \{
+ */
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  (i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(thrust::host, A, A + 10);
+ *  // A is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  (i - first)</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers.
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(A, A + 10);
+ *  // A is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename ForwardIterator>
+  void sequence(ForwardIterator first,
+                ForwardIterator last);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + (i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1 using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(thrust::host, A, A + 10, 1);
+ *  // A is now {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + (i - first)</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1.
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(A, A + 10, 1);
+ *  // A is now {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + step * (i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers
+ *  \param step The difference between consecutive elements.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1 with a step size of 3 using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(thrust::host, A, A + 10, 1, 3);
+ *  // A is now {1, 4, 7, 10, 13, 16, 19, 22, 25, 28}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + step * (i - first)</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers
+ *  \param step The difference between consecutive elements.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1 with a step size of 3.
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(A, A + 10, 1, 3);
+ *  // A is now {1, 4, 7, 10, 13, 16, 19, 22, 25, 28}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step);
+
+
+/*! \} // end transformations
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/sequence.inl>
+
diff --git a/thrust/thrust/set_operations.h b/thrust/thrust/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..a51eaed4351e52aaf3569c986cc5153640dd15d6
--- /dev/null
+++ b/thrust/thrust/set_operations.h
@@ -0,0 +1,2963 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file set_operations.h
+ *  \brief Set theoretic operations for sorted ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup set_operations Set Operations
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in ascending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result is now {0, 4, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result);
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result is now {0, 4, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result);
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in descending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result is now {6, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result,
+                                StrictWeakCompare                                           comp);
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in descending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result is now {6, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakCompare comp);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares objects using
+ *  \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute the
+ *  set intersection of two sets of integers sorted in ascending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[7];
+ *
+ *  int *result_end = thrust::set_intersection(thrust::host, A1, A1 + 6, A2, A2 + 7, result);
+ *  // result is now {1, 3, 5}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              first1,
+                                  InputIterator1                                              last1,
+                                  InputIterator2                                              first2,
+                                  InputIterator2                                              last2,
+                                  OutputIterator                                              result);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares objects using
+ *  \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute the
+ *  set intersection of two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[7];
+ *
+ *  int *result_end = thrust::set_intersection(A1, A1 + 6, A2, A2 + 7, result);
+ *  // result is now {1, 3, 5}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute
+ *  the set intersection of sets of integers sorted in descending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2,  1, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_intersection(thrust::host, A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
+ *  // result is now {5, 3, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              first1,
+                                  InputIterator1                                              last1,
+                                  InputIterator2                                              first2,
+                                  InputIterator2                                              last2,
+                                  OutputIterator                                              result,
+                                  StrictWeakCompare                                           comp);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute
+ *  the set intersection of sets of integers sorted in descending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2,  1, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_intersection(A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
+ *  // result is now {5, 3, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakCompare comp);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in ascending order using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A2[5] = {1, 1, 2, 5, 8};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {0, 4, 5, 6, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator1                                              first1,
+                                          InputIterator1                                              last1,
+                                          InputIterator2                                              first2,
+                                          InputIterator2                                              last2,
+                                          OutputIterator                                              result);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A2[5] = {1, 1, 2, 5, 8};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {0, 4, 5, 6, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in descending order using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A2[5] = {8, 5, 2, 1, 1};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {8, 7, 6, 5, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator1                                              first1,
+                                          InputIterator1                                              last1,
+                                          InputIterator2                                              first2,
+                                          InputIterator2                                              last2,
+                                          OutputIterator                                              result,
+                                          StrictWeakCompare                                           comp);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in descending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A2[5] = {8, 5, 2, 1, 1};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {8, 7, 6, 5, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakCompare comp);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[7] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
+ *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              first1,
+                           InputIterator1                                              last1,
+                           InputIterator2                                              first2,
+                           InputIterator2                                              last2,
+                           OutputIterator                                              result);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[7] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(A1, A1 + 7, A2, A2 + 5, result);
+ *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[7] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(thrust::host, A1, A1 + 7, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+__host__ __device__
+  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              first1,
+                           InputIterator1                                              last1,
+                           InputIterator2                                              first2,
+                           InputIterator2                                              last2,
+                           OutputIterator                                              result,
+                           StrictWeakCompare                                           comp);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A1[7] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(A1, A1 + 7, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakCompare comp);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in ascending order with their values using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          InputIterator4                                              values_first2,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          InputIterator4                             values_first2,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in descending order with their values using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          InputIterator4                                              values_first2,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result,
+                          StrictWeakCompare                                           comp);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using a function object \p comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          InputIterator4                             values_first2,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result,
+                          StrictWeakCompare                          comp);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in ascending order with their values using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0};
+ *  
+ *  int B_keys[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result);
+ *
+ *  // keys_result is now {1, 3, 5}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            InputIterator1                                              keys_first1,
+                            InputIterator1                                              keys_last1,
+                            InputIterator2                                              keys_first2,
+                            InputIterator2                                              keys_last2,
+                            InputIterator3                                              values_first1,
+                            OutputIterator1                                             keys_result,
+                            OutputIterator2                                             values_result);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0};
+ *  
+ *  int B_keys[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result);
+ *
+ *  // keys_result is now {1, 3, 5}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1                             keys_first1,
+                            InputIterator1                             keys_last1,
+                            InputIterator2                             keys_first2,
+                            InputIterator2                             keys_last2,
+                            InputIterator3                             values_first1,
+                            OutputIterator1                            keys_result,
+                            OutputIterator2                            values_result);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in descending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *  
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result, thrust::greater<int>());
+ *
+ *  // keys_result is now {5, 3, 1}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            InputIterator1                                              keys_first1,
+                            InputIterator1                                              keys_last1,
+                            InputIterator2                                              keys_first2,
+                            InputIterator2                                              keys_last2,
+                            InputIterator3                                              values_first1,
+                            OutputIterator1                                             keys_result,
+                            OutputIterator2                                             values_result,
+                            StrictWeakCompare                                           comp);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using a function object \p comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *  
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result, thrust::greater<int>());
+ *
+ *  // keys_result is now {5, 3, 1}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1                             keys_first1,
+                            InputIterator1                             keys_last1,
+                            InputIterator2                             keys_first2,
+                            InputIterator2                             keys_last2,
+                            InputIterator3                             values_first1,
+                            OutputIterator1                            keys_result,
+                            OutputIterator2                            values_result,
+                            StrictWeakCompare                          comp);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 1, 2, 5, 8};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 5, 6, 7, 8}
+ *  // vals_result is now {0, 0, 1, 0, 0, 1}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                    InputIterator1                                              keys_first1,
+                                    InputIterator1                                              keys_last1,
+                                    InputIterator2                                              keys_first2,
+                                    InputIterator2                                              keys_last2,
+                                    InputIterator3                                              values_first1,
+                                    InputIterator4                                              values_first2,
+                                    OutputIterator1                                             keys_result,
+                                    OutputIterator2                                             values_result);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 1, 2, 5, 8};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 5, 6, 7, 8}
+ *  // vals_result is now {0, 0, 1, 0, 0, 1}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1                             keys_first1,
+                                    InputIterator1                             keys_last1,
+                                    InputIterator2                             keys_first2,
+                                    InputIterator2                             keys_last2,
+                                    InputIterator3                             values_first1,
+                                    InputIterator4                             values_first2,
+                                    OutputIterator1                            keys_result,
+                                    OutputIterator2                            values_result);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using a function object \c comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {8, 5, 2, 1, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {8, 7, 6, 5, 4, 0}
+ *  // vals_result is now {1, 0, 0, 1, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                    InputIterator1                                              keys_first1,
+                                    InputIterator1                                              keys_last1,
+                                    InputIterator2                                              keys_first2,
+                                    InputIterator2                                              keys_last2,
+                                    InputIterator3                                              values_first1,
+                                    InputIterator4                                              values_first2,
+                                    OutputIterator1                                             keys_result,
+                                    OutputIterator2                                             values_result,
+                                    StrictWeakCompare                                           comp);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using a function object \c comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {8, 5, 2, 1, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {8, 7, 6, 5, 4, 0}
+ *  // vals_result is now {1, 0, 0, 1, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1                             keys_first1,
+                                    InputIterator1                             keys_last1,
+                                    InputIterator2                             keys_first2,
+                                    InputIterator2                             keys_last2,
+                                    InputIterator3                             values_first1,
+                                    InputIterator4                             values_first2,
+                                    OutputIterator1                            keys_result,
+                                    OutputIterator2                            values_result,
+                                    StrictWeakCompare                          comp);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0,  0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  // vals_result is now {0, 1, 0, 1, 0, 1, 0, 1, 0, 1,  0,  0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1                                              keys_first1,
+                     InputIterator1                                              keys_last1,
+                     InputIterator2                                              keys_first2,
+                     InputIterator2                                              keys_last2,
+                     InputIterator3                                              values_first1,
+                     InputIterator4                                              values_first2,
+                     OutputIterator1                                             keys_result,
+                     OutputIterator2                                             values_result);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0,  0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  // vals_result is now {0, 1, 0, 1, 0, 1, 0, 1, 0, 1,  0,  0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1                             keys_first1,
+                     InputIterator1                             keys_last1,
+                     InputIterator2                             keys_first2,
+                     InputIterator2                             keys_last2,
+                     InputIterator3                             values_first1,
+                     InputIterator4                             values_first2,
+                     OutputIterator1                            keys_result,
+                     OutputIterator2                            values_result);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using a function object \c comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A_vals[6] = { 0,  0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  // vals_result is now { 0,  1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1                                              keys_first1,
+                     InputIterator1                                              keys_last1,
+                     InputIterator2                                              keys_first2,
+                     InputIterator2                                              keys_last2,
+                     InputIterator3                                              values_first1,
+                     InputIterator4                                              values_first2,
+                     OutputIterator1                                             keys_result,
+                     OutputIterator2                                             values_result,
+                     StrictWeakCompare                                           comp);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using a function object \c comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A_vals[6] = { 0,  0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  // vals_result is now { 0,  1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1                             keys_first1,
+                     InputIterator1                             keys_last1,
+                     InputIterator2                             keys_first2,
+                     InputIterator2                             keys_last2,
+                     InputIterator3                             values_first1,
+                     InputIterator4                             values_first2,
+                     OutputIterator1                            keys_result,
+                     OutputIterator2                            values_result,
+                     StrictWeakCompare                          comp);
+
+
+/*! \} // end set_operations
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/set_operations.inl>
+
diff --git a/thrust/thrust/shuffle.h b/thrust/thrust/shuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ed156e15227047072938bc80d8d90309093671e
--- /dev/null
+++ b/thrust/thrust/shuffle.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.h
+ *  \brief Reorders range by a uniform random permutation
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust {
+
+/*! \addtogroup reordering
+*  \ingroup algorithms
+*
+*  \addtogroup shuffling
+*  \ingroup reordering
+*  \{
+*/
+
+
+/*! \p shuffle reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle to create a random permutation
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  #include <thrust/execution_policy.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle(thrust::host, A, A + N, g);
+ *  // A is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle_copy
+ */
+template <typename DerivedPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, URBG&& g);
+
+/*! \p shuffle reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle(A, A + N, g);
+ *  // A is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle_copy
+ */
+template <typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(RandomIterator first, RandomIterator last,
+                                 URBG&& g);
+
+/*! shuffle_copy differs from shuffle only in that the reordered sequence is written to different output sequences, rather than in place.
+ *  \p shuffle_copy reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param result Destination of shuffled sequence
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle_copy to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  #include <thrust/execution_policy.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle_copy(thrust::host, A, A + N, result, g);
+ *  // result is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle
+ */
+template <typename DerivedPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, OutputIterator result, URBG&& g);
+
+/*! shuffle_copy differs from shuffle only in that the reordered sequence is written to different output sequences, rather than in place.
+ *\p shuffle_copy reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param result Destination of shuffled sequence
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle_copy to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle_copy(A, A + N, result, g);
+ *  // result is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle
+ */
+template <typename RandomIterator, typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(RandomIterator first, RandomIterator last,
+                                      OutputIterator result, URBG&& g);
+
+}  // namespace thrust
+
+#include <thrust/detail/shuffle.inl>
+#endif
diff --git a/thrust/thrust/sort.h b/thrust/thrust/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..a100f960281394c5a178396c54cec2a73265ccc8
--- /dev/null
+++ b/thrust/thrust/sort.h
@@ -0,0 +1,1362 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/sort.h
+ *  \brief Functions for reorganizing ranges into sorted order
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup sorting
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(thrust::host, A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename DerivedPolicy, typename RandomAccessIterator>
+__host__ __device__
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last);
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using \c operator<.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename RandomAccessIterator>
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last);
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using a function object
+ *  \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp  Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(thrust::host, A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp);
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using a function object
+ *  \p comp.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp  Comparison operator.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::stable_sort(thrust::host, A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy, typename RandomAccessIterator>
+__host__ __device__
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using \c operator<.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::stable_sort(A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename RandomAccessIterator>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using a function object
+ *  \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using a function object
+ *  \p comp.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+
+///////////////
+// Key Value //
+///////////////
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(thrust::host, keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first);
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using \c operator<.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first);
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using a function object
+ *  \c comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys using the \p thrust::host execution policy
+ *  for parallelization.The keys are sorted in descending order using the <tt>greater<int></tt> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(thrust::host, keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp);
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using a function object
+ *  \c comp.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys.  The keys
+ *  are sorted in descending order using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_sort_by_key to sort
+ *  an array of characters using integers as sorting keys using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(thrust::host, keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using \c operator<.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_sort_by_key to sort
+ *  an array of characters using integers as sorting keys.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using the function
+ *  object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys using the \p thrust::host execution policy for
+ *  parallelization. The keys are sorted in descending order using the <tt>greater<int></tt> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(thrust::host, keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using the function
+ *  object \p comp.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys.  The keys
+ *  are sorted in descending order using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+
+/*! \} // end sorting
+ */
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup predicates
+ *  \{
+ */
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is
+ *  sorted in ascending order, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for
+ *  some iterator \c i in the range <tt>[first, last - 1)</tt> the
+ *  expression <tt>*(i + 1) < *i</tt> is \c true.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return \c true, if the sequence is sorted; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
+ *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *
+ *  The following code demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in ascending order using the \p thrust::device execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  bool result = thrust::is_sorted(thrust::device, v.begin(), v.end());
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end());
+ *  result = thrust::is_sorted(thrust::device, v.begin(), v.end());
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see is_sorted_until
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is
+ *  sorted in ascending order, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for
+ *  some iterator \c i in the range <tt>[first, last - 1)</tt> the
+ *  expression <tt>*(i + 1) < *i</tt> is \c true.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return \c true, if the sequence is sorted; \c false, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
+ *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *
+ *  The following code demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in ascending order.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/sort.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  bool result = thrust::is_sorted(v.begin(), v.end());
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end());
+ *  result = thrust::is_sorted(v.begin(), v.end());
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see is_sorted_until
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename ForwardIterator>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last);
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is sorted in ascending 
+ *  order accoring to a user-defined comparison operation, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for some iterator \c i in
+ *  the range <tt>[first, last - 1)</tt> the expression <tt>comp(*(i + 1), *i)</tt> is \c true.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp  Comparison operator.
+ *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
+ *          and \c second_argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in descending order using the \p thrust::device execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  thrust::greater<int> comp;
+ *  bool result = thrust::is_sorted(thrust::device, v.begin(), v.end(), comp);
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end(), comp);
+ *  result = thrust::is_sorted(thrust::device, v.begin(), v.end(), comp);
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+__host__ __device__
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp);
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is sorted in ascending 
+ *  order accoring to a user-defined comparison operation, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for some iterator \c i in
+ *  the range <tt>[first, last - 1)</tt> the expression <tt>comp(*(i + 1), *i)</tt> is \c true.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp  Comparison operator.
+ *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
+ *          and \c second_argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in descending order.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  thrust::greater<int> comp;
+ *  bool result = thrust::is_sorted(v.begin(), v.end(), comp);
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end(), comp);
+ *  result = thrust::is_sorted(v.begin(), v.end(), comp);
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename ForwardIterator, typename Compare>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using \c operator<. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted using the \p thrust::host execution policy for
+ *  parallelization:
+ *  
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+ *  
+ *  int * B = thrust::is_sorted_until(thrust::host, A, A + 8);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using \c operator<. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted:
+ *  
+ *  \code
+ *  #include <thrust/sort.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+ *  
+ *  int * B = thrust::is_sorted_until(A, A + 8);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename ForwardIterator>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using the function object \c comp. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization:
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param comp The function object to use for comparison.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted in descending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {3, 2, 1, 0, 3, 2, 1, 0};
+ *  
+ *  thrust::greater<int> comp;
+ *  int * B = thrust::is_sorted_until(thrust::host, A, A + 8, comp);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted in descending order
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+__host__ __device__
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using the function object \c comp. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param comp The function object to use for comparison.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted in descending order:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {3, 2, 1, 0, 3, 2, 1, 0};
+ *  
+ *  thrust::greater<int> comp;
+ *  int * B = thrust::is_sorted_until(A, A + 8, comp);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted in descending order
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename ForwardIterator, typename Compare>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp);
+
+
+/*! \} // end predicates
+ *  \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/sort.inl>
+
diff --git a/thrust/thrust/swap.h b/thrust/thrust/swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..246e8438781e91ecbf8dc51b36f279d9d58b96b1
--- /dev/null
+++ b/thrust/thrust/swap.h
@@ -0,0 +1,191 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file swap.h
+ *  \brief Functions for swapping the value of elements
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+// empty Doxygen comment below so namespace thrust's documentation will be extracted
+
+/*!
+ */
+namespace thrust
+{
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup swap
+ *  \{
+ */
+
+/*! \p swap assigns the contents of \c a to \c b and the
+ *  contents of \c b to \c a. This is used as a primitive operation
+ *  by many other algorithms.
+ *  
+ *  \param a The first value of interest. After completion,
+ *           the value of b will be returned here.
+ *  \param b The second value of interest. After completion,
+ *           the value of a will be returned here.
+ *
+ *  \tparam Assignable is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p swap to
+ *  swap the contents of two variables.
+ *
+ *  \code
+ *  #include <thrust/swap.h>
+ *  ...
+ *  int x = 1;
+ *  int y = 2;
+ *  thrust::swap(x,h);
+ *
+ *  // x == 2, y == 1
+ *  \endcode
+ */
+template<typename Assignable1, typename Assignable2>
+__host__ __device__ 
+inline void swap(Assignable1 &a, Assignable2 &b);
+
+/*! \} // swap
+ */
+
+/*! \} // utility
+ */
+
+
+/*! \addtogroup copying
+ *  \{
+ */
+
+
+/*! \p swap_ranges swaps each of the elements in the range <tt>[first1, last1)</tt>
+ *  with the corresponding element in the range <tt>[first2, first2 + (last1 - first1))</tt>.
+ *  That is, for each integer \c n such that <tt>0 <= n < (last1 - first1)</tt>, it swaps
+ *  <tt>*(first1 + n)</tt> and <tt>*(first2 + n)</tt>. The return value is
+ *  <tt>first2 + (last1 - first1)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence to swap.
+ *  \param last1 One position past the last element of the first sequence to swap.
+ *  \param first2 The beginning of the second sequence to swap.
+ *  \return An iterator pointing to one position past the last element of the second
+ *          sequence to swap.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
+ *
+ *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p swap_ranges to
+ *  swap the contents of two \c thrust::device_vectors using the \p thrust::device execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/swap.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v1(2), v2(2);
+ *  v1[0] = 1;
+ *  v1[1] = 2;
+ *  v2[0] = 3;
+ *  v2[1] = 4;
+ *
+ *  thrust::swap_ranges(thrust::device, v1.begin(), v1.end(), v2.begin());
+ *
+ *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see \c swap
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  ForwardIterator2 swap_ranges(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2);
+
+
+/*! \p swap_ranges swaps each of the elements in the range <tt>[first1, last1)</tt>
+ *  with the corresponding element in the range <tt>[first2, first2 + (last1 - first1))</tt>.
+ *  That is, for each integer \c n such that <tt>0 <= n < (last1 - first1)</tt>, it swaps
+ *  <tt>*(first1 + n)</tt> and <tt>*(first2 + n)</tt>. The return value is
+ *  <tt>first2 + (last1 - first1)</tt>.
+ *
+ *  \param first1 The beginning of the first sequence to swap.
+ *  \param last1 One position past the last element of the first sequence to swap.
+ *  \param first2 The beginning of the second sequence to swap.
+ *  \return An iterator pointing to one position past the last element of the second
+ *          sequence to swap.
+ *
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
+ *
+ *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p swap_ranges to
+ *  swap the contents of two \c thrust::device_vectors.
+ *
+ *  \code
+ *  #include <thrust/swap.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v1(2), v2(2);
+ *  v1[0] = 1;
+ *  v1[1] = 2;
+ *  v2[0] = 3;
+ *  v2[1] = 4;
+ *
+ *  thrust::swap_ranges(v1.begin(), v1.end(), v2.begin());
+ *
+ *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see \c swap
+ */
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2);
+
+
+/*! \} // copying
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/swap.inl>
+
diff --git a/thrust/thrust/system/cpp/detail/adjacent_difference.h b/thrust/thrust/system/cpp/detail/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..b82242c7c0798b58c6d2c2d3da12770dd373d562
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/adjacent_difference.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits adjacent_difference
+#include <thrust/system/detail/sequential/adjacent_difference.h>
+
diff --git a/thrust/thrust/system/cpp/detail/assign_value.h b/thrust/thrust/system/cpp/detail/assign_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5f14bd1636df4a18e36906fb34842a1efc56f66
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/assign_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits assign_value
+#include <thrust/system/detail/sequential/assign_value.h>
+
diff --git a/thrust/thrust/system/cpp/detail/binary_search.h b/thrust/thrust/system/cpp/detail/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2c33f32aad3991ba000f095b20e635a0844f718
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/binary_search.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+// this system inherits the binary search algorithms
+#include <thrust/system/detail/sequential/binary_search.h>
+
diff --git a/thrust/thrust/system/cpp/detail/copy.h b/thrust/thrust/system/cpp/detail/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2f5b1bd45cd777827137f2d7e9b885ec9fd9c34
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/copy.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits copy
+#include <thrust/system/detail/sequential/copy.h>
+
diff --git a/thrust/thrust/system/cpp/detail/copy_if.h b/thrust/thrust/system/cpp/detail/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..10869e2a90215ecf1045b36882180acf3e791981
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/copy_if.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits copy_if
+#include <thrust/system/detail/sequential/copy_if.h>
+
diff --git a/thrust/thrust/system/cpp/detail/count.h b/thrust/thrust/system/cpp/detail/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/count.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/equal.h b/thrust/thrust/system/cpp/detail/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/equal.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/execution_policy.h b/thrust/thrust/system/cpp/detail/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..27e4db86264ba8c08ab34499565758f1bbce9bb9
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/execution_policy.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace cpp
+{
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::system::detail::sequential::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::system::detail::sequential::execution_policy<Derived>
+{
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
+};
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::cpp::detail::execution_policy;
+using thrust::system::cpp::detail::tag;
+
+} // end cpp
+} // end system
+
+// alias items at top-level
+namespace cpp
+{
+
+using thrust::system::cpp::execution_policy;
+using thrust::system::cpp::tag;
+
+} // end cpp
+} // end thrust
+
diff --git a/thrust/thrust/system/cpp/detail/extrema.h b/thrust/thrust/system/cpp/detail/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fbb8c55c287cb15330d991376c3c10d75829f25
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/extrema.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits extrema algorithms
+#include <thrust/system/detail/sequential/extrema.h>
+
diff --git a/thrust/thrust/system/cpp/detail/fill.h b/thrust/thrust/system/cpp/detail/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/find.h b/thrust/thrust/system/cpp/detail/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..29c0dafc8ceea556c99666b7cdf22d07f5b458bd
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/find.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits find
+#include <thrust/system/detail/sequential/find.h>
+
diff --git a/thrust/thrust/system/cpp/detail/for_each.h b/thrust/thrust/system/cpp/detail/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ef45dfe62e47d9779c4b60839628efd82c2a5e1
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/for_each.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits for_each
+#include <thrust/system/detail/sequential/for_each.h>
+
diff --git a/thrust/thrust/system/cpp/detail/gather.h b/thrust/thrust/system/cpp/detail/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/gather.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/generate.h b/thrust/thrust/system/cpp/detail/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/generate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/get_value.h b/thrust/thrust/system/cpp/detail/get_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..915001d37f4dba8a6173df49f635b50f88ef162d
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/get_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits get_value
+#include <thrust/system/detail/sequential/get_value.h>
+
diff --git a/thrust/thrust/system/cpp/detail/inner_product.h b/thrust/thrust/system/cpp/detail/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/inner_product.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/iter_swap.h b/thrust/thrust/system/cpp/detail/iter_swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a982becb3949b2229bbf4bbf3fcc40f20f34253
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/iter_swap.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits iter_swap
+#include <thrust/system/detail/sequential/iter_swap.h>
+
diff --git a/thrust/thrust/system/cpp/detail/logical.h b/thrust/thrust/system/cpp/detail/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/logical.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/malloc_and_free.h b/thrust/thrust/system/cpp/detail/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1ad7a7341bf701b7f333033059c18b98096c2d7
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/malloc_and_free.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits malloc & free
+#include <thrust/system/detail/sequential/malloc_and_free.h>
+
diff --git a/thrust/thrust/system/cpp/detail/memory.inl b/thrust/thrust/system/cpp/detail/memory.inl
new file mode 100644
index 0000000000000000000000000000000000000000..bbb0bab78bcac2ec7843ee892c63b0b9f04cb6f8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/memory.inl
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+pointer<void> malloc(std::size_t n)
+{
+  tag t;
+  return pointer<void>(thrust::system::detail::sequential::malloc(t, n));
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::cpp::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+void free(pointer<void> ptr)
+{
+  tag t;
+  return thrust::system::detail::sequential::free(t, ptr);
+} // end free()
+
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/cpp/detail/merge.h b/thrust/thrust/system/cpp/detail/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..20e17f2d40dd218d2e7b22a7744321f75ea6ab0c
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/merge.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits merge
+#include <thrust/system/detail/sequential/merge.h>
+
diff --git a/thrust/thrust/system/cpp/detail/mismatch.h b/thrust/thrust/system/cpp/detail/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/mismatch.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/par.h b/thrust/thrust/system/cpp/detail/par.h
new file mode 100644
index 0000000000000000000000000000000000000000..740c39e8b992f2071488079da19b013de762b9d3
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/par.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::cpp::detail::execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::cpp::detail::execution_policy>
+{
+  __host__ __device__
+  THRUST_CONSTEXPR par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
+};
+
+
+} // end detail
+
+
+THRUST_INLINE_CONSTANT detail::par_t par;
+
+
+} // end cpp
+} // end system
+
+
+// alias par here
+namespace cpp
+{
+
+
+using thrust::system::cpp::par;
+
+
+} // end cpp
+} // end thrust
+
diff --git a/thrust/thrust/system/cpp/detail/partition.h b/thrust/thrust/system/cpp/detail/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..50d48e222aa53424d0fcdf8b2bc3e8bf4a9f4e54
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/partition.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits partition
+#include <thrust/system/detail/sequential/partition.h>
+
diff --git a/thrust/thrust/system/cpp/detail/per_device_resource.h b/thrust/thrust/system/cpp/detail/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d61f92169e0e09c3821e59218f0dcbb70cbe5
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/thrust/system/cpp/detail/pointer.inl b/thrust/thrust/system/cpp/detail/pointer.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7d9de3e55af42dc538ece5dcba8d4bb0a3993623
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/pointer.inl
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+namespace thrust
+{
+
+// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
+//     pointer_raw_pointer for pointer by specializing it here
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+namespace detail
+{
+
+template<typename T>
+  struct pointer_raw_pointer< thrust::cpp::pointer<T> >
+{
+  typedef typename thrust::cpp::pointer<T>::raw_pointer type;
+}; // end pointer_raw_pointer
+
+} // end detail
+#endif
+
+namespace system
+{
+namespace cpp
+{
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/cpp/detail/reduce.h b/thrust/thrust/system/cpp/detail/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..e09652cd9fd6a30fd1673d1bbd33313b23450a3f
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/reduce.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits reduce
+#include <thrust/system/detail/sequential/reduce.h>
+
diff --git a/thrust/thrust/system/cpp/detail/reduce_by_key.h b/thrust/thrust/system/cpp/detail/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2c7744249bdd644f2c8bf8e4d8b86bb583ac332
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/reduce_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits reduce_by_key
+#include <thrust/system/detail/sequential/reduce_by_key.h>
+
diff --git a/thrust/thrust/system/cpp/detail/remove.h b/thrust/thrust/system/cpp/detail/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..a529f625d6a206ec684f5c08f9fd8c199e5fcba4
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/remove.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits remove
+#include <thrust/system/detail/sequential/remove.h>
+
diff --git a/thrust/thrust/system/cpp/detail/replace.h b/thrust/thrust/system/cpp/detail/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/replace.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/reverse.h b/thrust/thrust/system/cpp/detail/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/reverse.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/scan.h b/thrust/thrust/system/cpp/detail/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d38e648437322d078d49c9412ab9532b7cc8b69
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits scan
+#include <thrust/system/detail/sequential/scan.h>
+
diff --git a/thrust/thrust/system/cpp/detail/scan_by_key.h b/thrust/thrust/system/cpp/detail/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e72e7c38f868bb83bc835c3d627751c54e2cd15
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/scan_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits the scan_by_key algorithms
+#include <thrust/system/detail/sequential/scan_by_key.h>
+
diff --git a/thrust/thrust/system/cpp/detail/scatter.h b/thrust/thrust/system/cpp/detail/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/scatter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/sequence.h b/thrust/thrust/system/cpp/detail/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/sequence.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/set_operations.h b/thrust/thrust/system/cpp/detail/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbde20114c2c8348ceff8dfb226f7e5ed71cc026
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/set_operations.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits the set operations
+#include <thrust/system/detail/sequential/set_operations.h>
+
diff --git a/thrust/thrust/system/cpp/detail/sort.h b/thrust/thrust/system/cpp/detail/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae38b3ba8c7854eafc92fe9f35ff7d3220a02c20
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/sort.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits sort
+#include <thrust/system/detail/sequential/sort.h>
+
diff --git a/thrust/thrust/system/cpp/detail/swap_ranges.h b/thrust/thrust/system/cpp/detail/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..35cd5462207a0a9d02cdfad9be9730f74a511e82
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/swap_ranges.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// cpp has no special swap_ranges
+
diff --git a/thrust/thrust/system/cpp/detail/tabulate.h b/thrust/thrust/system/cpp/detail/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/tabulate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/temporary_buffer.h b/thrust/thrust/system/cpp/detail/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2adfaf2810c67462e41f271e43ad0aff9cfbf75f
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/thrust/thrust/system/cpp/detail/transform.h b/thrust/thrust/system/cpp/detail/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..895164ce5afbc15c1ceff3a921d3b3765e99f251
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/transform.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// cpp has no special transform
+
diff --git a/thrust/thrust/system/cpp/detail/transform_reduce.h b/thrust/thrust/system/cpp/detail/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/transform_reduce.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/transform_scan.h b/thrust/thrust/system/cpp/detail/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/transform_scan.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/uninitialized_copy.h b/thrust/thrust/system/cpp/detail/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/uninitialized_copy.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/uninitialized_fill.h b/thrust/thrust/system/cpp/detail/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/uninitialized_fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cpp/detail/unique.h b/thrust/thrust/system/cpp/detail/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ff23e9d3db060d3284c743a1178b20069026575
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/unique.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits unique
+#include <thrust/system/detail/sequential/unique.h>
+
diff --git a/thrust/thrust/system/cpp/detail/unique_by_key.h b/thrust/thrust/system/cpp/detail/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d40011787cb8eaea25a969c855c1c758a0225e4
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/unique_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits unique_by_key
+#include <thrust/system/detail/sequential/unique_by_key.h>
+
diff --git a/thrust/thrust/system/cpp/detail/vector.inl b/thrust/thrust/system/cpp/detail/vector.inl
new file mode 100644
index 0000000000000000000000000000000000000000..55a1fa4bac1724eaac30b55ad6346d23b1a1e0f8
--- /dev/null
+++ b/thrust/thrust/system/cpp/detail/vector.inl
@@ -0,0 +1,126 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/vector.h>
+#include <utility>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector()
+      : super_t()
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n)
+      : super_t(n)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n, const value_type &value)
+      : super_t(n,value)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(const vector &x)
+      : super_t(x)
+{}
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Allocator>
+    vector<T,Allocator>
+      ::vector(vector &&x)
+        : super_t(std::move(x))
+  {}
+#endif
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const std::vector<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename InputIterator>
+    vector<T,Allocator>
+      ::vector(InputIterator first, InputIterator last)
+        : super_t(first,last)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator> &
+    vector<T,Allocator>
+      ::operator=(const vector &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Allocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(vector &&x)
+  {
+    super_t::operator=(std::move(x));
+    return *this;
+  }
+#endif
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+      
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/cpp/execution_policy.h b/thrust/thrust/system/cpp/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bf521be348f834fe71f0a754425a9c2438a1526
--- /dev/null
+++ b/thrust/thrust/system/cpp/execution_policy.h
@@ -0,0 +1,157 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/cpp/execution_policy.h
+ *  \brief Execution policies for Thrust's standard C++ system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/cpp/detail/par.h>
+
+// now get all the algorithm definitions
+
+#include <thrust/system/cpp/detail/adjacent_difference.h>
+#include <thrust/system/cpp/detail/assign_value.h>
+#include <thrust/system/cpp/detail/binary_search.h>
+#include <thrust/system/cpp/detail/copy.h>
+#include <thrust/system/cpp/detail/copy_if.h>
+#include <thrust/system/cpp/detail/count.h>
+#include <thrust/system/cpp/detail/equal.h>
+#include <thrust/system/cpp/detail/extrema.h>
+#include <thrust/system/cpp/detail/fill.h>
+#include <thrust/system/cpp/detail/find.h>
+#include <thrust/system/cpp/detail/for_each.h>
+#include <thrust/system/cpp/detail/gather.h>
+#include <thrust/system/cpp/detail/generate.h>
+#include <thrust/system/cpp/detail/get_value.h>
+#include <thrust/system/cpp/detail/inner_product.h>
+#include <thrust/system/cpp/detail/iter_swap.h>
+#include <thrust/system/cpp/detail/logical.h>
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+#include <thrust/system/cpp/detail/merge.h>
+#include <thrust/system/cpp/detail/mismatch.h>
+#include <thrust/system/cpp/detail/partition.h>
+#include <thrust/system/cpp/detail/reduce.h>
+#include <thrust/system/cpp/detail/reduce_by_key.h>
+#include <thrust/system/cpp/detail/remove.h>
+#include <thrust/system/cpp/detail/replace.h>
+#include <thrust/system/cpp/detail/reverse.h>
+#include <thrust/system/cpp/detail/scan.h>
+#include <thrust/system/cpp/detail/scan_by_key.h>
+#include <thrust/system/cpp/detail/scatter.h>
+#include <thrust/system/cpp/detail/sequence.h>
+#include <thrust/system/cpp/detail/set_operations.h>
+#include <thrust/system/cpp/detail/sort.h>
+#include <thrust/system/cpp/detail/swap_ranges.h>
+#include <thrust/system/cpp/detail/tabulate.h>
+#include <thrust/system/cpp/detail/transform.h>
+#include <thrust/system/cpp/detail/transform_reduce.h>
+#include <thrust/system/cpp/detail/transform_scan.h>
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+#include <thrust/system/cpp/detail/unique.h>
+#include <thrust/system/cpp/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::system::cpp::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's standard C++ backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p thrust::system::cpp::tag is a type representing Thrust's standard C++ backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p cpp::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p cpp system.
+ */
+struct tag : thrust::system::cpp::execution_policy<tag> { unspecified };
+
+
+/*! 
+ *  \p thrust::system::cpp::par is the parallel execution policy associated with Thrust's standard
+ *  C++ backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's C++ backend system by providing \p thrust::cpp::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::cpp::vector.
+ *
+ *  The type of \p thrust::cpp::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::cpp::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the standard C++ backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/cpp/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::cpp::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end cpp
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/thrust/thrust/system/cpp/memory.h b/thrust/thrust/system/cpp/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..18b31e758de483d77fc1c84f515e4117575ce852
--- /dev/null
+++ b/thrust/thrust/system/cpp/memory.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cpp/memory.h
+ *  \brief Managing memory associated with Thrust's standard C++ system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/memory_resource.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/mr/allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+/*! Allocates an area of memory available to Thrust's <tt>cpp</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>cpp::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>cpp::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cpp::pointer<void></tt> returned by this function must be
+ *        deallocated with \p cpp::free.
+ *  \see cpp::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>cpp</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>cpp::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated elements. A null <tt>cpp::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cpp::pointer<T></tt> returned by this function must be
+ *        deallocated with \p cpp::free.
+ *  \see cpp::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>cpp::malloc</tt>.
+ *  \param ptr A <tt>cpp::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>cpp::malloc</tt>.
+ *  \see cpp::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+/*! \p cpp::allocator is the default allocator used by the \p cpp system's containers such as
+ *  <tt>cpp::vector</tt> if no user-specified allocator is provided. \p cpp::allocator allocates
+ *  (deallocates) storage with \p cpp::malloc (\p cpp::free).
+ */
+template<typename T>
+using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
+
+} // end cpp
+
+} // end system
+
+/*! \namespace thrust::cpp
+ *  \brief \p thrust::cpp is a top-level alias for thrust::system::cpp.
+ */
+namespace cpp
+{
+
+using thrust::system::cpp::malloc;
+using thrust::system::cpp::free;
+using thrust::system::cpp::allocator;
+
+} // end cpp
+
+} // end thrust
+
+#include <thrust/system/cpp/detail/memory.inl>
+
diff --git a/thrust/thrust/system/cpp/memory_resource.h b/thrust/thrust/system/cpp/memory_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..e89fd25fdecc1d0362e5c66c5866fc5eaa78d76c
--- /dev/null
+++ b/thrust/thrust/system/cpp/memory_resource.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file cpp/memory_resource.h
+ *  \brief Memory resources for the CPP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/fancy_pointer_resource.h>
+
+#include <thrust/system/cpp/pointer.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+//! \cond
+namespace detail
+{
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::cpp::pointer<void>
+    > native_resource;
+}
+//! \endcond
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ */
+
+/*! The memory resource for the CPP system. Uses \p mr::new_delete_resource and tags it with \p cpp::pointer. */
+typedef detail::native_resource memory_resource;
+/*! An alias for \p cpp::memory_resource. */
+typedef detail::native_resource universal_memory_resource;
+/*! An alias for \p cpp::memory_resource. */
+typedef detail::native_resource universal_host_pinned_memory_resource;
+
+/*! \}
+ */
+
+}
+}
+}
diff --git a/thrust/thrust/system/cpp/pointer.h b/thrust/thrust/system/cpp/pointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8efeb33c46d2f5034586ac31cdacc8145eed6246
--- /dev/null
+++ b/thrust/thrust/system/cpp/pointer.h
@@ -0,0 +1,351 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+template<typename> class pointer;
+
+} // end cpp
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize thrust::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace thrust
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::cpp::pointer<Element> >
+{
+  private:
+    typedef thrust::system::cpp::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end thrust
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::cpp
+ *  \brief \p thrust::system::cpp is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's standard C++ backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::cpp</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace cpp
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::cpp::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the cpp system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in cpp memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p cpp::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cpp::malloc
+ *  \see cpp::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::cpp::tag,
+               thrust::system::cpp::reference<T>,
+               thrust::system::cpp::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::cpp::tag,
+      //thrust::system::cpp::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::cpp::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    // note that cpp::pointer's member functions need __host__ __device__
+    // to interoperate with nvcc + iterators' dereference member function
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer(decltype(nullptr)) : super_t(nullptr) {}
+    #endif
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p cpp system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! This constructor allows construction from another pointer-like object with \p void type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cpp::tag and its element type shall be \p void.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    explicit
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer& operator=(decltype(nullptr))
+    {
+      super_t::operator=(nullptr);
+      return *this;
+    }
+    #endif
+}; // end pointer
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p cpp system.
+ *  \p reference is the type of the result of dereferencing a \p cpp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::cpp::pointer<T>,
+               thrust::system::cpp::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::cpp::pointer<T>,
+      thrust::system::cpp::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference of interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+} // end cpp
+
+/*! \}
+ */
+
+} // end system
+
+namespace cpp
+{
+
+using thrust::system::cpp::pointer;
+using thrust::system::cpp::reference;
+
+} // end cpp
+
+} // end thrust
+
+#include <thrust/system/cpp/detail/pointer.inl>
diff --git a/thrust/thrust/system/cpp/vector.h b/thrust/thrust/system/cpp/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee5cfce6aa8d26a2d6d924361f42bfec99cf8601
--- /dev/null
+++ b/thrust/thrust/system/cpp/vector.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cpp/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's standard C++ system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+template<typename T, typename Allocator> class host_vector;
+
+namespace system
+{
+namespace cpp
+{
+
+/*! \p cpp::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p cpp::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p cpp::vector reside in memory
+ *  available to the \p cpp system.
+ *
+ *  \tparam T The element type of the \p cpp::vector.
+ *  \tparam Allocator The allocator type of the \p cpp::vector. Defaults to \p cpp::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cpp::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+} // end cpp
+} // end system
+
+// alias system::cpp names at top-level
+namespace cpp
+{
+
+using thrust::system::cpp::vector;
+
+} // end cpp
+
+} // end thrust
diff --git a/thrust/thrust/system/cuda/config.h b/thrust/thrust/system/cuda/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..246f2ccd05f30cb17e596335c2ee8675385b6b76
--- /dev/null
+++ b/thrust/thrust/system/cuda/config.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(__CUDACC__)
+#  if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+#    define __THRUST_HAS_CUDART__ 1
+#    define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
+#  else
+#    define __THRUST_HAS_CUDART__ 0
+#    define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
+#  endif
+#else
+#  define __THRUST_HAS_CUDART__ 0
+#  define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
+#endif
+
+#ifdef __CUDA_ARCH__
+#define THRUST_DEVICE_CODE
+#endif
+
+#ifdef THRUST_AGENT_ENTRY_NOINLINE
+#define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__
+#else
+#define THRUST_AGENT_ENTRY_INLINE_ATTR __forceinline__
+#endif
+
+#define THRUST_DEVICE_FUNCTION __device__ __forceinline__
+#define THRUST_HOST_FUNCTION __host__     __forceinline__
+#define THRUST_FUNCTION __host__ __device__ __forceinline__
+#if 0
+#define THRUST_ARGS(...) __VA_ARGS__
+#define THRUST_STRIP_PARENS(X) X
+#define THRUST_AGENT_ENTRY(ARGS) THRUST_FUNCTION static void entry(THRUST_STRIP_PARENS(THRUST_ARGS ARGS))
+#else
+#define THRUST_AGENT_ENTRY(...) THRUST_AGENT_ENTRY_INLINE_ATTR __device__ static void entry(__VA_ARGS__)
+#endif
+
+#ifdef THRUST_DEBUG_SYNC
+#define THRUST_DEBUG_SYNC_FLAG true
+#else
+#define THRUST_DEBUG_SYNC_FLAG false
+#endif
+
+#define THRUST_CUB_NS_PREFIX namespace thrust {   namespace cuda_cub {
+#define THRUST_CUB_NS_POSTFIX }  }
+
+#ifndef THRUST_IGNORE_CUB_VERSION_CHECK
+#include <thrust/version.h>
+#include <cub/util_namespace.cuh> // This includes <cub/version.cuh> in newer releases.
+#if THRUST_VERSION != CUB_VERSION
+#error The version of CUB in your include path is not compatible with this release of Thrust. CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. Define THRUST_IGNORE_CUB_VERSION_CHECK to ignore this.
+#endif
+#endif
diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..648ddba3e9bea6bf2f7b4c7a8b1b8fc330ac1818
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h
@@ -0,0 +1,540 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cub/device/device_select.cuh>
+#include <cub/block/block_adjacent_difference.cuh>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/functional.h>
+#include <thrust/distance.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/minmax.h>
+
+namespace thrust
+{
+
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__ OutputIterator
+adjacent_difference(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator                                               first,
+    InputIterator                                               last,
+    OutputIterator                                              result,
+    BinaryFunction                                              binary_op);
+
+namespace cuda_cub {
+
+namespace __adjacent_difference {
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  };
+
+  template<int INPUT_SIZE, int NOMINAL_4B_ITEMS_PER_THREAD>
+  struct items_per_thread
+  {
+    enum
+    {
+      value = (INPUT_SIZE <= 8)
+                  ? NOMINAL_4B_ITEMS_PER_THREAD
+                  : mpl::min<
+                        int,
+                        NOMINAL_4B_ITEMS_PER_THREAD,
+                        mpl::max<int,
+                                 1,
+                                 ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                                  INPUT_SIZE - 1) /
+                                     INPUT_SIZE>::value>::value
+    };
+  };
+
+  template<class Arch, class T>
+  struct Tuning;
+
+  template <class T>
+  struct Tuning<sm30, T>
+  {
+    enum
+    {
+      INPUT_SIZE                  = sizeof(T),
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = items_per_thread<INPUT_SIZE,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+  template <class T>
+  struct Tuning<sm35, T> : Tuning<sm30,T>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = items_per_thread<Tuning::INPUT_SIZE,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+
+  template <class InputIt,
+            class OutputIt,
+            class Size,
+            class BinaryOp>
+  struct AdjacentDifferenceAgent
+  {
+    typedef typename iterator_traits<InputIt>::value_type input_type;
+
+    // XXX output type must be result of BinaryOp(input_type,input_type);
+    typedef input_type output_type;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,input_type>::type
+    {
+      typedef Tuning<Arch,input_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, InputIt>::type LoadIt;
+      typedef typename core::BlockLoad<PtxPlan, LoadIt>::type     BlockLoad;
+
+      typedef typename core::BlockStore<PtxPlan, OutputIt, input_type>::type
+          BlockStore;
+
+      typedef cub::BlockAdjacentDifference<input_type,
+                                           PtxPlan::BLOCK_THREADS,
+                                           1,
+                                           1,
+                                           Arch::ver>
+          BlockAdjacentDifference;
+
+      union TempStorage
+      {
+        typename BlockAdjacentDifference::TempStorage discontinuity;
+        typename BlockLoad::TempStorage                load;
+        typename BlockStore::TempStorage               store;
+      }; // union TempStorage
+    }; // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::LoadIt      LoadIt;
+    typedef typename ptx_plan::BlockLoad   BlockLoad;
+    typedef typename ptx_plan::BlockStore  BlockStore;
+    typedef typename ptx_plan::BlockAdjacentDifference BlockAdjacentDifference;
+    typedef typename ptx_plan::TempStorage TempStorage;
+
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
+    };
+
+    struct impl
+    {
+
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &temp_storage;
+      LoadIt       load_it;                // iterator to the first element
+      input_type * first_tile_previous;    // iterator to the first element of previous tile value
+      OutputIt     output_it;
+      BinaryOp     binary_op;
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      void THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_remaining,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        input_type  input[ITEMS_PER_THREAD];
+        input_type  input_prev[ITEMS_PER_THREAD];
+        output_type output[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoad(temp_storage.load)
+              .Load(load_it + tile_base,
+                    input,
+                    num_remaining,
+                    *(load_it + tile_base));
+        }
+        else
+        {
+          BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
+        }
+
+
+        core::sync_threadblock();
+
+        if (IS_FIRST_TILE)
+        {
+          BlockAdjacentDifference(temp_storage.discontinuity)
+              .FlagHeads(output, input, input_prev, binary_op);
+          if (threadIdx.x == 0)
+            output[0] = input[0];
+        }
+        else
+        {
+          input_type tile_prev_input = first_tile_previous[tile_idx];
+          BlockAdjacentDifference(temp_storage.discontinuity)
+              .FlagHeads(output, input, input_prev, binary_op, tile_prev_input);
+        }
+
+        core::sync_threadblock();
+
+        if (IS_LAST_TILE)
+        {
+          BlockStore(temp_storage.store)
+              .Store(output_it + tile_base, output, num_remaining);
+        }
+        else
+        {
+          BlockStore(temp_storage.store).Store(output_it + tile_base, output);
+        }
+      }
+
+
+      template <bool IS_LAST_TILE>
+      void THRUST_DEVICE_FUNCTION
+      consume_tile(int  num_remaining,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          consume_tile_impl<IS_LAST_TILE, true>(num_remaining,
+                                                tile_idx,
+                                                tile_base);
+        }
+        else
+        {
+          consume_tile_impl<IS_LAST_TILE, false>(num_remaining,
+                                                 tile_idx,
+                                                 tile_base);
+        }
+      }
+
+      void THRUST_DEVICE_FUNCTION
+      consume_range(Size num_items)
+      {
+        int  tile_idx      = blockIdx.x;
+        Size tile_base     = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
+        Size num_remaining = num_items - tile_base;
+
+        if (num_remaining > ITEMS_PER_TILE)    // not a last tile
+        {
+          consume_tile<false>(num_remaining, tile_idx, tile_base);
+        }
+        else if (num_remaining > 0)
+        {
+          consume_tile<true>(num_remaining, tile_idx, tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &temp_storage_,
+           InputIt      input_it_,
+           input_type * first_tile_previous_,
+           OutputIt     result_,
+           BinaryOp     binary_op_,
+           Size         num_items)
+          : temp_storage(temp_storage_),
+            load_it(core::make_load_iterator(ptx_plan(), input_it_)),
+            first_tile_previous(first_tile_previous_),
+            output_it(result_),
+            binary_op(binary_op_)
+      {
+        consume_range(num_items);
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(InputIt     first,
+                       input_type *first_element,
+                       OutputIt    result,
+                       BinaryOp    binary_op,
+                       Size        num_items,
+                       char *      shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+      impl(storage, first, first_element, result, binary_op, num_items);
+    }
+  }; // struct AdjacentDifferenceAgent
+
+  template <class InputIt,
+            class OutputIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(InputIt  first,
+                       OutputIt result,
+                       Size     num_tiles,
+                       int      items_per_tile,
+                       char *   /*shmem*/)
+    {
+      int tile_idx  = blockIdx.x * blockDim.x + threadIdx.x;
+      Size tile_base = static_cast<Size>(tile_idx) * items_per_tile;
+      if (tile_base > 0 && tile_idx < num_tiles)
+        result[tile_idx] = first[tile_base - 1];
+    }
+  }; // struct InitAgent
+
+  template <class InputIt,
+            class OutputIt,
+            class BinaryOp,
+            class Size>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *       d_temp_storage,
+            size_t &     temp_storage_bytes,
+            InputIt      first,
+            OutputIt     result,
+            BinaryOp     binary_op,
+            Size         num_items,
+            cudaStream_t stream,
+            bool         debug_sync)
+  {
+    if (num_items == 0)
+      return cudaSuccess;
+
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    cudaError_t status = cudaSuccess;
+
+    typedef AgentLauncher<
+        AdjacentDifferenceAgent<InputIt,
+                                OutputIt,
+                                Size,
+                                BinaryOp> >
+        difference_agent;
+
+    typedef typename iterator_traits<InputIt>::value_type input_type;
+    typedef AgentLauncher<InitAgent<InputIt, input_type *, Size> > init_agent;
+
+    AgentPlan difference_plan = difference_agent::get_plan(stream);
+    AgentPlan init_plan       = init_agent::get_plan();
+
+
+    Size tile_size = difference_plan.items_per_tile;
+    Size num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t tmp1        = num_tiles * sizeof(input_type);
+    size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size,
+                                           num_tiles);
+
+    size_t allocation_sizes[2] = {tmp1, vshmem_size};
+    void * allocations[2]      = {NULL, NULL};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_bytes,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    input_type *first_tile_previous = (input_type *)allocations[0];
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    init_agent ia(init_plan, num_tiles, stream, "adjacent_difference::init_agent", debug_sync);
+    ia.launch(first, first_tile_previous, num_tiles, tile_size);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    difference_agent da(difference_plan, num_items, stream, vshmem_ptr, "adjacent_difference::difference_agent", debug_sync);
+    da.launch(first,
+              first_tile_previous,
+              result,
+              binary_op,
+              num_items);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename InputIt,
+            typename OutputIt,
+            typename BinaryOp>
+  OutputIt THRUST_RUNTIME_FUNCTION
+  adjacent_difference(execution_policy<Derived>& policy,
+                      InputIt                    first,
+                      InputIt                    last,
+                      OutputIt                   result,
+                      BinaryOp                   binary_op)
+  {
+    typedef typename iterator_traits<InputIt>::difference_type size_type;
+
+    size_type    num_items    = thrust::distance(first, last);
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
+        (NULL, storage_size, first, result, binary_op,
+           num_items_fixed, stream, debug_sync));
+    cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
+        (ptr, storage_size, first, result, binary_op,
+           num_items_fixed, stream, debug_sync));
+    cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "adjacent_difference failed to synchronize");
+
+    return result + num_items;
+  }
+
+}    // namespace __adjacent_difference
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class BinaryOp>
+OutputIt __host__ __device__
+adjacent_difference(execution_policy<Derived> &policy,
+                    InputIt                    first,
+                    InputIt                    last,
+                    OutputIt                   result,
+                    BinaryOp                   binary_op)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __adjacent_difference::adjacent_difference(policy,
+        first,
+        last,
+        result,
+        binary_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
+                                      first,
+                                      last,
+                                      result,
+                                      binary_op);
+#endif
+  }
+
+  return ret;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+adjacent_difference(execution_policy<Derived> &policy,
+                    InputIt                    first,
+                    InputIt                    last,
+                    OutputIt                   result)
+{
+  typedef typename iterator_traits<InputIt>::value_type input_type;
+  return cuda_cub::adjacent_difference(policy,
+                                       first,
+                                       last,
+                                       result,
+                                       minus<input_type>());
+}
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+
+//
+#include <thrust/memory.h>
+#include <thrust/adjacent_difference.h>
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/assign_value.h b/thrust/thrust/system/cuda/detail/assign_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6fd987bf3f814f389b01499a06b313517b69733
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/assign_value.h
@@ -0,0 +1,102 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/copy.h>
+
+
+namespace thrust
+{
+namespace cuda_cub {
+
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+    {
+      cuda_cub::copy(exec, src, src + 1, dst);
+    }
+
+    __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
+    {
+      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
+    }
+  };
+
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      war_nvbugs_881631::host_path(exec,dst,src);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      war_nvbugs_881631::device_path(exec,dst,src);
+    #endif
+  }
+} // end assign_value()
+
+
+template<typename System1, typename System2, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+    {
+      // rotate the systems so that they are ordered the same as (src, dst)
+      // for the call to thrust::copy
+      cross_system<System2,System1> rotated_systems = systems.rotate();
+      cuda_cub::copy(rotated_systems, src, src + 1, dst);
+    }
+
+    __device__ inline static void device_path(cross_system<System1,System2> &, Pointer1 dst, Pointer2 src)
+    {
+      // XXX forward the true cuda::execution_policy inside systems here
+      //     instead of materializing a tag
+      thrust::cuda::tag cuda_tag;
+      thrust::cuda_cub::assign_value(cuda_tag, dst, src);
+    }
+  };
+
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      war_nvbugs_881631::host_path(systems,dst,src);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      war_nvbugs_881631::device_path(systems,dst,src);
+    #endif
+  }
+} // end assign_value()
+
+
+
+
+} // end cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/async/copy.h b/thrust/thrust/system/cuda/detail/async/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b317cbb55a3322d2f097bdf6132c683d3e5d353
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/async/copy.h
@@ -0,0 +1,538 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/async/transform.h>
+#include <thrust/system/cuda/detail/cross_system.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/uninitialized_copy.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+
+namespace system { namespace cuda { namespace detail
+{
+
+// ContiguousIterator input and output iterators
+// TriviallyCopyable elements
+// Host to device, device to host, device to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy& from_exec
+, ToPolicy&   to_exec
+, ForwardIt   first
+, Size        n
+, OutputIt    output
+) ->
+  typename std::enable_if<
+    is_indirectly_trivially_relocatable_to<ForwardIt, OutputIt>::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(
+    select_device_system(from_exec, to_exec)
+  );
+
+  using pointer
+    = typename thrust::detail::allocator_traits<decltype(device_alloc)>::
+      template rebind_traits<void>::pointer;
+
+  unique_eager_event e;
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(
+    select_device_system(from_exec, to_exec)
+  );
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(from_exec))
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(to_exec))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        extract_dependencies(
+          std::move(thrust::detail::derived_cast(from_exec))
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(to_exec))
+        )
+      )
+    );
+  }
+
+  // Run copy.
+
+  thrust::cuda_cub::throw_on_error(
+    cudaMemcpyAsync(
+      thrust::raw_pointer_cast(&*output)
+    , thrust::raw_pointer_cast(&*first)
+    , sizeof(T) * n
+    , direction_of_copy(from_exec, to_exec)
+    , e.stream().native_handle()
+    )
+  , "after copy launch"
+  );
+
+  return e;
+}
+
+// Non-ContiguousIterator input or output, or non-TriviallyRelocatable value type
+// Device to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  thrust::cuda::execution_policy<FromPolicy>& from_exec
+, thrust::cuda::execution_policy<ToPolicy>&   to_exec
+, ForwardIt                                   first
+, Size                                        n
+, OutputIt                                    output
+) ->
+  typename std::enable_if<
+    conjunction<
+      negation<
+        is_indirectly_trivially_relocatable_to<ForwardIt, OutputIt>
+      >
+    , decltype(is_device_to_device_copy(from_exec, to_exec))
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  return async_transform_n(
+    select_device_system(from_exec, to_exec)
+  , first, n, output, thrust::identity<T>()
+  );
+}
+
+template <typename OutputIt>
+void async_copy_n_compile_failure_no_cuda_to_non_contiguous_output()
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (negation<is_contiguous_iterator<OutputIt>>::value)
+  , "copying to non-ContiguousIterators in another system from the CUDA system "
+    "is not supported; use `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR(Iterator)` to "
+    "indicate that an iterator points to elements that are contiguous in memory."
+  );
+}
+
+// Non-ContiguousIterator output iterator
+// TriviallyRelocatable value type
+// Device to host, host to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy& from_exec
+, ToPolicy&   to_exec
+, ForwardIt   first
+, Size        n
+, OutputIt    output
+) ->
+  typename std::enable_if<
+    conjunction<
+      negation<is_contiguous_iterator<OutputIt>>
+    , is_trivially_relocatable_to<
+        typename iterator_traits<ForwardIt>::value_type
+      , typename iterator_traits<OutputIt>::value_type
+      >
+    , disjunction<
+        decltype(is_host_to_device_copy(from_exec, to_exec))
+      , decltype(is_device_to_host_copy(from_exec, to_exec))
+      >
+    >::value
+  , unique_eager_event
+  >::type
+{
+  async_copy_n_compile_failure_no_cuda_to_non_contiguous_output<OutputIt>();
+
+  return {};
+}
+
+// Workaround for MSVC's lack of expression SFINAE and also for an NVCC bug.
+// In NVCC, when two SFINAE-enabled overloads are only distinguishable by a
+// part of a SFINAE condition that is in a `decltype`, NVCC thinks they are the
+// same overload and emits an error.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt
+  // MSVC2015 WAR: doesn't like decltype(...)::value in superclass definition
+, typename IsH2DCopy = decltype(is_host_to_device_copy(
+    std::declval<FromPolicy const&>()
+  , std::declval<ToPolicy const&>()))
+>
+struct is_buffered_trivially_relocatable_host_to_device_copy
+  : thrust::integral_constant<
+      bool
+    ,    !is_contiguous_iterator<ForwardIt>::value
+      && is_contiguous_iterator<OutputIt>::value
+      && is_trivially_relocatable_to<
+            typename iterator_traits<ForwardIt>::value_type
+          , typename iterator_traits<OutputIt>::value_type
+          >::value
+      && IsH2DCopy::value
+    >
+{};
+
+// Non-ContiguousIterator input iterator, ContiguousIterator output iterator
+// TriviallyRelocatable value type
+// Host to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy&                               from_exec
+, thrust::cuda::execution_policy<ToPolicy>& to_exec
+, ForwardIt                                 first
+, Size                                      n
+, OutputIt                                  output
+) ->
+  typename std::enable_if<
+    is_buffered_trivially_relocatable_host_to_device_copy<
+      FromPolicy
+    , thrust::cuda::execution_policy<ToPolicy>
+    , ForwardIt, OutputIt
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const host_alloc = get_async_host_allocator(
+    from_exec
+  );
+
+  // Create host-side buffer.
+
+  auto buffer = uninitialized_allocate_unique_n<T>(host_alloc, n);
+
+  auto const buffer_ptr = buffer.get();
+
+  // Copy into host-side buffer.
+
+  // TODO: Switch to an async call once we have async interfaces for host
+  // systems and support for cross system dependencies.
+  uninitialized_copy_n(from_exec, first, n, buffer_ptr);
+
+  // Run device-side copy.
+
+  auto new_to_exec = thrust::detail::derived_cast(to_exec).rebind_after(
+    std::tuple_cat(
+      std::make_tuple(
+        std::move(buffer)
+      )
+    , extract_dependencies(
+        std::move(thrust::detail::derived_cast(from_exec))
+      )
+    , extract_dependencies(
+        std::move(thrust::detail::derived_cast(to_exec))
+      )
+    )
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(to_exec)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_to_exec)
+    )>::value
+  ));
+
+  return async_copy_n(
+    from_exec
+    // TODO: We have to cast back to the right execution_policy class. Ideally,
+    // we should be moving here.
+  , new_to_exec
+  , buffer_ptr
+  , n
+  , output
+  );
+}
+
+// Workaround for MSVC's lack of expression SFINAE and also for an NVCC bug.
+// In NVCC, when two SFINAE-enabled overloads are only distinguishable by a
+// part of a SFINAE condition that is in a `decltype`, NVCC thinks they are the
+// same overload and emits an error.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt
+  // MSVC2015 WAR: doesn't like decltype(...)::value in superclass definition
+, typename IsD2HCopy = decltype(is_device_to_host_copy(
+    std::declval<FromPolicy const&>()
+  , std::declval<ToPolicy const&>()))
+>
+struct is_buffered_trivially_relocatable_device_to_host_copy
+  : thrust::integral_constant<
+      bool
+    ,    !is_contiguous_iterator<ForwardIt>::value
+      && is_contiguous_iterator<OutputIt>::value
+      && is_trivially_relocatable_to<
+            typename iterator_traits<ForwardIt>::value_type
+          , typename iterator_traits<OutputIt>::value_type
+          >::value
+      && IsD2HCopy::value
+    >
+{};
+
+// Non-ContiguousIterator input iterator, ContiguousIterator output iterator
+// TriviallyRelocatable value type
+// Device to host
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  thrust::cuda::execution_policy<FromPolicy>& from_exec
+, ToPolicy&                                   to_exec
+, ForwardIt                                   first
+, Size                                        n
+, OutputIt                                    output
+) ->
+  typename std::enable_if<
+    is_buffered_trivially_relocatable_device_to_host_copy<
+      thrust::cuda::execution_policy<FromPolicy>
+    , ToPolicy
+    , ForwardIt, OutputIt
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(
+    from_exec
+  );
+
+  // Create device-side buffer.
+
+  auto buffer = uninitialized_allocate_unique_n<T>(device_alloc, n);
+
+  auto const buffer_ptr = buffer.get();
+
+  // Run device-side copy.
+
+  auto f0 = async_copy_n(
+    from_exec
+  , from_exec
+  , first
+  , n
+  , buffer_ptr
+  );
+
+  // Run copy back to host.
+
+  auto new_from_exec = thrust::detail::derived_cast(from_exec).rebind_after(
+    std::move(buffer)
+  , std::move(f0)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(from_exec)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_from_exec)
+    )>::value
+  ));
+
+  return async_copy_n(
+    new_from_exec
+  , to_exec
+  , buffer_ptr
+  , n
+  , output
+  );
+}
+
+template <typename InputType, typename OutputType>
+void async_copy_n_compile_failure_non_trivially_relocatable_elements()
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (is_trivially_relocatable_to<OutputType, InputType>::value)
+  , "only sequences of TriviallyRelocatable elements can be copied to and from "
+    "the CUDA system; use `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(T)` to "
+    "indicate that a type can be copied by bitwise (e.g. by `memcpy`)"
+  );
+}
+
+// Non-TriviallyRelocatable value type
+// Host to device, device to host
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy& from_exec
+, ToPolicy&   to_exec
+, ForwardIt   first
+, Size        n
+, OutputIt    output
+) ->
+  typename std::enable_if<
+    conjunction<
+      negation<
+        is_trivially_relocatable_to<
+          typename iterator_traits<ForwardIt>::value_type
+        , typename iterator_traits<OutputIt>::value_type
+        >
+      >
+    , disjunction<
+        decltype(is_host_to_device_copy(from_exec, to_exec))
+      , decltype(is_device_to_host_copy(from_exec, to_exec))
+      >
+    >::value
+  , unique_eager_event
+  >::type
+{
+  // TODO: We could do more here with cudaHostRegister.
+
+  async_copy_n_compile_failure_non_trivially_relocatable_elements<
+    typename thrust::iterator_traits<ForwardIt>::value_type
+  , typename std::add_lvalue_reference<
+      typename thrust::iterator_traits<OutputIt>::value_type
+    >::type
+  >();
+
+  return {};
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+auto async_copy(
+  thrust::cuda::execution_policy<FromPolicy>&         from_exec
+, thrust::cpp::execution_policy<ToPolicy>&            to_exec
+, ForwardIt                                           first
+, Sentinel                                            last
+, OutputIt                                            output
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_copy_n(
+    from_exec, to_exec, first, distance(first, last), output
+  )
+)
+
+// ADL entry point.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+auto async_copy(
+  thrust::cpp::execution_policy<FromPolicy>& from_exec
+, thrust::cuda::execution_policy<ToPolicy>&  to_exec
+, ForwardIt                                  first
+, Sentinel                                   last
+, OutputIt                                   output
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_copy_n(
+    from_exec, to_exec, first, distance(first, last), output
+  )
+)
+
+// ADL entry point.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+auto async_copy(
+  thrust::cuda::execution_policy<FromPolicy>& from_exec
+, thrust::cuda::execution_policy<ToPolicy>&   to_exec
+, ForwardIt                                   first
+, Sentinel                                    last
+, OutputIt                                    output
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_copy_n(
+    from_exec, to_exec, first, distance(first, last), output
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/async/customization.h b/thrust/thrust/system/cuda/detail/async/customization.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb52c2cf02ce26f2083e8f39f436deb1a884a0dd
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/async/customization.h
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/execute_with_allocator.h>
+#include <thrust/system/cuda/memory_resource.h>
+#include <thrust/memory/detail/host_system_resource.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/disjoint_sync_pool.h>
+#include <thrust/mr/sync_pool.h>
+#include <thrust/per_device_resource.h>
+
+namespace thrust
+{
+
+namespace system { namespace cuda { namespace detail
+{
+
+using default_async_host_resource =
+  thrust::mr::synchronized_pool_resource<
+    thrust::host_memory_resource
+  >;
+
+template <typename DerivedPolicy>
+auto get_async_host_allocator(
+  thrust::detail::execution_policy_base<DerivedPolicy>&
+)
+THRUST_RETURNS(
+  thrust::mr::stateless_resource_allocator<
+    thrust::detail::uint8_t, default_async_host_resource
+  >{}
+)
+
+///////////////////////////////////////////////////////////////////////////////
+
+using default_async_device_resource =
+  thrust::mr::disjoint_synchronized_pool_resource<
+    thrust::system::cuda::memory_resource
+  , thrust::mr::new_delete_resource
+  >;
+
+template <typename DerivedPolicy>
+auto get_async_device_allocator(
+  thrust::detail::execution_policy_base<DerivedPolicy>&
+)
+THRUST_RETURNS(
+  thrust::per_device_allocator<
+    thrust::detail::uint8_t, default_async_device_resource, par_t
+  >{}
+)
+
+template <typename Allocator, template <typename> class BaseSystem>
+auto get_async_device_allocator(
+  thrust::detail::execute_with_allocator<Allocator, BaseSystem>& exec
+)
+THRUST_RETURNS(exec.get_allocator())
+
+template <typename Allocator, template <typename> class BaseSystem>
+auto get_async_device_allocator(
+  thrust::detail::execute_with_allocator_and_dependencies<
+    Allocator, BaseSystem
+  >& exec
+)
+THRUST_RETURNS(exec.get_allocator())
+
+///////////////////////////////////////////////////////////////////////////////
+
+using default_async_universal_host_pinned_resource =
+  thrust::mr::synchronized_pool_resource<
+    thrust::system::cuda::universal_host_pinned_memory_resource
+  >;
+
+template <typename DerivedPolicy>
+auto get_async_universal_host_pinned_allocator(
+  thrust::detail::execution_policy_base<DerivedPolicy>&
+)
+THRUST_RETURNS(
+  thrust::mr::stateless_resource_allocator<
+    thrust::detail::uint8_t, default_async_universal_host_pinned_resource
+  >{}
+)
+
+}}} // namespace system::cuda::detail
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/async/for_each.h b/thrust/thrust/system/cuda/detail/async/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..750b7e829b58f26c8cdd2433cac26817afbad6d4
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/async/for_each.h
@@ -0,0 +1,159 @@
+
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <typename ForwardIt, typename UnaryFunction>
+struct async_for_each_fn
+{
+  ForwardIt first;
+  UnaryFunction f;
+
+  __host__ __device__
+  async_for_each_fn(ForwardIt&& first_, UnaryFunction&& f_)
+    : first(std::move(first_)), f(std::move(f_))
+  {}
+
+  template <typename Index>
+  __host__ __device__
+  void operator()(Index idx)
+  {
+    f(thrust::raw_reference_cast(first[idx]));
+  }
+};
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename UnaryFunction
+>
+auto async_for_each_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  UnaryFunction                    func
+) -> unique_eager_event
+{
+  unique_eager_event e;
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      extract_dependencies(
+        std::move(thrust::detail::derived_cast(policy))
+      )
+    );
+  }
+
+  // Run for_each.
+
+  async_for_each_fn<ForwardIt, UnaryFunction> wrapped(
+    std::move(first), std::move(func)
+  );
+
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__parallel_for::parallel_for(
+      n, std::move(wrapped), e.stream().native_handle()
+    )
+  , "after for_each launch"
+  );
+
+  return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename UnaryFunction
+>
+auto async_for_each(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Sentinel                         last,
+  UnaryFunction&&                  func
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_for_each_n(
+    policy, first, distance(first, last), THRUST_FWD(func)
+  )
+);
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/async/reduce.h b/thrust/thrust/system/cuda/detail/async/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..906928b27f3107a72c68b57a6c532abe8e2af254
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/async/reduce.h
@@ -0,0 +1,350 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Optimize for thrust::plus
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename T, typename BinaryOp
+>
+auto async_reduce_n(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Size                             n
+, T                                init
+, BinaryOp                         op
+) -> unique_eager_future<remove_cvref_t<T>>
+{
+  using U = remove_cvref_t<T>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  using pointer
+    = typename thrust::detail::allocator_traits<decltype(device_alloc)>::
+      template rebind_traits<U>::pointer;
+
+  unique_eager_future_promise_pair<U, pointer> fp;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      nullptr
+    , tmp_size
+    , first
+    , static_cast<U*>(nullptr)
+    , n
+    , op
+    , init
+    , nullptr // Null stream, just for sizing.
+    , THRUST_DEBUG_SYNC_FLAG
+    )
+  , "after reduction sizing"
+  );
+
+  // Allocate temporary storage.
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, sizeof(U) + tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+  U* const ret_ptr = thrust::detail::aligned_reinterpret_cast<U*>(
+    raw_pointer_cast(content_ptr)
+  );
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr + sizeof(U))
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    fp = make_dependent_future<U, pointer>(
+      [] (decltype(content) const& c)
+      {
+        return pointer(
+          thrust::detail::aligned_reinterpret_cast<U*>(
+            raw_pointer_cast(c.get())
+          )
+        );
+      }
+    , std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    fp = make_dependent_future<U, pointer>(
+      [] (decltype(content) const& c)
+      {
+        return pointer(
+          thrust::detail::aligned_reinterpret_cast<U*>(
+            raw_pointer_cast(c.get())
+          )
+        );
+      }
+    , std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run reduction.
+
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      tmp_ptr
+    , tmp_size
+    , first
+    , ret_ptr
+    , n
+    , op
+    , init
+    , fp.future.stream().native_handle()
+    , THRUST_DEBUG_SYNC_FLAG
+    )
+  , "after reduction launch"
+  );
+
+  return std::move(fp.future);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
+>
+auto async_reduce(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Sentinel                         last
+, T                                init
+, BinaryOp                         op
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_reduce_n(
+    policy, first, distance(first, last), init, op
+  )
+)
+
+} // cuda_cub
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename OutputIt
+, typename T, typename BinaryOp
+>
+auto async_reduce_into_n(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Size                             n
+, OutputIt                         output
+, T                                init
+, BinaryOp                         op
+) -> unique_eager_event
+{
+  using U = remove_cvref_t<T>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  unique_eager_event e;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      nullptr
+    , tmp_size
+    , first
+    , static_cast<U*>(nullptr)
+    , n
+    , op
+    , init
+    , nullptr // Null stream, just for sizing.
+    , THRUST_DEBUG_SYNC_FLAG
+    )
+  , "after reduction sizing"
+  );
+
+  // Allocate temporary storage.
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr)
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run reduction.
+
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      tmp_ptr
+    , tmp_size
+    , first
+    , output
+    , n
+    , op
+    , init
+    , e.stream().native_handle()
+    , THRUST_DEBUG_SYNC_FLAG
+    )
+  , "after reduction launch"
+  );
+
+  return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename T, typename BinaryOp
+>
+auto async_reduce_into(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Sentinel                         last
+, OutputIt                         output
+, T                                init
+, BinaryOp                         op
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_reduce_into_n(
+    policy, first, distance(first, last), output, init, op
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/async/sort.h b/thrust/thrust/system/cuda/detail/async/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e357fde691ad27f70058120653ea1bdc0b39e91
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/async/sort.h
@@ -0,0 +1,522 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/async/copy.h>
+#include <thrust/system/cuda/detail/sort.h>
+#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/is_operator_less_or_greater_function_object.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+
+namespace system { namespace cuda { namespace detail
+{
+
+// Non-ContiguousIterator input and output iterators
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename StrictWeakOrdering
+>
+auto async_stable_sort_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  StrictWeakOrdering               comp
+) ->
+  typename std::enable_if<
+    negation<is_contiguous_iterator<ForwardIt>>::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  // Create device-side buffer.
+
+  // FIXME: Combine this temporary allocation with the main one for CUB.
+  auto device_buffer = uninitialized_allocate_unique_n<T>(device_alloc, n);
+
+  auto const device_buffer_ptr = device_buffer.get();
+
+  // Synthesize a suitable new execution policy, because we don't want to
+  // try and extract twice from the one we were passed.
+  typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
+
+  // Copy from the input into the buffer.
+
+  auto new_policy0 = thrust::detail::derived_cast(policy).rebind_after(
+    std::move(device_buffer)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(policy)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_policy0)
+    )>::value
+  ));
+
+  auto f0 = async_copy_n(
+    new_policy0
+  , tag_policy
+  , first
+  , n
+  , device_buffer_ptr
+  );
+
+  // Sort the buffer.
+
+  auto new_policy1 = thrust::detail::derived_cast(policy).rebind_after(
+    std::move(f0)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(policy)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_policy1)
+    )>::value
+  ));
+
+  auto f1 = async_sort_n(
+    new_policy1
+  , tag_policy
+  , device_buffer_ptr
+  , n
+  , comp
+  );
+
+  // Copy from the buffer into the input.
+  // FIXME: Combine this with the potential memcpy at the end of the main sort
+  // routine.
+
+  auto new_policy2 = thrust::detail::derived_cast(policy).rebind_after(
+    std::move(f1)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(policy)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_policy2)
+    )>::value
+  ));
+
+  return async_copy_n(
+    new_policy2
+  , tag_policy
+  , device_buffer_ptr
+  , n
+  , first
+  );
+}
+
+// ContiguousIterator iterators
+// Non-Scalar value type or user-defined StrictWeakOrdering
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename StrictWeakOrdering
+>
+auto async_stable_sort_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  StrictWeakOrdering               comp
+) ->
+  typename std::enable_if<
+    conjunction<
+      is_contiguous_iterator<ForwardIt>
+    , disjunction<
+        negation<
+          std::is_scalar<
+            typename iterator_traits<ForwardIt>::value_type
+          >
+        >
+      , negation<
+          is_operator_less_or_greater_function_object<StrictWeakOrdering>
+        >
+      >
+    >::value
+  , unique_eager_event
+  >::type
+{
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  unique_eager_event e;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__merge_sort::doit_step<
+      /* Sort items? */ std::false_type, /* Stable? */ std::true_type
+    >(
+      nullptr
+    , tmp_size
+    , first
+    , static_cast<thrust::detail::uint8_t*>(nullptr) // Items.
+    , n
+    , comp
+    , nullptr // Null stream, just for sizing.
+    , THRUST_DEBUG_SYNC_FLAG
+    )
+  , "after merge sort sizing"
+  );
+
+  // Allocate temporary storage.
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr)
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run merge sort.
+
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__merge_sort::doit_step<
+      /* Sort items? */ std::false_type, /* Stable? */ std::true_type
+    >(
+      tmp_ptr
+    , tmp_size
+    , first
+    , static_cast<thrust::detail::uint8_t*>(nullptr) // Items.
+    , n
+    , comp
+    , e.stream().native_handle()
+    , THRUST_DEBUG_SYNC_FLAG
+    )
+  , "after merge sort sizing"
+  );
+
+  return e;
+}
+
+template <typename T, typename Size, typename StrictWeakOrdering>
+typename std::enable_if<
+  is_operator_less_function_object<StrictWeakOrdering>::value
+, cudaError_t
+>::type
+invoke_radix_sort(
+  cudaStream_t          stream
+, void*                 tmp_ptr
+, std::size_t&          tmp_size
+, cub::DoubleBuffer<T>& keys
+, Size&                 n
+, StrictWeakOrdering
+)
+{
+  return cub::DeviceRadixSort::SortKeys(
+    tmp_ptr
+  , tmp_size
+  , keys
+  , n
+  , 0
+  , sizeof(T) * 8
+  , stream
+  , THRUST_DEBUG_SYNC_FLAG
+  );
+}
+
+template <typename T, typename Size, typename StrictWeakOrdering>
+typename std::enable_if<
+  is_operator_greater_function_object<StrictWeakOrdering>::value
+, cudaError_t
+>::type
+invoke_radix_sort(
+  cudaStream_t          stream
+, void*                 tmp_ptr
+, std::size_t&          tmp_size
+, cub::DoubleBuffer<T>& keys
+, Size&                 n
+, StrictWeakOrdering
+)
+{
+  return cub::DeviceRadixSort::SortKeysDescending(
+    tmp_ptr
+  , tmp_size
+  , keys
+  , n
+  , 0
+  , sizeof(T) * 8
+  , stream
+  , THRUST_DEBUG_SYNC_FLAG
+  );
+}
+
+// ContiguousIterator iterators
+// Scalar value type
+// operator< or operator>
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename StrictWeakOrdering
+>
+auto async_stable_sort_n(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Size                             n
+, StrictWeakOrdering               comp
+) ->
+  typename std::enable_if<
+    conjunction<
+      is_contiguous_iterator<ForwardIt>
+    , std::is_scalar<
+        typename iterator_traits<ForwardIt>::value_type
+      >
+    , is_operator_less_or_greater_function_object<StrictWeakOrdering>
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  unique_eager_event e;
+
+  cub::DoubleBuffer<T> keys(
+    raw_pointer_cast(&*first), nullptr
+  );
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    invoke_radix_sort(
+      nullptr // Null stream, just for sizing.
+    , nullptr
+    , tmp_size
+    , keys
+    , n
+    , comp
+    )
+  , "after radix sort sizing"
+  );
+
+  // Allocate temporary storage.
+
+  size_t keys_temp_storage = thrust::detail::aligned_storage_size(
+    sizeof(T) * n, 128
+  );
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, keys_temp_storage + tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+
+  keys.d_buffers[1] = thrust::detail::aligned_reinterpret_cast<T*>(
+    raw_pointer_cast(content_ptr)
+  );
+
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr + keys_temp_storage)
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run radix sort.
+
+  thrust::cuda_cub::throw_on_error(
+    invoke_radix_sort(
+      e.stream().native_handle()
+    , tmp_ptr
+    , tmp_size
+    , keys
+    , n
+    , comp
+    )
+  , "after radix sort launch"
+  );
+
+  if (0 != keys.selector)
+  {
+    auto new_policy0 = thrust::detail::derived_cast(policy).rebind_after(
+      std::move(e)
+    );
+
+    THRUST_STATIC_ASSERT((
+      std::tuple_size<decltype(
+        extract_dependencies(policy)
+      )>::value + 1
+      <=
+      std::tuple_size<decltype(
+        extract_dependencies(new_policy0)
+      )>::value
+    ));
+
+    // Synthesize a suitable new execution policy, because we don't want to
+    // try and extract twice from the one we were passed.
+    typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
+
+    using return_future = decltype(e);
+    return return_future(async_copy_n(
+      new_policy0
+    , tag_policy
+    , keys.d_buffers[1]
+    , n
+    , keys.d_buffers[0]
+    ));
+  }
+  else
+    return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+>
+auto async_stable_sort(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Sentinel                         last,
+  StrictWeakOrdering               comp
+)
+// A GCC 5 bug requires an explicit trailing return type here, so stick with
+// THRUST_DECLTYPE_RETURNS for now.
+THRUST_DECLTYPE_RETURNS(
+  thrust::system::cuda::detail::async_stable_sort_n(
+    policy, first, distance(first, last), comp
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/async/transform.h b/thrust/thrust/system/cuda/detail/async/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..544da5cb9efd4771f7638226e8bbcf8b74d14a3c
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/async/transform.h
@@ -0,0 +1,163 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+
+#include <type_traits>
+
+namespace thrust
+{
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <typename ForwardIt, typename OutputIt, typename UnaryOperation>
+struct async_transform_fn
+{
+  ForwardIt first_;
+  OutputIt output_;
+  UnaryOperation op_;
+
+  __host__ __device__
+  async_transform_fn(ForwardIt&& first, OutputIt&& output, UnaryOperation&& op)
+    : first_(std::move(first)), output_(std::move(output)), op_(std::move(op))
+  {}
+
+  template <typename Index>
+  __host__ __device__
+  void operator()(Index idx)
+  {
+    output_[idx] = op_(thrust::raw_reference_cast(first_[idx]));
+  }
+};
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename OutputIt, typename UnaryOperation
+>
+auto async_transform_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  OutputIt                         output,
+  UnaryOperation                   op
+) -> unique_eager_event
+{
+  unique_eager_event e;
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      extract_dependencies(
+        std::move(thrust::detail::derived_cast(policy))
+      )
+    );
+  }
+
+  // Run transform.
+
+  async_transform_fn<ForwardIt, OutputIt, UnaryOperation> wrapped(
+    std::move(first), std::move(output), std::move(op)
+  );
+
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__parallel_for::parallel_for(
+      n, std::move(wrapped), e.stream().native_handle()
+    )
+  , "after transform launch"
+  );
+
+  return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename UnaryOperation
+>
+auto async_transform(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Sentinel                         last,
+  OutputIt                         output,
+  UnaryOperation&&                 op
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_transform_n(
+    policy, first, distance(first, last), output, THRUST_FWD(op)
+  )
+);
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/binary_search.h b/thrust/thrust/system/cuda/detail/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..1859824b831566ffac987508a09184c2bdd6c82f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/binary_search.h
@@ -0,0 +1,781 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if 0
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+
+#if 1
+#  define BS_SIMPLE
+#endif
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __binary_search {
+
+  template <class HaystackIt, class NeedlesIt>
+  struct lbf
+  {
+    typedef typename iterator_traits<HaystackIt>::difference_type result_type;
+    typedef typename iterator_traits<NeedlesIt>::value_type T;
+
+    template <class It, class CompareOp>
+    THRUST_DEVICE_FUNCTION result_type
+    operator()(It begin, It end, T const& value, CompareOp comp)
+    {
+      return system::detail::generic::scalar::lower_bound(begin,
+                                                          end,
+                                                          value,
+                                                          comp) -
+             begin;
+    }
+  };    // struct lbf
+
+  template<class HaystackIt, class NeedlesIt>
+  struct ubf
+  {
+    typedef typename iterator_traits<HaystackIt>::difference_type result_type;
+    typedef typename iterator_traits<NeedlesIt>::value_type T;
+
+    template <class It, class CompareOp>
+    THRUST_DEVICE_FUNCTION result_type
+    operator()(It begin, It end, T const& value, CompareOp comp)
+    {
+      return system::detail::generic::scalar::upper_bound(begin,
+                                                          end,
+                                                          value,
+                                                          comp) -
+             begin;
+    }
+  };    // struct ubf
+
+  template<class HaystackIt, class NeedlesIt>
+  struct bsf
+  {
+    typedef bool result_type;
+    typedef typename iterator_traits<NeedlesIt>::value_type T;
+
+    template <class It, class CompareOp>
+    THRUST_DEVICE_FUNCTION bool
+    operator()(It begin, It end, T const& value, CompareOp comp)
+    {
+      HaystackIt iter = system::detail::generic::scalar::lower_bound(begin,
+                                                                     end,
+                                                                     value,
+                                                                     comp);
+
+      detail::wrapped_function<CompareOp, bool> wrapped_comp(comp);
+
+      return iter != end && !wrapped_comp(value, *iter);
+    }
+  };    // struct bsf
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class BinaryPred>
+  THRUST_DEVICE_FUNCTION Size
+  merge_path(KeysIt1    keys1,
+             KeysIt2    keys2,
+             Size       keys1_count,
+             Size       keys2_count,
+             Size       diag,
+             BinaryPred binary_pred)
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
+
+    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
+    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
+
+    while (keys1_begin < keys1_end)
+    {
+      Size      mid  = (keys1_begin + keys1_end) >> 1;
+      key1_type key1 = keys1[mid];
+      key2_type key2 = keys2[diag - 1 - mid];
+      bool      pred = binary_pred(key2, key1);
+      if (pred)
+      {
+        keys1_end = mid;
+      }
+      else
+      {
+        keys1_begin = mid + 1;
+      }
+    }
+    return keys1_begin;
+  }
+
+  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
+  THRUST_DEVICE_FUNCTION void
+  serial_merge(It  keys_shared,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T2 (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+  {
+    int keys1_end = keys1_beg + keys1_count;
+    int keys2_end = keys2_beg + keys2_count;
+
+    typedef typename iterator_value<It>::type key_type;
+
+    key_type key1 = keys_shared[keys1_beg];
+    key_type key2 = keys_shared[keys2_beg];
+
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      bool p = (keys2_beg < keys2_end) &&
+               ((keys1_beg >= keys1_end) ||
+                compare_op(key2,key1));
+
+      output[ITEM]  = p ? key2 : key1;
+      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
+
+      if (p)
+      {
+        key2 = keys_shared[keys2_beg];
+      }
+      else
+      {
+        key1 = keys_shared[keys1_beg];
+      }
+    }
+  }
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  };    // PtxPolicy
+
+  template <class Arch, class T>
+  struct Tuning;
+
+  template<class T>
+  struct Tuning<sm30,T>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_TRANSPOSE>
+        type;
+  };
+
+  template<class T>
+  struct Tuning<sm52,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+
+  template <class NeedlesIt,
+            class HaystackIt,
+            class Size,
+            class OutputIt,
+            class CompareOp,
+            class SearchOp>
+  struct VectorizedBinarySearchAgent
+  {
+    typedef typename iterator_traits<NeedlesIt>::value_type  needle_type;
+    typedef typename iterator_traits<HaystackIt>::value_type haystack_type;
+    typedef typename SearchOp::result_type                   result_type;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, needle_type>::type
+    {
+      typedef Tuning<Arch,needle_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, NeedlesIt>::type  NeedlesLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, HaystackIt>::type HaystackLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, NeedlesLoadIt>::type BlockLoadNeedles;
+
+      typedef typename core::BlockStore<PtxPlan, OutputIt, result_type>::type BlockStoreResult;
+
+      union TempStorage
+      {
+        typename BlockLoadNeedles::TempStorage load_needles;
+        typename BlockStoreResult::TempStorage store_result;
+
+#ifndef BS_SIMPLE
+        core::uninitialized_array<needle_type, PtxPlan::ITEMS_PER_TILE + 1> needles_shared;
+        core::uninitialized_array<result_type, PtxPlan::ITEMS_PER_TILE>     result_shared;
+        core::uninitialized_array<int, PtxPlan::ITEMS_PER_TILE>             indices_shared;
+#endif
+      };    // union TempStorage
+    };
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::NeedlesLoadIt    NeedlesLoadIt;
+    typedef typename ptx_plan::HaystackLoadIt   HaystackLoadIt;
+    typedef typename ptx_plan::BlockLoadNeedles BlockLoadNeedles;
+    typedef typename ptx_plan::BlockStoreResult BlockStoreResult;
+    typedef typename ptx_plan::TempStorage     TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      TempStorage&   storage;
+      NeedlesLoadIt  needles_load_it;
+      HaystackLoadIt haystack_load_it;
+      Size           needles_count;
+      Size           haystack_size;
+      OutputIt       result;
+      CompareOp      compare_op;
+      SearchOp       search_op;
+
+      THRUST_DEVICE_FUNCTION
+      void stable_odd_even_sort(needle_type (&needles)[ITEMS_PER_THREAD],
+                                int (&indices)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int I = 0; I < ITEMS_PER_THREAD; ++I)
+        {
+#pragma unroll
+          for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2)
+          {
+            if (compare_op(needles[J + 1], needles[J]))
+            {
+              using thrust::swap;
+              swap(needles[J], needles[J + 1]);
+              swap(indices[J], indices[J + 1]);
+            }
+          }    // inner loop
+        }      // outer loop
+      }
+
+      THRUST_DEVICE_FUNCTION void
+      block_mergesort(int tid,
+                      int count,
+                      needle_type (&needles_loc)[ITEMS_PER_THREAD],
+                      int (&indices_loc)[ITEMS_PER_THREAD])
+      {
+        using core::sync_threadblock;
+
+        // stable sort items in a single thread
+        //
+        stable_odd_even_sort(needles_loc,indices_loc);
+
+        // each thread has  sorted keys_loc
+        // merge sort keys_loc in shared memory
+        //
+#pragma unroll
+        for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
+        {
+          sync_threadblock();
+
+          // store keys in shmem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM;
+            storage.needles_shared[idx] = needles_loc[ITEM];
+          }
+
+          sync_threadblock();
+
+          int  indices[ITEMS_PER_THREAD];
+
+          int list  = ~(coop - 1) & tid;
+          int start = ITEMS_PER_THREAD * list;
+          int size  = ITEMS_PER_THREAD * (coop >> 1);
+
+          int diag = min(count, ITEMS_PER_THREAD * ((coop - 1) & tid));
+
+          int keys1_beg = min(count, start);
+          int keys1_end = min(count, keys1_beg + size);
+          int keys2_beg = keys1_end;
+          int keys2_end = min(count, keys2_beg + size);
+
+          int keys1_count = keys1_end - keys1_beg;
+          int keys2_count = keys2_end - keys2_beg;
+
+          int partition_diag = merge_path(&storage.needles_shared[keys1_beg],
+                                          &storage.needles_shared[keys2_beg],
+                                          keys1_count,
+                                          keys2_count,
+                                          diag,
+                                          compare_op);
+
+          int keys1_beg_loc   = keys1_beg + partition_diag;
+          int keys1_end_loc   = keys1_end;
+          int keys2_beg_loc   = keys2_beg + diag - partition_diag;
+          int keys2_end_loc   = keys2_end;
+          int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
+          int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
+          serial_merge(&storage.needles_shared[0],
+                       keys1_beg_loc,
+                       keys2_beg_loc,
+                       keys1_count_loc,
+                       keys2_count_loc,
+                       needles_loc,
+                       indices,
+                       compare_op);
+
+
+          sync_threadblock();
+
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM;
+            storage.indices_shared[idx] = indices_loc[ITEM];
+          }
+
+          sync_threadblock();
+
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            indices_loc[ITEM] = storage.indices_shared[indices[ITEM]];
+          }
+        }
+      }    // func block_merge_sort
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(int  tid,
+                   Size tile_idx,
+                   Size tile_base,
+                   int  num_remaining)
+      {
+        using core::sync_threadblock;
+
+        needle_type needles_loc[ITEMS_PER_THREAD];
+        BlockLoadNeedles(storage.load_needles)
+            .Load(needles_load_it + tile_base, needles_loc, num_remaining);
+
+#ifdef BS_SIMPLE
+
+        result_type results_loc[ITEMS_PER_THREAD];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          results_loc[ITEM] = search_op(haystack_load_it,
+                                        haystack_load_it + haystack_size,
+                                        needles_loc[ITEM],
+                                        compare_op);
+        }
+
+
+#else
+
+        if (IS_LAST_TILE)
+        {
+          needle_type max_value = needles_loc[0];
+#pragma unroll
+          for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
+            {
+              max_value = compare_op(max_value, needles_loc[ITEM])
+                            ? needles_loc[ITEM]
+                            : max_value;
+            }
+            else
+            {
+              needles_loc[ITEM] = max_value;
+            }
+          }
+        }
+
+        sync_threadblock();
+
+        int indices_loc[ITEMS_PER_THREAD];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM;
+          indices_loc[ITEM] = idx;
+        }
+
+        if (IS_LAST_TILE)
+        {
+          block_mergesort(tid,
+                          num_remaining,
+                          needles_loc,
+                          indices_loc);
+        }
+        else
+        {
+          block_mergesort(tid,
+                          ITEMS_PER_TILE,
+                          needles_loc,
+                          indices_loc);
+        }
+
+        sync_threadblock();
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = indices_loc[ITEM];
+          storage.result_shared[idx] =
+              search_op(haystack_load_it,
+                        haystack_load_it + haystack_size,
+                        needles_loc[ITEM],
+                        compare_op);
+        }
+
+        sync_threadblock();
+
+        result_type results_loc[ITEMS_PER_THREAD];
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM;
+          results_loc[ITEM] = storage.result_shared[idx];
+        }
+
+        sync_threadblock();
+#endif
+
+        BlockStoreResult(storage.store_result)
+            .Store(result + tile_base, results_loc, num_remaining);
+      }
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage& storage_,
+           NeedlesIt    needles_it_,
+           HaystackIt   haystack_it_,
+           Size         needles_count_,
+           Size         haystack_size_,
+           OutputIt     result_,
+           CompareOp    compare_op_,
+           SearchOp     search_op_)
+          : storage(storage_),
+            needles_load_it(core::make_load_iterator(ptx_plan(), needles_it_)),
+            haystack_load_it(core::make_load_iterator(ptx_plan(), haystack_it_)),
+            needles_count(needles_count_),
+            haystack_size(haystack_size_),
+            result(result_),
+            compare_op(compare_op_),
+            search_op(search_op_)
+      {
+        int  tid           = threadIdx.x;
+        Size tile_idx      = blockIdx.x;
+        Size num_tiles     = gridDim.x;
+        Size tile_base     = tile_idx * ITEMS_PER_TILE;
+        int  items_in_tile = min<int>(needles_count - tile_base, ITEMS_PER_TILE);
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
+        }
+        else
+        {
+          consume_tile<true>(tid, tile_idx, tile_base, items_in_tile);
+        }
+      }
+    };    // struct impl
+
+
+    THRUST_AGENT_ENTRY(NeedlesIt  needles_it,
+                       HaystackIt haystack_it,
+                       Size       needles_count,
+                       Size       haystack_size,
+                       OutputIt   result,
+                       CompareOp  compare_op,
+                       SearchOp   search_op,
+                       char*      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(storage,
+           needles_it,
+           haystack_it,
+           needles_count,
+           haystack_size,
+           result,
+           compare_op,
+           search_op);
+    }
+  };    // struct VectorizedBinarySearchAgent
+
+  template <class NeedlesIt,
+            class HaystackIt,
+            class Size,
+            class OutputIt,
+            class CompareOp,
+            class SearchOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_pass(void*        d_temp_storage,
+            size_t&      temp_storage_size,
+            NeedlesIt    needles_it,
+            HaystackIt   haystack_it,
+            Size         needles_count,
+            Size         haystack_size,
+            OutputIt     result,
+            CompareOp    compare_op,
+            SearchOp     search_op,
+            cudaStream_t stream,
+            bool         debug_sync)
+  {
+    if (needles_count == 0)
+      return cudaErrorNotSupported;
+
+    cudaError_t status = cudaSuccess;
+
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+
+    typedef AgentLauncher<
+        VectorizedBinarySearchAgent<NeedlesIt,
+                                    HaystackIt,
+                                    Size,
+                                    OutputIt,
+                                    CompareOp,
+                                    SearchOp> >
+        search_agent;
+
+    AgentPlan search_plan = search_agent::get_plan(stream);
+
+    temp_storage_size = 1;
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    search_agent sa(search_plan, needles_count, stream, "binary_search::search_agent", debug_sync);
+    sa.launch(needles_it,
+              haystack_it,
+              needles_count,
+              haystack_size,
+              result,
+              compare_op,
+              search_op);
+
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return status;
+  }
+
+  template <typename Derived,
+            typename NeedlesIt,
+            typename HaystackIt,
+            typename OutputIt,
+            typename CompareOp,
+            typename SearchOp>
+  OutputIt THRUST_RUNTIME_FUNCTION
+  doit(execution_policy<Derived>& policy,
+       HaystackIt                 haystack_begin,
+       HaystackIt                 haystack_end,
+       NeedlesIt                  needles_begin,
+       NeedlesIt                  needles_end,
+       OutputIt                   result,
+       CompareOp                  compare_op,
+       SearchOp                   search_op)
+  {
+    typedef typename iterator_traits<NeedlesIt>::difference_type size_type;
+
+    size_type needles_count = thrust::distance(needles_begin, needles_end);
+    size_type haystack_size = thrust::distance(haystack_begin, haystack_end);
+
+    if (needles_count == 0)
+      return result;
+
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError status;
+    status = doit_pass(NULL,
+                       storage_size,
+                       needles_begin,
+                       haystack_begin,
+                       needles_count,
+                       haystack_size,
+                       result,
+                       compare_op,
+                       search_op,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "binary_search: failed on 1st call");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = doit_pass(ptr,
+                       storage_size,
+                       needles_begin,
+                       haystack_begin,
+                       needles_count,
+                       haystack_size,
+                       result,
+                       compare_op,
+                       search_op,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "binary_search: failed on 2nt call");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "binary_search: failed to synchronize");
+
+    return result + needles_count;
+  }
+
+  struct less
+  {
+    template <typename T1, typename T2>
+    THRUST_DEVICE_FUNCTION bool
+    operator()(const T1& lhs, const T2& rhs) const
+    {
+      return lhs < rhs;
+    }
+  };
+}    // namespace __binary_search
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class HaystackIt,
+          class NeedlesIt,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+lower_bound(execution_policy<Derived>& policy,
+            HaystackIt                 first,
+            HaystackIt                 last,
+            NeedlesIt                  values_first,
+            NeedlesIt                  values_last,
+            OutputIt                   result,
+            CompareOp                  compare_op)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __binary_search::doit(policy,
+                                first,
+                                last,
+                                values_first,
+                                values_last,
+                                result,
+                                compare_op,
+                                __binary_search::lbf<HaystackIt, NeedlesIt>());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::lower_bound(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              values_first,
+                              values_last,
+                              result);
+#endif
+  }
+  return ret;
+}
+
+
+template <class Derived,
+          class HaystackIt,
+          class NeedlesIt,
+          class OutputIt>
+OutputIt __host__ __device__
+lower_bound(execution_policy<Derived>& policy,
+            HaystackIt                 first,
+            HaystackIt                 last,
+            NeedlesIt                  values_first,
+            NeedlesIt                  values_last,
+            OutputIt                   result)
+{
+  return cuda_cub::lower_bound(policy,
+                               first,
+                               last,
+                               values_first,
+                               values_last,
+                               result,
+                               __binary_search::less());
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/copy.h b/thrust/thrust/system/cuda/detail/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef51e4a5bc88e1dfc9c480878529bef4ee401962
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/copy.h
@@ -0,0 +1,198 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/cross_system.h>
+
+namespace thrust
+{
+
+template <typename DerivedPolicy, typename InputIt, typename OutputIt>
+__host__ __device__ OutputIt
+copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+     InputIt                                                     first,
+     InputIt                                                     last,
+     OutputIt                                                    result);
+
+template <class DerivedPolicy, class InputIt, class Size, class OutputIt>
+__host__ __device__ OutputIt
+copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+       InputIt                                                     first,
+       Size                                                        n,
+       OutputIt                                                    result);
+
+namespace cuda_cub {
+
+// D->D copy requires NVCC compiler
+template <class System,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy(execution_policy<System> &system,
+     InputIterator             first,
+     InputIterator             last,
+     OutputIterator            result);
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__
+copy(cross_system<System1, System2> systems,
+     InputIterator  first,
+     InputIterator  last,
+     OutputIterator result);
+
+template <class System,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy_n(execution_policy<System> &system,
+       InputIterator             first,
+       Size                      n,
+       OutputIterator            result);
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__
+copy_n(cross_system<System1, System2> systems,
+       InputIterator  first,
+       Size           n,
+       OutputIterator result);
+
+}    // namespace cuda_
+} // end namespace thrust
+
+
+
+#include <thrust/system/cuda/detail/internal/copy_device_to_device.h>
+#include <thrust/system/cuda/detail/internal/copy_cross_system.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+// D->D copy requires NVCC compiler
+
+__thrust_exec_check_disable__
+template <class System,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy(execution_policy<System> &system,
+     InputIterator             first,
+     InputIterator             last,
+     OutputIterator            result)
+{
+  OutputIterator ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __copy::device_to_device(system, first, last, result);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::copy(cvt_to_seq(derived_cast(system)),
+                       first,
+                       last,
+                       result);
+#endif
+  }
+
+  return ret;
+}    // end copy()
+
+__thrust_exec_check_disable__
+template <class System,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy_n(execution_policy<System> &system,
+       InputIterator             first,
+       Size                      n,
+       OutputIterator            result)
+{
+  OutputIterator ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __copy::device_to_device(system, first, first + n, result);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
+#endif
+  }
+
+  return ret;
+} // end copy_n()
+#endif
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__
+copy(cross_system<System1, System2> systems,
+     InputIterator  first,
+     InputIterator  last,
+     OutputIterator result)
+{
+  return __copy::cross_system_copy(systems,first,last,result);
+} // end copy()
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__
+copy_n(cross_system<System1, System2> systems,
+       InputIterator  first,
+       Size           n,
+       OutputIterator result)
+{
+  return __copy::cross_system_copy_n(systems, first, n, result);
+} // end copy_n()
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/memory.h>
+#include <thrust/detail/temporary_array.h>
diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..d441862ab6cec2ef6ed87e21f5f926e81c32a5fd
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -0,0 +1,857 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cub/device/device_select.cuh>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/detail/function.h>
+#include <thrust/distance.h>
+#include <thrust/detail/alignment.h>
+
+namespace thrust
+{
+// XXX declare generic copy_if interface
+// to avoid circulular dependency from thrust/copy.h
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
+__host__ __device__
+    OutputIterator
+    copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            InputIterator                                               first,
+            InputIterator                                               last,
+            OutputIterator                                              result,
+            Predicate                                                   pred);
+
+template <typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
+__host__ __device__
+    OutputIterator
+    copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            InputIterator1                                              first,
+            InputIterator1                                              last,
+            InputIterator2                                              stencil,
+            OutputIterator                                              result,
+            Predicate                                                   pred);
+
+namespace cuda_cub {
+
+namespace __copy_if {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class, class>
+  struct Tuning;
+
+  template<class T>
+  struct Tuning<sm52, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<350>
+
+
+  template<class T>
+  struct Tuning<sm35, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 10,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<350>
+
+  template<class T>
+  struct Tuning<sm30, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<300>
+
+  struct no_stencil_tag_    {};
+  typedef no_stencil_tag_* no_stencil_tag;
+  template <class ItemsIt,
+            class StencilIt,
+            class OutputIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutputIt>
+  struct CopyIfAgent
+  {
+    typedef typename iterator_traits<ItemsIt>::value_type   item_type;
+    typedef typename iterator_traits<StencilIt>::value_type stencil_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, item_type>::type
+    {
+      typedef Tuning<Arch,item_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type   ItemsLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, StencilIt>::type StencilLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type   BlockLoadItems;
+      typedef typename core::BlockLoad<PtxPlan, StencilLoadIt>::type BlockLoadStencil;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage          scan;
+          typename TilePrefixCallback::TempStorage prefix;
+        };
+
+        typename BlockLoadItems::TempStorage   load_items;
+        typename BlockLoadStencil::TempStorage load_stencil;
+
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE> raw_exchange;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
+    typedef typename ptx_plan::StencilLoadIt      StencilLoadIt;
+    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
+    typedef typename ptx_plan::BlockLoadStencil   BlockLoadStencil;
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan          BlockScan;
+    typedef typename ptx_plan::TempStorage        TempStorage;
+
+    enum
+    {
+      USE_STENCIL      = !thrust::detail::is_same<StencilIt, no_stencil_tag>::value,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &  storage;
+      ScanTileState &tile_state;
+      ItemsLoadIt    items_load_it;
+      StencilLoadIt  stencil_load_it;
+      OutputIt       output_it;
+      Predicate      predicate;
+      Size           num_items;
+
+      //------------------------------------------
+      // scatter results to memory
+      //------------------------------------------
+
+      THRUST_DEVICE_FUNCTION void
+      scatter(item_type (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  num_tile_selections,
+              Size num_selections_prefix)
+      {
+        using core::sync_threadblock;
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int local_scatter_offset = selection_indices[ITEM] -
+                                     num_selections_prefix;
+          if (selection_flags[ITEM])
+          {
+            new (&storage.raw_exchange[local_scatter_offset]) item_type(items[ITEM]);
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+          output_it[num_selections_prefix + item] = storage.raw_exchange[item];
+        }
+      }    // func scatter
+
+      //------------------------------------------
+      // specialize predicate on different types
+      //------------------------------------------
+
+      template <int T>
+      struct __tag {};
+
+      enum ItemStencil
+      {
+        ITEM,
+        STENCIL
+      };
+
+      template <bool TAG, class T>
+      struct wrap_value
+      {
+        T const &              x;
+        THRUST_DEVICE_FUNCTION wrap_value(T const &x) : x(x) {}
+
+        THRUST_DEVICE_FUNCTION T const &operator()() const { return x; };
+      };    // struct wrap_type
+
+      //------- item
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &x,
+                        __tag<false /* USE_STENCIL */>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+      //-------- stencil
+
+      template <class T>
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, T> const &x,
+                        __tag<true>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, no_stencil_tag_> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, stencil_type> const &,
+                        __tag<false>)
+      {
+        return false;
+      }
+
+      template <bool IS_LAST_TILE, ItemStencil TYPE, class T>
+      THRUST_DEVICE_FUNCTION void
+      compute_selection_flags(int num_tile_items,
+                              T (&values)[ITEMS_PER_THREAD],
+                              Size (&selection_flags)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Out-of-bounds items are selection_flags
+          selection_flags[ITEM] = 1;
+
+          if (!IS_LAST_TILE ||
+              (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+          {
+            selection_flags[ITEM] =
+                predicate_wrapper(wrap_value<TYPE, T>(values[ITEM]),
+                                  __tag<USE_STENCIL>());
+          }
+        }
+      }
+
+      //------------------------------------------
+      // consume tiles
+      //------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        item_type items_loc[ITEMS_PER_THREAD];
+        Size      selection_flags[ITEMS_PER_THREAD];
+        Size      selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE) {
+          BlockLoadItems(storage.load_items)
+              .Load(items_load_it + tile_base,
+                    items_loc,
+                    num_tile_items);
+        }
+        else
+        {
+          BlockLoadItems(storage.load_items)
+              .Load(items_load_it + tile_base,
+                    items_loc);
+        }
+
+        core::sync_threadblock();
+
+        if (USE_STENCIL)
+        {
+          stencil_type stencil_loc[ITEMS_PER_THREAD];
+
+          if (IS_LAST_TILE)
+          {
+            BlockLoadStencil(storage.load_stencil)
+                .Load(stencil_load_it + tile_base,
+                      stencil_loc,
+                      num_tile_items);
+          }
+          else
+          {
+            BlockLoadStencil(storage.load_stencil)
+                .Load(stencil_load_it + tile_base,
+                      stencil_loc);
+          }
+
+          compute_selection_flags<IS_LAST_TILE, STENCIL>(num_tile_items,
+                                                         stencil_loc,
+                                                         selection_flags);
+        }
+        else /* Use predicate on items rather then stencil */
+        {
+          compute_selection_flags<IS_LAST_TILE, ITEM>(num_tile_items,
+                                                      items_loc,
+                                                      selection_flags);
+        }
+
+        core::sync_threadblock();
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        core::sync_threadblock();
+
+        scatter(items_loc,
+                selection_flags,
+                selection_idx,
+                num_tile_selections,
+                num_selections_prefix);
+
+
+        return num_selections;
+      }    // func consume_tile_impl
+
+      template <bool         IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION Size
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }    // func consume_tile
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION impl(TempStorage &       storage_,
+                                  ScanTileState &     tile_state_,
+                                  ItemsIt             items_it,
+                                  StencilIt           stencil_it,
+                                  OutputIt            output_it_,
+                                  Predicate           predicate_,
+                                  Size                num_items_,
+                                  int                 num_tiles,
+                                  NumSelectedOutputIt num_selected_out)
+          : storage(storage_),
+            tile_state(tile_state_),
+            items_load_it(core::make_load_iterator(ptx_plan(), items_it)),
+            stencil_load_it(core::make_load_iterator(ptx_plan(), stencil_it)),
+            output_it(output_it_),
+            predicate(predicate_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }    // ctor impl
+    };
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ItemsIt             items_it,
+                       StencilIt           stencil_it,
+                       OutputIt            output_it,
+                       Predicate           predicate,
+                       Size                num_items,
+                       NumSelectedOutputIt num_selected_out,
+                       ScanTileState       tile_state,
+                       int                 num_tiles,
+                       char *              shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           items_it,
+           stencil_it,
+           output_it,
+           predicate,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  };    // struct CopyIfAgent
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+  };    // struct InitAgent
+
+  template <class ItemsIt,
+            class StencilIt,
+            class OutputIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            ItemsIt          items,
+            StencilIt        stencil,
+            OutputIt         output_it,
+            Predicate        predicate,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream,
+            bool             debug_sync)
+  {
+    if (num_items == 0)
+      return cudaSuccess;
+
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        CopyIfAgent<ItemsIt,
+                    StencilIt,
+                    OutputIt,
+                    Predicate,
+                    Size,
+                    NumSelectedOutIt> >
+        copy_if_agent;
+
+    typedef typename copy_if_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type    init_plan    = init_agent::get_plan();
+    typename get_plan<copy_if_agent>::type copy_if_plan = copy_if_agent::get_plan(stream);
+
+    int tile_size = copy_if_plan.items_per_tile;
+    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t vshmem_size = core::vshmem_size(copy_if_plan.shared_memory_size,
+                                           num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return status;
+
+    size_t allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+
+    void* allocations[2] = {NULL, NULL};
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    init_agent ia(init_plan, num_tiles, stream, "copy_if::init_agent", debug_sync);
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
+
+    copy_if_agent pa(copy_if_plan, num_items, stream, vshmem_ptr, "copy_if::partition_agent", debug_sync);
+
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    pa.launch(items,
+              stencil,
+              output_it,
+              predicate,
+              num_items,
+              num_selected_out,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename InputIt,
+            typename StencilIt,
+            typename OutputIt,
+            typename Predicate>
+  THRUST_RUNTIME_FUNCTION
+  OutputIt copy_if(execution_policy<Derived>& policy,
+                   InputIt                    first,
+                   InputIt                    last,
+                   StencilIt                  stencil,
+                   OutputIt                   output,
+                   Predicate                  predicate)
+  {
+    typedef int size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(first, last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    if (num_items == 0)
+      return output;
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       output,
+                       predicate,
+                       reinterpret_cast<size_type*>(NULL),
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "copy_if failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "copy_if failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "copy_if failed on 2nd alias_storage");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       output,
+                       predicate,
+                       d_num_selected_out,
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "copy_if failed to synchronize");
+
+    size_type num_selected = get_value(policy, d_num_selected_out);
+
+    return output + num_selected;
+  }
+
+}    // namespace __copy_if
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIterator,
+          class OutputIterator,
+          class Predicate>
+OutputIterator __host__ __device__
+copy_if(execution_policy<Derived> &policy,
+        InputIterator              first,
+        InputIterator              last,
+        OutputIterator             result,
+        Predicate                  pred)
+{
+  OutputIterator ret = result;
+
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __copy_if::copy_if(policy,
+                             first,
+                             last,
+                             __copy_if::no_stencil_tag(),
+                             result,
+                             pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)),
+                          first,
+                          last,
+                          result,
+                          pred);
+#endif
+  }
+  return ret;
+} // func copy_if
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIterator,
+          class StencilIterator,
+          class OutputIterator,
+          class Predicate>
+OutputIterator __host__ __device__
+copy_if(execution_policy<Derived> &policy,
+        InputIterator              first,
+        InputIterator              last,
+        StencilIterator            stencil,
+        OutputIterator             result,
+        Predicate                  pred)
+{
+  OutputIterator ret = result;
+
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __copy_if::copy_if(policy,
+                             first,
+                             last,
+                             stencil,
+                             result,
+                             pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)),
+                          first,
+                          last,
+                          stencil,
+                          result,
+                          pred);
+#endif
+  }
+  return ret;
+}    // func copy_if
+
+}    // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/copy.h>
+#endif
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..7788481c7b85124d0873be11b8563372e457e724
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -0,0 +1,1184 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <cassert>
+
+#if 0
+#define __THRUST__TEMPLATE_DEBUG
+#endif
+
+#if __THRUST__TEMPLATE_DEBUG
+template<int...> class ID_impl;
+template<int... I> class Foo { ID_impl<I...> t;};
+#endif
+
+namespace thrust
+{
+namespace cuda_cub {
+namespace core {
+
+
+#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__)
+#if 0
+  template <class Agent, class... Args>
+  void __global__
+  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+      _kernel_agent(Args... args)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(args..., shmem);
+  }
+#else
+  template <class Agent, class _0>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, shmem);
+  }
+  template <class Agent, class _0, class _1>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, shmem);
+  }
+#endif
+
+  ////////////////////////////////////////////////////////////
+
+
+#if 0
+  template <class Agent, class... Args>
+  void __global__
+  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+      _kernel_agent_vshmem(char* vshmem, Args... args)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(args..., vshmem);
+  }
+#else
+  template <class Agent, class _0>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, vshmem);
+  }
+  template <class Agent, class _0, class _1>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, vshmem);
+  }
+#endif
+#else
+#if 0
+  template <class , class... Args >
+  void __global__  _kernel_agent(Args... args) {}
+  template <class , class... Args >
+  void __global__  _kernel_agent_vshmem(char*, Args... args) {}
+#else
+  template <class, class _0>
+  void __global__ _kernel_agent(_0) {}
+  template <class, class _0, class _1>
+  void __global__ _kernel_agent(_0,_1) {}
+  template <class, class _0, class _1, class _2>
+  void __global__ _kernel_agent(_0,_1,_2) {}
+  template <class, class _0, class _1, class _2, class _3>
+  void __global__ _kernel_agent(_0,_1,_2,_3) {}
+  template <class, class _0, class _1, class _2, class _3, class _4>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7, _8) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD, _xE) {}
+  ////////////////////////////////////////////////////////////
+  template <class, class _0>
+  void __global__ _kernel_agent_vshmem(char*,_0) {}
+  template <class, class _0, class _1>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1) {}
+  template <class, class _0, class _1, class _2>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2) {}
+  template <class, class _0, class _1, class _2, class _3>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3) {}
+  template <class, class _0, class _1, class _2, class _3, class _4>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7, _8) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) {}
+#endif
+#endif
+
+
+  template<class Agent>
+  struct AgentLauncher : Agent
+  {
+    core::AgentPlan plan;
+    size_t          count;
+    cudaStream_t    stream;
+    char const*     name;
+    bool            debug_sync;
+    unsigned int    grid;
+    char*           vshmem;
+    bool            has_shmem;
+    size_t          shmem_size;
+
+    enum
+    {
+      MAX_SHMEM_PER_BLOCK = 48 * 1024,
+    };
+    typedef
+        typename has_enough_shmem<Agent,
+                                  MAX_SHMEM_PER_BLOCK>::type has_enough_shmem_t;
+    typedef
+        has_enough_shmem<Agent,
+                                  MAX_SHMEM_PER_BLOCK> shm1;
+
+    template <class Size>
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  Size         count_,
+                  cudaStream_t stream_,
+                  char const*  name_,
+                  bool         debug_sync_)
+        : plan(plan_),
+          count((size_t)count_),
+          stream(stream_),
+          name(name_),
+          debug_sync(debug_sync_),
+          grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile)),
+          vshmem(NULL),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(count > 0);
+    }
+
+    template <class Size>
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  Size         count_,
+                  cudaStream_t stream_,
+                  char*        vshmem,
+                  char const*  name_,
+                  bool         debug_sync_)
+        : plan(plan_),
+          count((size_t)count_),
+          stream(stream_),
+          name(name_),
+          debug_sync(debug_sync_),
+          grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile)),
+          vshmem(vshmem),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(count > 0);
+    }
+
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  cudaStream_t stream_,
+                  char const*  name_,
+                  bool         debug_sync_)
+        : plan(plan_),
+          count(0),
+          stream(stream_),
+          name(name_),
+          debug_sync(debug_sync_),
+          grid(plan.grid_size),
+          vshmem(NULL),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(plan.grid_size > 0);
+    }
+
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  cudaStream_t stream_,
+                  char*        vshmem,
+                  char const*  name_,
+                  bool         debug_sync_)
+        : plan(plan_),
+          count(0),
+          stream(stream_),
+          name(name_),
+          debug_sync(debug_sync_),
+          grid(plan.grid_size),
+          vshmem(vshmem),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(plan.grid_size > 0);
+    }
+
+#if 0
+    THRUST_RUNTIME_FUNCTION
+    AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
+    {
+      // in separable compilation mode, we have no choice
+      // but to call kernel to get agent_plan
+      // otherwise the risk is something may fail
+      // if user mix & match ptx versions in a separably compiled function
+      // http://nvbugs/1772071
+      // XXX may be it is too string of a requirements, consider relaxing it in
+      // the future
+#ifdef __CUDACC_RDC__
+      return core::get_agent_plan<Agent>(s, d_ptr);
+#else
+      core::cuda_optional<int> ptx_version = core::get_ptx_version();
+      //CUDA_CUB_RET_IF_FAIL(ptx_version.status());
+      return get_agent_plan<Agent>(ptx_version);
+#endif
+    }
+    THRUST_RUNTIME_FUNCTION
+    AgentPlan static get_plan_default()
+    {
+      return get_agent_plan<Agent>(sm_arch<0>::type::ver);
+    }
+#endif
+
+    THRUST_RUNTIME_FUNCTION
+    typename core::get_plan<Agent>::type static get_plan(cudaStream_t , void* d_ptr = 0)
+    {
+      THRUST_UNUSED_VAR(d_ptr);
+      core::cuda_optional<int> ptx_version = core::get_ptx_version();
+      return get_agent_plan<Agent>(ptx_version);
+    }
+
+    THRUST_RUNTIME_FUNCTION
+    typename core::get_plan<Agent>::type static get_plan()
+    {
+      return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
+    }
+
+    THRUST_RUNTIME_FUNCTION void sync() const
+    {
+      if (debug_sync)
+      {
+        if (THRUST_IS_DEVICE_CODE) {
+          #if THRUST_INCLUDE_DEVICE_CODE
+            cudaDeviceSynchronize();
+          #endif
+        } else {
+          #if THRUST_INCLUDE_HOST_CODE
+            cudaStreamSynchronize(stream);
+          #endif
+        }
+      }
+    }
+
+    template<class K>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    max_blocks_per_sm_impl(K k, int block_threads)
+    {
+      int occ;
+      cudaError_t status = cub::MaxSmOccupancy(occ, k, block_threads);
+      return cuda_optional<int>(status == cudaSuccess ? occ : -1, status);
+    }
+
+    template <class K>
+    cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    max_sm_occupancy(K k) const
+    {
+      return max_blocks_per_sm_impl(k, plan.block_threads);
+    }
+
+
+
+    template<class K>
+    THRUST_RUNTIME_FUNCTION
+    void print_info(K k) const
+    {
+      if (debug_sync)
+      {
+        cuda_optional<int> occ = max_sm_occupancy(k);
+        core::cuda_optional<int> ptx_version = core::get_ptx_version();
+        if (count > 0)
+        {
+          _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %llu items total, %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version \n",
+                  name,
+                  grid,
+                  plan.block_threads,
+                  (has_shmem ? (int)plan.shared_memory_size : 0),
+                  (long long)stream,
+                  (long long)count,
+                  plan.items_per_thread,
+                  (int)occ,
+                  (!has_shmem ? (int)plan.shared_memory_size : 0),
+                  (int)ptx_version);
+        }
+        else
+        {
+          _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version\n",
+                  name,
+                  grid,
+                  plan.block_threads,
+                  (has_shmem ? (int)plan.shared_memory_size : 0),
+                  (long long)stream,
+                  plan.items_per_thread,
+                  (int)occ,
+                  (!has_shmem ? (int)plan.shared_memory_size : 0),
+                  (int)ptx_version);
+        }
+      }
+    }
+
+    ////////////////////
+    //  Variadic code
+    ////////////////////
+
+#if 0
+    template<class... Args>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      return max_blocks_per_sm_impl(_kernel_agent<Agent, Args...>, plan.block_threads);
+    }
+#else
+    template<class _0>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0) = _kernel_agent<Agent, _0>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0, _1) = _kernel_agent<Agent, _0, _1>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2) = _kernel_agent<Agent, _0, _1, _2>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3) = _kernel_agent<Agent, _0, _1, _2,_3>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4) = _kernel_agent<Agent, _0, _1, _2,_3,_4>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+#endif
+
+
+
+#if 0
+
+    // If we are guaranteed to have enough shared memory
+    // don't compile other kernel which accepts pointer
+    // and save on compilations
+    template <class... Args>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, Args... args) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      print_info(_kernel_agent<Agent, Args...>);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(_kernel_agent<Agent, Args...>, args...);
+    }
+
+    // If there is a risk of not having enough shared memory
+    // we compile generic kernel instead.
+    // This kernel is likely to be somewhat slower, but it can accomodate
+    // both shared and virtualized shared memories.
+    // Alternative option is to compile two kernels, one using shared and one
+    // using virtualized shared memory. While this can be slightly faster if we
+    // do actually have enough shared memory, the compilation time will double.
+    //
+    template <class... Args>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, Args... args) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      print_info(_kernel_agent_vshmem<Agent, Args...>);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(_kernel_agent_vshmem<Agent, Args...>, vshmem, args...);
+    }
+
+    template <class... Args>
+    void THRUST_RUNTIME_FUNCTION
+    launch(Args... args) const
+    {
+#if __THRUST__TEMPLATE_DEBUG
+#ifdef __CUDA_ARCH__
+      typedef typename Foo<
+        shm1::v1,
+        shm1::v2,
+        shm1::v3,
+        shm1::v4,
+        shm1::v5>::t tt;
+#endif
+#endif
+      launch_impl(has_enough_shmem_t(),args...);
+      sync();
+    }
+#else
+    template <class _0>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0) = _kernel_agent_vshmem<Agent, _0>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0);
+    }
+    template <class _0, class _1>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1) = _kernel_agent_vshmem<Agent, _0, _1>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1);
+    }
+    template <class _0, class _1, class _2>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2) = _kernel_agent_vshmem<Agent, _0, _1, _2>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2);
+    }
+    template <class _0, class _1, class _2, class _3>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3);
+    }
+    template <class _0, class _1, class _2, class _3, class _4>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8>;
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD,_xE xE) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+
+    template <class _0>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0) = _kernel_agent<Agent, _0>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0);
+    }
+    template <class _0, class _1>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0, _1) = _kernel_agent<Agent, _0, _1>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1);
+    }
+    template <class _0, class _1, class _2>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2) = _kernel_agent<Agent, _0, _1, _2>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2);
+    }
+    template <class _0, class _1, class _2, class _3>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3) = _kernel_agent<Agent, _0, _1, _2,_3>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3);
+    }
+    template <class _0, class _1, class _2, class _3, class _4>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4) = _kernel_agent<Agent, _0, _1, _2,_3,_4>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr,x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+
+    template <class _0>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0) const
+    {
+      launch_impl(has_enough_shmem_t(), x0);
+      sync();
+    }
+    template <class _0, class _1>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1);
+      sync();
+    }
+    template <class _0, class _1, class _2>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+      sync();
+    }
+#endif
+
+
+  };
+
+}    // namespace core
+}
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/core/alignment.h b/thrust/thrust/system/cuda/detail/core/alignment.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dc21ebcec483730f869b1830924fa188bd97f04
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/core/alignment.h
@@ -0,0 +1,249 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// TODO: This can probably be removed.
+
+#pragma once
+
+#include <thrust/system/cuda/detail/util.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+namespace alignment_of_detail {
+
+
+  template <typename T>
+  class alignment_of_impl;
+
+  template <typename T, std::size_t size_diff>
+  struct helper
+  {
+    static const std::size_t value = size_diff;
+  };
+
+  template <typename T>
+  class helper<T, 0>
+  {
+  public:
+    static const std::size_t value = alignment_of_impl<T>::value;
+  };
+
+  template <typename T>
+  class alignment_of_impl
+  {
+  private:
+    struct big
+    {
+      T    x;
+      char c;
+    };
+
+  public:
+    static const std::size_t value = helper<big, sizeof(big) - sizeof(T)>::value;
+  };
+
+
+}    // end alignment_of_detail
+
+
+template <typename T>
+struct alignment_of
+    : alignment_of_detail::alignment_of_impl<T>
+{
+};
+
+
+template <std::size_t Align>
+struct aligned_type;
+
+// __align__ is CUDA-specific, so guard it
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+// implementing aligned_type portably is tricky:
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// implement aligned_type with specialization because MSVC
+// requires literals as arguments to declspec(align(n))
+template <>
+struct aligned_type<1>
+{
+  struct __align__(1) type{};
+};
+
+template <>
+struct aligned_type<2>
+{
+  struct __align__(2) type{};
+};
+
+template <>
+struct aligned_type<4>
+{
+  struct __align__(4) type{};
+};
+
+template <>
+struct aligned_type<8>
+{
+  struct __align__(8) type{};
+};
+
+template <>
+struct aligned_type<16>
+{
+  struct __align__(16) type{};
+};
+
+template <>
+struct aligned_type<32>
+{
+  struct __align__(32) type{};
+};
+
+template <>
+struct aligned_type<64>
+{
+  struct __align__(64) type{};
+};
+
+template <>
+struct aligned_type<128>
+{
+  struct __align__(128) type{};
+};
+
+template <>
+struct aligned_type<256>
+{
+  struct __align__(256) type{};
+};
+
+template <>
+struct aligned_type<512>
+{
+  struct __align__(512) type{};
+};
+
+template <>
+struct aligned_type<1024>
+{
+  struct __align__(1024) type{};
+};
+
+template <>
+struct aligned_type<2048>
+{
+  struct __align__(2048) type{};
+};
+
+template <>
+struct aligned_type<4096>
+{
+  struct __align__(4096) type{};
+};
+
+template <>
+struct aligned_type<8192>
+{
+  struct __align__(8192) type{};
+};
+#elif (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
+// implement aligned_type with specialization because gcc 4.2
+// requires literals as arguments to __attribute__(aligned(n))
+template <>
+struct aligned_type<1>
+{
+  struct __align__(1) type{};
+};
+
+template <>
+struct aligned_type<2>
+{
+  struct __align__(2) type{};
+};
+
+template <>
+struct aligned_type<4>
+{
+  struct __align__(4) type{};
+};
+
+template <>
+struct aligned_type<8>
+{
+  struct __align__(8) type{};
+};
+
+template <>
+struct aligned_type<16>
+{
+  struct __align__(16) type{};
+};
+
+template <>
+struct aligned_type<32>
+{
+  struct __align__(32) type{};
+};
+
+template <>
+struct aligned_type<64>
+{
+  struct __align__(64) type{};
+};
+
+template <>
+struct aligned_type<128>
+{
+  struct __align__(128) type{};
+};
+
+#else
+// assume the compiler allows template parameters as
+// arguments to __align__
+template <std::size_t Align>
+struct aligned_type
+{
+  struct __align__(Align) type{};
+};
+#endif    // THRUST_HOST_COMPILER
+#else
+template <std::size_t Align>
+struct aligned_type
+{
+  struct type
+  {
+  };
+};
+#endif    // THRUST_DEVICE_COMPILER
+
+
+template <std::size_t Len, std::size_t Align>
+struct aligned_storage
+{
+  union type
+  {
+    unsigned char data[Len];
+
+    typename aligned_type<Align>::type align;
+  };
+};
+
+
+}    // end cuda_
+
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..deeffac9dae8bc567face7cf7f8483d41454bbab
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
@@ -0,0 +1,976 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/core/alignment.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <cassert>
+
+
+namespace thrust
+{
+
+namespace cuda_cub {
+namespace launcher {
+
+  struct triple_chevron
+  {
+    typedef size_t Size;
+    dim3 const grid;
+    dim3 const block;
+    Size const shared_mem;
+    cudaStream_t const stream;
+
+    THRUST_RUNTIME_FUNCTION
+    triple_chevron(dim3         grid_,
+                   dim3         block_,
+                   Size         shared_mem_ = 0,
+                   cudaStream_t stream_     = 0)
+        : grid(grid_),
+          block(block_),
+          shared_mem(shared_mem_),
+          stream(stream_) {}
+
+#if 0
+    template<class K, class... Args>
+    cudaError_t __host__
+    doit_host(K k, Args const&... args) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(args...);
+      return cudaPeekAtLastError();
+    }
+#else
+    template <class K, class _0>
+    cudaError_t __host__
+    doit_host(K k, _0 x0) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE);
+      return cudaPeekAtLastError();
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
+    cudaError_t __host__
+    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF);
+      return cudaPeekAtLastError();
+    }
+#endif
+
+    template<class T>
+    size_t __device__
+    align_up(size_t offset) const
+    {
+      size_t alignment = alignment_of<T>::value;
+      return alignment * ((offset + (alignment - 1))/ alignment);
+    }
+
+#if 0
+    size_t __device__ argument_pack_size(size_t size) const { return size; }
+    template <class Arg, class... Args>
+    size_t __device__
+    argument_pack_size(size_t size, Arg const& arg, Args const&... args) const
+    {
+      size = align_up<Arg>(size);
+      return argument_pack_size(size + sizeof(Arg), args...);
+    }
+#else
+    template <class Arg>
+    size_t __device__
+    argument_pack_size(size_t size, Arg) const
+    {
+      return align_up<Arg>(size) + sizeof(Arg);
+    }
+    template <class Arg, class _0>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0);
+    }
+    template <class Arg, class _0, class _1>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1);
+    }
+    template <class Arg, class _0, class _1, class _2>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
+    size_t __device__
+    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const
+    {
+      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF);
+    }
+#endif /* variadic */
+
+    template <class Arg>
+    size_t __device__ copy_arg(char* buffer, size_t offset, Arg arg) const
+    {
+      offset = align_up<Arg>(offset);
+      for (int i = 0; i != sizeof(Arg); ++i)
+        buffer[offset+i] = *((char*)&arg + i);
+      return offset + sizeof(Arg);
+    }
+
+#if 0
+    void __device__ fill_arguments(char*, size_t) const {}
+    template<class Arg, class... Args>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg const& arg, Args const& ... args) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), args...);
+    }
+#else
+    template<class Arg>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg) const
+    {
+      copy_arg(buffer, offset, arg);
+    }
+    template<class Arg, class _0>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0);
+    }
+    template <class Arg, class _0, class _1>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1);
+    }
+    template <class Arg, class _0, class _1, class _2>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
+    void __device__
+    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF);
+    }
+#endif /* variadic */
+
+#if 0
+    template<class K, class... Args>
+    cudaError_t __device__
+    doit_device(K k, Args const&... args) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,args...);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, args...);
+      status = launch_device(k, param_buffer);
+#endif
+      return status;
+    }
+#else
+    template<class K, class _0>
+    cudaError_t __device__
+    doit_device(K k, _0 x0) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+      THRUST_UNUSED_VAR(xA);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+      THRUST_UNUSED_VAR(xA);
+      THRUST_UNUSED_VAR(xB);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+      THRUST_UNUSED_VAR(xA);
+      THRUST_UNUSED_VAR(xB);
+      THRUST_UNUSED_VAR(xC);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+      THRUST_UNUSED_VAR(xA);
+      THRUST_UNUSED_VAR(xB);
+      THRUST_UNUSED_VAR(xC);
+      THRUST_UNUSED_VAR(xD);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+      THRUST_UNUSED_VAR(xA);
+      THRUST_UNUSED_VAR(xB);
+      THRUST_UNUSED_VAR(xC);
+      THRUST_UNUSED_VAR(xD);
+      THRUST_UNUSED_VAR(xE);
+#endif
+      return status;
+    }
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
+    cudaError_t __device__
+    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const
+    {
+      cudaError_t status = cudaErrorNotSupported;
+#if __THRUST_HAS_CUDART__
+      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF);
+      status = launch_device(k, param_buffer);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(x0);
+      THRUST_UNUSED_VAR(x1);
+      THRUST_UNUSED_VAR(x2);
+      THRUST_UNUSED_VAR(x3);
+      THRUST_UNUSED_VAR(x4);
+      THRUST_UNUSED_VAR(x5);
+      THRUST_UNUSED_VAR(x6);
+      THRUST_UNUSED_VAR(x7);
+      THRUST_UNUSED_VAR(x8);
+      THRUST_UNUSED_VAR(x9);
+      THRUST_UNUSED_VAR(xA);
+      THRUST_UNUSED_VAR(xB);
+      THRUST_UNUSED_VAR(xC);
+      THRUST_UNUSED_VAR(xD);
+      THRUST_UNUSED_VAR(xE);
+      THRUST_UNUSED_VAR(xF);
+#endif
+      return status;
+    }
+#endif /* variadic */
+
+    template <class K>
+    cudaError_t __device__
+    launch_device(K k, void* buffer) const
+    {
+#if __THRUST_HAS_CUDART__
+      return cudaLaunchDevice((void*)k,
+                              buffer,
+                              dim3(grid),
+                              dim3(block),
+                              shared_mem,
+                              stream);
+#else
+      THRUST_UNUSED_VAR(k);
+      THRUST_UNUSED_VAR(buffer);
+      return cudaErrorNotSupported;
+#endif
+    }
+
+
+#if defined(__NVCOMPILER_CUDA__)
+#  define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(...) \
+      (__builtin_is_device_code() ?              \
+          doit_device(__VA_ARGS__) : doit_host(__VA_ARGS__))
+#elif defined(__CUDA_ARCH__)
+#  define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_device
+#else
+#  define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_host
+#endif
+
+#if 0
+    __thrust_exec_check_disable__
+    template <class K, class... Args>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, Args const&... args) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, args...);
+    }
+#else
+    __thrust_exec_check_disable__
+    template <class K, class _0>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+    __thrust_exec_check_disable__
+    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
+    cudaError_t THRUST_FUNCTION
+    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const
+    {
+      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF);
+    }
+#endif
+#undef THRUST_TRIPLE_LAUNCHER_HOSTDEVICE
+  }; // struct triple_chevron
+
+}    // namespace launcher
+}    // namespace cuda_
+
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea4ed6400b1d1070f83994db7c57636f14024d03
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/core/util.h
@@ -0,0 +1,773 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <cuda_occupancy.h>
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+namespace core {
+
+#ifdef __NVCOMPILER_CUDA__
+#  if (__NVCOMPILER_CUDA_ARCH__ >= 600)
+#    define THRUST_TUNING_ARCH sm60
+#  elif (__NVCOMPILER_CUDA_ARCH__ >= 520)
+#    define THRUST_TUNING_ARCH sm52
+#  elif (__NVCOMPILER_CUDA_ARCH__ >= 350)
+#    define THRUST_TUNING_ARCH sm35
+#  else
+#    define THRUST_TUNING_ARCH sm30
+#  endif
+#else
+#  if (__CUDA_ARCH__ >= 600)
+#    define THRUST_TUNING_ARCH sm60
+#  elif (__CUDA_ARCH__ >= 520)
+#    define THRUST_TUNING_ARCH sm52
+#  elif (__CUDA_ARCH__ >= 350)
+#    define THRUST_TUNING_ARCH sm35
+#  elif (__CUDA_ARCH__ >= 300)
+#    define THRUST_TUNING_ARCH sm30
+#  elif !defined (__CUDA_ARCH__)
+#    define THRUST_TUNING_ARCH sm30
+#  endif
+#endif
+
+  // Typelist - a container of types, supports up to 10 types
+  // --------------------------------------------------------------------------
+
+  class _;
+  template <class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _>
+  struct typelist;
+
+  // -------------------------------------
+
+  // supported SM arch
+  // ---------------------
+  struct sm30  { enum { ver = 300, warpSize = 32 }; };
+  struct sm35  { enum { ver = 350, warpSize = 32 }; };
+  struct sm52  { enum { ver = 520, warpSize = 32 }; };
+  struct sm60  { enum { ver = 600, warpSize = 32 }; };
+
+  // list of sm, checked from left to right order
+  // the rightmost is the lowest sm arch supported
+  // --------------------------------------------
+  typedef typelist<sm60,sm52,sm35,sm30> sm_list;
+
+  // lowest supported SM arch
+  // --------------------------------------------------------------------------
+
+  template<class, class>
+  struct lowest_supported_sm_arch_impl;
+
+  template <class SM, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct lowest_supported_sm_arch_impl<SM, typelist<_0, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+       : lowest_supported_sm_arch_impl<_0, typelist<    _1, _2, _3, _4, _5, _6, _7, _8, _9> > {};
+  template <class SM>
+  struct lowest_supported_sm_arch_impl<SM, typelist<> >
+  {
+    typedef SM type;
+  };
+
+  typedef typename lowest_supported_sm_arch_impl<_,sm_list>::type lowest_supported_sm_arch;
+
+  // metafunction to match next viable PtxPlan specialization
+  // --------------------------------------------------------------------------
+
+  __THRUST_DEFINE_HAS_NESTED_TYPE(has_tuning_t, tuning)
+  __THRUST_DEFINE_HAS_NESTED_TYPE(has_type_t, type)
+
+  template <template <class> class, class, class>
+  struct specialize_plan_impl_loop;
+  template <template <class> class, class>
+  struct specialize_plan_impl_match;
+
+  // we loop through the sm_list
+  template <template <class> class P, class SM, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct specialize_plan_impl_loop<P, SM, typelist<_0, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+       : specialize_plan_impl_loop<P, SM, typelist<    _1, _2, _3, _4, _5, _6, _7, _8, _9> > {};
+
+  // until we find first lowest match
+  template <template <class> class P, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct specialize_plan_impl_loop <P, SM,  typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+       : specialize_plan_impl_match<P,      typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> > {};
+
+  template<class, class>
+  struct has_sm_tuning_impl;
+
+  // specializing for Tunig which needs 1 arg
+  template <class SM,
+            template <class, class> class Tuning,
+            class _0>
+  struct has_sm_tuning_impl<SM, Tuning<lowest_supported_sm_arch, _0> > : has_type_t<Tuning<SM, _0> > {};
+
+  // specializing for Tunig which needs 2 args
+  template <class SM,
+            template <class, class,class> class Tuning,
+            class _0, class _1>
+  struct has_sm_tuning_impl<SM, Tuning<lowest_supported_sm_arch, _0, _1> > : has_type_t<Tuning<SM, _0, _1> > {};
+
+  template <template <class> class P, class SM>
+  struct has_sm_tuning : has_sm_tuning_impl<SM, typename P<lowest_supported_sm_arch>::tuning > {};
+
+  // once first match is found in sm_list, all remaining sm are possible
+  // candidate for tuning, so pick the first available
+  //   if the plan P has SM-level tuning then pick it,
+  //   otherwise move on to the next sm in the sm_list
+  template <template <class> class P, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct specialize_plan_impl_match<P, typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+      : thrust::detail::conditional<
+            has_sm_tuning<P, SM>::value,
+            P<SM>,
+            specialize_plan_impl_match<P, typelist<_1, _2, _3, _4, _5, _6, _7, _8, _9> > >::type {};
+
+    template <template <class> class Plan, class SM = THRUST_TUNING_ARCH>
+    struct specialize_plan_msvc10_war
+    {
+      // if Plan has tuning type, this means it has SM-specific tuning
+      // so loop through sm_list to find match,
+      // otherwise just specialize on provided SM
+      typedef thrust::detail::conditional<has_tuning_t<Plan<lowest_supported_sm_arch> >::value,
+                                  specialize_plan_impl_loop<Plan, SM, sm_list>,
+                                  Plan<SM> >
+          type;
+    };
+
+    template <template <class> class Plan, class SM = THRUST_TUNING_ARCH>
+    struct specialize_plan : specialize_plan_msvc10_war<Plan,SM>::type::type {};
+
+
+    /////////////////////////
+    /////////////////////////
+    /////////////////////////
+
+    // retrieve temp storage size from an Agent
+    // ---------------------------------------------------------------------------
+    // metafunction introspects Agent, and if it finds TempStorage type
+    // it will return its size
+
+    __THRUST_DEFINE_HAS_NESTED_TYPE(has_temp_storage, TempStorage)
+
+    template <class Agent, class U>
+    struct temp_storage_size_impl;
+
+    template <class Agent>
+    struct temp_storage_size_impl<Agent, thrust::detail::false_type>
+    {
+      enum
+      {
+        value = 0
+      };
+    };
+
+    template <class Agent>
+    struct temp_storage_size_impl<Agent, thrust::detail::true_type>
+    {
+      enum
+      {
+        value = sizeof(typename Agent::TempStorage)
+      };
+    };
+
+    template <class Agent>
+    struct temp_storage_size
+        : temp_storage_size_impl<Agent, typename has_temp_storage<Agent>::type>
+    {
+    };
+
+    // check whether all Agents requires < MAX_SHMEM shared memory
+    // ---------------------------------------------------------------------------
+    // if so, we can use simpler kernel for dispatch, which assumes that all
+    // shared memory is on chip.
+    // Otherwise, a kernel will be compiled which can also accept virtualized
+    // shared memory, in case there is not enough on chip. This kernel is about
+    // 10% slower
+
+    template <bool, class, size_t, class>
+    struct has_enough_shmem_impl;
+
+    template <bool V, class A, size_t S, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    struct has_enough_shmem_impl<V, A, S, typelist<_0, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+        : has_enough_shmem_impl<
+              V && (temp_storage_size<specialize_plan<A::template PtxPlan, _0> >::value <= S),
+              A,
+              S,
+              typelist<_1, _2, _3, _4, _5, _6, _7, _8, _9> >
+    {
+    };
+    template <bool V, class A, size_t S>
+    struct has_enough_shmem_impl<V, A, S, typelist<> >
+    {
+      enum
+      {
+        value = V
+      };
+      typedef typename thrust::detail::conditional<value,
+                                           thrust::detail::true_type,
+                                           thrust::detail::false_type>::type type;
+    };
+
+    template <class Agent, size_t MAX_SHMEM>
+    struct has_enough_shmem : has_enough_shmem_impl<true, Agent, MAX_SHMEM, sm_list>
+    {
+    };
+
+    /////////////////////////
+    /////////////////////////
+    /////////////////////////
+
+    // AgentPlan structure and helpers
+    // --------------------------------
+
+    struct AgentPlan
+    {
+      int block_threads;
+      int items_per_thread;
+      int items_per_tile;
+      int shared_memory_size;
+      int grid_size;
+
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan() {}
+
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan(int block_threads_,
+                int items_per_thread_,
+                int shared_memory_size_,
+                int grid_size_ = 0)
+          : block_threads(block_threads_),
+            items_per_thread(items_per_thread_),
+            items_per_tile(items_per_thread * block_threads),
+            shared_memory_size(shared_memory_size_),
+            grid_size(grid_size_)
+      {
+      }
+
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan(AgentPlan const& plan)
+          : block_threads(plan.block_threads),
+            items_per_thread(plan.items_per_thread),
+            items_per_tile(plan.items_per_tile),
+            shared_memory_size(plan.shared_memory_size),
+            grid_size(plan.grid_size) {}
+
+      template <class PtxPlan>
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan(PtxPlan,
+                typename thrust::detail::disable_if_convertible<
+                    PtxPlan,
+                    AgentPlan>::type* = NULL)
+          : block_threads(PtxPlan::BLOCK_THREADS),
+            items_per_thread(PtxPlan::ITEMS_PER_THREAD),
+            items_per_tile(PtxPlan::ITEMS_PER_TILE),
+            shared_memory_size(temp_storage_size<PtxPlan>::value),
+            grid_size(0)
+      {
+      }
+    };    // struct AgentPlan
+
+
+    __THRUST_DEFINE_HAS_NESTED_TYPE(has_Plan, Plan)
+
+    template <class Agent>
+    struct return_Plan
+    {
+      typedef typename Agent::Plan type;
+    };
+
+    template <class Agent>
+    struct get_plan : thrust::detail::conditional<
+                          has_Plan<Agent>::value,
+                          return_Plan<Agent>,
+                          thrust::detail::identity_<AgentPlan> >::type
+    {
+    };
+
+    // returns AgentPlan corresponding to a given ptx version
+    // ------------------------------------------------------
+
+    template<class, class>
+    struct get_agent_plan_impl;
+
+    template<class Agent, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    struct get_agent_plan_impl<Agent,typelist<SM,_1,_2,_3,_4,_5,_6,_7,_8,_9> >
+    {
+      typedef typename get_plan<Agent>::type Plan;
+      Plan THRUST_RUNTIME_FUNCTION
+      static get(int ptx_version)
+      {
+        if (ptx_version >= SM::ver)
+          return Plan(specialize_plan<Agent::template PtxPlan, SM>());
+        else
+          return get_agent_plan_impl<Agent,
+                                     typelist<_1, _2, _3, _4, _5, _6, _7, _8, _9> >::
+              get(ptx_version);
+      }
+    };
+
+    template<class Agent>
+    struct get_agent_plan_impl<Agent,typelist<lowest_supported_sm_arch> >
+    {
+      typedef typename get_plan<Agent>::type Plan;
+      Plan THRUST_RUNTIME_FUNCTION
+      static get(int /* ptx_version */)
+      {
+        typedef typename get_plan<Agent>::type Plan;
+        return Plan(specialize_plan<Agent::template PtxPlan, lowest_supported_sm_arch>());
+      }
+    };
+
+    template <class Agent>
+    typename get_plan<Agent>::type THRUST_RUNTIME_FUNCTION
+    get_agent_plan(int ptx_version)
+    {
+      // Use one path, with Agent::ptx_plan, for device code where device-side
+      // kernel launches are supported. The other path, with
+      // get_agent_plan_impl::get(version), is for host code and for device
+      // code without device-side kernel launches. NVCC and Feta check for
+      // these situations differently.
+      #ifdef __NVCOMPILER_CUDA__
+        #ifdef __THRUST_HAS_CUDART__
+          if (CUB_IS_DEVICE_CODE) {
+            return typename get_plan<Agent>::type(typename Agent::ptx_plan());
+          } else
+        #endif
+        {
+          return get_agent_plan_impl<Agent, sm_list>::get(ptx_version);
+        }
+      #else
+        #if (CUB_PTX_ARCH > 0) && defined(__THRUST_HAS_CUDART__)
+          typedef typename get_plan<Agent>::type Plan;
+          THRUST_UNUSED_VAR(ptx_version);
+          // We're on device, use default policy
+          return Plan(typename Agent::ptx_plan());
+        #else
+          return get_agent_plan_impl<Agent, sm_list>::get(ptx_version);
+        #endif
+      #endif
+    }
+
+// XXX keep this dead-code for now as a gentle reminder
+//     that kernel luunch which reats plan values is the most robust
+//     mechanism to extract sm-specific tuning parameters
+// TODO: since we are unable to afford kernel launch + cudaMemcpy ON EVERY
+//       algorithm invocation, we need to design a good caching strategy
+//       such that when the algorithm is called multiple times, only the
+//       first invocation will invoke kernel launch + cudaMemcpy, but
+//       the subsequent invocations, will just read cached values from host mem
+//       If launched from device, this is just a device-function call
+//       no caching is required.
+// ----------------------------------------------------------------------------
+  // if we don't know ptx version, we can call kernel
+  // to retrieve AgentPlan from device code. Slower, but guaranteed to work
+  // -----------------------------------------------------------------------
+#if 0
+  template<class Agent>
+  void __global__ get_agent_plan_kernel(AgentPlan *plan);
+
+  static __device__ AgentPlan agent_plan_device;
+
+  template<class Agent>
+  AgentPlan __device__ get_agent_plan_dev()
+  {
+    AgentPlan plan;
+    plan.block_threads      = Agent::ptx_plan::BLOCK_THREADS;
+    plan.items_per_thread   = Agent::ptx_plan::ITEMS_PER_THREAD;
+    plan.items_per_tile     = Agent::ptx_plan::ITEMS_PER_TILE;
+    plan.shared_memory_size = temp_storage_size<typename Agent::ptx_plan>::value;
+    return plan;
+  }
+
+  template <class Agent, class F>
+  AgentPlan __host__ __device__ __forceinline__
+  xget_agent_plan_impl(F f, cudaStream_t s, void* d_ptr)
+  {
+    AgentPlan plan;
+#ifdef __CUDA_ARCH__
+    plan = get_agent_plan_dev<Agent>();
+#else
+    static cub::Mutex mutex;
+    bool lock = false;
+    if (d_ptr == 0)
+    {
+      lock = true;
+      cudaGetSymbolAddress(&d_ptr, agent_plan_device);
+    }
+    if (lock)
+      mutex.Lock();
+    f<<<1,1,0,s>>>((AgentPlan*)d_ptr);
+    cudaMemcpyAsync((void*)&plan,
+                    d_ptr,
+                    sizeof(AgentPlan),
+                    cudaMemcpyDeviceToHost,
+                    s);
+    if (lock)
+      mutex.Unlock();
+    cudaStreamSynchronize(s);
+#endif
+    return plan;
+  }
+
+  template <class Agent>
+  AgentPlan THRUST_RUNTIME_FUNCTION
+  get_agent_plan(cudaStream_t s = 0, void *ptr = 0)
+  {
+    return xget_agent_plan_impl<Agent>(get_agent_plan_kernel<Agent>,
+                                        s,
+                                        ptr);
+  }
+
+  template<class Agent>
+  void __global__ get_agent_plan_kernel(AgentPlan *plan)
+  {
+    *plan = get_agent_plan_dev<Agent>();
+  }
+#endif
+
+  /////////////////////////
+  /////////////////////////
+  /////////////////////////
+
+  THRUST_RUNTIME_FUNCTION
+  int get_sm_count()
+  {
+    int dev_id;
+    cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
+                             "get_sm_count :"
+                             "failed to cudaGetDevice");
+
+    cudaError_t status;
+    int         i32value;
+    status = cudaDeviceGetAttribute(&i32value,
+                                    cudaDevAttrMultiProcessorCount,
+                                    dev_id);
+    cuda_cub::throw_on_error(status,
+                             "get_sm_count:"
+                             "failed to sm_count");
+    return i32value;
+  }
+
+  size_t THRUST_RUNTIME_FUNCTION
+  get_max_shared_memory_per_block()
+  {
+    int dev_id;
+    cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
+                             "get_max_shared_memory_per_block :"
+                             "failed to cudaGetDevice");
+
+    cudaError_t status;
+    int         i32value;
+    status = cudaDeviceGetAttribute(&i32value,
+                                    cudaDevAttrMaxSharedMemoryPerBlock,
+                                    dev_id);
+    cuda_cub::throw_on_error(status,
+                             "get_max_shared_memory_per_block :"
+                             "failed to get max shared memory per block");
+
+    return static_cast<size_t>(i32value);
+  }
+
+  size_t THRUST_RUNTIME_FUNCTION
+  virtual_shmem_size(size_t shmem_per_block)
+  {
+    size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
+    if (shmem_per_block > max_shmem_per_block)
+      return shmem_per_block;
+    else
+      return 0;
+  }
+
+  size_t THRUST_RUNTIME_FUNCTION
+  vshmem_size(size_t shmem_per_block, size_t num_blocks)
+  {
+    size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
+    if (shmem_per_block > max_shmem_per_block)
+      return shmem_per_block*num_blocks;
+    else
+      return 0;
+  }
+
+  // LoadIterator
+  // ------------
+  // if trivial iterator is passed, wrap loads into LDG
+  //
+  template <class PtxPlan, class It>
+  struct LoadIterator
+  {
+    typedef typename iterator_traits<It>::value_type      value_type;
+    typedef typename iterator_traits<It>::difference_type size_type;
+
+    typedef typename thrust::detail::conditional<
+        is_contiguous_iterator<It>::value,
+        cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER,
+                                        value_type,
+                                        size_type>,
+        It>::type type;
+  };    // struct Iterator
+
+  template <class PtxPlan, class It>
+  typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
+  make_load_iterator_impl(It it, thrust::detail::true_type /* is_trivial */)
+  {
+    return raw_pointer_cast(&*it);
+  }
+
+  template <class PtxPlan, class It>
+  typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
+  make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */)
+  {
+    return it;
+  }
+
+  template <class PtxPlan, class It>
+  typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
+  make_load_iterator(PtxPlan const&, It it)
+  {
+    return make_load_iterator_impl<PtxPlan>(
+        it, typename is_contiguous_iterator<It>::type());
+  }
+
+  template<class>
+  struct get_arch;
+
+  template<template<class> class Plan, class Arch>
+  struct get_arch<Plan<Arch> > { typedef Arch type; };
+
+  // BlockLoad
+  // -----------
+  // a helper metaprogram that returns type of a block loader
+  template <class PtxPlan,
+            class It,
+            class T    = typename iterator_traits<It>::value_type>
+  struct BlockLoad
+  {
+    typedef cub::BlockLoad<T,
+                           PtxPlan::BLOCK_THREADS,
+                           PtxPlan::ITEMS_PER_THREAD,
+                           PtxPlan::LOAD_ALGORITHM,
+                           1,
+                           1,
+                           get_arch<PtxPlan>::type::ver>
+
+
+        type;
+  };
+
+  // BlockStore
+  // -----------
+  // a helper metaprogram that returns type of a block loader
+  template <class PtxPlan,
+            class It,
+            class T = typename iterator_traits<It>::value_type>
+  struct BlockStore
+  {
+    typedef cub::BlockStore<T,
+                            PtxPlan::BLOCK_THREADS,
+                            PtxPlan::ITEMS_PER_THREAD,
+                            PtxPlan::STORE_ALGORITHM,
+                            1,
+                            1,
+                            get_arch<PtxPlan>::type::ver>
+        type;
+  };
+  // cuda_otional
+  // --------------
+  // used for function that return cudaError_t along with the result
+  //
+  template <class T>
+  class cuda_optional
+  {
+    cudaError_t status_;
+    T           value_;
+
+  public:
+    __host__ __device__
+    cuda_optional() : status_(cudaSuccess) {}
+
+    __host__ __device__
+    cuda_optional(T v, cudaError_t status = cudaSuccess) : status_(status), value_(v) {}
+
+    bool __host__ __device__
+    isValid() const { return cudaSuccess == status_; }
+
+    cudaError_t __host__ __device__
+    status() const { return status_; }
+
+    __host__ __device__ T const &
+    value() const { return value_; }
+
+    __host__ __device__ operator T const &() const { return value_; }
+  };
+
+  cuda_optional<int> THRUST_RUNTIME_FUNCTION
+  get_ptx_version()
+  {
+    int ptx_version = 0;
+    cudaError_t status = cub::PtxVersion(ptx_version);
+    return cuda_optional<int>(ptx_version, status);
+  }
+
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  sync_stream(cudaStream_t stream)
+  {
+    return cub::SyncStream(stream);
+  }
+
+  inline void __device__ sync_threadblock()
+  {
+    cub::CTA_SYNC();
+  }
+
+#define CUDA_CUB_RET_IF_FAIL(e) \
+  {                             \
+    auto const error = (e);     \
+    if (cub::Debug(error, __FILE__, __LINE__)) return error; \
+  }
+
+  // uninitialized
+  // -------
+  // stores type in uninitialized form
+  //
+  template <class T>
+  struct uninitialized
+  {
+    typedef typename cub::UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+      WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    DeviceWord storage[WORDS];
+
+    __host__ __device__ __forceinline__ T& get()
+    {
+      return reinterpret_cast<T&>(*this);
+    }
+
+    __host__ __device__ __forceinline__ operator T&() { return get(); }
+  };
+
+  // uninitialized_array
+  // --------------
+  // allocates uninitialized data on stack
+  template<class T, size_t N>
+  struct array
+  {
+    typedef T value_type;
+    typedef T ref[N];
+    enum {SIZE = N};
+    private:
+      T data_[N];
+
+    public:
+      __host__ __device__ T* data() { return data_; }
+      __host__ __device__ const T* data() const { return data_; }
+      __host__ __device__ T& operator[](unsigned int idx) { return ((T*)data_)[idx]; }
+      __host__ __device__ T const& operator[](unsigned int idx) const { return ((T*)data_)[idx]; }
+      __host__ __device__ unsigned int size() const { return N; }
+      __host__ __device__ operator ref&() { return data_; }
+  };
+
+
+  // uninitialized_array
+  // --------------
+  // allocates uninitialized data on stack
+  template<class T, size_t N>
+  struct uninitialized_array
+  {
+    typedef T value_type;
+    typedef T ref[N];
+    enum {SIZE = N};
+    private:
+      char data_[N * sizeof(T)];
+
+    public:
+      __host__ __device__ T* data() { return data_; }
+      __host__ __device__ const T* data() const { return data_; }
+      __host__ __device__ T& operator[](unsigned int idx) { return ((T*)data_)[idx]; }
+      __host__ __device__ T const& operator[](unsigned int idx) const { return ((T*)data_)[idx]; }
+      __host__ __device__ T& operator[](int idx) { return ((T*)data_)[idx]; }
+      __host__ __device__ T const& operator[](int idx) const { return ((T*)data_)[idx]; }
+      __host__ __device__ unsigned int size() const { return N; }
+      __host__ __device__ operator ref&() { return *reinterpret_cast<ref*>(data_); }
+      __host__ __device__ ref& get_ref() { return (ref&)*this; }
+  };
+
+  __host__ __device__ __forceinline__ size_t align_to(size_t n, size_t align)
+  {
+    return ((n+align-1)/align) * align;
+  }
+
+  namespace host {
+    inline cuda_optional<size_t> get_max_shared_memory_per_block()
+    {
+      cudaError_t status = cudaSuccess;
+      int         dev_id = 0;
+      status             = cudaGetDevice(&dev_id);
+      if (status != cudaSuccess) return cuda_optional<size_t>(0, status);
+
+      int max_shmem = 0;
+      status        = cudaDeviceGetAttribute(&max_shmem,
+                                      cudaDevAttrMaxSharedMemoryPerBlock,
+                                      dev_id);
+      if (status != cudaSuccess) return cuda_optional<size_t>(0, status);
+      return cuda_optional<size_t>(max_shmem, status);
+    }
+  }
+
+  template <int           ALLOCATIONS>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  alias_storage(void*   storage_ptr,
+                size_t& storage_size,
+                void* (&allocations)[ALLOCATIONS],
+                size_t (&allocation_sizes)[ALLOCATIONS])
+  {
+    return cub::AliasTemporaries(storage_ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+  }
+
+
+}    // namespace core
+using core::sm60;
+using core::sm52;
+using core::sm35;
+using core::sm30;
+} // namespace cuda_
+
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/count.h b/thrust/thrust/system/cuda/detail/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d8f0c02dd18bbb46ca145223677a6acc8c87240
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/count.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt,
+          class UnaryPred>
+typename iterator_traits<InputIt>::difference_type __host__ __device__
+count_if(execution_policy<Derived> &policy,
+         InputIt                    first,
+         InputIt                    last,
+         UnaryPred                  unary_pred)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  typedef transform_input_iterator_t<size_type,
+                                     InputIt,
+                                     UnaryPred>
+      flag_iterator_t;
+
+  return cuda_cub::reduce_n(policy,
+                            flag_iterator_t(first, unary_pred),
+                            thrust::distance(first, last),
+                            size_type(0),
+                            plus<size_type>());
+}
+
+template <class Derived,
+          class InputIt,
+          class Value>
+typename iterator_traits<InputIt>::difference_type __host__ __device__
+count(execution_policy<Derived> &policy,
+      InputIt                    first,
+      InputIt                    last,
+      Value const &              value)
+{
+  return cuda_cub::count_if(policy,
+                            first,
+                            last,
+                            thrust::detail::equal_to_value<Value>(value));
+}
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/cross_system.h b/thrust/thrust/system/cuda/detail/cross_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..f89f3dba8d3c9c07e259e0aba3ed7aed6dfa1f54
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/cross_system.h
@@ -0,0 +1,344 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+  template <class Sys1, class Sys2>
+  struct cross_system : execution_policy<cross_system<Sys1, Sys2> >
+  {
+    typedef thrust::execution_policy<Sys1> policy1;
+    typedef thrust::execution_policy<Sys2> policy2;
+
+    policy1 &sys1;
+    policy2 &sys2;
+
+    inline __host__ __device__
+    cross_system(policy1 &sys1, policy2 &sys2) : sys1(sys1), sys2(sys2) {}
+
+    inline __host__ __device__
+    cross_system<Sys2, Sys1> rotate() const
+    {
+      return cross_system<Sys2, Sys1>(sys2, sys1);
+    }
+  };
+
+#if THRUST_CPP_DIALECT >= 2011
+  // Device to host.
+  template <class Sys1, class Sys2>
+  THRUST_CONSTEXPR __host__ __device__ 
+  auto direction_of_copy(
+    thrust::system::cuda::execution_policy<Sys1> const&
+  , thrust::cpp::execution_policy<Sys2> const&
+  )
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyDeviceToHost
+    >{}
+  )
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  THRUST_CONSTEXPR __host__ __device__
+  auto direction_of_copy(
+    thrust::cpp::execution_policy<Sys1> const&
+  , thrust::system::cuda::execution_policy<Sys2> const&
+  )
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyHostToDevice
+    >{}
+  )
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  THRUST_CONSTEXPR __host__ __device__
+  auto direction_of_copy(
+    thrust::system::cuda::execution_policy<Sys1> const&
+  , thrust::system::cuda::execution_policy<Sys2> const&
+  )
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyDeviceToDevice
+    >{}
+  )
+
+  // Device to device.
+  template <class DerivedPolicy>
+  THRUST_CONSTEXPR __host__ __device__ 
+  auto direction_of_copy(execution_policy<DerivedPolicy> const &)
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyDeviceToDevice
+    >{}
+  )
+
+  template <class Sys1, class Sys2>
+  THRUST_CONSTEXPR __host__ __device__
+  auto direction_of_copy(
+    execution_policy<cross_system<Sys1, Sys2>> const &systems
+  )
+  THRUST_DECLTYPE_RETURNS(
+    direction_of_copy(
+      derived_cast(derived_cast(systems).sys1)
+    , derived_cast(derived_cast(systems).sys2)
+    )
+  )
+
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  THRUST_CONSTEXPR __host__ __device__
+  auto is_device_to_host_copy(
+    ExecutionPolicy0 const& exec0
+  , ExecutionPolicy1 const& exec1
+  )
+    noexcept -> 
+      thrust::detail::integral_constant<
+        bool, cudaMemcpyDeviceToHost == Direction::value
+      >
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  THRUST_CONSTEXPR __host__ __device__
+  auto is_device_to_host_copy(ExecutionPolicy const& exec)
+    noexcept -> 
+      thrust::detail::integral_constant<
+        bool, cudaMemcpyDeviceToHost == Direction::value
+      >
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  THRUST_CONSTEXPR __host__ __device__
+  auto is_host_to_device_copy(
+    ExecutionPolicy0 const& exec0
+  , ExecutionPolicy1 const& exec1
+  )
+    noexcept -> 
+      thrust::detail::integral_constant<
+        bool, cudaMemcpyHostToDevice == Direction::value
+      >
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  THRUST_CONSTEXPR __host__ __device__
+  auto is_host_to_device_copy(ExecutionPolicy const& exec)
+    noexcept -> 
+      thrust::detail::integral_constant<
+        bool, cudaMemcpyHostToDevice == Direction::value
+      >
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  THRUST_CONSTEXPR __host__ __device__
+  auto is_device_to_device_copy(
+    ExecutionPolicy0 const& exec0
+  , ExecutionPolicy1 const& exec1
+  )
+    noexcept -> 
+      thrust::detail::integral_constant<
+        bool, cudaMemcpyDeviceToDevice == Direction::value
+      >
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  THRUST_CONSTEXPR __host__ __device__
+  auto is_device_to_device_copy(ExecutionPolicy const& exec)
+    noexcept -> 
+      thrust::detail::integral_constant<
+        bool, cudaMemcpyDeviceToDevice == Direction::value
+      >
+  {
+    return {};
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> &sys1,
+                       thrust::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> const &sys1,
+                       thrust::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::execution_policy<Sys1> &,
+                       thrust::cuda::execution_policy<Sys2> &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::execution_policy<Sys1> const &,
+                       thrust::cuda::execution_policy<Sys2> const &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> &sys1,
+                       thrust::cuda::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> const &sys1,
+                       thrust::cuda::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::cuda::execution_policy<Sys1> &,
+                     thrust::execution_policy<Sys2> &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::cuda::execution_policy<Sys1> const &,
+                     thrust::execution_policy<Sys2> const &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> &sys1,
+                     thrust::cuda::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> const &sys1,
+                     thrust::cuda::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> &sys1,
+                     thrust::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> const &sys1,
+                     thrust::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+#endif
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  cross_system<Sys1, Sys2>
+  select_system(execution_policy<Sys1> const &             sys1,
+                thrust::cpp::execution_policy<Sys2> const &sys2)
+  {
+    thrust::execution_policy<Sys1> &     non_const_sys1 = const_cast<execution_policy<Sys1> &>(sys1);
+    thrust::cpp::execution_policy<Sys2> &non_const_sys2 = const_cast<thrust::cpp::execution_policy<Sys2> &>(sys2);
+    return cross_system<Sys1, Sys2>(non_const_sys1, non_const_sys2);
+  }
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  cross_system<Sys1, Sys2>
+  select_system(thrust::cpp::execution_policy<Sys1> const &sys1,
+                execution_policy<Sys2> const &             sys2)
+  {
+    thrust::cpp::execution_policy<Sys1> &non_const_sys1 = const_cast<thrust::cpp::execution_policy<Sys1> &>(sys1);
+    thrust::execution_policy<Sys2> &     non_const_sys2 = const_cast<execution_policy<Sys2> &>(sys2);
+    return cross_system<Sys1, Sys2>(non_const_sys1, non_const_sys2);
+  }
+
+} // namespace cuda_cub
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..45b034217996c5c474e6b91009c57821337a0ef2
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/preprocessor.h>
+#include <thrust/detail/integer_traits.h>
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+    if (count <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call arguments; \
+    } \
+    else { \
+        thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call arguments; \
+    }
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ *
+ * This version of the macro supports providing two count variables, which is
+ * necessary for set algorithms.
+ */
+#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+    if (count1 + count2 <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        thrust::detail::int32_t THRUST_PP_CAT2(count1, _fixed) = count1; \
+        thrust::detail::int32_t THRUST_PP_CAT2(count2, _fixed) = count2; \
+        status = call arguments; \
+    } \
+    else { \
+        thrust::detail::int64_t THRUST_PP_CAT2(count1, _fixed) = count1; \
+        thrust::detail::int64_t THRUST_PP_CAT2(count2, _fixed) = count2; \
+        status = call arguments; \
+    }
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version allows using different token sequences for callables
+ * in both branches, and is intended to be used with CUB-style dispatch interfaces,
+ * where the "simple" interface always forces the size to be `int` (making it harder
+ * for us to use), but the complex interface that we end up using doesn't actually
+ * provide a way to fully deduce the type from just the call, making the size type
+ * appear in the token sequence of the callable.
+ *
+ * See reduce_n_impl to see an example of how this is meant to be used.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    if (count <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call_32 arguments; \
+    } \
+    else { \
+        thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
+        status = call_64 arguments; \
+    }
+
diff --git a/thrust/thrust/system/cuda/detail/equal.h b/thrust/thrust/system/cuda/detail/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5e7d6863f378899330cc1e69d7667a87047338
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/equal.h
@@ -0,0 +1,74 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/mismatch.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class BinaryPred>
+bool __host__ __device__
+equal(execution_policy<Derived>& policy,
+      InputIt1                   first1,
+      InputIt1                   last1,
+      InputIt2                   first2,
+      BinaryPred                 binary_pred)
+{
+  return cuda_cub::mismatch(policy, first1, last1, first2, binary_pred).first == last1;
+}
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2>
+bool __host__ __device__
+equal(execution_policy<Derived>& policy,
+      InputIt1                   first1,
+      InputIt1                   last1,
+      InputIt2                   first2)
+{
+  typedef typename thrust::iterator_value<InputIt1>::type InputType1;
+  return cuda_cub::equal(policy,
+                         first1,
+                         last1,
+                         first2,
+                         equal_to<InputType1>());
+}
+
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/error.inl b/thrust/thrust/system/cuda/detail/error.inl
new file mode 100644
index 0000000000000000000000000000000000000000..5c689b49984eb3d8ae2c764431fbf37af9de1073
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/error.inl
@@ -0,0 +1,98 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/cuda/error.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+
+error_code make_error_code(cuda::errc::errc_t e)
+{
+  return error_code(static_cast<int>(e), cuda_category());
+} // end make_error_code()
+
+
+error_condition make_error_condition(cuda::errc::errc_t e)
+{
+  return error_condition(static_cast<int>(e), cuda_category());
+} // end make_error_condition()
+
+
+namespace cuda_cub
+{
+
+namespace detail
+{
+
+
+class cuda_error_category
+  : public error_category
+{
+  public:
+    inline cuda_error_category(void) {}
+
+    inline virtual const char *name(void) const
+    {
+      return "cuda";
+    }
+
+    inline virtual std::string message(int ev) const
+    {
+      char const* const unknown_str  = "unknown error";
+      char const* const unknown_name = "cudaErrorUnknown";
+      char const* c_str  = ::cudaGetErrorString(static_cast<cudaError_t>(ev));
+      char const* c_name = ::cudaGetErrorName(static_cast<cudaError_t>(ev));
+      return std::string(c_name ? c_name : unknown_name)
+           + ": " + (c_str ? c_str : unknown_str);
+    }
+
+    inline virtual error_condition default_error_condition(int ev) const
+    {
+      using namespace cuda::errc;
+
+      if(ev < ::cudaErrorApiFailureBase)
+      {
+        return make_error_condition(static_cast<errc_t>(ev));
+      }
+
+      return system_category().default_error_condition(ev);
+    }
+}; // end cuda_error_category
+
+} // end detail
+
+} // end namespace cuda_cub
+
+
+const error_category &cuda_category(void)
+{
+  static const thrust::system::cuda_cub::detail::cuda_error_category result;
+  return result;
+}
+
+
+} // end namespace system
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/cuda/detail/execution_policy.h b/thrust/thrust/system/cuda/detail/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee49a60cb44a3183e6788f3d0b847204afc36380
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/execution_policy.h
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/version.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/allocator_aware_execution_policy.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/dependencies_aware_execution_policy.h>
+#endif
+
+namespace thrust
+{
+
+namespace cuda_cub
+{
+
+struct tag;
+
+template <class>
+struct execution_policy;
+
+template <>
+struct execution_policy<tag> : thrust::execution_policy<tag>
+{
+  typedef tag tag_type;
+};
+
+struct tag : execution_policy<tag>
+, thrust::detail::allocator_aware_execution_policy<cuda_cub::execution_policy>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<cuda_cub::execution_policy>
+#endif
+{};
+
+template <class Derived>
+struct execution_policy : thrust::execution_policy<Derived>
+{
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
+};
+
+} // namespace cuda_cub
+
+namespace system { namespace cuda { namespace detail
+{
+
+using thrust::cuda_cub::tag;
+using thrust::cuda_cub::execution_policy;
+
+}}} // namespace system::cuda::detail
+
+namespace system { namespace cuda
+{
+
+using thrust::cuda_cub::tag;
+using thrust::cuda_cub::execution_policy;
+
+}} // namespace system::cuda
+
+namespace cuda
+{
+
+using thrust::cuda_cub::tag;
+using thrust::cuda_cub::execution_policy;
+
+} // namespace cuda
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..40903cd9a9aca0ec22b5521a33964deea9961cd9
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -0,0 +1,568 @@
+/*******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/reduce.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __extrema {
+
+  template <class InputType, class IndexType, class Predicate>
+  struct arg_min_f
+  {
+    Predicate predicate;
+    typedef tuple<InputType, IndexType> pair_type;
+
+    __host__ __device__
+    arg_min_f(Predicate p) : predicate(p) {}
+
+    pair_type __device__
+    operator()(pair_type const &lhs, pair_type const &rhs)
+    {
+      InputType const &rhs_value = get<0>(rhs);
+      InputType const &lhs_value = get<0>(lhs);
+      IndexType const &rhs_key   = get<1>(rhs);
+      IndexType const &lhs_key   = get<1>(lhs);
+
+      // check values first
+      if (predicate(lhs_value, rhs_value))
+        return lhs;
+      else if (predicate(rhs_value, lhs_value))
+        return rhs;
+
+      // values are equivalent, prefer smaller index
+      if (lhs_key < rhs_key)
+        return lhs;
+      else
+        return rhs;
+    }
+  };    // struct arg_min_f
+
+  template <class InputType, class IndexType, class Predicate>
+  struct arg_max_f
+  {
+    Predicate predicate;
+    typedef tuple<InputType, IndexType> pair_type;
+
+    __host__ __device__
+    arg_max_f(Predicate p) : predicate(p) {}
+
+    pair_type __device__
+    operator()(pair_type const &lhs, pair_type const &rhs)
+    {
+      InputType const &rhs_value = get<0>(rhs);
+      InputType const &lhs_value = get<0>(lhs);
+      IndexType const &rhs_key   = get<1>(rhs);
+      IndexType const &lhs_key   = get<1>(lhs);
+
+      // check values first
+      if (predicate(lhs_value, rhs_value))
+        return rhs;
+      else if (predicate(rhs_value, lhs_value))
+        return lhs;
+
+      // values are equivalent, prefer smaller index
+      if (lhs_key < rhs_key)
+        return lhs;
+      else
+        return rhs;
+    }
+  };    // struct arg_max_f
+
+  template<class InputType, class IndexType, class Predicate>
+  struct arg_minmax_f
+  {
+    Predicate predicate;
+
+    typedef tuple<InputType, IndexType> pair_type;
+    typedef tuple<pair_type, pair_type> two_pairs_type;
+
+    typedef arg_min_f<InputType, IndexType, Predicate> arg_min_t;
+    typedef arg_max_f<InputType, IndexType, Predicate> arg_max_t;
+
+    __host__ __device__
+    arg_minmax_f(Predicate p) : predicate(p)
+    {
+    }
+
+    two_pairs_type __device__
+    operator()(two_pairs_type const &lhs, two_pairs_type const &rhs)
+    {
+      pair_type const &rhs_min = get<0>(rhs);
+      pair_type const &lhs_min = get<0>(lhs);
+      pair_type const &rhs_max = get<1>(rhs);
+      pair_type const &lhs_max = get<1>(lhs);
+      return thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min),
+                                arg_max_t(predicate)(lhs_max, rhs_max));
+    }
+
+    struct duplicate_tuple
+    {
+      __device__ two_pairs_type
+      operator()(pair_type const &t)
+      {
+        return thrust::make_tuple(t, t);
+      }
+    };
+  }; // struct arg_minmax_f
+
+  template <class T,
+            class InputIt,
+            class OutputIt,
+            class Size,
+            class ReductionOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *       d_temp_storage,
+            size_t &     temp_storage_bytes,
+            InputIt      input_it,
+            Size         num_items,
+            ReductionOp  reduction_op,
+            OutputIt     output_it,
+            cudaStream_t stream,
+            bool         debug_sync)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+    using core::get_agent_plan;
+    using core::cuda_optional;
+
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef AgentLauncher<
+        __reduce::ReduceAgent<InputIt, OutputIt, T, Size, ReductionOp> >
+        reduce_agent;
+
+    typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream);
+
+    cudaError_t status = cudaSuccess;
+
+
+    if (num_items <= reduce_plan.items_per_tile)
+    {
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+
+      // small, single tile size
+      if (d_temp_storage == NULL)
+      {
+        temp_storage_bytes = max<size_t>(1, vshmem_size);
+        return status;
+      }
+      char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL;
+
+      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only", debug_sync);
+      ra.launch(input_it, output_it, num_items, reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+    else
+    {
+      // regular size
+      cuda_optional<int> sm_count = core::get_sm_count();
+      CUDA_CUB_RET_IF_FAIL(sm_count.status());
+
+      // reduction will not use more cta counts than requested
+      cuda_optional<int> max_blocks_per_sm =
+          reduce_agent::
+              template get_max_blocks_per_sm<InputIt,
+                                             OutputIt,
+                                             Size,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
+                                             ReductionOp>(reduce_plan);
+      CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
+
+
+
+      int reduce_device_occupancy = (int)max_blocks_per_sm * sm_count;
+
+      int sm_oversubscription = 5;
+      int max_blocks          = reduce_device_occupancy * sm_oversubscription;
+
+      cub::GridEvenShare<Size> even_share;
+      even_share.DispatchInit(num_items, max_blocks,
+                              reduce_plan.items_per_tile);
+
+      // we will launch at most "max_blocks" blocks in a grid
+      // so preallocate virtual shared memory storage for this if required
+      //
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size,
+                                             max_blocks);
+
+      // Temporary storage allocation requirements
+      void * allocations[3] = {NULL, NULL, NULL};
+      size_t allocation_sizes[3] =
+          {
+              max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              vshmem_size                                        // size of virtualized shared memory storage
+          };
+      status = cub::AliasTemporaries(d_temp_storage,
+                                     temp_storage_bytes,
+                                     allocations,
+                                     allocation_sizes);
+      CUDA_CUB_RET_IF_FAIL(status);
+      if (d_temp_storage == NULL)
+      {
+        return status;
+      }
+
+      T *d_block_reductions = (T*) allocations[0];
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
+      char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
+
+
+      // Get grid size for device_reduce_sweep_kernel
+      int reduce_grid_size = 0;
+      if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE)
+      {
+        // Work is distributed evenly
+        reduce_grid_size = even_share.grid_size;
+      }
+      else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
+      {
+        // Work is distributed dynamically
+        size_t num_tiles = (num_items + reduce_plan.items_per_tile - 1) /
+          reduce_plan.items_per_tile;
+
+        // if not enough to fill the device with threadblocks
+        // then fill the device with threadblocks
+        reduce_grid_size = static_cast<int>(min(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+
+        typedef AgentLauncher<__reduce::DrainAgent<Size> > drain_agent;
+        AgentPlan drain_plan = drain_agent::get_plan();
+        drain_plan.grid_size = 1;
+        drain_agent da(drain_plan, stream, "__reduce::drain_agent", debug_sync);
+        da.launch(queue, num_items);
+        CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+      }
+      else
+      {
+        CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported);
+      }
+
+      reduce_plan.grid_size = reduce_grid_size;
+      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce", debug_sync);
+      ra.launch(input_it,
+                d_block_reductions,
+                num_items,
+                even_share,
+                queue,
+                reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+
+      typedef AgentLauncher<
+        __reduce::ReduceAgent<T*, OutputIt, T, Size, ReductionOp> >
+        reduce_agent_single;
+
+      reduce_plan.grid_size = 1;
+      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce", debug_sync);
+
+      ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    return status;
+  }    // func doit_step
+
+  // this is an init-less reduce, needed for min/max-element functionality
+  // this will avoid copying the first value from device->host
+  template <typename Derived,
+            typename InputIt,
+            typename Size,
+            typename BinaryOp,
+            typename T>
+  THRUST_RUNTIME_FUNCTION
+  T extrema(execution_policy<Derived>& policy,
+            InputIt                    first,
+            Size                       num_items,
+            BinaryOp                   binary_op,
+            T*)
+  {
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (NULL, temp_storage_bytes, first, num_items_fixed,
+            binary_op, reinterpret_cast<T*>(NULL), stream,
+            debug_sync));
+    cuda_cub::throw_on_error(status, "extrema failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage");
+
+    T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
+
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (allocations[1], temp_storage_bytes, first,
+            num_items_fixed, binary_op, d_result, stream,
+            debug_sync));
+    cuda_cub::throw_on_error(status, "extrema failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "extrema failed to synchronize");
+
+    T result = cuda_cub::get_value(policy, d_result);
+
+    return result;
+  }
+
+  template <template <class, class, class> class ArgFunctor,
+            class Derived,
+            class ItemsIt,
+            class BinaryPred>
+  ItemsIt THRUST_RUNTIME_FUNCTION
+  element(execution_policy<Derived> &policy,
+          ItemsIt                    first,
+          ItemsIt                    last,
+          BinaryPred                 binary_pred)
+  {
+    if (first == last)
+      return last;
+
+    typedef typename iterator_traits<ItemsIt>::value_type      InputType;
+    typedef typename iterator_traits<ItemsIt>::difference_type IndexType;
+
+    IndexType num_items = static_cast<IndexType>(thrust::distance(first, last));
+
+    typedef tuple<ItemsIt, counting_iterator_t<IndexType> > iterator_tuple;
+    typedef zip_iterator<iterator_tuple> zip_iterator;
+
+    iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
+
+
+    typedef ArgFunctor<InputType, IndexType, BinaryPred> arg_min_t;
+    typedef tuple<InputType, IndexType> T;
+
+    zip_iterator begin = make_zip_iterator(iter_tuple);
+
+    T result = extrema(policy,
+                       begin,
+                       num_items,
+                       arg_min_t(binary_pred),
+                       (T *)(NULL));
+    return first + thrust::get<1>(result);
+  }
+
+
+}    // namespace __extrema
+
+/// min element
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt,
+          class BinaryPred>
+ItemsIt __host__ __device__
+min_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last,
+            BinaryPred                 binary_pred)
+{
+  ItemsIt ret = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __extrema::element<__extrema::arg_min_f>(policy,
+                                                   first,
+                                                   last,
+                                                   binary_pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::min_element(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt>
+ItemsIt __host__ __device__
+min_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last)
+{
+  typedef typename iterator_value<ItemsIt>::type value_type;
+  return cuda_cub::min_element(policy, first, last, less<value_type>());
+}
+
+/// max element
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt,
+          class BinaryPred>
+ItemsIt __host__ __device__
+max_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last,
+            BinaryPred                 binary_pred)
+{
+  ItemsIt ret = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __extrema::element<__extrema::arg_max_f>(policy,
+                                                   first,
+                                                   last,
+                                                   binary_pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::max_element(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt>
+ItemsIt __host__ __device__
+max_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last)
+{
+  typedef typename iterator_value<ItemsIt>::type value_type;
+  return cuda_cub::max_element(policy, first, last, less<value_type>());
+}
+
+/// minmax element
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt,
+          class BinaryPred>
+pair<ItemsIt, ItemsIt> __host__ __device__
+minmax_element(execution_policy<Derived> &policy,
+               ItemsIt                    first,
+               ItemsIt                    last,
+               BinaryPred                 binary_pred)
+{
+  pair<ItemsIt, ItemsIt> ret = thrust::make_pair(first, first);
+
+  if (__THRUST_HAS_CUDART__)
+  {
+    if (first == last)
+      return thrust::make_pair(last, last);
+
+    typedef typename iterator_traits<ItemsIt>::value_type      InputType;
+    typedef typename iterator_traits<ItemsIt>::difference_type IndexType;
+
+    IndexType num_items = static_cast<IndexType>(thrust::distance(first, last));
+
+
+    typedef tuple<ItemsIt, counting_iterator_t<IndexType> > iterator_tuple;
+    typedef zip_iterator<iterator_tuple> zip_iterator;
+
+    iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
+
+
+    typedef __extrema::arg_minmax_f<InputType, IndexType, BinaryPred> arg_minmax_t;
+    typedef typename arg_minmax_t::two_pairs_type  two_pairs_type;
+    typedef typename arg_minmax_t::duplicate_tuple duplicate_t;
+    typedef transform_input_iterator_t<two_pairs_type,
+                                       zip_iterator,
+                                       duplicate_t>
+        transform_t;
+
+    zip_iterator   begin  = make_zip_iterator(iter_tuple);
+    two_pairs_type result = __extrema::extrema(policy,
+                                               transform_t(begin, duplicate_t()),
+                                               num_items,
+                                               arg_minmax_t(binary_pred),
+                                               (two_pairs_type *)(NULL));
+    ret = thrust::make_pair(first + get<1>(get<0>(result)),
+                    first + get<1>(get<1>(result)));
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)),
+                                 first,
+                                 last,
+                                 binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt>
+pair<ItemsIt, ItemsIt> __host__ __device__
+minmax_element(execution_policy<Derived> &policy,
+               ItemsIt                    first,
+               ItemsIt                    last)
+{
+  typedef typename iterator_value<ItemsIt>::type value_type;
+  return cuda_cub::minmax_element(policy, first, last, less<value_type>());
+}
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/fill.h b/thrust/thrust/system/cuda/detail/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..078e1b3781fda6e5de9824e1f96d61a529c6f839
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/fill.h
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __fill {
+
+  // fill functor
+  template<class Iterator, class T>
+  struct functor
+  {
+    Iterator it;
+    T value;
+
+    THRUST_FUNCTION
+    functor(Iterator it, T value)
+        : it(it), value(value) {}
+
+    template<class Size>
+    THRUST_DEVICE_FUNCTION void operator()(Size idx)
+    {
+      it[idx] = value;
+    }
+  }; // struct functor
+
+}    // namespace __fill
+
+template <class Derived, class OutputIterator, class Size, class T>
+OutputIterator __host__ __device__
+fill_n(execution_policy<Derived>& policy,
+       OutputIterator             first,
+       Size                       count,
+       const T&                   value)
+{
+  cuda_cub::parallel_for(policy,
+                         __fill::functor<OutputIterator, T>(
+                         first,
+                         value),
+                         count);
+
+  cuda_cub::throw_on_error(
+    cuda_cub::synchronize(policy)
+  , "fill_n: failed to synchronize"
+  );
+
+  return first + count;
+}    // func fill_n
+
+template <class Derived, class ForwardIterator, class T>
+void __host__ __device__
+fill(execution_policy<Derived>& policy,
+     ForwardIterator            first,
+     ForwardIterator            last,
+     const T&                   value)
+{
+  cuda_cub::fill_n(policy, first, thrust::distance(first,last), value);
+} // func filll
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6a1e59d105f0db35d65ee93a058afd143002b35
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/find.h
@@ -0,0 +1,219 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+// XXX forward declare to circumvent circular depedency
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if(execution_policy<Derived>& policy,
+        InputIt                    first,
+        InputIt                    last,
+        Predicate                  predicate);
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if_not(execution_policy<Derived>& policy,
+            InputIt                    first,
+            InputIt                    last,
+            Predicate                  predicate);
+
+template <class Derived,
+          class InputIt,
+          class T>
+InputIt __host__ __device__
+find(execution_policy<Derived> &policy,
+     InputIt                    first,
+     InputIt                    last,
+     T const& value);
+
+}; // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __find_if {
+
+  template <typename TupleType>
+  struct functor
+  {
+    THRUST_DEVICE_FUNCTION TupleType
+    operator()(const TupleType& lhs, const TupleType& rhs) const
+    {
+      // select the smallest index among true results
+      if (thrust::get<0>(lhs) && thrust::get<0>(rhs))
+      {
+        return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+      }
+      else if (thrust::get<0>(lhs))
+      {
+        return lhs;
+      }
+      else
+      {
+        return rhs;
+      }
+    }
+  };
+}    // namespace __find_if
+
+template <class Derived,
+          class InputIt,
+          class Size,
+          class Predicate>
+InputIt __host__ __device__
+find_if_n(execution_policy<Derived>& policy,
+          InputIt                    first,
+          Size                       num_items,
+          Predicate                  predicate)
+{
+  typedef typename thrust::tuple<bool,Size> result_type;
+  
+  // empty sequence
+  if(num_items == 0) return first;
+  
+  // this implementation breaks up the sequence into separate intervals
+  // in an attempt to early-out as soon as a value is found
+  //
+  // XXX compose find_if from a look-back prefix scan algorithm
+  //     and abort kernel when the first element is found
+
+
+  // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
+  const Size interval_threshold = 1 << 20;
+  const Size interval_size = (thrust::min)(interval_threshold, num_items);
+  
+  // force transform_iterator output to bool
+  typedef transform_input_iterator_t<bool,
+                                     InputIt,
+                                     Predicate>
+      XfrmIterator;
+  typedef thrust::tuple<XfrmIterator,
+                        counting_iterator_t<Size> >
+      IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  IteratorTuple iter_tuple =
+      thrust::make_tuple(XfrmIterator(first, predicate),
+                         counting_iterator_t<Size>(0));
+
+  ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
+  ZipIterator end   = begin + num_items;
+
+  for (ZipIterator interval_begin = begin;
+       interval_begin < end;
+       interval_begin += interval_size)
+  {
+    ZipIterator interval_end = interval_begin + interval_size;
+    if(end < interval_end)
+    {
+      interval_end = end;
+    } // end if
+
+    result_type result = reduce(policy,
+                                interval_begin,
+                                interval_end,
+                                result_type(false, interval_end - begin),
+                                __find_if::functor<result_type>());
+
+    // see if we found something
+    if(thrust::get<0>(result))
+    {
+      return first + thrust::get<1>(result);
+    }
+  }
+  
+  //nothing was found if we reach here...
+  return first + num_items;
+}
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if(execution_policy<Derived>& policy,
+        InputIt                    first,
+        InputIt                    last,
+        Predicate                  predicate)
+{
+  return cuda_cub::find_if_n(policy, first, thrust::distance(first,last), predicate);
+}
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if_not(execution_policy<Derived>& policy,
+            InputIt                    first,
+            InputIt                    last,
+            Predicate                  predicate)
+{
+  return cuda_cub::find_if(policy, first, last, thrust::detail::not1(predicate));
+}
+
+
+template <class Derived,
+          class InputIt,
+          class T>
+InputIt __host__ __device__
+find(execution_policy<Derived> &policy,
+     InputIt                    first,
+     InputIt                    last,
+     T const& value)
+{
+  using thrust::placeholders::_1;
+
+  return cuda_cub::find_if(policy,
+                        first,
+                        last,
+                        _1 == value);
+}
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..542dcf754e752866324d38b630364a8d44a7b75f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/for_each.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/detail/function.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+  // for_each functor
+  template <class Input, class UnaryOp>
+  struct for_each_f
+  {
+    Input input;
+    UnaryOp op;
+
+    THRUST_FUNCTION
+    for_each_f(Input input, UnaryOp op)
+        : input(input), op(op) {}
+
+    template <class Size>
+    THRUST_DEVICE_FUNCTION void operator()(Size idx)
+    {
+      op(raw_reference_cast(input[idx]));
+    }
+  };
+
+  //-------------------------
+  // Thrust API entry points
+  //-------------------------
+
+  // for_each_n
+  template <class Derived,
+            class Input,
+            class Size,
+            class UnaryOp>
+  Input THRUST_FUNCTION
+  for_each_n(execution_policy<Derived> &policy,
+             Input                      first,
+             Size                       count,
+             UnaryOp                    op)
+  {
+    typedef thrust::detail::wrapped_function<UnaryOp, void> wrapped_t;
+    wrapped_t wrapped_op(op);
+
+    cuda_cub::parallel_for(policy,
+                           for_each_f<Input, wrapped_t>(first, wrapped_op),
+                           count);
+
+    cuda_cub::throw_on_error(
+      cuda_cub::synchronize(policy)
+    , "for_each: failed to synchronize"
+    );
+
+    return first + count;
+  }
+
+  // for_each
+  template <class Derived,
+            class Input,
+            class UnaryOp>
+  Input THRUST_FUNCTION
+  for_each(execution_policy<Derived> &policy,
+           Input                      first,
+           Input                      last,
+           UnaryOp                    op)
+  {
+    typedef typename iterator_traits<Input>::difference_type size_type;
+    size_type count = static_cast<size_type>(thrust::distance(first,last));
+    return cuda_cub::for_each_n(policy, first,  count, op);
+  }
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/future.inl b/thrust/thrust/system/cuda/detail/future.inl
new file mode 100644
index 0000000000000000000000000000000000000000..cfc9101952c2a1b2e5b1314ea3744a3c4f88433f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/future.inl
@@ -0,0 +1,1369 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+// TODO: Split into more granular headers (move unique_stream/unique_marker to
+// another header, etc).
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/optional.h>
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/tuple_algorithms.h>
+#include <thrust/allocate_unique.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/execute_with_dependencies.h>
+#include <thrust/detail/event_error.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/get_value.h>
+
+#include <type_traits>
+#include <thrust/detail/memory_wrapper.h>
+
+namespace thrust
+{
+
+// Forward declaration.
+struct new_stream_t;
+
+namespace system { namespace cuda { namespace detail
+{
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct nonowning_t final {};
+
+THRUST_INLINE_CONSTANT nonowning_t nonowning{};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct marker_deleter final
+{
+  __host__
+  void operator()(CUevent_st* e) const
+  {
+    if (nullptr != e)
+      thrust::cuda_cub::throw_on_error(cudaEventDestroy(e));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct unique_marker final
+{
+  using native_handle_type = CUevent_st*;
+
+private:
+  std::unique_ptr<CUevent_st, marker_deleter> handle_;
+
+public:
+  /// \brief Create a new stream and construct a handle to it. When the handle
+  ///        is destroyed, the stream is destroyed.
+  __host__
+  unique_marker()
+    : handle_(nullptr, marker_deleter())
+  {
+    native_handle_type e;
+    thrust::cuda_cub::throw_on_error(
+      cudaEventCreateWithFlags(&e, cudaEventDisableTiming)
+    );
+    handle_.reset(e);
+  }
+
+  __thrust_exec_check_disable__
+  unique_marker(unique_marker const&) = delete;
+  __thrust_exec_check_disable__
+  unique_marker(unique_marker&&) = default;
+  __thrust_exec_check_disable__
+  unique_marker& operator=(unique_marker const&) = delete;
+  __thrust_exec_check_disable__
+  unique_marker& operator=(unique_marker&&) = default;
+
+  __thrust_exec_check_disable__
+  ~unique_marker() = default;
+
+  __host__
+  auto get() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+  __host__
+  auto native_handle() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+
+  __host__
+  bool valid() const noexcept { return bool(handle_); }
+
+  __host__
+  bool ready() const
+  {
+    cudaError_t const err = cudaEventQuery(handle_.get());
+
+    if (cudaErrorNotReady == err)
+      return false;
+
+    // Throw on any other error.
+    thrust::cuda_cub::throw_on_error(err);
+
+    return true;
+  }
+
+  __host__
+  void wait() const
+  {
+    thrust::cuda_cub::throw_on_error(cudaEventSynchronize(handle_.get()));
+  }
+
+  __host__
+  bool operator==(unique_marker const& other) const
+  {
+    return other.handle_ == handle_;
+  }
+
+  __host__
+  bool operator!=(unique_marker const& other) const
+  {
+    return !(other == *this);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct stream_deleter final
+{
+  __host__
+  void operator()(CUstream_st* s) const
+  {
+    if (nullptr != s)
+      thrust::cuda_cub::throw_on_error(cudaStreamDestroy(s));
+  }
+};
+
+struct stream_conditional_deleter final
+{
+private:
+  bool const cond_;
+
+public:
+  __host__
+  constexpr stream_conditional_deleter() noexcept
+    : cond_(true) {}
+
+  __host__
+  explicit constexpr stream_conditional_deleter(nonowning_t) noexcept
+    : cond_(false) {}
+
+  __host__
+  void operator()(CUstream_st* s) const
+  {
+    if (cond_ && nullptr != s)
+    {
+      thrust::cuda_cub::throw_on_error(cudaStreamDestroy(s));
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct unique_stream final
+{
+  using native_handle_type = CUstream_st*;
+
+private:
+  std::unique_ptr<CUstream_st, stream_conditional_deleter> handle_;
+
+public:
+  /// \brief Create a new stream and construct a handle to it. When the handle
+  ///        is destroyed, the stream is destroyed.
+  __host__
+  unique_stream()
+    : handle_(nullptr, stream_conditional_deleter())
+  {
+    native_handle_type s;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking)
+    );
+    handle_.reset(s);
+  }
+
+  /// \brief Construct a non-owning handle to an existing stream. When the
+  ///        handle is destroyed, the stream is not destroyed.
+  __host__
+  explicit unique_stream(nonowning_t, native_handle_type handle)
+    : handle_(handle, stream_conditional_deleter(nonowning))
+  {}
+
+  __thrust_exec_check_disable__
+  unique_stream(unique_stream const&) = delete;
+  __thrust_exec_check_disable__
+  unique_stream(unique_stream&&) = default;
+  __thrust_exec_check_disable__
+  unique_stream& operator=(unique_stream const&) = delete;
+  __thrust_exec_check_disable__
+  unique_stream& operator=(unique_stream&&) = default;
+
+  __thrust_exec_check_disable__
+  ~unique_stream() = default;
+
+  __host__
+  auto get() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+  __host__
+  auto native_handle() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+
+  __host__
+  bool valid() const noexcept { return bool(handle_); }
+
+  __host__
+  bool ready() const
+  {
+    cudaError_t const err = cudaStreamQuery(handle_.get());
+
+    if (cudaErrorNotReady == err)
+      return false;
+
+    // Throw on any other error.
+    thrust::cuda_cub::throw_on_error(err);
+
+    return true;
+  }
+
+  __host__
+  void wait() const
+  {
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamSynchronize(handle_.get())
+    );
+  }
+
+  __host__
+  void depend_on(unique_marker& e)
+  {
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamWaitEvent(handle_.get(), e.get(), 0)
+    );
+  }
+
+  __host__
+  void depend_on(unique_stream& s)
+  {
+    if (s != *this)
+    {
+      unique_marker e;
+      s.record(e);
+      depend_on(e);
+    }
+  }
+
+  __host__
+  void record(unique_marker& e)
+  {
+    thrust::cuda_cub::throw_on_error(cudaEventRecord(e.get(), handle_.get()));
+  }
+
+  __host__
+  bool operator==(unique_stream const& other) const
+  {
+    return other.handle_ == handle_;
+  }
+
+  __host__
+  bool operator!=(unique_stream const& other) const
+  {
+    return !(other == *this);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Inheritance hierarchy of future/event shared state types.
+
+struct async_signal;
+
+template <typename KeepAlives>
+struct async_keep_alives /* : virtual async_signal */;
+
+template <typename T>
+struct async_value /* : virtual async_signal */;
+
+template <typename T, typename Pointer, typename KeepAlives>
+struct async_addressable_value_with_keep_alives
+/* : async_value<T>, async_keep_alives<KeepAlives> */;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Pointer>
+struct weak_promise;
+
+template <typename X, typename XPointer = pointer<X>>
+struct unique_eager_future_promise_pair final
+{
+  unique_eager_future<X>    future;
+  weak_promise<X, XPointer> promise;
+};
+
+struct acquired_stream final
+{
+  unique_stream stream;
+  optional<std::size_t> const acquired_from;
+  // `acquired_from` contains the index in the tuple of dependencies from which
+  // the stream was acquired. If `acquired_from` is empty, no stream could be
+  // acquired from a dependency, and then the stream was newly created.
+};
+
+// Precondition: `device` is the current CUDA device.
+template <typename X, typename Y, typename Deleter>
+__host__
+optional<unique_stream>
+try_acquire_stream(int device, std::unique_ptr<Y, Deleter>&) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int, unique_stream& stream) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int device, ready_event&) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+template <typename X>
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int device, ready_future<X>&) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int device, unique_eager_event& parent) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+template <typename X>
+__host__
+optional<unique_stream>
+try_acquire_stream(int device, unique_eager_future<X>& parent) noexcept;
+
+template <typename... Dependencies>
+__host__
+acquired_stream acquire_stream(int device, Dependencies&... deps) noexcept;
+  
+template <typename... Dependencies>
+__host__
+unique_eager_event
+make_dependent_event(
+  std::tuple<Dependencies...>&& deps
+);
+
+template <
+  typename X, typename XPointer
+, typename ComputeContent, typename... Dependencies
+>
+__host__
+unique_eager_future_promise_pair<X, XPointer>
+make_dependent_future(ComputeContent&& cc, std::tuple<Dependencies...>&& deps);
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct async_signal
+{
+protected:
+  unique_stream stream_;
+
+public:
+  // Constructs an `async_signal` which uses `stream`.
+  __host__
+  explicit async_signal(unique_stream&& stream)
+    : stream_(std::move(stream))
+  {}
+
+  __host__
+  virtual ~async_signal() {}
+
+  unique_stream&       stream()       noexcept { return stream_; }
+  unique_stream const& stream() const noexcept { return stream_; }
+};
+
+template <typename... KeepAlives>
+struct async_keep_alives<std::tuple<KeepAlives...>> : virtual async_signal
+{
+  using keep_alives_type = std::tuple<KeepAlives...>;
+
+protected:
+  keep_alives_type keep_alives_;
+
+public:
+  // Constructs an `async_keep_alives` which uses `stream`, and keeps the
+  // objects in the tuple `keep_alives` alive until the asynchronous signal is
+  // destroyed.
+  __host__
+  explicit async_keep_alives(
+    unique_stream&& stream, keep_alives_type&& keep_alives
+  )
+    : async_signal(std::move(stream))
+    , keep_alives_(std::move(keep_alives))
+  {}
+
+  __host__
+  virtual ~async_keep_alives() {}
+};
+
+template <typename T>
+struct async_value : virtual async_signal
+{
+  using value_type        = T;
+  using raw_const_pointer = value_type const*;
+
+  // Constructs an `async_value` which uses `stream` and has no content.
+  __host__
+  explicit async_value(unique_stream stream)
+    : async_signal(std::move(stream))
+  {}
+
+  __host__
+  virtual ~async_value() {}
+
+  __host__
+  virtual bool valid_content() const noexcept { return false; }
+
+  __host__
+  virtual value_type get()
+  {
+    throw thrust::event_error(event_errc::no_state);
+  }
+
+  __host__
+  virtual value_type extract()
+  {
+    throw thrust::event_error(event_errc::no_state);
+  }
+
+  // For testing only.
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  __host__
+  virtual raw_const_pointer raw_data() const
+  {
+    return nullptr;
+  }
+  #endif
+};
+
+template <typename T, typename Pointer, typename... KeepAlives>
+struct async_addressable_value_with_keep_alives<
+  T, Pointer, std::tuple<KeepAlives...>
+> final
+  : async_value<T>, async_keep_alives<std::tuple<KeepAlives...>>
+{
+  using value_type        = typename async_value<T>::value_type;
+  using raw_const_pointer = typename async_value<T>::raw_const_pointer;
+
+  using keep_alives_type
+    = typename async_keep_alives<std::tuple<KeepAlives...>>::keep_alives_type;
+
+  using pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<value_type>::other;
+  using const_pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<value_type const>::other;
+
+private:
+  pointer content_;
+
+public:
+  // Constructs an `async_addressable_value_with_keep_alives` which uses
+  // `stream`, keeps the objects in the tuple `keep_alives` alive until the
+  // asynchronous value is destroyed, and determines the location of its
+  // content by evaluating `compute_content(content_keep_alive)`.
+  // NOTE: The use of a callback idiom is necessary if the content is stored in
+  // place in the content keep alive object, in which case we need to get its
+  // address after its been moved into the new signal we're constructing.
+  // NOTE: NVCC has a bug that causes it to reorder our base class initializers
+  // in generated host code, which leads to -Wreorder warnings.
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_BEGIN
+  template <typename ComputeContent>
+  __host__
+  explicit async_addressable_value_with_keep_alives(
+    unique_stream&&    stream
+  , keep_alives_type&& keep_alives
+  , ComputeContent&&   compute_content
+  )
+    : async_signal(std::move(stream))
+    , async_value<T>(std::move(stream))
+    , async_keep_alives<keep_alives_type>(
+        std::move(stream), std::move(keep_alives)
+      )
+  {
+    content_ = THRUST_FWD(compute_content)(std::get<0>(this->keep_alives_));
+  }
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END
+
+  __host__
+  bool valid_content() const noexcept final override
+  {
+    return nullptr != content_;
+  }
+
+  // Precondition: `true == valid_content()`.
+  __host__
+  pointer data() 
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    return content_;
+  }
+
+  // Precondition: `true == valid_content()`.
+  __host__
+  const_pointer data() const 
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    return content_;
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  __host__
+  value_type get() final override
+  {
+    this->stream().wait();
+    return *data();
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  __host__
+  value_type extract() final override
+  {
+    this->stream().wait();
+    return std::move(*data());
+  }
+
+  // For testing only.
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  __host__
+  raw_const_pointer raw_data() const final override
+  {
+    return raw_pointer_cast(content_);
+  }
+  #endif
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Pointer>
+struct weak_promise final
+{
+  using value_type = typename async_value<T>::value_type;
+
+  using pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<T>::other;
+  using const_pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<T const>::other;
+
+private:
+  int device_ = 0;
+  pointer content_;
+
+  explicit weak_promise(int device, pointer content)
+    : device_(device), content_(std::move(content))
+  {}
+
+public:
+  __host__ __device__
+  weak_promise() : device_(0), content_{} {}
+
+  __thrust_exec_check_disable__
+  weak_promise(weak_promise const&) = default;
+  __thrust_exec_check_disable__
+  weak_promise(weak_promise&&) = default;
+  __thrust_exec_check_disable__
+  weak_promise& operator=(weak_promise const&) = default;
+  __thrust_exec_check_disable__
+  weak_promise& operator=(weak_promise&&) = default;
+
+  template <typename U>
+  __host__ __device__
+  void set_value(U&& value) &&
+  {
+    *content_ = THRUST_FWD(value);
+  }
+
+  template <
+    typename X, typename XPointer
+  , typename ComputeContent, typename... Dependencies
+  >
+  friend __host__
+  unique_eager_future_promise_pair<X, XPointer>
+  thrust::system::cuda::detail::make_dependent_future(
+    ComputeContent&& cc, std::tuple<Dependencies...>&& deps
+  );
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+struct ready_event final
+{
+  ready_event() = default;
+
+  template <typename U>
+  __host__ __device__
+  explicit ready_event(ready_future<U>) {}
+
+  __host__ __device__
+  static constexpr bool valid_content() noexcept { return true; }
+
+  __host__ __device__
+  static constexpr bool ready() noexcept { return true; }
+};
+
+template <typename T>
+struct ready_future final
+{
+  using value_type        = T;
+  using raw_const_pointer = T const*;
+
+private:
+  value_type value_;
+
+public:
+  __host__ __device__
+  ready_future() : value_{} {}
+
+  ready_future(ready_future&&) = default;
+  ready_future(ready_future const&) = default;
+  ready_future& operator=(ready_future&&) = default;
+  ready_future& operator=(ready_future const&) = default;
+
+  template <typename U>
+  __host__ __device__
+  explicit ready_future(U&& u) : value_(THRUST_FWD(u)) {}
+
+  __host__ __device__
+  static constexpr bool valid_content() noexcept { return true; }
+
+  __host__ __device__
+  static constexpr bool ready() noexcept { return true; }
+
+  __host__ __device__
+  value_type get() const
+  {
+    return value_;
+  }
+
+  THRUST_NODISCARD __host__ __device__
+  value_type extract() 
+  {
+    return std::move(value_);
+  }
+
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  // For testing only.
+  __host__ __device__
+  raw_const_pointer data() const
+  {
+    return addressof(value_);
+  }
+  #endif
+};
+
+struct unique_eager_event final
+{
+protected:
+  int device_ = 0;
+  std::unique_ptr<detail::async_signal> async_signal_;
+
+  __host__
+  explicit unique_eager_event(
+    int device, std::unique_ptr<detail::async_signal> async_signal
+  )
+    : device_(device), async_signal_(std::move(async_signal))
+  {}
+
+public:
+  __host__
+  unique_eager_event()
+    : device_(0), async_signal_()
+  {}
+
+  unique_eager_event(unique_eager_event&&) = default;
+  unique_eager_event(unique_eager_event const&) = delete;
+  unique_eager_event& operator=(unique_eager_event&&) = default;
+  unique_eager_event& operator=(unique_eager_event const&) = delete;
+
+  // Any `unique_eager_future<T>` can be explicitly converted to a
+  // `unique_eager_event<void>`.
+  template <typename U>
+  __host__
+  explicit unique_eager_event(unique_eager_future<U>&& other)
+    // NOTE: We upcast to `unique_ptr<async_signal>` here.
+    : device_(other.where()), async_signal_(std::move(other.async_signal_))
+  {}
+
+  __host__
+  // NOTE: We take `new_stream_t` by `const&` because it is incomplete here.
+  explicit unique_eager_event(new_stream_t const&)
+    : device_(0)
+    , async_signal_(new detail::async_signal(detail::unique_stream{}))
+  {
+    thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_));
+  }
+
+  __host__
+  virtual ~unique_eager_event()
+  {
+    // FIXME: If we could asynchronously handle destruction of keep alives, we
+    // could avoid doing this.
+    if (valid_stream()) wait();
+  }
+
+  __host__
+  bool valid_stream() const noexcept
+  {
+    return bool(async_signal_);
+  }
+
+  __host__
+  bool ready() const noexcept
+  {
+    if (valid_stream())
+      return stream().ready();
+    else
+      return false;
+  }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  detail::unique_stream& stream()
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+  detail::unique_stream const& stream() const
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+
+  __host__
+  int where() const noexcept { return device_; }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  void wait()
+  {
+    stream().wait();
+  }
+
+  friend __host__
+  optional<detail::unique_stream>
+  thrust::system::cuda::detail::try_acquire_stream(
+    int device, unique_eager_event& parent
+    ) noexcept;
+
+  template <typename... Dependencies>
+  friend __host__
+  unique_eager_event
+  thrust::system::cuda::detail::make_dependent_event(
+    std::tuple<Dependencies...>&& deps
+  );
+};
+
+template <typename T>
+struct unique_eager_future final
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (!std::is_same<T, remove_cvref_t<void>>::value)
+  , "`thrust::event` should be used to express valueless futures"
+  );
+
+  using value_type        = typename detail::async_value<T>::value_type;
+  using raw_const_pointer = typename detail::async_value<T>::raw_const_pointer;
+
+private:
+  int device_ = 0;
+  std::unique_ptr<detail::async_value<value_type>> async_signal_;
+
+  __host__
+  explicit unique_eager_future(
+    int device, std::unique_ptr<detail::async_value<value_type>> async_signal
+  )
+    : device_(device), async_signal_(std::move(async_signal))
+  {}
+
+public:
+  __host__
+  unique_eager_future()
+    : device_(0), async_signal_()
+  {}
+
+  unique_eager_future(unique_eager_future&&) = default;
+  unique_eager_future(unique_eager_future const&) = delete;
+  unique_eager_future& operator=(unique_eager_future&&) = default;
+  unique_eager_future& operator=(unique_eager_future const&) = delete;
+
+  __host__
+  // NOTE: We take `new_stream_t` by `const&` because it is incomplete here.
+  explicit unique_eager_future(new_stream_t const&)
+    : device_(0)
+    , async_signal_(new detail::async_value<value_type>(detail::unique_stream{}))
+  {
+    thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_));
+  }
+
+  __host__
+  ~unique_eager_future()
+  {
+    // FIXME: If we could asynchronously handle destruction of keep alives, we
+    // could avoid doing this.
+    if (valid_stream()) wait();
+  }
+
+  __host__
+  bool valid_stream() const noexcept
+  {
+    return bool(async_signal_);
+  }
+
+  __host__
+  bool valid_content() const noexcept
+  {
+    if (!valid_stream())
+      return false;
+
+    // We might have been constructed with `new_stream_t`, in which case we'd
+    // have an async_value, but it doesn't have content.
+    return async_signal_->valid_content();
+  }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  bool ready() const noexcept
+  {
+    if (valid_stream())
+      return stream().ready();
+    else
+      return false;
+  }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  detail::unique_stream& stream()
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+  __host__
+  detail::unique_stream const& stream() const
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+
+  __host__
+  int where() const noexcept { return device_; }
+
+  // Blocks.
+  // Precondition: `true == valid_stream()`.
+  __host__
+  void wait()
+  {
+    stream().wait();
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  __host__
+  value_type get()
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    return async_signal_->get();
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  THRUST_NODISCARD __host__
+  value_type extract()
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    value_type tmp(async_signal_->extract());
+    async_signal_.reset();
+    return tmp;
+  }
+
+  // For testing only.
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  // Precondition: `true == valid_stream()`.
+  __host__
+  raw_const_pointer raw_data() const
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->raw_data();
+  }
+  #endif
+
+  template <typename X>
+  friend __host__
+  optional<detail::unique_stream>
+  thrust::system::cuda::detail::try_acquire_stream(
+    int device, unique_eager_future<X>& parent
+    ) noexcept;
+
+  template <
+    typename X, typename XPointer
+  , typename ComputeContent, typename... Dependencies
+  >
+  friend __host__
+  detail::unique_eager_future_promise_pair<X, XPointer>
+  thrust::system::cuda::detail::make_dependent_future(
+    ComputeContent&& cc, std::tuple<Dependencies...>&& deps
+  );
+
+  friend struct unique_eager_event;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename X, typename Deleter>
+__host__
+optional<unique_stream>
+try_acquire_stream(int, std::unique_ptr<X, Deleter>&) noexcept
+{
+  // There's no stream to acquire!
+  return {};
+}
+
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int, unique_stream& stream) noexcept
+{
+  return {std::move(stream)};
+}
+
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int, ready_event&) noexcept
+{
+  // There's no stream to acquire!
+  return {};
+}
+
+template <typename X>
+__host__
+optional<unique_stream>
+try_acquire_stream(int, ready_future<X>&) noexcept
+{
+  // There's no stream to acquire!
+  return {};
+}
+
+__host__
+optional<unique_stream>
+try_acquire_stream(int device, unique_eager_event& parent) noexcept
+{
+  // We have unique ownership, so we can always steal the stream if the future
+  // has one as long as they are on the same device as us.
+  if (parent.valid_stream())
+    if (device == parent.device_)
+      return std::move(parent.async_signal_->stream());
+
+  return {};
+}
+
+template <typename X>
+__host__
+optional<unique_stream>
+try_acquire_stream(int device, unique_eager_future<X>& parent) noexcept
+{
+  // We have unique ownership, so we can always steal the stream if the future
+  // has one as long as they are on the same device as us.
+  if (parent.valid_stream())
+    if (device == parent.device_)
+      return std::move(parent.async_signal_->stream());
+
+  return {};
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename... Dependencies>
+__host__
+acquired_stream acquire_stream_impl(
+  int, std::tuple<Dependencies...>&, index_sequence<>
+) noexcept
+{
+  // We tried to take a stream from all of our dependencies and failed every
+  // time, so we need to make a new stream.
+  return {unique_stream{}, {}};
+}
+
+template <typename... Dependencies, std::size_t I0, std::size_t... Is>
+__host__
+acquired_stream acquire_stream_impl(
+  int device
+, std::tuple<Dependencies...>& deps, index_sequence<I0, Is...>
+) noexcept
+{
+  auto tr = try_acquire_stream(device, std::get<I0>(deps));
+
+  if (tr)
+    return {std::move(*tr), {I0}};
+  else
+    return acquire_stream_impl(device, deps, index_sequence<Is...>{});
+}
+
+template <typename... Dependencies>
+__host__
+acquired_stream acquire_stream(
+  int device
+, std::tuple<Dependencies...>& deps
+) noexcept
+{
+  return acquire_stream_impl(
+    device, deps, make_index_sequence<sizeof...(Dependencies)>{}
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename X, typename Deleter>
+__host__
+void create_dependency(
+  unique_stream&, std::unique_ptr<X, Deleter>&
+) noexcept
+{}
+
+inline __host__
+void create_dependency(
+  unique_stream&, ready_event&
+) noexcept
+{}
+
+template <typename T>
+__host__
+void create_dependency(
+  unique_stream&, ready_future<T>&
+) noexcept
+{}
+
+inline __host__
+void create_dependency(
+  unique_stream& child, unique_stream& parent
+)
+{
+  child.depend_on(parent);
+}
+
+inline __host__
+void create_dependency(
+  unique_stream& child, unique_eager_event& parent
+)
+{
+  child.depend_on(parent.stream());
+}
+
+template <typename X>
+__host__
+void create_dependency(
+  unique_stream& child, unique_eager_future<X>& parent
+)
+{
+  child.depend_on(parent.stream());
+}
+
+template <typename... Dependencies>
+__host__
+void create_dependencies_impl(
+  acquired_stream&
+, std::tuple<Dependencies...>&, index_sequence<>
+)
+{}
+
+template <typename... Dependencies, std::size_t I0, std::size_t... Is>
+__host__
+void create_dependencies_impl(
+  acquired_stream& as
+, std::tuple<Dependencies...>& deps, index_sequence<I0, Is...>
+)
+{
+  // We only need to wait on the current dependency if we didn't steal our
+  // stream from it.
+  if (!as.acquired_from || *as.acquired_from != I0)
+  {
+    create_dependency(as.stream, std::get<I0>(deps));
+  }
+
+  create_dependencies_impl(as, deps, index_sequence<Is...>{});
+}
+
+template <typename... Dependencies>
+__host__
+void create_dependencies(acquired_stream& as, std::tuple<Dependencies...>& deps)
+{
+  create_dependencies_impl(
+    as, deps, make_index_sequence<sizeof...(Dependencies)>{}
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Metafunction that determine which `Dependencies` need to be kept alive.
+// Returns the result as an `index_sequence` of indices into the parameter
+// pack.
+template <typename Tuple, typename Indices>
+  struct find_keep_alives_impl;
+template <typename Tuple>
+  using find_keep_alives
+    = typename find_keep_alives_impl<
+        Tuple, make_index_sequence<std::tuple_size<Tuple>::value>
+      >::type;
+
+template <>
+struct find_keep_alives_impl<
+  std::tuple<>, index_sequence<>
+>
+{
+  using type = index_sequence<>;
+};
+
+// User-provided stream.
+template <
+  typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<unique_stream, Dependencies...>, index_sequence<I0, Is...>
+>
+{
+  // Nothing to keep alive, skip this index.
+  using type = typename find_keep_alives_impl<
+    std::tuple<Dependencies...>, index_sequence<Is...>
+  >::type;
+};
+
+template <
+  typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<ready_event, Dependencies...>, index_sequence<I0, Is...>
+>
+{
+  // Nothing to keep alive, skip this index.
+  using type = typename find_keep_alives_impl<
+    std::tuple<Dependencies...>, index_sequence<Is...>
+  >::type;
+};
+
+template <
+  typename T, typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<ready_future<T>, Dependencies...>, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+template <
+  typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<unique_eager_event, Dependencies...>
+, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+template <
+  typename X, typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<unique_eager_future<X>, Dependencies...>
+, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+// Content storage.
+template <
+  typename T, typename Deleter, typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<std::unique_ptr<T, Deleter>, Dependencies...>
+, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename... Dependencies>
+__host__
+unique_eager_event make_dependent_event(std::tuple<Dependencies...>&& deps)
+{
+  int device = 0;
+  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device));
+
+  // First, either steal a stream from one of our children or make a new one.
+  auto as = acquire_stream(device, deps);
+
+  // Then, make the stream we've acquired asynchronously wait on all of our
+  // dependencies, except the one we stole the stream from.
+  create_dependencies(as, deps);
+
+  // Then, we determine which subset of dependencies need to be kept alive.
+  auto ka = tuple_subset(
+    std::move(deps)
+  , find_keep_alives<std::tuple<Dependencies...>>{}
+  );
+
+  // Next, we create the asynchronous signal.
+  using async_signal_type = async_keep_alives<decltype(ka)>;
+
+  std::unique_ptr<async_signal_type> sig(
+    new async_signal_type(std::move(as.stream), std::move(ka))
+  );
+
+  // Finally, we create the event object.
+  return unique_eager_event(device, std::move(sig));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename X, typename XPointer
+, typename ComputeContent, typename... Dependencies
+>
+__host__
+unique_eager_future_promise_pair<X, XPointer>
+make_dependent_future(ComputeContent&& cc, std::tuple<Dependencies...>&& deps)
+{
+  int device = 0;
+  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device));
+
+  // First, either steal a stream from one of our children or make a new one.
+  auto as = acquire_stream(device, deps);
+
+  // Then, make the stream we've acquired asynchronously wait on all of our
+  // dependencies, except the one we stole the stream from.
+  create_dependencies(as, deps);
+
+  // Then, we determine which subset of dependencies need to be kept alive.
+  auto ka = tuple_subset(
+    std::move(deps)
+  , find_keep_alives<std::tuple<Dependencies...>>{}
+  );
+
+  // Next, we create the asynchronous value.
+  using async_signal_type = async_addressable_value_with_keep_alives<
+    X, XPointer, decltype(ka)
+  >;
+
+  std::unique_ptr<async_signal_type> sig(
+    new async_signal_type(std::move(as.stream), std::move(ka), std::move(cc))
+  );
+ 
+  // Finally, we create the promise and future objects.
+  weak_promise<X, XPointer> child_prom(device, sig->data());
+  unique_eager_future<X> child_fut(device, std::move(sig));
+
+  return unique_eager_future_promise_pair<X, XPointer>
+    {std::move(child_fut), std::move(child_prom)};
+}
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename... Events>
+__host__
+unique_eager_event when_all(Events&&... evs)
+// TODO: Constrain to events, futures, and maybe streams (currently allows keep
+// alives).
+{
+  return detail::make_dependent_event(std::make_tuple(std::move(evs)...)); 
+}
+
+// ADL hook for transparent `.after` move support.
+inline __host__
+auto capture_as_dependency(unique_eager_event& dependency)
+THRUST_DECLTYPE_RETURNS(std::move(dependency))
+
+// ADL hook for transparent `.after` move support.
+template <typename X>
+__host__
+auto capture_as_dependency(unique_eager_future<X>& dependency)
+THRUST_DECLTYPE_RETURNS(std::move(dependency))
+
+}} // namespace system::cuda
+
+} // end namespace thrust
+
+#endif 
+
diff --git a/thrust/thrust/system/cuda/detail/gather.h b/thrust/thrust/system/cuda/detail/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..31ca3fd561b71fb389ade0359ee30205ef290ed4
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/gather.h
@@ -0,0 +1,107 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class MapIt,
+          class ItemsIt,
+          class ResultIt>
+ResultIt __host__ __device__
+gather(execution_policy<Derived>& policy,
+    MapIt map_first,
+    MapIt map_last,
+    ItemsIt items,
+    ResultIt result)
+{
+  return cuda_cub::transform(policy,
+                          thrust::make_permutation_iterator(items, map_first),
+                          thrust::make_permutation_iterator(items, map_last),
+                          result,
+                          identity());
+}
+
+
+template <class Derived,
+          class MapIt,
+          class StencilIt,
+          class ItemsIt,
+          class ResultIt,
+          class Predicate>
+ResultIt __host__ __device__
+gather_if(execution_policy<Derived>& policy,
+          MapIt                      map_first,
+          MapIt                      map_last,
+          StencilIt                  stencil,
+          ItemsIt                    items,
+          ResultIt                   result,
+          Predicate                  predicate)
+{
+  return cuda_cub::transform_if(policy,
+                              thrust::make_permutation_iterator(items, map_first),
+                              thrust::make_permutation_iterator(items, map_last),
+                              stencil,
+                              result,
+                              identity(),
+                              predicate);
+}
+
+template <class Derived,
+          class MapIt,
+          class StencilIt,
+          class ItemsIt,
+          class ResultIt>
+ResultIt __host__ __device__
+gather_if(execution_policy<Derived>& policy,
+          MapIt                      map_first,
+          MapIt                      map_last,
+          StencilIt                  stencil,
+          ItemsIt                    items,
+          ResultIt                   result)
+{
+  return cuda_cub::gather_if(policy,
+                          map_first,
+                          map_last,
+                          stencil,
+                          items,
+                          result,
+                          identity());
+}
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/generate.h b/thrust/thrust/system/cuda/detail/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..df77901e219d07e76cbc294299445bb0eaad0dfc
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/generate.h
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/for_each.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+// for_each functor
+template <class Generator>
+struct generate_f
+{
+  Generator generator;
+
+  THRUST_FUNCTION
+  generate_f(Generator generator_) : generator(generator_) {}
+
+  template<class T>
+  THRUST_DEVICE_FUNCTION void operator()(T const& value)
+  {
+    T & lvalue = const_cast<T&>(value);
+    lvalue = generator();
+  }
+};
+
+// for_each_n
+template <class Derived,
+          class OutputIt,
+          class Size,
+          class Generator>
+OutputIt __host__ __device__
+generate_n(execution_policy<Derived> &policy,
+           OutputIt                   result,
+           Size                       count,
+           Generator                  generator)
+{
+  return cuda_cub::for_each_n(policy,
+                              result,
+                              count,
+                              generate_f<Generator>(generator));
+}
+
+  // for_each
+template <class Derived,
+          class OutputIt,
+          class Generator>
+void __host__ __device__
+generate(execution_policy<Derived> &policy,
+         OutputIt                   first,
+         OutputIt                   last,
+         Generator                  generator)
+{
+  cuda_cub::generate_n(policy, first, thrust::distance(first, last), generator);
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/get_value.h b/thrust/thrust/system/cuda/detail/get_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fbb0b548ca4068184685339f1da3fe13518eded
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/get_value.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cross_system.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+
+namespace
+{
+
+
+template<typename DerivedPolicy, typename Pointer>
+inline __host__ __device__
+  typename thrust::iterator_value<Pointer>::type
+    get_value_msvc2005_war(execution_policy<DerivedPolicy> &exec, Pointer ptr)
+{
+  typedef typename thrust::iterator_value<Pointer>::type result_type;
+
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static result_type host_path(execution_policy<DerivedPolicy> &exec, Pointer ptr)
+    {
+      // when called from host code, implement with assign_value
+      // note that this requires a type with default constructor
+      result_type result;
+
+      thrust::host_system_tag host_tag;
+      cross_system<thrust::host_system_tag, DerivedPolicy> systems(host_tag, exec);
+      assign_value(systems, &result, ptr);
+
+      return result;
+    }
+
+    __device__ inline static result_type device_path(execution_policy<DerivedPolicy> &, Pointer ptr)
+    {
+      // when called from device code, just do simple deref
+      return *thrust::raw_pointer_cast(ptr);
+    }
+  };
+
+  // The usual pattern for separating host and device code doesn't work here
+  // because it would result in a compiler warning, either about falling off
+  // the end of a non-void function, or about result_type's default constructor
+  // being a host-only function.
+  #ifdef __NVCOMPILER_CUDA__
+  if (THRUST_IS_HOST_CODE) {
+    return war_nvbugs_881631::host_path(exec, ptr);
+  } else {
+    return war_nvbugs_881631::device_path(exec, ptr);
+  }
+  #else
+    #ifndef __CUDA_ARCH__
+      return war_nvbugs_881631::host_path(exec, ptr);
+    #else
+      return war_nvbugs_881631::device_path(exec, ptr);
+    #endif // __CUDA_ARCH__
+  #endif
+  } // end get_value_msvc2005_war()
+} // end anon namespace
+
+
+template<typename DerivedPolicy, typename Pointer>
+inline __host__ __device__
+  typename thrust::iterator_value<Pointer>::type
+    get_value(execution_policy<DerivedPolicy> &exec, Pointer ptr)
+{
+  return get_value_msvc2005_war(exec,ptr);
+} // end get_value()
+
+
+} // end cuda_cub
+} // end namespace thrust
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/guarded_cuda_runtime_api.h b/thrust/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b0f345a74a4aa3b69027774be52fd9e0a5d09cd
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to check for the existence of macros
+// such as __host__ and __device__, which may already be defined by thrust
+// and to undefine them before entering cuda_runtime_api.h (which will redefine them)
+
+// we only try to do this stuff if cuda/include/host_defines.h has been included
+#if !defined(__HOST_DEFINES_H__)
+
+#ifdef __host__
+#undef __host__
+#endif // __host__
+
+#ifdef __device__
+#undef __device__
+#endif // __device__
+
+#endif // __HOST_DEFINES_H__
+
+#include <cuda_runtime_api.h>
+
diff --git a/thrust/thrust/system/cuda/detail/guarded_driver_types.h b/thrust/thrust/system/cuda/detail/guarded_driver_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..076964071cf78458de27fe54de3caf932ce93b40
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/guarded_driver_types.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include <driver_types.h> without causing
+// warnings from redefinitions of __host__ and __device__.
+// carefully save their definitions and restore them
+// can't tell exactly when push_macro & pop_macro were introduced to gcc; assume 4.5.0
+
+
+#if !defined(__GNUC__) || ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) >= 40500)
+#  ifdef __host__
+#    pragma push_macro("__host__")
+#    undef __host__
+#    define THRUST_HOST_NEEDS_RESTORATION
+#  endif
+#  ifdef __device__
+#    pragma push_macro("__device__")
+#    undef __device__
+#    define THRUST_DEVICE_NEEDS_RESTORATION
+#  endif
+#else // GNUC pre 4.5.0
+#  if !defined(__DRIVER_TYPES_H__)
+#    ifdef __host__
+#      undef __host__
+#    endif
+#    ifdef __device__
+#      undef __device__
+#    endif
+#  endif // __DRIVER_TYPES_H__
+#endif // __GNUC__
+
+
+#include <driver_types.h>
+
+
+#if !defined(__GNUC__) || ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) >= 40500)
+#  ifdef THRUST_HOST_NEEDS_RESTORATION
+#    pragma pop_macro("__host__")
+#    undef THRUST_HOST_NEEDS_RESTORATION
+#  endif
+#  ifdef THRUST_DEVICE_NEEDS_RESTORATION
+#    pragma pop_macro("__device__")
+#    undef THRUST_DEVICE_NEEDS_RESTORATION
+#  endif
+#endif // __GNUC__
+
diff --git a/thrust/thrust/system/cuda/detail/inner_product.h b/thrust/thrust/system/cuda/detail/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd6aec606c16e5eb4c5aa3276b7d374647b021cd
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/inner_product.h
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class T,
+          class ReduceOp,
+          class ProductOp>
+T __host__ __device__
+inner_product(execution_policy<Derived> &policy,
+              InputIt1                   first1,
+              InputIt1                   last1,
+              InputIt2                   first2,
+              T                          init,
+              ReduceOp                   reduce_op,
+              ProductOp                  product_op)
+{
+  typedef typename iterator_traits<InputIt1>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first1, last1));
+  typedef transform_pair_of_input_iterators_t<T,
+                                              InputIt1,
+                                              InputIt2,
+                                              ProductOp>
+      binop_iterator_t;
+
+  return cuda_cub::reduce_n(policy,
+                            binop_iterator_t(first1, first2, product_op),
+                            num_items,
+                            init,
+                            reduce_op);
+}
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class T>
+T __host__ __device__
+inner_product(execution_policy<Derived> &policy,
+              InputIt1                   first1,
+              InputIt1                   last1,
+              InputIt2                   first2,
+              T                          init)
+{
+  return cuda_cub::inner_product(policy,
+                                 first1,
+                                 last1,
+                                 first2,
+                                 init,
+                                 plus<T>(),
+                                 multiplies<T>());
+}
+
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h b/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab3b4e5bb7fe598a2f22da280b772fb72f4b3dd5
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h
@@ -0,0 +1,242 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditionu and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+// XXX
+// this file must not be included on its own, ever,
+// but must be part of include in thrust/system/cuda/detail/copy.h
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __copy {
+
+
+  template <class H,
+            class D,
+            class T,
+            class Size>
+  THRUST_HOST_FUNCTION void
+  trivial_device_copy(thrust::cpp::execution_policy<H>&      ,
+                      thrust::cuda_cub::execution_policy<D>& device_s,
+                      T*                                     dst,
+                      T const*                               src,
+                      Size                                   count)
+  {
+    cudaError status;
+    status = cuda_cub::trivial_copy_to_device(dst,
+                                              src,
+                                              count,
+                                              cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "__copy::trivial_device_copy H->D: failed");
+  }
+
+  template <class D,
+            class H,
+            class T,
+            class Size>
+  THRUST_HOST_FUNCTION void
+  trivial_device_copy(thrust::cuda_cub::execution_policy<D>& device_s,
+                      thrust::cpp::execution_policy<H>&      ,
+                      T*                                     dst,
+                      T const*                               src,
+                      Size                                   count)
+  {
+    cudaError status;
+    status = cuda_cub::trivial_copy_from_device(dst,
+                                                src,
+                                                count,
+                                                cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "trivial_device_copy D->H failed");
+  }
+
+  template <class System1,
+            class System2,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(thrust::execution_policy<System1>& sys1,
+                      thrust::execution_policy<System2>& sys2,
+                      InputIt                            begin,
+                      Size                               n,
+                      OutputIt                           result,
+                      thrust::detail::true_type)    // trivial copy
+
+  {
+    typedef typename iterator_traits<InputIt>::value_type InputTy;
+
+    trivial_device_copy(derived_cast(sys1),
+                        derived_cast(sys2),
+                        reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+                        reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*begin)),
+                        n);
+
+    return result + n;
+  }
+
+  // non-trivial H->D copy
+  template <class H,
+            class D,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(thrust::cpp::execution_policy<H>&      host_s,
+                      thrust::cuda_cub::execution_policy<D>& device_s,
+                      InputIt                                first,
+                      Size                                   num_items,
+                      OutputIt                               result,
+                      thrust::detail::false_type)    // non-trivial copy
+  {
+    // get type of the input data
+    typedef typename thrust::iterator_value<InputIt>::type InputTy;
+
+    // copy input data into host temp storage
+    InputIt last = first;
+    thrust::advance(last, num_items);
+    thrust::detail::temporary_array<InputTy, H> temp(host_s, num_items);
+
+    for (Size idx = 0; idx != num_items; idx++)
+    {
+      ::new (static_cast<void*>(temp.data().get()+idx)) InputTy(*first);
+      ++first;
+    }
+
+    // allocate device temporary storage
+    thrust::detail::temporary_array<InputTy, D> d_in_ptr(device_s, num_items);
+
+    // trivial copy data from host to device
+    cudaError status = cuda_cub::trivial_copy_to_device(d_in_ptr.data().get(),
+                                                        temp.data().get(),
+                                                        num_items,
+                                                        cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "__copy:: H->D: failed");
+
+
+    // device->device copy
+    OutputIt ret = cuda_cub::copy_n(device_s, d_in_ptr.data(), num_items, result);
+
+    return ret;
+  }
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  // non-trivial copy D->H, only supported with NVCC compiler
+  // because copy ctor must have  __device__ annotations, which is nvcc-only
+  // feature
+  template <class D,
+            class H,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(thrust::cuda_cub::execution_policy<D>& device_s,
+                      thrust::cpp::execution_policy<H>&   host_s,
+                      InputIt                             first,
+                      Size                                num_items,
+                      OutputIt                            result,
+                      thrust::detail::false_type)    // non-trivial copy
+
+  {
+    // get type of the input data
+    typedef typename thrust::iterator_value<InputIt>::type InputTy;
+
+    // allocate device temp storage 
+    thrust::detail::temporary_array<InputTy, D> d_in_ptr(device_s, num_items);
+
+    // uninitialize copy into temp device storage
+    cuda_cub::uninitialized_copy_n(device_s, first, num_items, d_in_ptr.data());
+
+    // allocate host temp storage
+    thrust::detail::temporary_array<InputTy, H> temp(host_s, num_items);
+
+    // trivial copy from device to host
+    cudaError status;
+    status = cuda_cub::trivial_copy_from_device(temp.data().get(),
+                                                d_in_ptr.data().get(),
+                                                num_items,
+                                                cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "__copy:: D->H: failed");
+
+    // host->host copy
+    OutputIt ret = thrust::copy_n(host_s, temp.data(), num_items, result);
+
+    return ret;
+  }
+#endif
+
+  template <class System1,
+            class System2,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(cross_system<System1, System2> systems,
+                      InputIt  begin,
+                      Size     n,
+                      OutputIt result)
+  {
+    return cross_system_copy_n(
+        derived_cast(systems.sys1),
+        derived_cast(systems.sys2),
+        begin,
+        n,
+        result,
+        typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+  }
+
+  template <class System1,
+            class System2,
+            class InputIterator,
+            class OutputIterator>
+  OutputIterator __host__
+  cross_system_copy(cross_system<System1, System2> systems,
+                    InputIterator  begin,
+                    InputIterator  end,
+                    OutputIterator result)
+  {
+    return cross_system_copy_n(systems,
+                               begin,
+                               thrust::distance(begin, end),
+                               result);
+  }
+
+}    // namespace __copy
+
+} // namespace cuda_cub
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a6631d90321bf52c5441aacfe86c7cf6ea71a5b
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -0,0 +1,64 @@
+
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __copy {
+
+  template <class Derived,
+            class InputIt,
+            class OutputIt>
+  OutputIt THRUST_RUNTIME_FUNCTION
+  device_to_device(execution_policy<Derived>& policy,
+                   InputIt                    first,
+                   InputIt                    last,
+                   OutputIt                   result)
+  {
+    typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+    return cuda_cub::transform(policy,
+                            first,
+                            last,
+                            result,
+                            thrust::identity<InputTy>());
+  }
+
+}    // namespace __copy
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/iter_swap.h b/thrust/thrust/system/cuda/detail/iter_swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..353bb1851388d2ecb02042ab114677a9e516688e
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/iter_swap.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/swap.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+void iter_swap(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 a, Pointer2 b)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(Pointer1 a, Pointer2 b)
+    {
+      thrust::swap_ranges(a, a + 1, b);
+    }
+
+    __device__ inline static void device_path(Pointer1 a, Pointer2 b)
+    {
+      using thrust::swap;
+      swap(*thrust::raw_pointer_cast(a),
+           *thrust::raw_pointer_cast(b));
+    }
+  };
+
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      war_nvbugs_881631::host_path(a, b);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      war_nvbugs_881631::device_path(a, b);
+    #endif
+  }
+} // end iter_swap()
+
+
+} // end cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/logical.h b/thrust/thrust/system/cuda/detail/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/logical.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cuda/detail/make_unsigned_special.h b/thrust/thrust/system/cuda/detail/make_unsigned_special.h
new file mode 100644
index 0000000000000000000000000000000000000000..683647cbede60d62a4160efe58e9e62ba53c9d12
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/make_unsigned_special.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2019 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace detail {
+
+    template<typename Size>
+    struct make_unsigned_special;
+
+    template<>
+    struct make_unsigned_special<int> { typedef unsigned int type; };
+
+    // this is special, because CUDA's atomicAdd doesn't have an overload
+    // for unsigned long, for some godforsaken reason
+    template<>
+    struct make_unsigned_special<long> { typedef unsigned long long type; };
+
+    template<>
+    struct make_unsigned_special<long long> { typedef unsigned long long type; };
+
+}
+}
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/cuda/detail/malloc_and_free.h b/thrust/thrust/system/cuda/detail/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d72381b5b5e3be37526000b9e2e637f0817f368
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/malloc_and_free.h
@@ -0,0 +1,104 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/seq.h>
+#include <thrust/system/cuda/config.h>
+#ifdef THRUST_CACHING_DEVICE_MALLOC
+#include <cub/util_allocator.cuh>
+#endif
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/detail/bad_alloc.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+#ifdef THRUST_CACHING_DEVICE_MALLOC
+#define __CUB_CACHING_MALLOC
+#ifndef __CUDA_ARCH__
+inline cub::CachingDeviceAllocator &get_allocator()
+{
+  static cub::CachingDeviceAllocator g_allocator(true);
+  return g_allocator;
+}
+#endif
+#endif
+
+
+// note that malloc returns a raw pointer to avoid
+// depending on the heavyweight thrust/system/cuda/memory.h header
+template<typename DerivedPolicy>
+__host__ __device__
+void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
+{
+  void *result = 0;
+
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      #ifdef __CUB_CACHING_MALLOC
+        cub::CachingDeviceAllocator &alloc = get_allocator();
+        cudaError_t status = alloc.DeviceAllocate(&result, n);
+      #else
+        cudaError_t status = cudaMalloc(&result, n);
+      #endif
+
+      if(status != cudaSuccess)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+      }
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+    #endif
+  }
+
+  return result;
+} // end malloc()
+
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void free(execution_policy<DerivedPolicy> &, Pointer ptr)
+{
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      #ifdef __CUB_CACHING_MALLOC
+        cub::CachingDeviceAllocator &alloc = get_allocator();
+        cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr));
+      #else
+        cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr));
+      #endif
+      cuda_cub::throw_on_error(status, "device free failed");
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      thrust::free(thrust::seq, ptr);
+    #endif
+  }
+} // end free()
+
+}    // namespace cuda_cub
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/managed_memory_pointer.h b/thrust/thrust/system/cuda/detail/managed_memory_pointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6a4c9756be37a9ba03806132ba6fb3381c21354
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/managed_memory_pointer.h
@@ -0,0 +1,195 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/pointer.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+// forward decl for iterator traits:
+template <typename T>
+class managed_memory_pointer;
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+
+// Specialize iterator traits to define `pointer` to something meaningful.
+template <typename Element, typename Tag, typename Reference>
+struct iterator_traits<thrust::pointer<
+  Element,
+  Tag,
+  Reference,
+  thrust::system::cuda::detail::managed_memory_pointer<Element> > > {
+private:
+  typedef thrust::pointer<
+    Element,
+    Tag,
+    Reference,
+    thrust::system::cuda::detail::managed_memory_pointer<Element> >
+    ptr;
+
+public:
+  typedef typename ptr::iterator_category iterator_category;
+  typedef typename ptr::value_type value_type;
+  typedef typename ptr::difference_type difference_type;
+  typedef Element* pointer;
+  typedef typename ptr::reference reference;
+}; // end iterator_traits
+
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+/*! A version of thrust::cuda_cub::pointer that uses c++ references instead
+ * of thrust::cuda::reference. This is to allow managed memory pointers to
+ * be used with host-side code in standard libraries that are not compatible
+ * with proxy references.
+ */
+template <typename T>
+class managed_memory_pointer
+    : public thrust::pointer<
+        T,
+        thrust::cuda_cub::tag,
+        typename thrust::detail::add_reference<T>::type,
+        thrust::system::cuda::detail::managed_memory_pointer<T> >
+{
+private:
+  typedef thrust::pointer<
+    T,
+    thrust::cuda_cub::tag,
+    typename thrust::detail::add_reference<T>::type,
+    thrust::system::cuda::detail::managed_memory_pointer<T> >
+    super_t;
+
+public:
+  typedef typename super_t::raw_pointer pointer;
+
+  /*! \p managed_memory_pointer's no-argument constructor initializes its
+   * encapsulated pointer to \c 0.
+   */
+  __host__ __device__ managed_memory_pointer()
+      : super_t()
+  {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  // NOTE: This is needed so that Thrust smart pointers can be used in
+  // `std::unique_ptr`.
+  __host__ __device__ managed_memory_pointer(decltype(nullptr))
+      : super_t(nullptr)
+  {}
+#endif
+
+  /*! This constructor allows construction of a <tt><const T></tt> from a
+   * <tt>T*</tt>.
+   *
+   *  \param ptr A raw pointer to copy from, presumed to point to a location
+   * in memory accessible by the \p cuda system. \tparam OtherT \p OtherT
+   * shall be convertible to \p T.
+   */
+  template <typename OtherT>
+  __host__ __device__ explicit managed_memory_pointer(OtherT* ptr)
+      : super_t(ptr)
+  {}
+
+  /*! This constructor allows construction from another pointer-like object
+   * with related type.
+   *
+   *  \param other The \p OtherPointer to copy.
+   *  \tparam OtherPointer The system tag associated with \p OtherPointer
+   * shall be convertible to \p thrust::system::cuda::tag and its element
+   * type shall be convertible to \p T.
+   */
+  template <typename OtherPointer>
+  __host__ __device__ managed_memory_pointer(
+    const OtherPointer& other,
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      managed_memory_pointer>::type* = 0)
+      : super_t(other)
+  {}
+
+  /*! This constructor allows construction from another pointer-like object
+   * with \p void type.
+   *
+   *  \param other The \p OtherPointer to copy.
+   *  \tparam OtherPointer The system tag associated with \p OtherPointer
+   * shall be convertible to \p thrust::system::cuda::tag and its element
+   * type shall be \p void.
+   */
+  template <typename OtherPointer>
+  __host__ __device__ explicit managed_memory_pointer(
+    const OtherPointer& other,
+    typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+      OtherPointer,
+      managed_memory_pointer>::type* = 0)
+      : super_t(other)
+  {}
+
+  /*! Assignment operator allows assigning from another pointer-like object
+   * with related type.
+   *
+   *  \param other The other pointer-like object to assign from.
+   *  \tparam OtherPointer The system tag associated with \p OtherPointer
+   * shall be convertible to \p thrust::system::cuda::tag and its element
+   * type shall be convertible to \p T.
+   */
+  template <typename OtherPointer>
+  __host__ __device__ typename thrust::detail::enable_if_pointer_is_convertible<
+    OtherPointer,
+    managed_memory_pointer,
+    managed_memory_pointer&>::type
+  operator=(const OtherPointer& other)
+  {
+    return super_t::operator=(other);
+  }
+
+#if THRUST_CPP_DIALECT >= 2011
+  // NOTE: This is needed so that Thrust smart pointers can be used in
+  // `std::unique_ptr`.
+  __host__ __device__ managed_memory_pointer& operator=(decltype(nullptr))
+  {
+    super_t::operator=(nullptr);
+    return *this;
+  }
+#endif
+
+  __host__ __device__
+  pointer operator->() const
+  {
+    return this->get();
+  }
+
+}; // class managed_memory_pointer
+
+} // namespace detail
+} // namespace cuda
+} // namespace system
+} // namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/memory.inl b/thrust/thrust/system/cuda/detail/memory.inl
new file mode 100644
index 0000000000000000000000000000000000000000..82a04b67ddc389e86839118a53eeecb13c9b412f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/memory.inl
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/system/cuda/detail/malloc_and_free.h>
+#include <limits>
+
+namespace thrust
+{
+namespace cuda_cub
+{
+
+__host__ __device__
+pointer<void> malloc(std::size_t n)
+{
+  tag cuda_tag;
+  return pointer<void>(thrust::cuda_cub::malloc(cuda_tag, n));
+} // end malloc()
+
+template<typename T>
+__host__ __device__
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::cuda_cub::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+__host__ __device__
+void free(pointer<void> ptr)
+{
+  tag cuda_tag;
+  return thrust::cuda_cub::free(cuda_tag, ptr.get());
+} // end free()
+
+} // end cuda_cub
+} // end thrust
+
diff --git a/thrust/thrust/system/cuda/detail/merge.h b/thrust/thrust/system/cuda/detail/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a223b60604a60bccc603165963071c7b7b3a24a
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/merge.h
@@ -0,0 +1,1018 @@
+/******************************************************************************
+j * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/merge.h>
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/distance.h>
+
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __merge {
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class BinaryPred>
+  Size THRUST_DEVICE_FUNCTION
+  merge_path(KeysIt1    keys1,
+             KeysIt2    keys2,
+             Size       keys1_count,
+             Size       keys2_count,
+             Size       diag,
+             BinaryPred binary_pred)
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
+
+    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
+    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
+
+    while (keys1_begin < keys1_end)
+    {
+      Size mid = (keys1_begin + keys1_end) >> 1;
+      key1_type key1 = keys1[mid];
+      key2_type key2 = keys2[diag - 1 - mid];
+      bool pred = binary_pred(key2, key1);
+      if (pred)
+      {
+        keys1_end = mid;
+      }
+      else
+      {
+        keys1_begin = mid+1;
+      }
+    }
+    return keys1_begin;
+  }
+
+  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
+  THRUST_DEVICE_FUNCTION void
+  serial_merge(It  keys_shared,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T2 (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+  {
+    int keys1_end = keys1_beg + keys1_count;
+    int keys2_end = keys2_beg + keys2_count;
+
+    typedef typename iterator_value<It>::type key_type;
+
+    key_type key1 = keys_shared[keys1_beg];
+    key_type key2 = keys_shared[keys2_beg];
+
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      bool p = (keys2_beg < keys2_end) &&
+               ((keys1_beg >= keys1_end) ||
+                compare_op(key2,key1));
+
+      output[ITEM]  = p ? key2 : key1;
+      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
+
+      if (p)
+      {
+        key2 = keys_shared[keys2_beg];
+      }
+      else
+      {
+        key1 = keys_shared[keys1_beg];
+      }
+    }
+  }
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  };    // PtxPolicy
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class CompareOp>
+  struct PartitionAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<256> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    THRUST_AGENT_ENTRY(KeysIt1   keys1,
+                       KeysIt2   keys2,
+                       Size      keys1_count,
+                       Size      keys2_count,
+                       Size      num_partitions,
+                       Size*     merge_partitions,
+                       CompareOp compare_op,
+                       int       items_per_tile,
+                       char*     /*shmem*/)
+    {
+      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      if (partition_idx < num_partitions)
+      {
+        Size partition_at = thrust::min(partition_idx * items_per_tile,
+                                        keys1_count + keys2_count);
+        Size partition_diag = merge_path(keys1,
+                                         keys2,
+                                         keys1_count,
+                                         keys2_count,
+                                         partition_at,
+                                         compare_op);
+        merge_partitions[partition_idx] = partition_diag;
+      }
+    }
+  };    // struct PartitionAgent
+
+
+  template <class Arch, class TSize>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<size_t NOMINAL_4B_ITEMS_PER_THREAD, size_t INPUT_SIZE>
+  struct items_per_thread
+  {
+    enum
+    {
+      ITEMS_PER_THREAD =
+          mpl::min<
+              int,
+              NOMINAL_4B_ITEMS_PER_THREAD,
+              mpl::max<
+                  int,
+                  1,
+                  (NOMINAL_4B_ITEMS_PER_THREAD * 4 / INPUT_SIZE)>::value>::value,
+      value = mpl::is_odd<size_t, ITEMS_PER_THREAD>::value
+                  ? ITEMS_PER_THREAD
+                  : ITEMS_PER_THREAD + 1
+    };
+  };
+
+  template<class TSize>
+  struct Tuning<sm30,TSize>
+  {
+    const static int INPUT_SIZE = TSize::value;
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          INPUT_SIZE>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm300
+
+
+
+  template<class TSize>
+  struct Tuning<sm60,TSize> : Tuning<sm30,TSize>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 15,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          Tuning::INPUT_SIZE>::value
+    };
+
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm52
+
+  template<class TSize>
+  struct Tuning<sm52,TSize> : Tuning<sm30,TSize>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 13,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          Tuning::INPUT_SIZE>::value
+    };
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm52
+
+  template<class TSize>
+  struct Tuning<sm35,TSize> : Tuning<sm30,TSize>
+  {
+    const static int INPUT_SIZE = TSize::value;
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          Tuning::INPUT_SIZE>::value
+    };
+
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm350
+
+
+  template<size_t VALUE>
+  struct integer_constant : thrust::detail::integral_constant<size_t, VALUE> {};
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class ItemsIt1,
+            class ItemsIt2,
+            class Size,
+            class KeysOutputIt,
+            class ItemsOutputIt,
+            class CompareOp,
+            class MERGE_ITEMS>
+  struct MergeAgent
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type  key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type  key2_type;
+    typedef typename iterator_traits<ItemsIt1>::value_type item1_type;
+    typedef typename iterator_traits<ItemsIt2>::value_type item2_type;
+
+    typedef key1_type  key_type;
+    typedef item1_type item_type;
+
+    typedef typename thrust::detail::conditional<
+        MERGE_ITEMS::value,
+        integer_constant<sizeof(key_type) + sizeof(item_type)>,
+        integer_constant<sizeof(key_type)> >::type tuning_type;
+
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, tuning_type>::type
+    {
+      typedef Tuning<Arch,tuning_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysIt1>::type  KeysLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, KeysIt2>::type  KeysLoadIt2;
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt1>::type ItemsLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt2>::type ItemsLoadIt2;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type  BlockLoadKeys1;
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type  BlockLoadKeys2;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt1>::type BlockLoadItems1;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt2>::type BlockLoadItems2;
+
+      typedef typename core::BlockStore<PtxPlan,
+                                        KeysOutputIt,
+                                        key_type>::type BlockStoreKeys;
+      typedef typename core::BlockStore<PtxPlan,
+                                        ItemsOutputIt,
+                                        item_type>::type BlockStoreItems;
+
+      // gather required temporary storage in a union
+      //
+      union TempStorage
+      {
+        typename BlockLoadKeys1::TempStorage  load_keys1;
+        typename BlockLoadKeys2::TempStorage  load_keys2;
+        typename BlockLoadItems1::TempStorage load_items1;
+        typename BlockLoadItems2::TempStorage load_items2;
+        typename BlockStoreKeys::TempStorage  store_keys;
+        typename BlockStoreItems::TempStorage store_items;
+
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
+        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt1     KeysLoadIt1;
+    typedef typename ptx_plan::KeysLoadIt2     KeysLoadIt2;
+    typedef typename ptx_plan::ItemsLoadIt1    ItemsLoadIt1;
+    typedef typename ptx_plan::ItemsLoadIt2    ItemsLoadIt2;
+    typedef typename ptx_plan::BlockLoadKeys1  BlockLoadKeys1;
+    typedef typename ptx_plan::BlockLoadKeys2  BlockLoadKeys2;
+    typedef typename ptx_plan::BlockLoadItems1 BlockLoadItems1;
+    typedef typename ptx_plan::BlockLoadItems2 BlockLoadItems2;
+    typedef typename ptx_plan::BlockStoreKeys  BlockStoreKeys;
+    typedef typename ptx_plan::BlockStoreItems BlockStoreItems;
+    typedef typename ptx_plan::TempStorage     TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      TempStorage&  storage;
+      KeysLoadIt1   keys1_in;
+      KeysLoadIt2   keys2_in;
+      ItemsLoadIt1  items1_in;
+      ItemsLoadIt2  items2_in;
+      Size          keys1_count;
+      Size          keys2_count;
+      KeysOutputIt  keys_out;
+      ItemsOutputIt items_out;
+      CompareOp     compare_op;
+      Size*         merge_partitions;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE, class T, class It1, class It2>
+      THRUST_DEVICE_FUNCTION void
+      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+                  It1 input1,
+                  It2 input2,
+                  int count1,
+                  int count2)
+      {
+        if (IS_FULL_TILE)
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1)
+              output[ITEM] = input1[idx];
+            else
+              output[ITEM] = input2[idx - count1];
+          }
+        }
+        else
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1 + count2)
+            {
+              if (idx < count1)
+                output[ITEM] = input1[idx];
+              else
+                output[ITEM] = input2[idx - count1];
+            }
+          }
+        }
+      }
+
+      template <class T, class It>
+      THRUST_DEVICE_FUNCTION void
+      reg_to_shared(It output,
+                    T (&input)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+          output[idx] = input[ITEM];
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE>
+      void THRUST_DEVICE_FUNCTION
+      consume_tile(Size tile_idx,
+                   Size tile_base,
+                   int  num_remaining)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
+
+        Size partition_beg = merge_partitions[tile_idx + 0];
+        Size partition_end = merge_partitions[tile_idx + 1];
+
+        Size diag0 = ITEMS_PER_TILE * tile_idx;
+        Size diag1 = thrust::min(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
+
+        // compute bounding box for keys1 & keys2
+        //
+        Size keys1_beg = partition_beg;
+        Size keys1_end = partition_end;
+        Size keys2_beg = diag0 - keys1_beg;
+        Size keys2_end = diag1 - keys1_end;
+
+        // number of keys per tile
+        //
+        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+        key_type keys_loc[ITEMS_PER_THREAD];
+        gmem_to_reg<IS_FULL_TILE>(keys_loc,
+                                  keys1_in + keys1_beg,
+                                  keys2_in + keys2_beg,
+                                  num_keys1,
+                                  num_keys2);
+        reg_to_shared(&storage.keys_shared[0], keys_loc);
+
+        sync_threadblock();
+
+        // use binary search in shared memory
+        // to find merge path for each of thread
+        // we can use int type here, because the number of
+        // items in shared memory is limited
+        //
+        int diag0_loc = min<int>(num_keys1 + num_keys2,
+                                 ITEMS_PER_THREAD * threadIdx.x);
+
+        int keys1_beg_loc = merge_path(&storage.keys_shared[0],
+                                       &storage.keys_shared[num_keys1],
+                                       num_keys1,
+                                       num_keys2,
+                                       diag0_loc,
+                                       compare_op);
+        int keys1_end_loc = num_keys1;
+        int keys2_beg_loc = diag0_loc - keys1_beg_loc;
+        int keys2_end_loc = num_keys2;
+
+        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+        // perform serial merge
+        //
+        int indices[ITEMS_PER_THREAD];
+
+        serial_merge(&storage.keys_shared[0],
+                     keys1_beg_loc,
+                     keys2_beg_loc + num_keys1,
+                     num_keys1_loc,
+                     num_keys2_loc,
+                     keys_loc,
+                     indices,
+                     compare_op);
+
+        sync_threadblock();
+
+        // write keys
+        //
+        if (IS_FULL_TILE)
+        {
+          BlockStoreKeys(storage.store_keys)
+              .Store(keys_out + tile_base, keys_loc);
+        }
+        else
+        {
+          BlockStoreKeys(storage.store_keys)
+              .Store(keys_out + tile_base, keys_loc, num_remaining);
+        }
+
+        // if items are provided, merge them
+        if (MERGE_ITEMS::value)
+        {
+          item_type items_loc[ITEMS_PER_THREAD];
+          gmem_to_reg<IS_FULL_TILE>(items_loc,
+                                    items1_in + keys1_beg,
+                                    items2_in + keys2_beg,
+                                    num_keys1,
+                                    num_keys2);
+
+          sync_threadblock();
+
+          reg_to_shared(&storage.items_shared[0], items_loc);
+
+          sync_threadblock();
+
+          // gather items from shared mem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            items_loc[ITEM] = storage.items_shared[indices[ITEM]];
+          }
+
+          sync_threadblock();
+
+          // write form reg to gmem
+          //
+          if (IS_FULL_TILE)
+          {
+            BlockStoreItems(storage.store_items)
+                .Store(items_out + tile_base, items_loc);
+          }
+          else
+          {
+            BlockStoreItems(storage.store_items)
+                .Store(items_out + tile_base, items_loc, num_remaining);
+          }
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage&  storage_,
+           KeysLoadIt1   keys1_in_,
+           KeysLoadIt2   keys2_in_,
+           ItemsLoadIt1  items1_in_,
+           ItemsLoadIt2  items2_in_,
+           Size          keys1_count_,
+           Size          keys2_count_,
+           KeysOutputIt  keys_out_,
+           ItemsOutputIt items_out_,
+           CompareOp     compare_op_,
+           Size*         merge_partitions_)
+          : storage(storage_),
+            keys1_in(keys1_in_),
+            keys2_in(keys2_in_),
+            items1_in(items1_in_),
+            items2_in(items2_in_),
+            keys1_count(keys1_count_),
+            keys2_count(keys2_count_),
+            keys_out(keys_out_),
+            items_out(items_out_),
+            compare_op(compare_op_),
+            merge_partitions(merge_partitions_)
+      {
+        // XXX with 8.5 chaging type to Size (or long long) results in error!
+        int  tile_idx      = blockIdx.x;
+        Size  tile_base     = tile_idx * ITEMS_PER_TILE;
+        int  items_in_tile = static_cast<int>(
+            min<Size>(ITEMS_PER_TILE,
+                      keys1_count + keys2_count - tile_base));
+        if (items_in_tile == ITEMS_PER_TILE)
+        {
+          // full tile
+          consume_tile<true>(tile_idx,
+                             tile_base,
+                             ITEMS_PER_TILE);
+        }
+        else
+        {
+          // partial tile
+          consume_tile<false>(tile_idx,
+                              tile_base,
+                              items_in_tile);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysIt1       keys1_in,
+                       KeysIt2       keys2_in,
+                       ItemsIt1      items1_in,
+                       ItemsIt2      items2_in,
+                       Size          keys1_count,
+                       Size          keys2_count,
+                       KeysOutputIt  keys_out,
+                       ItemsOutputIt items_out,
+                       CompareOp     compare_op,
+                       Size*         merge_partitions,
+                       char*         shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(storage,
+           core::make_load_iterator(ptx_plan(), keys1_in),
+           core::make_load_iterator(ptx_plan(), keys2_in),
+           core::make_load_iterator(ptx_plan(), items1_in),
+           core::make_load_iterator(ptx_plan(), items2_in),
+           keys1_count,
+           keys2_count,
+           keys_out,
+           items_out,
+           compare_op,
+           merge_partitions);
+    }
+  };    // struct MergeAgent;
+
+  //---------------------------------------------------------------------
+  // Two-step internal API
+  //---------------------------------------------------------------------
+
+  template <class MERGE_ITEMS,
+            class KeysIt1,
+            class KeysIt2,
+            class ItemsIt1,
+            class ItemsIt2,
+            class Size,
+            class KeysOutputIt,
+            class ItemsOutputIt,
+            class CompareOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void*         d_temp_storage,
+            size_t&       temp_storage_bytes,
+            KeysIt1       keys1,
+            KeysIt2       keys2,
+            ItemsIt1      items1,
+            ItemsIt2      items2,
+            Size          num_keys1,
+            Size          num_keys2,
+            KeysOutputIt  keys_result,
+            ItemsOutputIt items_result,
+            CompareOp     compare_op,
+            cudaStream_t  stream,
+            bool          debug_sync)
+  {
+    if (num_keys1 + num_keys2 == 0)
+      return cudaErrorNotSupported;
+
+    using core::AgentPlan;
+    using core::get_agent_plan;
+    typedef core::AgentLauncher<
+        MergeAgent<KeysIt1,
+                   KeysIt2,
+                   ItemsIt1,
+                   ItemsIt2,
+                   Size,
+                   KeysOutputIt,
+                   ItemsOutputIt,
+                   CompareOp,
+                   MERGE_ITEMS> >
+        merge_agent;
+
+    typedef core::AgentLauncher<
+        PartitionAgent<KeysIt1,
+                       KeysIt2,
+                       Size,
+                       CompareOp> >
+        partition_agent;
+
+    cudaError_t status = cudaSuccess;
+
+    AgentPlan partition_plan = partition_agent::get_plan();
+    AgentPlan merge_plan     = merge_agent::get_plan(stream);
+
+    int  tile_size = merge_plan.items_per_tile;
+    Size num_tiles = (num_keys1 + num_keys2 + tile_size - 1) / tile_size;
+
+    size_t temp_storage1 = (1 + num_tiles) * sizeof(Size);
+    size_t temp_storage2 = core::vshmem_size(merge_plan.shared_memory_size,
+                                             num_tiles);
+
+    void*  allocations[2]      = {NULL, NULL};
+    size_t allocation_sizes[2] = {temp_storage1, temp_storage2};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_bytes,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    // partition data into work balanced tiles
+    Size* merge_partitions = (Size*)allocations[0];
+    char* vshmem_ptr       = temp_storage2 > 0 ? (char*)allocations[1] : NULL;
+
+    {
+      Size num_partitions = num_tiles + 1;
+
+      partition_agent(partition_plan, num_partitions, stream, "partition agent", debug_sync)
+          .launch(keys1,
+                  keys2,
+                  num_keys1,
+                  num_keys2,
+                  num_partitions,
+                  merge_partitions,
+                  compare_op,
+                  merge_plan.items_per_tile);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    merge_agent(merge_plan, num_keys1 + num_keys2, stream, vshmem_ptr, "merge agent", debug_sync)
+        .launch(keys1,
+                keys2,
+                items1,
+                items2,
+                num_keys1,
+                num_keys2,
+                keys_result,
+                items_result,
+                compare_op,
+                merge_partitions);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return status;
+  }
+
+  template <typename MERGE_ITEMS,
+            typename Derived,
+            typename KeysIt1,
+            typename KeysIt2,
+            typename ItemsIt1,
+            typename ItemsIt2,
+            typename KeysOutputIt,
+            typename ItemsOutputIt,
+            typename CompareOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ItemsOutputIt>
+  merge(execution_policy<Derived>& policy,
+        KeysIt1                    keys1_first,
+        KeysIt1                    keys1_last,
+        KeysIt2                    keys2_first,
+        KeysIt2                    keys2_last,
+        ItemsIt1                   items1_first,
+        ItemsIt2                   items2_first,
+        KeysOutputIt               keys_result,
+        ItemsOutputIt              items_result,
+        CompareOp                  compare_op)
+  {
+    typedef typename iterator_traits<KeysIt1>::difference_type size_type;
+
+    size_type num_keys1
+      = static_cast<size_type>(thrust::distance(keys1_first, keys1_last));
+    size_type num_keys2
+      = static_cast<size_type>(thrust::distance(keys2_first, keys2_last));
+
+    size_type const count = num_keys1 + num_keys2;
+
+    if (count == 0)
+      return thrust::make_pair(keys_result, items_result);
+
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    status = doit_step<MERGE_ITEMS>(NULL,
+                                    storage_size,
+                                    keys1_first,
+                                    keys2_first,
+                                    items1_first,
+                                    items2_first,
+                                    num_keys1,
+                                    num_keys2,
+                                    keys_result,
+                                    items_result,
+                                    compare_op,
+                                    stream,
+                                    debug_sync);
+    cuda_cub::throw_on_error(status, "merge: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = doit_step<MERGE_ITEMS>(ptr,
+                                    storage_size,
+                                    keys1_first,
+                                    keys2_first,
+                                    items1_first,
+                                    items2_first,
+                                    num_keys1,
+                                    num_keys2,
+                                    keys_result,
+                                    items_result,
+                                    compare_op,
+                                    stream,
+                                    debug_sync);
+    cuda_cub::throw_on_error(status, "merge: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "merge: failed to synchronize");
+
+    return thrust::make_pair(keys_result + count, items_result + count);
+  }
+}    // namespace __merge
+
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ResultIt,
+          class CompareOp>
+ResultIt __host__ __device__
+merge(execution_policy<Derived>& policy,
+      KeysIt1                    keys1_first,
+      KeysIt1                    keys1_last,
+      KeysIt2                    keys2_first,
+      KeysIt2                    keys2_last,
+      ResultIt                   result,
+      CompareOp                  compare_op)
+
+{
+  ResultIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typedef typename thrust::iterator_value<KeysIt1>::type keys_type;
+    //
+    keys_type* null_ = NULL;
+    //
+    ret = __merge::merge<thrust::detail::false_type>(policy,
+                                                     keys1_first,
+                                                     keys1_last,
+                                                     keys2_first,
+                                                     keys2_last,
+                                                     null_,
+                                                     null_,
+                                                     result,
+                                                     null_,
+                                                     compare_op)
+              .first;
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::merge(cvt_to_seq(derived_cast(policy)),
+                        keys1_first,
+                        keys1_last,
+                        keys2_first,
+                        keys2_last,
+                        result,
+                        compare_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived, class KeysIt1, class KeysIt2, class ResultIt>
+ResultIt __host__ __device__
+merge(execution_policy<Derived>& policy,
+      KeysIt1                    keys1_first,
+      KeysIt1                    keys1_last,
+      KeysIt2                    keys2_first,
+      KeysIt2                    keys2_last,
+      ResultIt                   result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type keys_type;
+  return cuda_cub::merge(policy,
+                         keys1_first,
+                         keys1_last,
+                         keys2_first,
+                         keys2_last,
+                         result,
+                         less<keys_type>());
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+merge_by_key(execution_policy<Derived> &policy,
+             KeysIt1                    keys1_first,
+             KeysIt1                    keys1_last,
+             KeysIt2                    keys2_first,
+             KeysIt2                    keys2_last,
+             ItemsIt1                   items1_first,
+             ItemsIt2                   items2_first,
+             KeysOutputIt               keys_result,
+             ItemsOutputIt              items_result,
+             CompareOp                  compare_op)
+{
+  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    return __merge::merge<thrust::detail::true_type>(policy,
+                                                     keys1_first,
+                                                     keys1_last,
+                                                     keys2_first,
+                                                     keys2_last,
+                                                     items1_first,
+                                                     items2_first,
+                                                     keys_result,
+                                                     items_result,
+                                                     compare_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)),
+                               keys1_first,
+                               keys1_last,
+                               keys2_first,
+                               keys2_last,
+                               items1_first,
+                               items2_first,
+                               keys_result,
+                               items_result,
+                               compare_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+merge_by_key(execution_policy<Derived> &policy,
+             KeysIt1                    keys1_first,
+             KeysIt1                    keys1_last,
+             KeysIt2                    keys2_first,
+             KeysIt2                    keys2_last,
+             ItemsIt1                   items1_first,
+             ItemsIt2                   items2_first,
+             KeysOutputIt               keys_result,
+             ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type keys_type;
+  return cuda_cub::merge_by_key(policy,
+                                keys1_first,
+                                keys1_last,
+                                keys2_first,
+                                keys2_last,
+                                items1_first,
+                                items2_first,
+                                keys_result,
+                                items_result,
+                                thrust::less<keys_type>());
+}
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/mismatch.h b/thrust/thrust/system/cuda/detail/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..98c462e8446b7a54da43b90457ee90393188e225
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/mismatch.h
@@ -0,0 +1,117 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class BinaryPred>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2,
+         BinaryPred                 binary_pred);
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2);
+} // namespace cuda_
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/find.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class BinaryPred>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2,
+         BinaryPred                 binary_pred)
+{
+  typedef transform_pair_of_input_iterators_t<bool,
+                                              InputIt1,
+                                              InputIt2,
+                                              BinaryPred>
+      transform_t;
+
+  transform_t transform_first = transform_t(first1, first2, binary_pred);
+
+  transform_t result = cuda_cub::find_if_not(policy,
+                                          transform_first,
+                                          transform_first + thrust::distance(first1, last1),
+                                          identity());
+
+  return thrust::make_pair(first1 + thrust::distance(transform_first,result),
+                           first2 + thrust::distance(transform_first,result));
+}
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2)
+{
+  typedef typename thrust::iterator_value<InputIt1>::type InputType1;
+  return cuda_cub::mismatch(policy,
+                         first1,
+                         last1,
+                         first2,
+                         equal_to<InputType1>());
+}
+
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/par.h b/thrust/thrust/system/cuda/detail/par.h
new file mode 100644
index 0000000000000000000000000000000000000000..d232a6cfacb03f9b8b5a420542c2d690723a5622
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/par.h
@@ -0,0 +1,125 @@
+/******************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/detail/allocator_aware_execution_policy.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  include <thrust/detail/dependencies_aware_execution_policy.h>
+#endif
+
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived>
+struct execute_on_stream_base : execution_policy<Derived>
+{
+private:
+  cudaStream_t stream;
+
+public:
+  __host__ __device__
+  execute_on_stream_base(cudaStream_t stream_ = default_stream())
+      : stream(stream_) {}
+
+  THRUST_RUNTIME_FUNCTION
+  Derived
+  on(cudaStream_t const &s) const
+  {
+    Derived result = derived_cast(*this);
+    result.stream  = s;
+    return result;
+  }
+
+private:
+  friend __host__ __device__
+  cudaStream_t
+  get_stream(const execute_on_stream_base &exec)
+  {
+    return exec.stream;
+  }
+};
+
+struct execute_on_stream : execute_on_stream_base<execute_on_stream>
+{
+  typedef execute_on_stream_base<execute_on_stream> base_t;
+
+  __host__ __device__
+  execute_on_stream() : base_t(){};
+  __host__ __device__
+  execute_on_stream(cudaStream_t stream) : base_t(stream){};
+};
+
+
+struct par_t : execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    execute_on_stream_base>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<
+    execute_on_stream_base>
+#endif
+{
+  typedef execution_policy<par_t> base_t;
+
+  __host__ __device__
+  THRUST_CONSTEXPR par_t() : base_t() {}
+
+  typedef execute_on_stream stream_attachment_type;
+
+  THRUST_RUNTIME_FUNCTION
+  stream_attachment_type
+  on(cudaStream_t const &stream) const
+  {
+    return execute_on_stream(stream);
+  }
+};
+
+THRUST_INLINE_CONSTANT par_t par;
+}    // namespace cuda_
+
+namespace system {
+namespace cuda {
+  using thrust::cuda_cub::par;
+  namespace detail {
+    using thrust::cuda_cub::par_t;
+  }
+} // namesapce cuda
+} // namespace system
+
+namespace cuda {
+using thrust::cuda_cub::par;
+} // namespace cuda
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/cuda/detail/par_to_seq.h b/thrust/thrust/system/cuda/detail/par_to_seq.h
new file mode 100644
index 0000000000000000000000000000000000000000..22c4e58386e8c6dd0832bf3820072fadc53d34e8
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/par_to_seq.h
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/seq.h>
+#include <thrust/system/cuda/detail/par.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <int PAR>
+struct has_par : thrust::detail::true_type {};
+
+template <>
+struct has_par<0> : thrust::detail::false_type {};
+
+template<class Policy>
+struct cvt_to_seq_impl
+{
+  typedef thrust::detail::seq_t seq_t;
+
+  static seq_t __host__ __device__
+  doit(Policy&)
+  {
+    return seq_t();
+  }
+};    // cvt_to_seq_impl
+
+#if 0
+template <class Allocator>
+struct cvt_to_seq_impl<
+    thrust::detail::execute_with_allocator<Allocator,
+                                           execute_on_stream_base> >
+{
+  typedef thrust::detail::execute_with_allocator<Allocator,
+                                                 execute_on_stream_base>
+      Policy;
+  typedef thrust::detail::execute_with_allocator<
+      Allocator,
+      thrust::system::detail::sequential::execution_policy>
+      seq_t;
+
+
+  static seq_t __host__ __device__
+  doit(Policy& policy)
+  {
+    return seq_t(policy.m_alloc);
+  }
+};    // specialization of struct cvt_to_seq_impl
+#endif
+
+template <class Policy>
+typename cvt_to_seq_impl<Policy>::seq_t __host__ __device__
+cvt_to_seq(Policy& policy)
+{
+  return cvt_to_seq_impl<Policy>::doit(policy);
+}
+
+#if __THRUST_HAS_CUDART__
+#define THRUST_CUDART_DISPATCH par
+#else
+#define THRUST_CUDART_DISPATCH seq
+#endif
+
+} // namespace cuda_
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/detail/parallel_for.h b/thrust/thrust/system/cuda/detail/parallel_for.h
new file mode 100644
index 0000000000000000000000000000000000000000..17fa7e7a86b243c80e13bc6678e31c80ad1e3f5b
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/parallel_for.h
@@ -0,0 +1,178 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+namespace __parallel_for {
+
+  template <int _BLOCK_THREADS,
+            int _ITEMS_PER_THREAD = 1>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+  };    // struct PtxPolicy
+
+  template <class Arch, class F>
+  struct Tuning;
+
+  template <class F>
+  struct Tuning<sm30, F>
+  {
+    typedef PtxPolicy<256, 2> type;
+  };
+
+
+  template <class F,
+            class Size>
+  struct ParallelForAgent
+  {
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, F>::type
+    {
+      typedef Tuning<Arch, F> tuning;
+    };
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS
+    };
+
+    template <bool IS_FULL_TILE>
+    static void    THRUST_DEVICE_FUNCTION
+    consume_tile(F    f,
+                 Size tile_base,
+                 int  items_in_tile)
+    {
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        Size idx = BLOCK_THREADS * ITEM + threadIdx.x;
+        if (IS_FULL_TILE || idx < items_in_tile)
+          f(tile_base + idx);
+      }
+    }
+
+    THRUST_AGENT_ENTRY(F     f,
+                       Size  num_items,
+                       char * /*shmem*/ )
+    {
+      Size tile_base     = static_cast<Size>(blockIdx.x) * ITEMS_PER_TILE;
+      Size num_remaining = num_items - tile_base;
+      Size items_in_tile = static_cast<Size>(
+          num_remaining < ITEMS_PER_TILE ? num_remaining : ITEMS_PER_TILE);
+
+      if (items_in_tile == ITEMS_PER_TILE)
+      {
+        // full tile
+        consume_tile<true>(f, tile_base, ITEMS_PER_TILE);
+      }
+      else
+      {
+        // partial tile
+        consume_tile<false>(f, tile_base, items_in_tile);
+      }
+    }
+  };    // struct ParallelForEagent
+
+  template <class F,
+            class Size>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  parallel_for(Size         num_items,
+               F            f,
+               cudaStream_t stream)
+  {
+    if (num_items == 0)
+      return cudaSuccess;
+    using core::AgentLauncher;
+    using core::AgentPlan;
+
+    bool debug_sync = THRUST_DEBUG_SYNC_FLAG;
+
+    typedef AgentLauncher<ParallelForAgent<F, Size> > parallel_for_agent;
+    AgentPlan parallel_for_plan = parallel_for_agent::get_plan(stream);
+
+    parallel_for_agent pfa(parallel_for_plan, num_items, stream, "transform::agent", debug_sync);
+    pfa.launch(f, num_items);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return cudaSuccess;
+  }
+}    // __parallel_for
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class F,
+          class Size>
+void __host__ __device__
+parallel_for(execution_policy<Derived> &policy,
+             F                          f,
+             Size                       count)
+{
+  if (count == 0)
+    return;
+
+  if (__THRUST_HAS_CUDART__)
+  {
+    cudaStream_t stream = cuda_cub::stream(policy);
+    cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
+    cuda_cub::throw_on_error(status, "parallel_for failed");
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    for (Size idx = 0; idx != count; ++idx)
+      f(idx);
+#endif
+  }
+}
+
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..c69d02409f49478af09c2d06c60300d57de6a1d1
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/partition.h
@@ -0,0 +1,1146 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <cub/device/device_partition.cuh>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/partition.h>
+#include <thrust/pair.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __partition {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class, class>
+  struct Tuning;
+
+  template<class T>
+  struct Tuning<sm35, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 10,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<350>
+
+  template<class T>
+  struct Tuning<sm30, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<300>
+
+  template<int T>
+  struct __tag{};
+
+
+  struct no_stencil_tag_    {};
+  struct single_output_tag_
+  {
+    template<class T>
+    THRUST_DEVICE_FUNCTION T const& operator=(T const& t) const { return t; }
+  };
+
+  typedef no_stencil_tag_* no_stencil_tag;
+  typedef single_output_tag_* single_output_tag;;
+
+  template <class ItemsIt,
+            class StencilIt,
+            class SelectedOutIt,
+            class RejectedOutIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutIt>
+  struct PartitionAgent
+  {
+    typedef typename iterator_traits<ItemsIt>::value_type   item_type;
+    typedef typename iterator_traits<StencilIt>::value_type stencil_type;
+
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, item_type>::type
+    {
+      typedef Tuning<Arch,item_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type   ItemsLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, StencilIt>::type StencilLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type   BlockLoadItems;
+      typedef typename core::BlockLoad<PtxPlan, StencilLoadIt>::type BlockLoadStencil;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage          scan;
+          typename TilePrefixCallback::TempStorage prefix;
+        };
+
+        typename BlockLoadItems::TempStorage   load_items;
+        typename BlockLoadStencil::TempStorage load_stencil;
+
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE> raw_exchange;
+      };    // union TempStorage
+    };    // struct PtxPlan
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
+    typedef typename ptx_plan::StencilLoadIt      StencilLoadIt;
+    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
+    typedef typename ptx_plan::BlockLoadStencil   BlockLoadStencil;
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan          BlockScan;
+    typedef typename ptx_plan::TempStorage        TempStorage;
+
+    enum
+    {
+      SINGLE_OUTPUT    = thrust::detail::is_same<RejectedOutIt, single_output_tag>::value,
+      USE_STENCIL      = !thrust::detail::is_same<StencilIt, no_stencil_tag>::value,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &  temp_storage;
+      ScanTileState &tile_state;
+      ItemsLoadIt    items_glob;
+      StencilLoadIt  stencil_glob;
+      SelectedOutIt  selected_out_glob;
+      RejectedOutIt  rejected_out_glob;
+      Predicate      predicate;
+      Size           num_items;
+
+      //---------------------------------------------------------------------
+      // Utilities
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      scatter(item_type (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  num_tile_items,
+              int  num_tile_selections,
+              Size num_selections_prefix,
+              Size num_rejected_prefix,
+              Size /*num_selections*/)
+      {
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int item_idx             = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+          int local_selection_idx  = selection_indices[ITEM] - num_selections_prefix;
+          int local_rejection_idx  = item_idx - local_selection_idx;
+          int local_scatter_offset = (selection_flags[ITEM])
+                                         ? tile_num_rejections + local_selection_idx
+                                         : local_rejection_idx;
+
+          temp_storage.raw_exchange[local_scatter_offset] = items[ITEM];
+        }
+
+        core::sync_threadblock();
+
+        // Gather items from shared memory and scatter to global
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int  item_idx       = (ITEM * BLOCK_THREADS) + threadIdx.x;
+          int  rejection_idx  = item_idx;
+          int  selection_idx  = item_idx - tile_num_rejections;
+          Size scatter_offset = (item_idx < tile_num_rejections)
+                                    ? num_items -
+                                          num_rejected_prefix - rejection_idx - 1
+                                    : num_selections_prefix + selection_idx;
+
+          item_type item = temp_storage.raw_exchange[item_idx];
+
+          if (!IS_LAST_TILE || (item_idx < num_tile_items))
+          {
+            if (SINGLE_OUTPUT || item_idx >= tile_num_rejections)
+            {
+              selected_out_glob[scatter_offset] = item;
+            }
+            else    // if !SINGLE_OUTPUT, scatter rejected items separately
+            {
+              rejected_out_glob[num_items - scatter_offset - 1] = item;
+            }
+          }
+        }
+      }    // func scatter
+
+      //------------------------------------------
+      // specialize predicate on different types
+      //------------------------------------------
+
+      enum ItemStencil
+      {
+        ITEM,
+        STENCIL
+      };
+
+      template <bool TAG, class T>
+      struct wrap_value
+      {
+        T const &              x;
+        THRUST_DEVICE_FUNCTION wrap_value(T const &x) : x(x) {}
+
+        THRUST_DEVICE_FUNCTION T const &operator()() const { return x; };
+      };    // struct wrap_type
+
+      //------- item
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &x,
+                        __tag<false /* USE_STENCIL */>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+      //-------- stencil
+
+      template <class T>
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, T> const &x,
+                        __tag<true>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, no_stencil_tag_> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, stencil_type> const &,
+                        __tag<false>)
+      {
+        return false;
+      }
+
+      template <bool IS_LAST_TILE, ItemStencil TYPE, class T>
+      THRUST_DEVICE_FUNCTION void
+      compute_selection_flags(int num_tile_items,
+                              T (&values)[ITEMS_PER_THREAD],
+                              Size (&selection_flags)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Out-of-bounds items are selection_flags
+          selection_flags[ITEM] = 1;
+
+          if (!IS_LAST_TILE ||
+              (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+          {
+            selection_flags[ITEM] =
+                predicate_wrapper(wrap_value<TYPE, T>(values[ITEM]),
+                                  __tag<USE_STENCIL>());
+          }
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        item_type items_loc[ITEMS_PER_THREAD];
+        Size      selection_flags[ITEMS_PER_THREAD];
+        Size      selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_glob + tile_base, items_loc, num_tile_items);
+        }
+        else
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_glob + tile_base, items_loc);
+        }
+
+        core::sync_threadblock();
+
+        if (USE_STENCIL)
+        {
+          stencil_type stencil_loc[ITEMS_PER_THREAD];
+
+          if (IS_LAST_TILE)
+          {
+            BlockLoadStencil(temp_storage.load_stencil)
+                .Load(stencil_glob + tile_base, stencil_loc, num_tile_items);
+          }
+          else
+          {
+            BlockLoadStencil(temp_storage.load_stencil)
+                .Load(stencil_glob + tile_base, stencil_loc);
+          }
+
+          compute_selection_flags<IS_LAST_TILE, STENCIL>(num_tile_items,
+                                                         stencil_loc,
+                                                         selection_flags);
+        }
+        else /* Use predicate on items rather then stencil */
+        {
+          compute_selection_flags<IS_LAST_TILE, ITEM>(num_tile_items,
+                                                      items_loc,
+                                                      selection_flags);
+        }
+
+        core::sync_threadblock();
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        Size num_rejected_prefix   = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(temp_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       temp_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(temp_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+          num_rejected_prefix   = tile_base - num_selections_prefix;
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        core::sync_threadblock();
+
+        scatter<IS_LAST_TILE>(items_loc,
+                              selection_flags,
+                              selection_idx,
+                              num_tile_items,
+                              num_tile_selections,
+                              num_selections_prefix,
+                              num_rejected_prefix,
+                              num_selections);
+
+
+        return num_selections;
+      }
+
+
+      template <bool         IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION Size
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &    temp_storage_,
+           ScanTileState &  tile_state_,
+           ItemsLoadIt      items_glob_,
+           StencilLoadIt    stencil_glob_,
+           SelectedOutIt    selected_out_glob_,
+           RejectedOutIt    rejected_out_glob_,
+           Predicate        predicate_,
+           Size             num_items_,
+           int              num_tiles,
+           NumSelectedOutIt num_selected_out)
+          : temp_storage(temp_storage_),
+            tile_state(tile_state_),
+            items_glob(items_glob_),
+            stencil_glob(stencil_glob_),
+            selected_out_glob(selected_out_glob_),
+            rejected_out_glob(rejected_out_glob_),
+            predicate(predicate_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }    //
+    };     //struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ItemsIt          items,
+                       StencilIt        stencil,
+                       SelectedOutIt    selected_out,
+                       RejectedOutIt    rejected_out,
+                       Predicate        predicate,
+                       Size             num_items,
+                       NumSelectedOutIt num_selected_out,
+                       ScanTileState    tile_state,
+                       int              num_tiles,
+                       char *           shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           core::make_load_iterator(ptx_plan(), items),
+           core::make_load_iterator(ptx_plan(), stencil),
+           selected_out,
+           rejected_out,
+           predicate,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  };       // struct PartitionAgent
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+
+  }; // struct InitAgent
+
+  template <class ItemsIt,
+            class StencilIt,
+            class SelectedOutIt,
+            class RejectedOutIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            ItemsIt          items,
+            StencilIt        stencil,
+            SelectedOutIt    selected_out,
+            RejectedOutIt    rejected_out,
+            Predicate        predicate,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream,
+            bool             debug_sync)
+  {
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        PartitionAgent<ItemsIt,
+                       StencilIt,
+                       SelectedOutIt,
+                       RejectedOutIt,
+                       Predicate,
+                       Size,
+                       NumSelectedOutIt> >
+        partition_agent;
+
+    typedef typename partition_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type      init_plan      = init_agent::get_plan();
+    typename get_plan<partition_agent>::type partition_plan = partition_agent::get_plan(stream);
+
+    int tile_size = partition_plan.items_per_tile;
+    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t vshmem_storage = core::vshmem_size(partition_plan.shared_memory_size,
+                                              num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return status;
+
+    size_t allocation_sizes[2] = {0, vshmem_storage};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+
+    void* allocations[2] = {NULL, NULL};
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    init_agent ia(init_plan, num_tiles, stream, "partition::init_agent", debug_sync);
+
+    char *vshmem_ptr = vshmem_storage > 0 ? (char *)allocations[1] : NULL;
+
+    partition_agent pa(partition_plan, num_items, stream, vshmem_ptr, "partition::partition_agent", debug_sync);
+
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    pa.launch(items,
+              stencil,
+              selected_out,
+              rejected_out,
+              predicate,
+              num_items,
+              num_selected_out,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+
+  }
+
+  template <typename Derived,
+            typename InputIt,
+            typename StencilIt,
+            typename SelectedOutIt,
+            typename RejectedOutIt,
+            typename Predicate>
+  THRUST_RUNTIME_FUNCTION
+  pair<SelectedOutIt, RejectedOutIt>
+  partition(execution_policy<Derived>& policy,
+            InputIt                    first,
+            InputIt                    last,
+            StencilIt                  stencil,
+            SelectedOutIt              selected_result,
+            RejectedOutIt              rejected_result,
+            Predicate                  predicate)
+  {
+    typedef typename iterator_traits<InputIt>::difference_type size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(first, last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       selected_result,
+                       rejected_result,
+                       predicate,
+                       reinterpret_cast<size_type*>(NULL),
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "partition failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "partition failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "partition failed on 2nd alias_storage");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       selected_result,
+                       rejected_result,
+                       predicate,
+                       d_num_selected_out,
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "partition failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "partition failed to synchronize");
+
+    size_type num_selected = 0;
+    if (num_items > 0)
+    {
+      num_selected = get_value(policy, d_num_selected_out);
+    }
+
+    return thrust::make_pair(selected_result + num_selected,
+                             rejected_result + num_items - num_selected);
+  }
+
+  template <typename Derived,
+            typename Iterator,
+            typename StencilIt,
+            typename Predicate>
+  THRUST_RUNTIME_FUNCTION
+  Iterator partition_inplace(execution_policy<Derived>& policy,
+                             Iterator                   first,
+                             Iterator                   last,
+                             StencilIt                  stencil,
+                             Predicate                  predicate)
+  {
+    typedef typename iterator_traits<Iterator>::difference_type size_type;
+    typedef typename iterator_traits<Iterator>::value_type      value_type;
+
+    size_type num_items = thrust::distance(first, last);
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<value_type, Derived> tmp(policy, num_items);
+
+    cuda_cub::uninitialized_copy(policy, first, last, tmp.begin());
+
+    pair<Iterator, single_output_tag> result =
+        partition(policy,
+                  tmp.data().get(),
+                  tmp.data().get() + num_items,
+                  stencil,
+                  first,
+                  single_output_tag(),
+                  predicate);
+
+    size_type num_selected = result.first - first;
+
+    return first + num_selected;
+  }
+}    // namespace __partition
+
+///// copy
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+partition_copy(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               StencilIt                  stencil,
+               SelectedOutIt              selected_result,
+               RejectedOutIt              rejected_result,
+               Predicate                  predicate)
+{
+  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __partition::partition(policy,
+                            first,
+                            last,
+                            stencil,
+                            selected_result,
+                            rejected_result,
+                            predicate);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
+                                 first,
+                                 last,
+                                 stencil,
+                                 selected_result,
+                                 rejected_result,
+                                 predicate);
+#endif
+  }
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+partition_copy(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               SelectedOutIt              selected_result,
+               RejectedOutIt              rejected_result,
+               Predicate                  predicate)
+{
+  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __partition::partition(policy,
+                                 first,
+                                 last,
+                                 __partition::no_stencil_tag(),
+                                 selected_result,
+                                 rejected_result,
+                                 predicate);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
+                                 first,
+                                 last,
+                                 selected_result,
+                                 rejected_result,
+                                 predicate);
+#endif
+  }
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+stable_partition_copy(execution_policy<Derived> &policy,
+                      InputIt                    first,
+                      InputIt                    last,
+                      SelectedOutIt              selected_result,
+                      RejectedOutIt              rejected_result,
+                      Predicate                  predicate)
+{
+  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __partition::partition(policy,
+                                 first,
+                                 last,
+                                 __partition::no_stencil_tag(),
+                                 selected_result,
+                                 rejected_result,
+                                 predicate);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
+                                        first,
+                                        last,
+                                        selected_result,
+                                        rejected_result,
+                                        predicate);
+#endif
+  }
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+stable_partition_copy(execution_policy<Derived> &policy,
+                      InputIt                    first,
+                      InputIt                    last,
+                      StencilIt                  stencil,
+                      SelectedOutIt              selected_result,
+                      RejectedOutIt              rejected_result,
+                      Predicate                  predicate)
+{
+  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __partition::partition(policy,
+                                 first,
+                                 last,
+                                 stencil,
+                                 selected_result,
+                                 rejected_result,
+                                 predicate);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
+                                        first,
+                                        last,
+                                        stencil,
+                                        selected_result,
+                                        rejected_result,
+                                        predicate);
+#endif
+  }
+  return ret;
+}
+
+/// inplace
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class StencilIt,
+          class Predicate>
+Iterator __host__ __device__
+partition(execution_policy<Derived> &policy,
+          Iterator                   first,
+          Iterator                   last,
+          StencilIt                  stencil,
+          Predicate                  predicate)
+{
+  Iterator ret = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __partition::partition_inplace(policy, first, last, stencil, predicate);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::partition(cvt_to_seq(derived_cast(policy)),
+                            first,
+                            last,
+                            stencil,
+                            predicate);
+#endif
+  }
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class Predicate>
+Iterator __host__ __device__
+partition(execution_policy<Derived> &policy,
+          Iterator                   first,
+          Iterator                   last,
+          Predicate                  predicate)
+{
+  Iterator ret = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __partition::partition_inplace(policy,
+                                         first,
+                                         last,
+                                         __partition::no_stencil_tag(),
+                                         predicate);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::partition(cvt_to_seq(derived_cast(policy)),
+                            first,
+                            last,
+                            predicate);
+#endif
+  }
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class StencilIt,
+          class Predicate>
+Iterator __host__ __device__
+stable_partition(execution_policy<Derived> &policy,
+                 Iterator                   first,
+                 Iterator                   last,
+                 StencilIt                  stencil,
+                 Predicate                  predicate)
+{
+  Iterator result = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    result = __partition::partition_inplace(policy,
+                                    first,
+                                    last,
+                                    stencil,
+                                    predicate);
+
+    // partition returns rejected values in reverese order
+    // so reverse the rejected elements to make it stable
+    cuda_cub::reverse(policy, result, last);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
+                                      first,
+                                      last,
+                                      stencil,
+                                      predicate);
+#endif
+  }
+  return result;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class Predicate>
+Iterator __host__ __device__
+stable_partition(execution_policy<Derived> &policy,
+                 Iterator                   first,
+                 Iterator                   last,
+                 Predicate                  predicate)
+{
+  Iterator result = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    result = __partition::partition_inplace(policy,
+                                       first,
+                                       last,
+                                       __partition::no_stencil_tag(),
+                                       predicate);
+
+    // partition returns rejected values in reverese order
+    // so reverse the rejected elements to make it stable
+    cuda_cub::reverse(policy, result, last);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
+                                      first,
+                                      last,
+                                      predicate);
+#endif
+  }
+  return result;
+}
+
+template <class Derived,
+          class ItemsIt,
+          class Predicate>
+bool __host__ __device__
+is_partitioned(execution_policy<Derived> &policy,
+               ItemsIt                    first,
+               ItemsIt                    last,
+               Predicate                  predicate)
+{
+  ItemsIt boundary = cuda_cub::find_if_not(policy, first, last, predicate);
+  ItemsIt end      = cuda_cub::find_if(policy,boundary,last,predicate);
+  return end == last;
+}
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/per_device_resource.h b/thrust/thrust/system/cuda/detail/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..68f7194af34ad93736cff429152d381af31e8dff
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/per_device_resource.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <mutex>
+#include <unordered_map>
+
+namespace thrust
+{
+
+namespace cuda_cub
+{
+
+template<typename MR, typename DerivedPolicy>
+__host__
+MR * get_per_device_resource(execution_policy<DerivedPolicy>&)
+{
+    static std::mutex map_lock;
+    static std::unordered_map<int, MR> device_id_to_resource;
+
+    int device_id;
+    thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_id));
+
+    std::lock_guard<std::mutex> lock{map_lock};
+    return &device_id_to_resource[device_id];
+}
+
+}
+
+} // end namespace thrust
+
+#endif
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/detail/pointer.inl b/thrust/thrust/system/cuda/detail/pointer.inl
new file mode 100644
index 0000000000000000000000000000000000000000..60f277f597204fb0570c954da47f92ff1ccc8302
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/pointer.inl
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+namespace thrust
+{
+
+// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
+//     pointer_raw_pointer for pointer by specializing it here
+//     note that we specialize it here, before the use of raw_pointer_cast
+//     below, which causes pointer_raw_pointer's instantiation
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+namespace detail
+{
+
+template<typename T>
+  struct pointer_raw_pointer< thrust::cuda_cub::pointer<T> >
+{
+  typedef typename thrust::cuda_cub::pointer<T>::raw_pointer type;
+}; // end pointer_raw_pointer
+
+} // end detail
+#endif
+
+namespace cuda_cub {
+
+template <typename T>
+template <typename OtherT>
+__host__ __device__ reference<T> &reference<T>::operator=(
+    const reference<OtherT> &other) {
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template <typename T>
+__host__ __device__ reference<T> &reference<T>::operator=(const value_type &x) {
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+} // end cuda_cub
+} // end thrust
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fece97186a32ffb147c60a5b28f990a8600ba6f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -0,0 +1,1076 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <cub/device/device_reduce.cuh>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/make_unsigned_special.h>
+#include <thrust/functional.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+#include <thrust/detail/alignment.h>
+
+namespace thrust
+{
+
+// forward declare generic reduce
+// to circumvent circular dependency
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename T,
+          typename BinaryFunction>
+T __host__ __device__
+reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+       InputIterator                                               first,
+       InputIterator                                               last,
+       T                                                           init,
+       BinaryFunction                                              binary_op);
+
+namespace cuda_cub {
+
+namespace __reduce {
+
+  template<bool>
+  struct is_true : thrust::detail::false_type {};
+  template<>
+  struct is_true<true> : thrust::detail::true_type {};
+
+  template <int                       _BLOCK_THREADS,
+            int                       _ITEMS_PER_THREAD   = 1,
+            int                       _VECTOR_LOAD_LENGTH = 1,
+            cub::BlockReduceAlgorithm _BLOCK_ALGORITHM    = cub::BLOCK_REDUCE_RAKING,
+            cub::CacheLoadModifier    _LOAD_MODIFIER      = cub::LOAD_DEFAULT,
+            cub::GridMappingStrategy  _GRID_MAPPING       = cub::GRID_MAPPING_DYNAMIC>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
+    };
+
+    static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
+    static const cub::CacheLoadModifier    LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::GridMappingStrategy  GRID_MAPPING    = _GRID_MAPPING;
+  }; // struct PtxPolicy
+
+  template<class,class>
+  struct Tuning;
+
+  template <class T>
+  struct Tuning<sm30, T>
+  {
+    enum
+    {
+      // Relative size of T type to a 4-byte word
+      SCALE_FACTOR_4B = (sizeof(T) + 3) / 4,
+      // Relative size of T type to a 1-byte word
+      SCALE_FACTOR_1B = sizeof(T),
+    };
+
+    typedef PtxPolicy<256,
+                      CUB_MAX(1, 20 / SCALE_FACTOR_4B),
+                      2,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_DEFAULT,
+                      cub::GRID_MAPPING_RAKE>
+        type;
+  }; // Tuning sm30
+
+  template <class T>
+  struct Tuning<sm35, T> : Tuning<sm30,T>
+  {
+    // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
+    typedef PtxPolicy<128,
+                      CUB_MAX(1, 24 / Tuning::SCALE_FACTOR_1B),
+                      4,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_LDG,
+                      cub::GRID_MAPPING_DYNAMIC>
+        ReducePolicy1B;
+
+    // ReducePolicy4B types (GTX Titan: 255.1 GB/s @ 48M 4B items)
+    typedef PtxPolicy<256,
+                      CUB_MAX(1, 20 / Tuning::SCALE_FACTOR_4B),
+                      4,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_LDG,
+                      cub::GRID_MAPPING_DYNAMIC>
+        ReducePolicy4B;
+
+    typedef typename thrust::detail::conditional<(sizeof(T) < 4),
+                                                 ReducePolicy1B,
+                                                 ReducePolicy4B>::type type;
+  };    // Tuning sm35
+
+  template <class InputIt,
+            class OutputIt,
+            class T,
+            class Size,
+            class ReductionOp>
+  struct ReduceAgent
+  {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,T>::type
+    {
+      // we need this type definition to indicate "specialize_plan" metafunction
+      // that this PtxPlan may have specializations for different Arch
+      // via Tuning<Arch,T> type.
+      //
+      typedef Tuning<Arch,T> tuning;
+
+      typedef typename cub::CubVector<T, PtxPlan::VECTOR_LOAD_LENGTH> Vector;
+      typedef typename core::LoadIterator<PtxPlan, InputIt>::type     LoadIt;
+      typedef cub::BlockReduce<T,
+                               PtxPlan::BLOCK_THREADS,
+                               PtxPlan::BLOCK_ALGORITHM,
+                               1,
+                               1,
+                               Arch::ver>
+          BlockReduce;
+
+      typedef cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER,
+                                              Vector,
+                                              Size>
+          VectorLoadIt;
+
+      struct TempStorage
+      {
+        typename BlockReduce::TempStorage reduce;
+        //
+        Size dequeue_offset;
+      };    // struct TempStorage
+
+
+    }; // struct PtxPlan
+
+    // Reduction need additional information which is not covered in
+    // default core::AgentPlan. We thus inherit from core::AgentPlan
+    // and add additional member fields that are needed.
+    // Other algorithms, e.g. merge, may not need additional information,
+    // and may use AgentPlan directly, instead of defining their own Plan type.
+    //
+    struct Plan : core::AgentPlan
+    {
+      cub::GridMappingStrategy grid_mapping;
+
+      template <class P>
+      THRUST_RUNTIME_FUNCTION
+          Plan(P) : core::AgentPlan(P()),
+                    grid_mapping(P::GRID_MAPPING)
+      {
+      }
+    };
+
+    // this specialized PtxPlan for a device-compiled Arch
+    // ptx_plan type *must* only be used from device code
+    // Its use from host code will result in *undefined behaviour*
+    //
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::TempStorage  TempStorage;
+    typedef typename ptx_plan::Vector       Vector;
+    typedef typename ptx_plan::LoadIt       LoadIt;
+    typedef typename ptx_plan::BlockReduce  BlockReduce;
+    typedef typename ptx_plan::VectorLoadIt VectorLoadIt;
+
+    enum
+    {
+      ITEMS_PER_THREAD   = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS      = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE     = ptx_plan::ITEMS_PER_TILE,
+      VECTOR_LOAD_LENGTH = ptx_plan::VECTOR_LOAD_LENGTH,
+
+      ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
+                              (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                              thrust::detail::is_pointer<InputIt>::value &&
+                              thrust::detail::is_arithmetic<
+                                  typename thrust::detail::remove_cv<T> >::value
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      TempStorage &storage;
+      InputIt      input_it;
+      LoadIt       load_it;
+      ReductionOp  reduction_op;
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION impl(TempStorage &storage_,
+                                  InputIt      input_it_,
+                                  ReductionOp  reduction_op_)
+          : storage(storage_),
+            input_it(input_it_),
+            load_it(core::make_load_iterator(ptx_plan(), input_it)),
+            reduction_op(reduction_op_) {}
+
+      //---------------------------------------------------------------------
+      // Utility
+      //---------------------------------------------------------------------
+
+
+      // Whether or not the input is aligned with the vector type
+      // (specialized for types we can vectorize)
+      //
+      template <class Iterator>
+      static THRUST_DEVICE_FUNCTION bool
+      is_aligned(Iterator d_in,
+                 thrust::detail::true_type /* can_vectorize */)
+      {
+        return (size_t(d_in) & (sizeof(Vector) - 1)) == 0;
+      }
+
+      // Whether or not the input is aligned with the vector type
+      // (specialized for types we cannot vectorize)
+      //
+      template <class Iterator>
+      static THRUST_DEVICE_FUNCTION bool
+      is_aligned(Iterator,
+                 thrust::detail::false_type /* can_vectorize */)
+      {
+        return false;
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      // Consume a full tile of input (non-vectorized)
+      //
+      template <int IS_FIRST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(T &  thread_aggregate,
+                   Size block_offset,
+                   int  /*valid_items*/,
+                   thrust::detail::true_type /* is_full_tile */,
+                   thrust::detail::false_type /* can_vectorize */)
+      {
+        T items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x,
+                                              load_it + block_offset,
+                                              items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate =
+            (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op)
+                            : cub::internal::ThreadReduce(items, reduction_op,
+                                                          thread_aggregate);
+      }
+
+      // Consume a full tile of input (vectorized)
+      //
+      template <int IS_FIRST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(T &  thread_aggregate,
+                   Size block_offset,
+                   int  /*valid_items*/,
+                   thrust::detail::true_type /* is_full_tile */,
+                   thrust::detail::true_type /* can_vectorize */)
+      {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum
+        {
+          WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH
+        };
+
+        T items[ITEMS_PER_THREAD];
+
+        Vector *vec_items = reinterpret_cast<Vector *>(items);
+
+        // Vector Input iterator wrapper type (for applying cache modifier)
+        T *d_in_unqualified = const_cast<T *>(input_it) +
+                              block_offset +
+                              (threadIdx.x * VECTOR_LOAD_LENGTH);
+        VectorLoadIt vec_load_it(reinterpret_cast<Vector *>(d_in_unqualified));
+
+#pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+        {
+          vec_items[i] = vec_load_it[BLOCK_THREADS * i];
+        }
+
+
+        // Reduce items within each thread stripe
+        thread_aggregate =
+            (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op)
+                            : cub::internal::ThreadReduce(items, reduction_op,
+                                                          thread_aggregate);
+      }
+
+
+      // Consume a partial tile of input
+      //
+      template <int IS_FIRST_TILE, class CAN_VECTORIZE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(T &  thread_aggregate,
+                   Size block_offset,
+                   int  valid_items,
+                   thrust::detail::false_type /* is_full_tile */,
+                   CAN_VECTORIZE)
+      {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+          thread_aggregate = load_it[block_offset + thread_offset];
+          thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+          thread_aggregate = reduction_op(
+              thread_aggregate,
+              thrust::raw_reference_cast(load_it[block_offset + thread_offset]));
+          thread_offset += BLOCK_THREADS;
+        }
+      }
+
+      //---------------------------------------------------------------
+      // Consume a contiguous segment of tiles
+      //---------------------------------------------------------------------
+
+
+      // Reduce a contiguous segment of input tiles
+      //
+      template <class CAN_VECTORIZE>
+      THRUST_DEVICE_FUNCTION T
+      consume_range_impl(Size          block_offset,
+                         Size          block_end,
+                         CAN_VECTORIZE can_vectorize)
+      {
+        T thread_aggregate;
+
+        if (block_offset + ITEMS_PER_TILE > block_end)
+        {
+          // First tile isn't full (not all threads have valid items)
+          int valid_items = block_end - block_offset;
+          consume_tile<true>(thread_aggregate,
+                             block_offset,
+                             valid_items,
+                             thrust::detail::false_type(),
+                             can_vectorize);
+          return BlockReduce(storage.reduce)
+              .Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        consume_tile<true>(thread_aggregate,
+                           block_offset,
+                           ITEMS_PER_TILE,
+                           thrust::detail::true_type(),
+                           can_vectorize);
+        block_offset += ITEMS_PER_TILE;
+
+        // Consume subsequent full tiles of input
+        while (block_offset + ITEMS_PER_TILE <= block_end)
+        {
+          consume_tile<false>(thread_aggregate,
+                              block_offset,
+                              ITEMS_PER_TILE,
+                              thrust::detail::true_type(),
+                              can_vectorize);
+          block_offset += ITEMS_PER_TILE;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+          int valid_items = block_end - block_offset;
+          consume_tile<false>(thread_aggregate,
+                              block_offset,
+                              valid_items,
+                              thrust::detail::false_type(),
+                              can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduce(storage.reduce)
+            .Reduce(thread_aggregate, reduction_op);
+      }
+
+      // Reduce a contiguous segment of input tiles
+      //
+      THRUST_DEVICE_FUNCTION T consume_range(Size block_offset,
+                                             Size block_end)
+      {
+        typedef is_true<ATTEMPT_VECTORIZATION>          attempt_vec;
+        typedef is_true<true && ATTEMPT_VECTORIZATION>  path_a;
+        typedef is_true<false && ATTEMPT_VECTORIZATION> path_b;
+
+        return is_aligned(input_it + block_offset, attempt_vec())
+                   ? consume_range_impl(block_offset, block_end, path_a())
+                   : consume_range_impl(block_offset, block_end, path_b());
+      }
+
+      // Reduce a contiguous segment of input tiles
+      //
+      THRUST_DEVICE_FUNCTION T
+      consume_tiles(Size /*num_items*/,
+                    cub::GridEvenShare<Size> &even_share,
+                    cub::GridQueue<UnsignedSize> & /*queue*/,
+                    thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_RAKE> /*is_rake*/)
+      {
+        typedef is_true<ATTEMPT_VECTORIZATION>          attempt_vec;
+        typedef is_true<true && ATTEMPT_VECTORIZATION>  path_a;
+        typedef is_true<false && ATTEMPT_VECTORIZATION> path_b;
+
+        // Initialize even-share descriptor for this thread block
+        even_share
+            .template BlockInit<ITEMS_PER_TILE, cub::GRID_MAPPING_RAKE>();
+
+        return is_aligned(input_it, attempt_vec())
+                   ? consume_range_impl(even_share.block_offset,
+                                        even_share.block_end,
+                                        path_a())
+                   : consume_range_impl(even_share.block_offset,
+                                        even_share.block_end,
+                                        path_b());
+      }
+
+
+      //---------------------------------------------------------------------
+      // Dynamically consume tiles
+      //---------------------------------------------------------------------
+
+      // Dequeue and reduce tiles of items as part of a inter-block reduction
+      //
+      template <class CAN_VECTORIZE>
+      THRUST_DEVICE_FUNCTION T
+      consume_tiles_impl(Size                         num_items,
+                         cub::GridQueue<UnsignedSize> queue,
+                         CAN_VECTORIZE                can_vectorize)
+      {
+        using core::sync_threadblock;
+
+        // We give each thread block at least one tile of input.
+        T    thread_aggregate;
+        Size block_offset    = blockIdx.x * ITEMS_PER_TILE;
+        Size even_share_base = gridDim.x * ITEMS_PER_TILE;
+
+        if (block_offset + ITEMS_PER_TILE > num_items)
+        {
+          // First tile isn't full (not all threads have valid items)
+          int valid_items = num_items - block_offset;
+          consume_tile<true>(thread_aggregate,
+                             block_offset,
+                             valid_items,
+                             thrust::detail::false_type(),
+                             can_vectorize);
+          return BlockReduce(storage.reduce)
+              .Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // Consume first full tile of input
+        consume_tile<true>(thread_aggregate,
+                           block_offset,
+                           ITEMS_PER_TILE,
+                           thrust::detail::true_type(),
+                           can_vectorize);
+
+        if (num_items > even_share_base)
+        {
+          // Dequeue a tile of items
+          if (threadIdx.x == 0)
+            storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) +
+                                     even_share_base;
+
+          sync_threadblock();
+
+          // Grab tile offset and check if we're done with full tiles
+          block_offset = storage.dequeue_offset;
+
+          // Consume more full tiles
+          while (block_offset + ITEMS_PER_TILE <= num_items)
+          {
+            consume_tile<false>(thread_aggregate,
+                                block_offset,
+                                ITEMS_PER_TILE,
+                                thrust::detail::true_type(),
+                                can_vectorize);
+
+            sync_threadblock();
+
+            // Dequeue a tile of items
+            if (threadIdx.x == 0)
+              storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) +
+                                       even_share_base;
+
+            sync_threadblock();
+
+            // Grab tile offset and check if we're done with full tiles
+            block_offset = storage.dequeue_offset;
+          }
+
+          // Consume partial tile
+          if (block_offset < num_items)
+          {
+            int valid_items = num_items - block_offset;
+            consume_tile<false>(thread_aggregate,
+                                block_offset,
+                                valid_items,
+                                thrust::detail::false_type(),
+                                can_vectorize);
+          }
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduce(storage.reduce)
+            .Reduce(thread_aggregate, reduction_op);
+      }
+
+
+      // Dequeue and reduce tiles of items as part of a inter-block reduction
+      //
+      THRUST_DEVICE_FUNCTION T
+      consume_tiles(
+          Size                              num_items,
+          cub::GridEvenShare<Size> &/*even_share*/,
+          cub::GridQueue<UnsignedSize> &    queue,
+          thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_DYNAMIC>)
+      {
+        typedef is_true<ATTEMPT_VECTORIZATION>         attempt_vec;
+        typedef is_true<true && ATTEMPT_VECTORIZATION> path_a;
+        typedef is_true<false && ATTEMPT_VECTORIZATION> path_b;
+
+        return is_aligned(input_it, attempt_vec())
+                   ? consume_tiles_impl(num_items, queue, path_a())
+                   : consume_tiles_impl(num_items, queue, path_b());
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry points
+    //---------------------------------------------------------------------
+
+    // single tile reduce entry point
+    //
+    THRUST_AGENT_ENTRY(InputIt     input_it,
+                       OutputIt    output_it,
+                       Size        num_items,
+                       ReductionOp reduction_op,
+                       char *      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      if (num_items == 0)
+      {
+        return;
+      }
+
+      T block_aggregate =
+          impl(storage, input_it, reduction_op).consume_range((Size)0, num_items);
+
+      if (threadIdx.x == 0)
+        *output_it = block_aggregate;
+    }
+
+    // single tile reduce entry point
+    //
+    THRUST_AGENT_ENTRY(InputIt     input_it,
+                       OutputIt    output_it,
+                       Size        num_items,
+                       ReductionOp reduction_op,
+                       T           init,
+                       char *      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      if (num_items == 0)
+      {
+        if (threadIdx.x == 0)
+          *output_it = init;
+        return;
+      }
+
+      T block_aggregate =
+          impl(storage, input_it, reduction_op).consume_range((Size)0, num_items);
+
+      if (threadIdx.x == 0)
+        *output_it = reduction_op(init, block_aggregate);
+    }
+
+    THRUST_AGENT_ENTRY(InputIt                          input_it,
+                       OutputIt                         output_it,
+                       Size                             num_items,
+                       cub::GridEvenShare<Size> even_share,
+                       cub::GridQueue<UnsignedSize>     queue,
+                       ReductionOp                      reduction_op,
+                       char *                           shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      typedef thrust::detail::integral_constant<cub::GridMappingStrategy, ptx_plan::GRID_MAPPING> grid_mapping;
+
+      T block_aggregate =
+          impl(storage, input_it, reduction_op)
+              .consume_tiles(num_items, even_share, queue, grid_mapping());
+
+      if (threadIdx.x == 0)
+        output_it[blockIdx.x] = block_aggregate;
+    }
+  };    // struct ReduceAgent
+
+  template<class Size>
+  struct DrainAgent
+  {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<1> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(cub::GridQueue<UnsignedSize> grid_queue,
+                       Size                         num_items,
+                       char * /*shmem*/)
+    {
+      grid_queue.FillAndResetDrain(num_items);
+    }
+  };    // struct DrainAgent;
+
+
+  template <class InputIt,
+            class OutputIt,
+            class Size,
+            class ReductionOp,
+            class T>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *       d_temp_storage,
+            size_t &     temp_storage_bytes,
+            InputIt      input_it,
+            Size         num_items,
+            T            init,
+            ReductionOp  reduction_op,
+            OutputIt     output_it,
+            cudaStream_t stream,
+            bool         debug_sync)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+    using core::get_agent_plan;
+    using core::cuda_optional;
+
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef AgentLauncher<
+        ReduceAgent<InputIt, OutputIt, T, Size, ReductionOp> >
+        reduce_agent;
+
+    typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream);
+
+    cudaError_t status = cudaSuccess;
+
+
+    if (num_items <= reduce_plan.items_per_tile)
+    {
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+
+      // small, single tile size
+      if (d_temp_storage == NULL)
+      {
+        temp_storage_bytes = max<size_t>(1, vshmem_size);
+        return status;
+      }
+      char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL;
+
+      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only", debug_sync);
+      ra.launch(input_it, output_it, num_items, reduction_op, init);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+    else
+    {
+      // regular size
+      cuda_optional<int> sm_count = core::get_sm_count();
+      CUDA_CUB_RET_IF_FAIL(sm_count.status());
+
+      // reduction will not use more cta counts than requested
+      cuda_optional<int> max_blocks_per_sm =
+          reduce_agent::
+              template get_max_blocks_per_sm<InputIt,
+                                             OutputIt,
+                                             Size,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
+                                             ReductionOp>(reduce_plan);
+      CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
+
+
+
+      int reduce_device_occupancy = (int)max_blocks_per_sm * sm_count;
+
+      int sm_oversubscription = 5;
+      int max_blocks          = reduce_device_occupancy * sm_oversubscription;
+
+      cub::GridEvenShare<Size> even_share;
+      even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
+                              reduce_plan.items_per_tile);
+
+      // we will launch at most "max_blocks" blocks in a grid
+      // so preallocate virtual shared memory storage for this if required
+      //
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size,
+                                             max_blocks);
+
+      // Temporary storage allocation requirements
+      void * allocations[3] = {NULL, NULL, NULL};
+      size_t allocation_sizes[3] =
+          {
+              max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              vshmem_size                                        // size of virtualized shared memory storage
+          };
+      status = cub::AliasTemporaries(d_temp_storage,
+                                     temp_storage_bytes,
+                                     allocations,
+                                     allocation_sizes);
+      CUDA_CUB_RET_IF_FAIL(status);
+      if (d_temp_storage == NULL)
+      {
+        return status;
+      }
+
+      T *d_block_reductions = (T*) allocations[0];
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
+      char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
+
+
+      // Get grid size for device_reduce_sweep_kernel
+      int reduce_grid_size = 0;
+      if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE)
+      {
+        // Work is distributed evenly
+        reduce_grid_size = even_share.grid_size;
+      }
+      else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
+      {
+        // Work is distributed dynamically
+        size_t num_tiles = (num_items + reduce_plan.items_per_tile - 1) /
+          reduce_plan.items_per_tile;
+
+        // if not enough to fill the device with threadblocks
+        // then fill the device with threadblocks
+        reduce_grid_size = static_cast<int>(min(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+
+        typedef AgentLauncher<DrainAgent<Size> > drain_agent;
+        AgentPlan drain_plan = drain_agent::get_plan();
+        drain_plan.grid_size = 1;
+        drain_agent da(drain_plan, stream, "__reduce::drain_agent", debug_sync);
+        da.launch(queue, num_items);
+        CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+      }
+      else
+      {
+        CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported);
+      }
+
+      reduce_plan.grid_size = reduce_grid_size;
+      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce", debug_sync);
+      ra.launch(input_it,
+                d_block_reductions,
+                num_items,
+                even_share,
+                queue,
+                reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+
+      typedef AgentLauncher<
+        ReduceAgent<T*, OutputIt, T, Size, ReductionOp> >
+        reduce_agent_single;
+
+      reduce_plan.grid_size = 1;
+      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce", debug_sync);
+
+      ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op, init);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    return status;
+  }    // func doit_step
+
+
+  template <typename Derived,
+            typename InputIt,
+            typename Size,
+            typename T,
+            typename BinaryOp>
+  THRUST_RUNTIME_FUNCTION
+  T reduce(execution_policy<Derived>& policy,
+           InputIt                    first,
+           Size                       num_items,
+           T                          init,
+           BinaryOp                   binary_op)
+  {
+    if (num_items == 0)
+      return init;
+
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       first,
+                       num_items,
+                       init,
+                       binary_op,
+                       reinterpret_cast<T*>(NULL),
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "reduce failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
+
+    T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       first,
+                       num_items,
+                       init,
+                       binary_op,
+                       d_result,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "reduce failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "reduce failed to synchronize");
+
+    T result = cuda_cub::get_value(policy, d_result);
+
+    return result;
+  }
+}    // namespace __reduce
+
+namespace detail {
+
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename T,
+          typename BinaryOp>
+THRUST_RUNTIME_FUNCTION
+T reduce_n_impl(execution_policy<Derived>& policy,
+                InputIt                    first,
+                Size                       num_items,
+                T                          init,
+                BinaryOp                   binary_op)
+{
+  cudaStream_t stream = cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+
+  THRUST_INDEX_TYPE_DISPATCH2(status,
+    cub::DeviceReduce::Reduce,
+    (cub::DispatchReduce<
+        InputIt, T*, Size, BinaryOp
+    >::Dispatch),
+    num_items,
+    (NULL, tmp_size, first, reinterpret_cast<T*>(NULL),
+        num_items_fixed, binary_op, init, stream,
+        THRUST_DEBUG_SYNC_FLAG));
+  cuda_cub::throw_on_error(status, "after reduction step 1");
+
+  // Allocate temporary storage.
+
+  thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+    tmp(policy, sizeof(T) + tmp_size);
+
+  // Run reduction.
+
+  // `tmp.begin()` yields a `normal_iterator`, which dereferences to a
+  // `reference`, which has an `operator&` that returns a `pointer`, which
+  // has a `.get` method that returns a raw pointer, which we can (finally)
+  // `static_cast` to `void*`.
+  //
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  T* ret_ptr = thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get());
+  void* tmp_ptr = static_cast<void*>((tmp.data() + sizeof(T)).get());
+  THRUST_INDEX_TYPE_DISPATCH2(status,
+    cub::DeviceReduce::Reduce,
+    (cub::DispatchReduce<
+        InputIt, T*, Size, BinaryOp
+    >::Dispatch),
+    num_items,
+    (tmp_ptr, tmp_size, first, ret_ptr,
+        num_items_fixed, binary_op, init, stream,
+        THRUST_DEBUG_SYNC_FLAG));
+  cuda_cub::throw_on_error(status, "after reduction step 2");
+
+  // Synchronize the stream and get the value.
+
+  cuda_cub::throw_on_error(cuda_cub::synchronize(policy),
+    "reduce failed to synchronize");
+
+  // `tmp.begin()` yields a `normal_iterator`, which dereferences to a
+  // `reference`, which has an `operator&` that returns a `pointer`, which
+  // has a `.get` method that returns a raw pointer, which we can (finally)
+  // `static_cast` to `void*`.
+  //
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  return thrust::cuda_cub::get_value(policy,
+    thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get()));
+}
+
+} // namespace detail
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename T,
+          typename BinaryOp>
+__host__ __device__
+T reduce_n(execution_policy<Derived>& policy,
+           InputIt                    first,
+           Size                       num_items,
+           T                          init,
+           BinaryOp                   binary_op)
+{
+  if (__THRUST_HAS_CUDART__)
+    return thrust::cuda_cub::detail::reduce_n_impl(
+      policy, first, num_items, init, binary_op);
+
+  #if !__THRUST_HAS_CUDART__
+    return thrust::reduce(
+      cvt_to_seq(derived_cast(policy)), first, first + num_items, init, binary_op);
+  #endif
+}
+
+template <class Derived, class InputIt, class T, class BinaryOp>
+__host__ __device__
+T reduce(execution_policy<Derived> &policy,
+         InputIt                    first,
+         InputIt                    last,
+         T                          init,
+         BinaryOp                   binary_op)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  // FIXME: Check for RA iterator.
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  return cuda_cub::reduce_n(policy, first, num_items, init, binary_op);
+}
+
+template <class Derived,
+          class InputIt,
+          class T>
+__host__ __device__
+T reduce(execution_policy<Derived> &policy,
+         InputIt                    first,
+         InputIt                    last,
+         T                          init)
+{
+  return cuda_cub::reduce(policy, first, last, init, plus<T>());
+}
+
+template <class Derived,
+          class InputIt>
+__host__ __device__
+typename iterator_traits<InputIt>::value_type
+reduce(execution_policy<Derived> &policy,
+       InputIt                    first,
+       InputIt                    last)
+{
+  typedef typename iterator_traits<InputIt>::value_type value_type;
+  return cuda_cub::reduce(policy, first, last, value_type(0));
+}
+
+
+} // namespace cuda_cub
+
+} // end namespace thrust
+
+#include <thrust/memory.h>
+#include <thrust/reduce.h>
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..673a64b827c35b3cf48794dc55ee1a4d857f0eb6
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h
@@ -0,0 +1,1168 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/detail/type_traits.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <cub/device/device_reduce.cuh>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/pair.h>
+#include <thrust/functional.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+#include <thrust/detail/alignment.h>
+
+namespace thrust
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+__host__ __device__ thrust::pair<OutputIterator1, OutputIterator2>
+reduce_by_key(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator1                                              keys_first,
+    InputIterator1                                              keys_last,
+    InputIterator2                                              values_first,
+    OutputIterator1                                             keys_output,
+    OutputIterator2                                             values_output,
+    BinaryPredicate                                             binary_pred);
+
+namespace cuda_cub {
+
+namespace __reduce_by_key {
+
+  template<bool> struct is_true : thrust::detail::false_type {};
+  template<> struct is_true<true> : thrust::detail::true_type {};
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template <class Arch, class Key, class Value>
+  struct Tuning;
+
+  template <class Key, class Value>
+  struct Tuning<sm30, Key, Value>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
+      NOMINAL_4B_ITEMS_PER_THREAD = 6,
+
+      ITEMS_PER_THREAD = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning sm30
+
+  template<class Key, class Value>
+  struct Tuning<sm35,Key,Value> : Tuning<sm30,Key,Value>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 6,
+
+      ITEMS_PER_THREAD =
+          (Tuning::MAX_INPUT_BYTES <= 8)
+              ? 6
+              : mpl::min<
+                    int,
+                    NOMINAL_4B_ITEMS_PER_THREAD,
+                    mpl::max<
+                        int,
+                        1,
+                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                         Tuning::COMBINED_INPUT_BYTES - 1) /
+                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning sm35
+
+  template<class Key, class Value>
+  struct Tuning<sm52,Key,Value> : Tuning<sm30,Key,Value>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+
+      ITEMS_PER_THREAD =
+          (Tuning::MAX_INPUT_BYTES <= 8)
+              ? 9
+              : mpl::min<
+                    int,
+                    NOMINAL_4B_ITEMS_PER_THREAD,
+                    mpl::max<
+                        int,
+                        1,
+                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                         Tuning::COMBINED_INPUT_BYTES - 1) /
+                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning sm52
+
+  template <class KeysInputIt,
+            class ValuesInputIt,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class EqualityOp,
+            class ReductionOp,
+            class NumRunsOutputIt,
+            class Size>
+  struct ReduceByKeyAgent
+  {
+    typedef typename iterator_traits<KeysInputIt>::value_type   key_type;
+    typedef typename iterator_traits<ValuesInputIt>::value_type value_type;
+    typedef Size                                                size_type;
+
+    typedef cub::KeyValuePair<size_type, value_type> size_value_pair_t;
+    typedef cub::KeyValuePair<key_type, value_type>  key_value_pair_t;
+
+    typedef cub::ReduceByKeyScanTileState<value_type, size_type> ScanTileState;
+    typedef cub::ReduceBySegmentOp<ReductionOp> ReduceBySegmentOp;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,key_type, value_type>::type
+    {
+      typedef Tuning<Arch, key_type, value_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysInputIt>::type    KeysLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, ValuesInputIt>::type  ValuesLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt>::type   BlockLoadKeys;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt>::type BlockLoadValues;
+
+      typedef cub::BlockDiscontinuity<key_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityKeys;
+
+      typedef cub::TilePrefixCallbackOp<size_value_pair_t,
+                                        ReduceBySegmentOp,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<size_value_pair_t,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage              scan;
+          typename TilePrefixCallback::TempStorage     prefix;
+          typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        };
+
+        typename BlockLoadKeys::TempStorage   load_keys;
+        typename BlockLoadValues::TempStorage load_values;
+
+        core::uninitialized_array<key_value_pair_t, PtxPlan::ITEMS_PER_TILE + 1>
+          raw_exchange;
+      };    // union TempStorage
+    };  // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt             KeysLoadIt;
+    typedef typename ptx_plan::ValuesLoadIt           ValuesLoadIt;
+    typedef typename ptx_plan::BlockLoadKeys          BlockLoadKeys;
+    typedef typename ptx_plan::BlockLoadValues        BlockLoadValues;
+    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
+    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan              BlockScan;
+    typedef typename ptx_plan::TempStorage            TempStorage;
+
+    enum
+    {
+      BLOCK_THREADS     = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD  = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE    = ptx_plan::ITEMS_PER_TILE,
+      TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1),
+
+      // Whether or not the scan operation has a zero-valued identity value
+      // (true if we're performing addition on a primitive type)
+      HAS_IDENTITY_ZERO = thrust::detail::is_same<ReductionOp,
+                                                  plus<value_type> >::value &&
+                          thrust::detail::is_arithmetic<value_type>::value
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &                      storage;
+      KeysLoadIt                         keys_load_it;
+      ValuesLoadIt                       values_load_it;
+      KeysOutputIt                       keys_output_it;
+      ValuesOutputIt                     values_output_it;
+      NumRunsOutputIt                    num_runs_output_it;
+      cub::InequalityWrapper<EqualityOp> inequality_op;
+      ReduceBySegmentOp                  scan_op;
+
+      //---------------------------------------------------------------------
+      // Block scan utility methods
+      //---------------------------------------------------------------------
+
+      // Scan with identity (first tile)
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t &tile_aggregate,
+                thrust::detail::true_type /* has_identity */)
+      {
+        size_value_pair_t identity;
+        identity.value = 0;
+        identity.key   = 0;
+        BlockScan(storage.scan)
+            .ExclusiveScan(scan_items, scan_items, identity, scan_op, tile_aggregate);
+      }
+
+      // Scan without identity (first tile).
+      // Without an identity, the first output item is undefined.
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t &tile_aggregate,
+                thrust::detail::false_type /* has_identity */)
+      {
+        BlockScan(storage.scan)
+            .ExclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
+      }
+
+      // Scan with identity (subsequent tile)
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t & tile_aggregate,
+                TilePrefixCallback &prefix_op,
+                thrust::detail::true_type /*  has_identity */)
+      {
+        BlockScan(storage.scan)
+            .ExclusiveScan(scan_items,
+                           scan_items,
+                           scan_op,
+                           prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      // Scan without identity (subsequent tile).
+      // Without an identity, the first output item is undefined.
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t & tile_aggregate,
+                TilePrefixCallback &prefix_op,
+                thrust::detail::false_type /* has_identity */)
+      {
+        BlockScan(storage.scan)
+            .ExclusiveScan(scan_items,
+                           scan_items,
+                           scan_op,
+                           prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      //---------------------------------------------------------------------
+      // Zip utility methods
+      //---------------------------------------------------------------------
+
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      zip_values_and_flags(size_type num_remaining,
+                           value_type (&values)[ITEMS_PER_THREAD],
+                           size_type (&segment_flags)[ITEMS_PER_THREAD],
+                           size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+      {
+        // Zip values and segment_flags
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set segment_flags for first out-of-bounds item, zero for others
+          if (IS_LAST_TILE &&
+              Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)
+            segment_flags[ITEM] = 1;
+
+          scan_items[ITEM].value = values[ITEM];
+          scan_items[ITEM].key   = segment_flags[ITEM];
+        }
+      }
+
+      THRUST_DEVICE_FUNCTION void zip_keys_and_values(
+          key_type (&keys)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD],
+          size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD])
+      {
+        // Zip values and segment_flags
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          scatter_items[ITEM].key   = keys[ITEM];
+          scatter_items[ITEM].value = scan_items[ITEM].value;
+          segment_indices[ITEM]     = scan_items[ITEM].key;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Scatter utility methods
+      //---------------------------------------------------------------------
+
+      // Directly scatter flagged items to output offsets
+      // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false)
+      THRUST_DEVICE_FUNCTION void scatter_direct(
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD],
+          size_type (&segment_flags)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD])
+      {
+        // Scatter flagged keys and values
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          if (segment_flags[ITEM])
+          {
+            keys_output_it[segment_indices[ITEM]] = scatter_items[ITEM].key;
+            values_output_it[segment_indices[ITEM]] = scatter_items[ITEM].value;
+          }
+        }
+      }
+
+      // 2-phase scatter flagged items to output offsets
+      // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false
+      //
+      // The exclusive scan causes each head flag to be paired with
+      // the previous value aggregate:
+      //   * the scatter offsets must be decremented for value aggregates
+      //
+      THRUST_DEVICE_FUNCTION void scatter_two_phase(
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD],
+          size_type (&segment_flags)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD],
+          size_type num_tile_segments,
+          size_type num_tile_segments_prefix)
+      {
+        using core::sync_threadblock;
+
+        sync_threadblock();
+
+        // Compact and scatter keys
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          if (segment_flags[ITEM])
+          {
+            storage.raw_exchange[segment_indices[ITEM] -
+                                 num_tile_segments_prefix] = scatter_items[ITEM];
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+          size_type        idx  = num_tile_segments_prefix + item;
+          key_value_pair_t pair = storage.raw_exchange[item];
+          keys_output_it[idx]   = pair.key;
+          values_output_it[idx] = pair.value;
+        }
+      }
+
+
+      // Scatter flagged items
+      //
+      THRUST_DEVICE_FUNCTION void scatter(
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD],
+          size_type (&segment_flags)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD],
+          size_type num_tile_segments,
+          size_type num_tile_segments_prefix)
+      {
+        // Do a one-phase scatter if (a) two-phase is disabled or
+        // (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+          scatter_two_phase(scatter_items,
+                            segment_flags,
+                            segment_indices,
+                            num_tile_segments,
+                            num_tile_segments_prefix);
+        }
+        else
+        {
+          scatter_direct(scatter_items,
+                         segment_flags,
+                         segment_indices);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Finalization utility methods
+      //---------------------------------------------------------------------
+
+      // Finalize the carry-out from the last tile
+      // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false)
+      THRUST_DEVICE_FUNCTION void
+      finalize_last_tile(size_type num_segments,
+                         size_type num_remaining,
+                         key_type    last_key,
+                         value_type  last_value)
+      {
+        // Last thread will output final count and last item, if necessary
+        if (threadIdx.x == BLOCK_THREADS - 1)
+        {
+          // If the last tile is a whole tile, the inclusive prefix
+          // contains accumulated value reduction for the last segment
+          if (num_remaining == ITEMS_PER_TILE)
+          {
+            // Scatter key and value
+            keys_output_it[num_segments]   = last_key;
+            values_output_it[num_segments] = last_value;
+            num_segments++;
+          }
+
+          // Output the total number of items selected
+          *num_runs_output_it = num_segments;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Cooperatively scan a device-wide sequence of tiles with other CTAs
+      //---------------------------------------------------------------------
+
+      // Process first tile of input (dynamic chained scan).
+      // Returns the running  count of segments
+      // and aggregated values (including this tile)
+      //
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_first_tile(Size           num_remaining,
+                         Size           tile_offset,
+                         ScanTileState &tile_state)
+      {
+        using core::sync_threadblock;
+
+        key_type          keys[ITEMS_PER_THREAD];               // Tile keys
+        key_type          pred_keys[ITEMS_PER_THREAD];          // Tile keys shifted up (predecessor)
+        value_type        values[ITEMS_PER_THREAD];             // Tile values
+        size_type         segment_flags[ITEMS_PER_THREAD];      // Segment head flags
+        size_type         segment_indices[ITEMS_PER_THREAD];    // Segment indices
+        size_value_pair_t scan_items[ITEMS_PER_THREAD];         // Zipped values and segment flags|indices
+        key_value_pair_t  scatter_items[ITEMS_PER_THREAD];      // Zipped key value pairs for scattering
+
+        // Load keys (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset,
+                    keys,
+                    num_remaining,
+                    *(keys_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset, keys);
+        }
+
+        sync_threadblock();
+
+        // Load values (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset,
+                    values,
+                    num_remaining,
+                    *(values_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset, values);
+        }
+
+        sync_threadblock();
+
+        // Set head segment_flags.
+        // First tile sets the first flag for the first item
+        BlockDiscontinuityKeys(storage.discontinuity)
+            .FlagHeads(segment_flags, keys, pred_keys, inequality_op);
+
+        // Unset the flag for the first item in the first tile
+        // so we won't scatter it
+        //
+        if (threadIdx.x == 0)
+          segment_flags[0] = 0;
+
+        // Zip values and segment_flags
+        zip_values_and_flags<IS_LAST_TILE>(num_remaining,
+                                           values,
+                                           segment_flags,
+                                           scan_items);
+
+        // Exclusive scan of values and segment_flags
+        size_value_pair_t tile_aggregate;
+        scan_tile(scan_items, tile_aggregate, is_true<HAS_IDENTITY_ZERO>());
+
+        if (threadIdx.x == 0)
+        {
+          // Update tile status if this is not the last tile
+          if (!IS_LAST_TILE)
+            tile_state.SetInclusive(0, tile_aggregate);
+
+          // Initialize the segment index for the first scan item if necessary
+          // (the exclusive prefix for the first item is garbage)
+          if (!HAS_IDENTITY_ZERO)
+            scan_items[0].key = 0;
+        }
+
+        // Unzip values and segment indices
+        zip_keys_and_values(pred_keys,
+                            segment_indices,
+                            scan_items,
+                            scatter_items);
+
+        // Scatter flagged items
+        scatter(scatter_items,
+                segment_flags,
+                segment_indices,
+                tile_aggregate.key,
+                0);
+
+        if (IS_LAST_TILE)
+        {
+          // Finalize the carry-out from the last tile
+          finalize_last_tile(tile_aggregate.key,
+                             num_remaining,
+                             keys[ITEMS_PER_THREAD - 1],
+                             tile_aggregate.value);
+        }
+      }
+
+      // Process subsequent tile of input (dynamic chained scan).
+      // Returns the running count of segments
+      // and aggregated values (including this tile)
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_subsequent_tile(Size           num_remaining,
+                              int            tile_idx,
+                              Size           tile_offset,
+                              ScanTileState &tile_state)
+      {
+        using core::sync_threadblock;
+
+        key_type          keys[ITEMS_PER_THREAD];               // Tile keys
+        key_type          pred_keys[ITEMS_PER_THREAD];          // Tile keys shifted up (predecessor)
+        value_type        values[ITEMS_PER_THREAD];             // Tile values
+        size_type         segment_flags[ITEMS_PER_THREAD];      // Segment head flags
+        size_type         segment_indices[ITEMS_PER_THREAD];    // Segment indices
+        size_value_pair_t scan_items[ITEMS_PER_THREAD];         // Zipped values and segment flags|indices
+        key_value_pair_t  scatter_items[ITEMS_PER_THREAD];      // Zipped key value pairs for scattering
+
+        // Load keys (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset,
+                    keys,
+                    num_remaining,
+                    *(keys_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset, keys);
+        }
+
+        key_type tile_pred_key = (threadIdx.x == 0)
+                                     ? keys_load_it[tile_offset - 1]
+                                     : key_type();
+
+        sync_threadblock();
+
+        // Load values (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset,
+                    values,
+                    num_remaining,
+                    *(values_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset, values);
+        }
+
+        sync_threadblock();
+
+        // Set head segment_flags
+        BlockDiscontinuityKeys(storage.discontinuity)
+            .FlagHeads(segment_flags,
+                       keys,
+                       pred_keys,
+                       inequality_op,
+                       tile_pred_key);
+
+        // Zip values and segment_flags
+        zip_values_and_flags<IS_LAST_TILE>(num_remaining,
+                                           values,
+                                           segment_flags,
+                                           scan_items);
+
+        // Exclusive scan of values and segment_flags
+        size_value_pair_t  tile_aggregate;
+        TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx);
+        scan_tile(scan_items,
+                  tile_aggregate,
+                  prefix_op,
+                  is_true<HAS_IDENTITY_ZERO>());
+        size_value_pair_t tile_inclusive_prefix = prefix_op.GetInclusivePrefix();
+
+        // Unzip values and segment indices
+        zip_keys_and_values(pred_keys, segment_indices, scan_items, scatter_items);
+
+        // Scatter flagged items
+        scatter(scatter_items,
+                segment_flags,
+                segment_indices,
+                tile_aggregate.key,
+                prefix_op.GetExclusivePrefix().key);
+
+        if (IS_LAST_TILE)
+        {
+          // Finalize the carry-out from the last tile
+          finalize_last_tile(tile_inclusive_prefix.key,
+                             num_remaining,
+                             keys[ITEMS_PER_THREAD - 1],
+                             tile_inclusive_prefix.value);
+        }
+      }
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(size_type      num_remaining,
+                   int            tile_idx,
+                   size_type      tile_offset,
+                   ScanTileState &tile_state)
+      {
+        if (tile_idx == 0)
+        {
+          consume_first_tile<IS_LAST_TILE>(num_remaining,
+                                           tile_offset,
+                                           tile_state);
+        }
+        else
+        {
+          consume_subsequent_tile<IS_LAST_TILE>(num_remaining,
+                                                tile_idx,
+                                                tile_offset,
+                                                tile_state);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor : consume_range
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION impl(TempStorage &   storage_,
+                                  KeysInputIt     keys_input_it_,
+                                  ValuesInputIt   values_input_it_,
+                                  KeysOutputIt    keys_output_it_,
+                                  ValuesOutputIt  values_output_it_,
+                                  NumRunsOutputIt num_runs_output_it_,
+                                  EqualityOp      equality_op_,
+                                  ReductionOp     reduction_op_,
+                                  Size            num_items,
+                                  int             /*num_tiles*/,
+                                  ScanTileState & tile_state)
+          : storage(storage_),
+            keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_)),
+            values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_)),
+            keys_output_it(keys_output_it_),
+            values_output_it(values_output_it_),
+            num_runs_output_it(num_runs_output_it_),
+            inequality_op(equality_op_),
+            scan_op(reduction_op_)
+      {
+        // Blocks are launched in increasing order,
+        // so just assign one tile per block
+        //
+        int  tile_idx          = blockIdx.x;
+        Size tile_offset       = tile_idx * ITEMS_PER_TILE;
+        Size num_remaining     = num_items - tile_offset;
+
+        if (num_remaining > ITEMS_PER_TILE)
+        {
+          // Not the last tile (full)
+          consume_tile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+          // The last tile (possibly partially-full)
+          consume_tile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysInputIt     keys_input_it,
+                       ValuesInputIt   values_input_it,
+                       KeysOutputIt    keys_output_it,
+                       ValuesOutputIt  values_output_it,
+                       NumRunsOutputIt num_runs_output_it,
+                       ScanTileState   tile_state,
+                       EqualityOp      equality_op,
+                       ReductionOp     reduction_op,
+                       Size            num_items,
+                       int             num_tiles,
+                       char *          shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(storage,
+           keys_input_it,
+           values_input_it,
+           keys_output_it,
+           values_output_it,
+           num_runs_output_it,
+           equality_op,
+           reduction_op,
+           num_items,
+           num_tiles,
+           tile_state);
+    }
+
+  };    // struct ReduceByKeyAgent
+
+  template <class ScanTileState,
+            class Size,
+            class NumSelectedIt>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+  }; // struct InitAgent
+
+  template <class KeysInputIt,
+            class ValuesInputIt,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class NumRunsOutputIt,
+            class EqualityOp,
+            class ReductionOp,
+            class Size>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *          d_temp_storage,
+            size_t &        temp_storage_bytes,
+            KeysInputIt     keys_input_it,
+            ValuesInputIt   values_input_it,
+            KeysOutputIt    keys_output_it,
+            ValuesOutputIt  values_output_it,
+            NumRunsOutputIt num_runs_output_it,
+            EqualityOp      equality_op,
+            ReductionOp     reduction_op,
+            Size            num_items,
+            cudaStream_t    stream,
+            bool            debug_sync)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef AgentLauncher<
+        ReduceByKeyAgent<KeysInputIt,
+                         ValuesInputIt,
+                         KeysOutputIt,
+                         ValuesOutputIt,
+                         EqualityOp,
+                         ReductionOp,
+                         NumRunsOutputIt,
+                         Size> >
+        reduce_by_key_agent;
+
+    typedef typename reduce_by_key_agent::ScanTileState ScanTileState;
+    typedef AgentLauncher<
+        InitAgent<ScanTileState,
+                  Size,
+                  NumRunsOutputIt> >
+        init_agent;
+
+    AgentPlan reduce_by_key_plan = reduce_by_key_agent::get_plan(stream);
+    AgentPlan init_plan          = init_agent::get_plan();
+
+    // Number of input tiles
+    int  tile_size = reduce_by_key_plan.items_per_tile;
+    Size num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size,
+                                           num_tiles);
+
+    size_t allocation_sizes[2] = {9, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_state;
+    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    init_agent ia(init_plan, num_tiles, stream, "reduce_by_key::init_agent", debug_sync);
+    ia.launch(tile_state, num_tiles, num_runs_output_it);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    reduce_by_key_agent rbka(reduce_by_key_plan,
+                             num_items,
+                             stream,
+                             vshmem_ptr,
+                             "reduce_by_keys::reduce_by_key_agent",
+                             debug_sync);
+    rbka.launch(keys_input_it,
+                values_input_it,
+                keys_output_it,
+                values_output_it,
+                num_runs_output_it,
+                tile_state,
+                equality_op,
+                reduction_op,
+                num_items,
+                num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename KeysInputIt,
+            typename ValuesInputIt,
+            typename KeysOutputIt,
+            typename ValuesOutputIt,
+            typename EqualityOp,
+            typename ReductionOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ValuesOutputIt>
+  reduce_by_key(execution_policy<Derived>& policy,
+                KeysInputIt                keys_first,
+                KeysInputIt                keys_last,
+                ValuesInputIt              values_first,
+                KeysOutputIt               keys_output,
+                ValuesOutputIt             values_output,
+                EqualityOp                 equality_op,
+                ReductionOp                reduction_op)
+  {
+    typedef int size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(keys_first, keys_last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    if (num_items == 0)
+      return thrust::make_pair(keys_output, values_output);
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       keys_first,
+                       values_first,
+                       keys_output,
+                       values_output,
+                       reinterpret_cast<size_type*>(NULL),
+                       equality_op,
+                       reduction_op,
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "reduce_by_key failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
+
+    size_type* d_num_runs_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       keys_first,
+                       values_first,
+                       keys_output,
+                       values_output,
+                       d_num_runs_out,
+                       equality_op,
+                       reduction_op,
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "reduce_by_key failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "reduce_by_key: failed to synchronize");
+
+    int num_runs_out = cuda_cub::get_value(policy, d_num_runs_out);
+
+    return thrust::make_pair(
+      keys_output + num_runs_out,
+      values_output + num_runs_out
+    );
+  }
+
+}    // namespace __reduce_by_key
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt,
+          class BinaryPred,
+          class BinaryOp>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+reduce_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              KeyOutputIt                keys_output,
+              ValOutputIt                values_output,
+              BinaryPred                 binary_pred,
+              BinaryOp                   binary_op)
+{
+  pair<KeyOutputIt, ValOutputIt> ret = thrust::make_pair(keys_output, values_output);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __reduce_by_key::reduce_by_key(policy,
+                                         keys_first,
+                                         keys_last,
+                                         values_first,
+                                         keys_output,
+                                         values_output,
+                                         binary_pred,
+                                         binary_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)),
+                                keys_first,
+                                keys_last,
+                                values_first,
+                                keys_output,
+                                values_output,
+                                binary_pred,
+                                binary_op);
+#endif
+  }
+  return ret;
+}
+
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt,
+          class BinaryPred>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+reduce_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              KeyOutputIt                keys_output,
+              ValOutputIt                values_output,
+              BinaryPred                 binary_pred)
+{
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_output_iterator<ValOutputIt>::value,
+    thrust::iterator_value<ValInputIt>,
+    thrust::iterator_value<ValOutputIt>
+  >::type value_type;
+  return cuda_cub::reduce_by_key(policy,
+                              keys_first,
+                              keys_last,
+                              values_first,
+                              keys_output,
+                              values_output,
+                              binary_pred,
+                              plus<value_type>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+reduce_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              KeyOutputIt                keys_output,
+              ValOutputIt                values_output)
+{
+  typedef typename thrust::iterator_value<KeyInputIt>::type KeyT;
+  return cuda_cub::reduce_by_key(policy,
+                              keys_first,
+                              keys_last,
+                              values_first,
+                              keys_output,
+                              values_output,
+                              equal_to<KeyT>());
+}
+
+} // namespace cuda_
+
+} // end namespace thrust
+
+#include <thrust/memory.h>
+#include <thrust/reduce.h>
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/remove.h b/thrust/thrust/system/cuda/detail/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..700c95f23ccd6dd55160e2284e211289cecbdb89
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/remove.h
@@ -0,0 +1,134 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/copy_if.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+// in-place
+  
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class Predicate>
+InputIt __host__ __device__
+remove_if(execution_policy<Derived> &policy,
+          InputIt                    first,
+          InputIt                    last,
+          StencilIt                  stencil,
+          Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, stencil, first,
+    thrust::detail::not1(predicate));
+}
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+remove_if(execution_policy<Derived> &policy,
+          InputIt                    first,
+          InputIt                    last,
+          Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, first,
+    thrust::detail::not1(predicate));
+}
+
+
+template <class Derived,
+          class InputIt,
+          class T>
+InputIt __host__ __device__
+remove(execution_policy<Derived> &policy,
+       InputIt                    first,
+       InputIt                    last,
+       const T &                  value)
+{
+  using thrust::placeholders::_1;
+
+  return cuda_cub::remove_if(policy, first, last, _1 == value);
+}
+
+// copy
+
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class OutputIt,
+          class Predicate>
+OutputIt __host__ __device__
+remove_copy_if(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               StencilIt                  stencil,
+               OutputIt                   result,
+               Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, stencil, result,
+    thrust::detail::not1(predicate));
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class Predicate>
+OutputIt __host__ __device__
+remove_copy_if(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               OutputIt                   result,
+               Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, result,
+    thrust::detail::not1(predicate));
+}
+
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class T>
+OutputIt __host__ __device__
+remove_copy(execution_policy<Derived> &policy,
+            InputIt                    first,
+            InputIt                    last,
+            OutputIt                   result,
+            const T &                  value)
+{
+  thrust::detail::equal_to_value<T> pred(value);
+  return cuda_cub::remove_copy_if(policy, first, last, result, pred);
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/replace.h b/thrust/thrust/system/cuda/detail/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bd685108fb112e11148229647f7a8d4f9f5cd54
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/replace.h
@@ -0,0 +1,213 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+  namespace __replace
+  {
+    template<class T>
+    struct constant_f
+    {
+      T value;
+
+      THRUST_FUNCTION
+      constant_f(T const &x) : value(x) {}
+
+      template<class U>
+      THRUST_DEVICE_FUNCTION
+      T operator()(U const &)  const
+      {
+        return value;
+      }
+    }; // struct constant_f
+
+    template<class Predicate, class NewType, class OutputType>
+    struct new_value_if_f
+    {
+      Predicate pred;
+      NewType new_value;
+
+      THRUST_FUNCTION
+      new_value_if_f(Predicate pred_, NewType new_value_)
+          : pred(pred_), new_value(new_value_) {}
+
+      template<class T>
+      OutputType THRUST_DEVICE_FUNCTION
+      operator()(T const &x)
+      {
+        return pred(x) ? new_value : x;
+      }
+
+      template<class T, class P>
+      OutputType THRUST_DEVICE_FUNCTION
+      operator()(T const &x, P const& y)
+      {
+        return pred(y) ? new_value : x;
+      }
+    }; // struct new_value_if_f
+
+  } // namespace __replace
+
+template <class Derived,
+          class Iterator,
+          class T>
+void __host__ __device__
+replace(execution_policy<Derived> &policy,
+        Iterator                   first,
+        Iterator                   last,
+        T const &                  old_value,
+        T const &                  new_value)
+{
+  using thrust::placeholders::_1;
+
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      first,
+                      __replace::constant_f<T>(new_value),
+                      _1 == old_value);
+}
+
+template <class Derived,
+          class Iterator,
+          class Predicate,
+          class T>
+void __host__ __device__
+replace_if(execution_policy<Derived> &policy,
+           Iterator                   first,
+           Iterator                   last,
+           Predicate                  pred,
+           T const &                  new_value)
+{
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      first,
+                      __replace::constant_f<T>(new_value),
+                      pred);
+}
+
+template <class Derived,
+          class Iterator,
+          class StencilIt,
+          class Predicate,
+          class T>
+void __host__ __device__
+replace_if(execution_policy<Derived> &policy,
+           Iterator                   first,
+           Iterator                   last,
+           StencilIt                  stencil,
+           Predicate                  pred,
+           T const &                  new_value)
+{
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      stencil,
+                      first,
+                      __replace::constant_f<T>(new_value),
+                      pred);
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class Predicate,
+          class T>
+OutputIt __host__ __device__
+replace_copy_if(execution_policy<Derived> &policy,
+                InputIt                    first,
+                InputIt                    last,
+                OutputIt                   result,
+                Predicate                  predicate,
+                T const &                  new_value)
+{
+  typedef typename iterator_traits<OutputIt>::value_type output_type;
+  typedef __replace::new_value_if_f<Predicate, T, output_type> new_value_if_t;
+  return cuda_cub::transform(policy,
+                             first,
+                             last,
+                             result,
+                             new_value_if_t(predicate, new_value));
+}
+
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class OutputIt,
+          class Predicate,
+          class T>
+OutputIt __host__ __device__
+replace_copy_if(execution_policy<Derived> &policy,
+                InputIt                    first,
+                InputIt                    last,
+                StencilIt                  stencil,
+                OutputIt                   result,
+                Predicate                  predicate,
+                T const &                  new_value)
+{
+  typedef typename iterator_traits<OutputIt>::value_type output_type;
+  typedef __replace::new_value_if_f<Predicate, T, output_type> new_value_if_t;
+  return cuda_cub::transform(policy,
+                           first,
+                           last,
+                           stencil,
+                           result,
+                           new_value_if_t(predicate, new_value));
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class T>
+OutputIt __host__ __device__
+replace_copy(execution_policy<Derived> &policy,
+             InputIt                    first,
+             InputIt                    last,
+             OutputIt                   result,
+             T const &                  old_value,
+             T const &                  new_value)
+{
+  return cuda_cub::replace_copy_if(policy,
+                                   first,
+                                   last,
+                                   result,
+                                   thrust::detail::equal_to_value<T>(old_value),
+                                   new_value);
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/reverse.h b/thrust/thrust/system/cuda/detail/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..955825217d0857720bccfe0241704b679f80504f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/reverse.h
@@ -0,0 +1,98 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived, class ItemsIt, class ResultIt>
+ResultIt __host__ __device__
+reverse_copy(execution_policy<Derived> &policy,
+             ItemsIt                    first,
+             ItemsIt                    last,
+             ResultIt                   result);
+
+template <class Derived, class ItemsIt>
+void __host__ __device__
+reverse(execution_policy<Derived> &policy,
+        ItemsIt                    first,
+        ItemsIt                    last);
+
+}    // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/swap_ranges.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class ItemsIt,
+          class ResultIt>
+ResultIt __host__ __device__
+reverse_copy(execution_policy<Derived> &policy,
+             ItemsIt                    first,
+             ItemsIt                    last,
+             ResultIt                   result)
+{
+  return cuda_cub::copy(policy,
+                        make_reverse_iterator(last),
+                        make_reverse_iterator(first),
+                        result);
+}
+
+template <class Derived,
+          class ItemsIt>
+void __host__ __device__
+reverse(execution_policy<Derived> &policy,
+        ItemsIt                    first,
+        ItemsIt                    last)
+{
+  typedef typename thrust::iterator_difference<ItemsIt>::type difference_type;
+
+  // find the midpoint of [first,last)
+  difference_type N = thrust::distance(first, last);
+  ItemsIt mid(first);
+  thrust::advance(mid, N / 2);
+
+  cuda_cub::swap_ranges(policy, first, mid, make_reverse_iterator(last));
+}
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c3cfefec7290d2a80036d1edfb84b2b0cd5f1b4
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/scan.h
@@ -0,0 +1,928 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/functional.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cub/device/device_scan.cuh>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename AssociativeOperator>
+__host__ __device__ OutputIterator
+inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               InputIterator                                               first,
+               InputIterator                                               last,
+               OutputIterator                                              result,
+               AssociativeOperator                                         binary_op);
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename T,
+          typename AssociativeOperator>
+__host__ __device__ OutputIterator
+exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               InputIterator                                               first,
+               InputIterator                                               last,
+               OutputIterator                                              result,
+               T                                                           init,
+               AssociativeOperator                                         binary_op);
+} // end namespace thrust
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __scan {
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class>
+  struct WarpSize { enum { value = 32 }; };
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT,
+            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+    static const cub::BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+
+  // Scale the number of warps to keep same amount of "tile" storage
+  // as the nominal configuration for 4B data.  Minimum of two warps.
+  //
+  template<class Arch, int NOMINAL_4B_BLOCK_THREADS, class T>
+  struct THRUST_BLOCK_THREADS
+  {
+    enum
+    {
+      value = mpl::min<int,
+                       NOMINAL_4B_BLOCK_THREADS,
+                       mpl::max<int,
+                                3,
+                                ((NOMINAL_4B_BLOCK_THREADS /
+                                  WarpSize<Arch>::value) *
+                                 4) /
+                                    sizeof(T)>::value *
+                           WarpSize<Arch>::value>::value
+    };
+  }; // struct THRUST_BLOCK_THREADS
+
+  // If necessary, scale down number of items per thread to keep
+  // the same amount of "tile" storage as the nominal configuration for 4B data.
+  // Minimum 1 item per thread
+  //
+  template <class Arch,
+            int NOMINAL_4B_ITEMS_PER_THREAD,
+            int NOMINAL_4B_BLOCK_THREADS,
+            class T>
+  struct THRUST_ITEMS_PER_THREAD
+  {
+    enum
+    {
+      value = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              (NOMINAL_4B_ITEMS_PER_THREAD *
+               NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) /
+                  THRUST_BLOCK_THREADS<Arch,
+                                       NOMINAL_4B_BLOCK_THREADS,
+                                       T>::value>::value>::value
+    };
+  };
+
+
+  template <class Arch, class T, class U>
+  struct Tuning;
+
+  template<class T, class U>
+  struct Tuning<sm30,T,U>
+  {
+    typedef sm30 Arch;
+    enum
+    {
+      NOMINAL_4B_BLOCK_THREADS    = 256,
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+    };
+
+    typedef PtxPolicy<THRUST_BLOCK_THREADS<Arch,
+                                           NOMINAL_4B_BLOCK_THREADS,
+                                           T>::value,
+                      THRUST_ITEMS_PER_THREAD<Arch,
+                                              NOMINAL_4B_ITEMS_PER_THREAD,
+                                              NOMINAL_4B_BLOCK_THREADS,
+                                              T>::value,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                      cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        type;
+  };    // struct Tuning for sm30
+
+  template<class T, class U>
+  struct Tuning<sm35,T,U>
+  {
+    typedef sm35 Arch;
+    enum
+    {
+      NOMINAL_4B_BLOCK_THREADS    = 128,
+      NOMINAL_4B_ITEMS_PER_THREAD = 12,
+    };
+
+    typedef PtxPolicy<THRUST_BLOCK_THREADS<Arch,
+                                           NOMINAL_4B_BLOCK_THREADS,
+                                           T>::value,
+                      THRUST_ITEMS_PER_THREAD<Arch,
+                                              NOMINAL_4B_ITEMS_PER_THREAD,
+                                              NOMINAL_4B_BLOCK_THREADS,
+                                              T>::value,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                      cub::BLOCK_SCAN_RAKING>
+        type;
+  };    // struct Tuning for sm35
+
+  template<class T, class U>
+  struct Tuning<sm52,T,U>
+  {
+    typedef sm52 Arch;
+    enum
+    {
+      NOMINAL_4B_BLOCK_THREADS    = 128,
+      NOMINAL_4B_ITEMS_PER_THREAD = 12,
+    };
+
+    typedef PtxPolicy<THRUST_BLOCK_THREADS<Arch,
+                                           NOMINAL_4B_BLOCK_THREADS,
+                                           T>::value,
+                      THRUST_ITEMS_PER_THREAD<Arch,
+                                              NOMINAL_4B_ITEMS_PER_THREAD,
+                                              NOMINAL_4B_BLOCK_THREADS,
+                                              T>::value,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                      cub::BLOCK_SCAN_RAKING>
+        type;
+  };    // struct Tuning for sm52
+
+  template <class InputIt,
+            class OutputIt,
+            class ScanOp,
+            class Size,
+            class T,
+            class Inclusive>
+  struct ScanAgent
+  {
+    typedef cub::ScanTileState<T> ScanTileState;
+    typedef cub::BlockScanRunningPrefixOp<T, ScanOp> RunningPrefixCallback;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,T,T>::type
+    {
+      typedef Tuning<Arch, T, T> tuning;
+
+
+      typedef typename core::LoadIterator<PtxPlan, InputIt>::type LoadIt;
+      typedef typename core::BlockLoad<PtxPlan, LoadIt, T>::type    BlockLoad;
+      typedef typename core::BlockStore<PtxPlan, OutputIt, T>::type BlockStore;
+
+      typedef cub::TilePrefixCallbackOp<T, ScanOp, ScanTileState, Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<T,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      union TempStorage
+      {
+        typename BlockLoad::TempStorage  load;
+        typename BlockStore::TempStorage store;
+
+        struct
+        {
+          typename TilePrefixCallback::TempStorage prefix;
+          typename BlockScan::TempStorage          scan;
+        };
+      };    // struct TempStorage
+    };    // struct PtxPlan
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::LoadIt             LoadIt;
+    typedef typename ptx_plan::BlockLoad          BlockLoad;
+    typedef typename ptx_plan::BlockStore         BlockStore;
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan          BlockScan;
+    typedef typename ptx_plan::TempStorage        TempStorage;
+
+    enum
+    {
+      INCLUSIVE        = Inclusive::value,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
+
+      SYNC_AFTER_LOAD = (ptx_plan::LOAD_ALGORITHM != cub::BLOCK_LOAD_DIRECT),
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      TempStorage &storage;
+      ScanTileState &tile_state;
+      LoadIt load_it;
+      OutputIt output_it;
+      ScanOp scan_op;
+
+      //---------------------------------------------------------------------
+      // Block scan utility methods (first tile)
+      //---------------------------------------------------------------------
+
+      // Exclusive scan specialization
+      //
+      template <class _ScanOp>
+      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            _ScanOp scan_op,
+                                            T &     block_aggregate,
+                                            thrust::detail::false_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).ExclusiveScan(items, items, scan_op, block_aggregate);
+      }
+
+      // Exclusive sum specialization
+      //
+      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            plus<T> /*scan_op*/,
+                                            T &     block_aggregate,
+                                            thrust::detail::false_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).ExclusiveSum(items, items, block_aggregate);
+      }
+
+      // Inclusive scan specialization
+      //
+      template <typename _ScanOp>
+      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            _ScanOp scan_op,
+                                            T &     block_aggregate,
+                                            thrust::detail::true_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+      }
+
+
+      // Inclusive sum specialization
+      //
+      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            plus<T> /*scan_op*/,
+                                            T &     block_aggregate,
+                                            thrust::detail::true_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).InclusiveSum(items, items, block_aggregate);
+      }
+
+      //---------------------------------------------------------------------
+      // Block scan utility methods (subsequent tiles)
+      //---------------------------------------------------------------------
+
+      // Exclusive scan specialization (with prefix from predecessors)
+      //
+      template <class _ScanOp, class PrefixCallback>
+      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            _ScanOp         scan_op,
+                                            T &             block_aggregate,
+                                            PrefixCallback &prefix_op,
+                                            thrust::detail::false_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+        block_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      // Exclusive sum specialization (with prefix from predecessors)
+      //
+      template <class PrefixCallback>
+      THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            plus<T>         /*scan_op*/,
+                                            T &             block_aggregate,
+                                            PrefixCallback &prefix_op,
+                                            thrust::detail::false_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).ExclusiveSum(items, items, prefix_op);
+        block_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      // Inclusive scan specialization (with prefix from predecessors)
+      //
+      template <class _ScanOp, class PrefixCallback>
+      THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            _ScanOp         scan_op,
+                                            T &             block_aggregate,
+                                            PrefixCallback &prefix_op,
+                                            thrust::detail::true_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+        block_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      // Inclusive sum specialization (with prefix from predecessors)
+      //
+      template <class U, class PrefixCallback>
+      THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD],
+                                            plus<T>         /*scan_op*/,
+                                            T &             block_aggregate,
+                                            PrefixCallback &prefix_op,
+                                            thrust::detail::true_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan).InclusiveSum(items, items, prefix_op);
+        block_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      //---------------------------------------------------------------------
+      // Cooperatively scan a device-wide sequence of tiles with other CTAs
+      //---------------------------------------------------------------------
+
+      // Process a tile of input (dynamic chained scan)
+      //
+      template <bool IS_FULL_TILE, class AddInitToExclusive>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(Size               /*num_items*/,
+                   Size               num_remaining,
+                   int                tile_idx,
+                   Size               tile_base,
+                   AddInitToExclusive add_init_to_exclusive_scan)
+      {
+        using core::sync_threadblock;
+
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (IS_FULL_TILE)
+        {
+          BlockLoad(storage.load).Load(load_it + tile_base, items);
+        }
+        else
+        {
+          // Fill last element with the first element
+          // because collectives are not suffix guarded
+          BlockLoad(storage.load)
+              .Load(load_it + tile_base,
+                    items,
+                    num_remaining,
+                    *(load_it + tile_base));
+        }
+
+        if (SYNC_AFTER_LOAD)
+          sync_threadblock();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+          // Scan first tile
+          T block_aggregate;
+          scan_tile(items, scan_op, block_aggregate, Inclusive());
+
+          // Update tile status if there may be successor tiles (i.e., this tile is full)
+          if (IS_FULL_TILE && (threadIdx.x == 0))
+            tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+          // Scan non-first tile
+          T                  block_aggregate;
+          TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx);
+          scan_tile(items, scan_op, block_aggregate, prefix_op, Inclusive());
+        }
+
+        sync_threadblock();
+
+        add_init_to_exclusive_scan(items, tile_idx);
+
+        // Store items
+        if (IS_FULL_TILE)
+        {
+          BlockStore(storage.store).Store(output_it + tile_base, items);
+        }
+        else
+        {
+          BlockStore(storage.store).Store(output_it + tile_base, items, num_remaining);
+        }
+      }
+
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      // Dequeue and scan tiles of items as part of a dynamic chained scan
+      // with Init
+      template <class AddInitToExclusiveScan>
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &          storage_,
+           ScanTileState &        tile_state_,
+           InputIt                input_it,
+           OutputIt               output_it_,
+           ScanOp                 scan_op_,
+           Size                   num_items,
+           AddInitToExclusiveScan add_init_to_exclusive_scan)
+          : storage(storage_),
+            tile_state(tile_state_),
+            load_it(core::make_load_iterator(ptx_plan(), input_it)),
+            output_it(output_it_),
+            scan_op(scan_op_)
+      {
+        int  tile_idx      = blockIdx.x;
+        Size tile_base     = ITEMS_PER_TILE * tile_idx;
+        Size num_remaining = num_items - tile_base;
+
+        if (num_remaining > ITEMS_PER_TILE)
+        {
+          // Full tile
+          consume_tile<true>(num_items,
+                             num_remaining,
+                             tile_idx,
+                             tile_base,
+                             add_init_to_exclusive_scan);
+        }
+        else if (num_remaining > 0)
+        {
+          // Partially-full tile
+          consume_tile<false>(num_items,
+                              num_remaining,
+                              tile_idx,
+                              tile_base,
+                              add_init_to_exclusive_scan);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    template <class AddInitToExclusiveScan>
+    THRUST_AGENT_ENTRY(InputIt                input_it,
+                       OutputIt               output_it,
+                       ScanOp                 scan_op,
+                       Size                   num_items,
+                       ScanTileState          tile_state,
+                       AddInitToExclusiveScan add_init_to_exclusive_scan,
+                       char *                 shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+      impl(storage,
+           tile_state,
+           input_it,
+           output_it,
+           scan_op,
+           num_items,
+           add_init_to_exclusive_scan);
+    }
+  };    // struct ScanAgent
+
+  template <class ScanTileState,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+    }
+
+  }; // struct InitAgent
+
+  template<class T>
+  struct DoNothing
+  {
+    typedef T     type;
+    template <int ITEMS_PER_THREAD>
+    THRUST_DEVICE_FUNCTION void
+    operator()(T (&items)[ITEMS_PER_THREAD], int /*tile_idx*/)
+    {
+      THRUST_UNUSED_VAR(items);
+    }
+  };    // struct DoNothing
+
+  template<class T, class ScanOp>
+  struct AddInitToExclusiveScan
+  {
+    typedef T type;
+    T         init;
+    ScanOp    scan_op;
+
+    THRUST_RUNTIME_FUNCTION
+    AddInitToExclusiveScan(T init_, ScanOp scan_op_)
+        : init(init_), scan_op(scan_op_) {}
+
+    template <int ITEMS_PER_THREAD>
+    THRUST_DEVICE_FUNCTION void
+    operator()(T (&items)[ITEMS_PER_THREAD], int tile_idx)
+    {
+      if (tile_idx == 0 && threadIdx.x == 0)
+      {
+        items[0] = init;
+        for (int i = 1; i < ITEMS_PER_THREAD; ++i)
+          items[i] = scan_op(init, items[i]);
+      }
+      else
+      {
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+          items[i] = scan_op(init, items[i]);
+      }
+    }
+  };    // struct AddInitToExclusiveScan
+
+  template <class Inclusive,
+            class InputIt,
+            class OutputIt,
+            class ScanOp,
+            class Size,
+            class AddInitToExclusiveScan>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *                 d_temp_storage,
+            size_t &               temp_storage_bytes,
+            InputIt                input_it,
+            Size                   num_items,
+            AddInitToExclusiveScan add_init_to_exclusive_scan,
+            OutputIt               output_it,
+            ScanOp                 scan_op,
+            cudaStream_t           stream,
+            bool                   debug_sync)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef typename AddInitToExclusiveScan::type T;
+
+    typedef AgentLauncher<
+        ScanAgent<InputIt, OutputIt, ScanOp, Size, T, Inclusive> >
+        scan_agent;
+
+    typedef typename scan_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
+
+    AgentPlan scan_plan = scan_agent::get_plan(stream);
+    AgentPlan init_plan = init_agent::get_plan();
+
+    int tile_size = scan_plan.items_per_tile;
+    Size num_tiles = static_cast<Size>((num_items + tile_size - 1) / tile_size);
+
+    size_t vshmem_size = core::vshmem_size(scan_plan.shared_memory_size,
+                                           num_tiles);
+
+    size_t allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void* allocations[2] = {NULL, NULL};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_bytes,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_state;
+    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
+
+    init_agent ia(init_plan, num_tiles, stream, "scan::init_agent", debug_sync);
+    ia.launch(tile_state, num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    scan_agent sa(scan_plan, num_items, stream, vshmem_ptr, "scan::scan_agent", debug_sync);
+    sa.launch(input_it,
+              output_it,
+              scan_op,
+              num_items,
+              tile_state,
+              add_init_to_exclusive_scan);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }    // func doit_step
+
+  template <typename Inclusive,
+            typename Derived,
+            typename InputIt,
+            typename OutputIt,
+            typename Size,
+            typename ScanOp,
+            typename AddInitToExclusiveScan>
+  THRUST_RUNTIME_FUNCTION
+  OutputIt scan(execution_policy<Derived>& policy,
+                InputIt                    input_it,
+                OutputIt                   output_it,
+                Size                       num_items,
+                ScanOp                     scan_op,
+                AddInitToExclusiveScan     add_init_to_exclusive_scan)
+  {
+    if (num_items == 0)
+      return output_it;
+
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    THRUST_INDEX_TYPE_DISPATCH(status,
+                                doit_step<Inclusive>,
+                                num_items,
+                                (NULL,
+                                storage_size,
+                                input_it,
+                                num_items_fixed,
+                                add_init_to_exclusive_scan,
+                                output_it,
+                                scan_op,
+                                stream,
+                                debug_sync));
+    cuda_cub::throw_on_error(status, "scan failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    THRUST_INDEX_TYPE_DISPATCH(status,
+                                doit_step<Inclusive>,
+                                num_items,
+                                (ptr,
+                                storage_size,
+                                input_it,
+                                num_items_fixed,
+                                add_init_to_exclusive_scan,
+                                output_it,
+                                scan_op,
+                                stream,
+                                debug_sync));
+    cuda_cub::throw_on_error(status, "scan failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "scan failed to synchronize");
+
+    return output_it + num_items;
+  }    // func scan
+
+}    // namespace __scan
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class Size,
+          class OutputIt,
+          class ScanOp>
+OutputIt __host__ __device__
+inclusive_scan_n(execution_policy<Derived> &policy,
+                 InputIt                    first,
+                 Size                       num_items,
+                 OutputIt                   result,
+                 ScanOp                     scan_op)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typedef typename iterator_traits<InputIt>::value_type T;
+    ret = __scan::scan<thrust::detail::true_type>(policy,
+                                                  first,
+                                                  result,
+                                                  num_items,
+                                                  scan_op,
+                                                  __scan::DoNothing<T>());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)),
+                                 first,
+                                 first + num_items,
+                                 result,
+                                 scan_op);
+#endif
+  }
+  return ret;
+}
+
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class ScanOp>
+OutputIt __host__ __device__
+inclusive_scan(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               OutputIt                   result,
+               ScanOp                     scan_op)
+{
+  typedef typename thrust::iterator_traits<InputIt>::difference_type diff_t;
+  diff_t num_items = thrust::distance(first, last);
+  return cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op);
+}
+
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+inclusive_scan(execution_policy<Derived> &policy,
+               InputIt                    first,
+               OutputIt                   last,
+               OutputIt                   result)
+{
+
+  typedef typename thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIt>::value,
+      thrust::iterator_value<InputIt>,
+      thrust::iterator_value<OutputIt> >::type result_type;
+  return cuda_cub::inclusive_scan(policy, first, last, result, plus<result_type>());
+};
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class Size,
+          class OutputIt,
+          class T,
+          class ScanOp>
+OutputIt __host__ __device__
+exclusive_scan_n(execution_policy<Derived> &policy,
+                 InputIt                    first,
+                 Size                       num_items,
+                 OutputIt                   result,
+                 T                          init,
+                 ScanOp                     scan_op)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __scan::scan<thrust::detail::false_type>(
+        policy,
+        first,
+        result,
+        num_items,
+        scan_op,
+        __scan::AddInitToExclusiveScan<T, ScanOp>(init, scan_op));
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)),
+                                 first,
+                                 first + num_items,
+                                 result,
+                                 init,
+                                 scan_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class T,
+          class ScanOp>
+OutputIt __host__ __device__
+exclusive_scan(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               OutputIt                   result,
+               T                          init,
+               ScanOp                   scan_op)
+{
+  typedef typename thrust::iterator_traits<InputIt>::difference_type diff_t;
+  diff_t num_items = thrust::distance(first, last);
+  return cuda_cub::exclusive_scan_n(policy, first, num_items, result, init, scan_op);
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class T>
+OutputIt __host__ __device__
+exclusive_scan(execution_policy<Derived> &policy,
+               InputIt                    first,
+               OutputIt                   last,
+               OutputIt                   result,
+               T                          init)
+{
+  return cuda_cub::exclusive_scan(policy, first, last, result, init, plus<T>());
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+exclusive_scan(execution_policy<Derived> &policy,
+               InputIt                    first,
+               OutputIt                   last,
+               OutputIt                   result)
+{
+  typedef typename thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIt>::value,
+      thrust::iterator_value<InputIt>,
+      thrust::iterator_value<OutputIt>
+  >::type result_type;
+  return cuda_cub::exclusive_scan(policy, first, last, result, result_type(0));
+};
+
+} // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/scan.h>
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/scan_by_key.h b/thrust/thrust/system/cuda/detail/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..1744c9e8dbf70a77d56a13032f246b59373b80d3
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/scan_by_key.h
@@ -0,0 +1,1004 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __scan_by_key {
+  namespace mpl = thrust::detail::mpl::math;
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
+            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template <class Arch, class Key, class Value>
+  struct Tuning;
+
+  template <class Key, class Value>
+  struct Tuning<sm30, Key, Value>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
+      NOMINAL_4B_ITEMS_PER_THREAD = 6,
+
+      ITEMS_PER_THREAD = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm30
+
+  template <class Key, class Value>
+  struct Tuning<sm35, Key, Value> : Tuning<sm30, Key, Value>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 6,
+
+      ITEMS_PER_THREAD =
+          (Tuning::MAX_INPUT_BYTES <= 8)
+              ? 6
+              : mpl::min<
+                    int,
+                    NOMINAL_4B_ITEMS_PER_THREAD,
+                    mpl::max<
+                        int,
+                        1,
+                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                         Tuning::COMBINED_INPUT_BYTES - 1) /
+                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm35
+
+  template <class Key, class Value>
+  struct Tuning<sm52, Key, Value> : Tuning<sm30, Key, Value>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+
+      ITEMS_PER_THREAD =
+          (Tuning::MAX_INPUT_BYTES <= 8)
+              ? 9
+              : mpl::min<
+                    int,
+                    NOMINAL_4B_ITEMS_PER_THREAD,
+                    mpl::max<
+                        int,
+                        1,
+                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                         Tuning::COMBINED_INPUT_BYTES - 1) /
+                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm52
+
+  template <class KeysInputIt,
+            class ValuesInputIt,
+            class ValuesOutputIt,
+            class EqualityOp,
+            class ScanOp,
+            class Size,
+            class T,
+            class Inclusive>
+  struct ScanByKeyAgent
+  {
+    typedef typename iterator_traits<KeysInputIt>::value_type key_type;
+
+    typedef T    value_type;
+    typedef Size size_type;
+
+    typedef cub::KeyValuePair<size_type, value_type> size_value_pair_t;
+    typedef cub::KeyValuePair<key_type, value_type> key_value_pair_t;
+
+    typedef cub::ReduceByKeyScanTileState<value_type, size_type> ScanTileState;
+    typedef cub::ReduceBySegmentOp<ScanOp> ReduceBySegmentOp;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, key_type, value_type>::type
+    {
+      typedef Tuning<Arch, key_type, value_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysInputIt>::type   KeysLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, ValuesInputIt>::type ValuesLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt, key_type>::type     BlockLoadKeys;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt, value_type>::type BlockLoadValues;
+
+      typedef typename core::BlockStore<PtxPlan,
+                                        ValuesOutputIt,
+                                        value_type>::type BlockStoreValues;
+
+      typedef cub::BlockDiscontinuity<key_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityKeys;
+
+      typedef cub::TilePrefixCallbackOp<size_value_pair_t,
+                                        ReduceBySegmentOp,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<size_value_pair_t,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage              scan;
+          typename TilePrefixCallback::TempStorage     prefix;
+          typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        };
+
+        typename BlockLoadKeys::TempStorage   load_keys;
+        typename BlockLoadValues::TempStorage load_values;
+
+        typename BlockStoreValues::TempStorage store_values;
+      };    // union TempStorage
+    };      // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt   KeysLoadIt;
+    typedef typename ptx_plan::ValuesLoadIt ValuesLoadIt;
+
+    typedef typename ptx_plan::BlockLoadKeys    BlockLoadKeys;
+    typedef typename ptx_plan::BlockLoadValues  BlockLoadValues;
+    typedef typename ptx_plan::BlockStoreValues BlockStoreValues;
+
+    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
+    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan              BlockScan;
+    typedef typename ptx_plan::TempStorage            TempStorage;
+
+    enum
+    {
+      BLOCK_THREADS     = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD  = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE    = ptx_plan::ITEMS_PER_TILE,
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      TempStorage &  storage;
+      ScanTileState &tile_state;
+
+      KeysLoadIt     keys_load_it;
+      ValuesLoadIt   values_load_it;
+      ValuesOutputIt values_output_it;
+
+      cub::InequalityWrapper<EqualityOp> inequality_op;
+      ReduceBySegmentOp                  scan_op;
+
+
+      //---------------------------------------------------------------------
+      // Block scan utility methods (first tile)
+      //---------------------------------------------------------------------
+
+      // Exclusive scan specialization
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t &tile_aggregate,
+                thrust::detail::false_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan)
+            .ExclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
+      }
+
+      // Inclusive scan specialization
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t &tile_aggregate,
+                thrust::detail::true_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan)
+            .InclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
+      }
+
+      //---------------------------------------------------------------------
+      // Block scan utility methods (subsequent tiles)
+      //---------------------------------------------------------------------
+
+      // Exclusive scan specialization (with prefix from predecessors)
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t & tile_aggregate,
+                TilePrefixCallback &prefix_op,
+                thrust::detail::false_type /* is_incclusive */)
+      {
+        BlockScan(storage.scan)
+            .ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      // Inclusive scan specialization (with prefix from predecessors)
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t & tile_aggregate,
+                TilePrefixCallback &prefix_op,
+                thrust::detail::true_type /* is_inclusive */)
+      {
+        BlockScan(storage.scan)
+            .InclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      //---------------------------------------------------------------------
+      // Zip utility methods
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      zip_values_and_flags(size_type num_remaining,
+                           value_type (&values)[ITEMS_PER_THREAD],
+                           size_type (&segment_flags)[ITEMS_PER_THREAD],
+                           size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+      {
+        // Zip values and segment_flags
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set segment_flags for first out-of-bounds item, zero for others
+          if (IS_LAST_TILE &&
+              Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)
+            segment_flags[ITEM] = 1;
+
+          scan_items[ITEM].value = values[ITEM];
+          scan_items[ITEM].key   = segment_flags[ITEM];
+        }
+      }
+
+      THRUST_DEVICE_FUNCTION void unzip_values(
+          value_type (&values)[ITEMS_PER_THREAD],
+          size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+      {
+        // Zip values and segment_flags
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          values[ITEM] = scan_items[ITEM].value;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Cooperatively scan a device-wide sequence of tiles with other CTAs
+      //---------------------------------------------------------------------
+
+      // Process a tile of input (dynamic chained scan)
+      //
+      template <bool IS_LAST_TILE, class AddInitToScan>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(Size          /*num_items*/,
+                   Size          num_remaining,
+                   int           tile_idx,
+                   Size          tile_base,
+                   AddInitToScan add_init_to_scan)
+      {
+        using core::sync_threadblock;
+
+        // Load items
+        key_type          keys[ITEMS_PER_THREAD];
+        value_type        values[ITEMS_PER_THREAD];
+        size_type         segment_flags[ITEMS_PER_THREAD];
+        size_value_pair_t scan_items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          // Fill last element with the first element
+          // because collectives are not suffix guarded
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_base,
+                    keys,
+                    num_remaining,
+                    *(keys_load_it + tile_base));
+        }
+        else
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_base, keys);
+        }
+
+        sync_threadblock();
+
+        if (IS_LAST_TILE)
+        {
+          // Fill last element with the first element
+          // because collectives are not suffix guarded
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_base,
+                    values,
+                    num_remaining,
+                    *(values_load_it + tile_base));
+        }
+        else
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_base, values);
+        }
+
+        sync_threadblock();
+
+        // first tile
+        if (tile_idx == 0)
+        {
+          BlockDiscontinuityKeys(storage.discontinuity)
+            .FlagHeads(segment_flags, keys, inequality_op);
+
+          // Zip values and segment_flags
+          zip_values_and_flags<IS_LAST_TILE>(num_remaining,
+                                             values,
+                                             segment_flags,
+                                             scan_items);
+
+          // Exclusive scan of values and segment_flags
+          size_value_pair_t tile_aggregate;
+          scan_tile(scan_items, tile_aggregate, Inclusive());
+
+          if (threadIdx.x == 0)
+          {
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, tile_aggregate);
+
+            scan_items[0].key = 0;
+          }
+        }
+        else
+        {
+          key_type tile_pred_key = (threadIdx.x == 0)
+                                       ? keys_load_it[tile_base - 1]
+                                       : key_type();
+          BlockDiscontinuityKeys(storage.discontinuity)
+              .FlagHeads(segment_flags,
+                         keys,
+                         inequality_op,
+                         tile_pred_key);
+
+          // Zip values and segment_flags
+          zip_values_and_flags<IS_LAST_TILE>(num_remaining,
+                                             values,
+                                             segment_flags,
+                                             scan_items);
+
+          size_value_pair_t  tile_aggregate;
+          TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx);
+          scan_tile(scan_items, tile_aggregate, prefix_op, Inclusive());
+        }
+
+        sync_threadblock();
+
+        unzip_values(values, scan_items);
+
+        add_init_to_scan(values, segment_flags);
+
+        // Store items
+        if (IS_LAST_TILE)
+        {
+          BlockStoreValues(storage.store_values)
+            .Store(values_output_it + tile_base, values, num_remaining);
+        }
+        else
+        {
+          BlockStoreValues(storage.store_values)
+            .Store(values_output_it + tile_base, values);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      // Dequeue and scan tiles of items as part of a dynamic chained scan
+      // with Init functor
+      template <class AddInitToScan>
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &  storage_,
+           ScanTileState &tile_state_,
+           KeysInputIt    keys_input_it,
+           ValuesInputIt  values_input_it,
+           ValuesOutputIt values_output_it_,
+           EqualityOp     equality_op_,
+           ScanOp         scan_op_,
+           Size           num_items,
+           AddInitToScan  add_init_to_scan)
+          : storage(storage_),
+            tile_state(tile_state_),
+            keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it)),
+            values_load_it(core::make_load_iterator(ptx_plan(), values_input_it)),
+            values_output_it(values_output_it_),
+            inequality_op(equality_op_),
+            scan_op(scan_op_)
+      {
+        int  tile_idx      = blockIdx.x;
+        Size tile_base     = ITEMS_PER_TILE * tile_idx;
+        Size num_remaining = num_items - tile_base;
+
+        if (num_remaining > ITEMS_PER_TILE)
+        {
+          // Not the last tile (full)
+          consume_tile<false>(num_items,
+                              num_remaining,
+                              tile_idx,
+                              tile_base,
+                              add_init_to_scan);
+        }
+        else if (num_remaining > 0)
+        {
+          // The last tile (possibly partially-full)
+          consume_tile<true>(num_items,
+                             num_remaining,
+                             tile_idx,
+                             tile_base,
+                             add_init_to_scan);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    template <class AddInitToScan>
+    THRUST_AGENT_ENTRY(KeysInputIt    keys_input_it,
+                       ValuesInputIt  values_input_it,
+                       ValuesOutputIt values_output_it,
+                       EqualityOp     equaility_op,
+                       ScanOp         scan_op,
+                       ScanTileState  tile_state,
+                       Size           num_items,
+                       AddInitToScan  add_init_to_scan,
+                       char *         shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+      impl(storage,
+           tile_state,
+           keys_input_it,
+           values_input_it,
+           values_output_it,
+           equaility_op,
+           scan_op,
+           num_items,
+           add_init_to_scan);
+    }
+
+  };    // struct ScanByKeyAgent
+
+  template <class ScanTileState,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+    }
+  }; // struct InitAgent
+
+  template<class T>
+  struct DoNothing
+  {
+    typedef T     type;
+    template <int ITEMS_PER_THREAD, class Size>
+    THRUST_DEVICE_FUNCTION void
+    operator()(T (&/*items*/)[ITEMS_PER_THREAD],
+               Size (&/*flags*/)[ITEMS_PER_THREAD])
+    {
+    }
+  };    // struct DoNothing
+
+  template<class T, class ScanOp>
+  struct AddInitToScan
+  {
+    typedef T type;
+    T         init;
+    ScanOp    scan_op;
+
+    THRUST_RUNTIME_FUNCTION
+    AddInitToScan(T init_, ScanOp scan_op_)
+        : init(init_), scan_op(scan_op_) {}
+
+    template <int ITEMS_PER_THREAD, class Size>
+    THRUST_DEVICE_FUNCTION void
+    operator()(T (&items)[ITEMS_PER_THREAD],
+               Size (&flags)[ITEMS_PER_THREAD])
+    {
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        items[ITEM] = flags[ITEM] ? init : scan_op(init, items[ITEM]);
+      }
+    }
+  };    // struct AddInitToScan
+
+  template <class Inclusive,
+            class KeysInputIt,
+            class ValuesInputIt,
+            class ValuesOutputIt,
+            class EqualityOp,
+            class ScanOp,
+            class Size,
+            class AddInitToScan>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *         d_temp_storage,
+            size_t &       temp_storage_bytes,
+            KeysInputIt    keys_input_it,
+            ValuesInputIt  values_input_it,
+            Size           num_items,
+            ValuesOutputIt values_output_it,
+            EqualityOp     equality_op,
+            ScanOp         scan_op,
+            AddInitToScan  add_init_to_scan,
+            cudaStream_t   stream,
+            bool           debug_sync)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef typename AddInitToScan::type T;
+
+    typedef AgentLauncher<
+        ScanByKeyAgent<KeysInputIt,
+                       ValuesInputIt,
+                       ValuesOutputIt,
+                       EqualityOp,
+                       ScanOp,
+                       Size,
+                       T,
+                       Inclusive> >
+        scan_by_key_agent;
+
+    typedef typename scan_by_key_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
+
+    AgentPlan scan_by_key_plan = scan_by_key_agent::get_plan(stream);
+    AgentPlan init_plan        = init_agent::get_plan();
+
+    int tile_size = scan_by_key_plan.items_per_tile;
+    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t vshmem_size = core::vshmem_size(scan_by_key_plan.shared_memory_size,
+                                           num_tiles);
+
+    size_t allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    status               = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_state;
+    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
+
+    init_agent ia(init_plan, num_tiles, stream, "scan_by_key::init_agent", debug_sync);
+    ia.launch(tile_state, num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    scan_by_key_agent sbka(scan_by_key_plan, num_items, stream, vshmem_ptr, "scan_by_key::scan_agent", debug_sync);
+    sbka.launch(keys_input_it,
+                values_input_it,
+                values_output_it,
+                equality_op,
+                scan_op,
+                tile_state,
+                num_items,
+                add_init_to_scan);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }    // func doit_pass
+
+  template <typename Inclusive,
+            typename Derived,
+            typename KeysInputIt,
+            typename ValuesInputIt,
+            typename ValuesOutputIt,
+            typename EqualityOp,
+            typename ScanOp,
+            typename AddInitToScan>
+  THRUST_RUNTIME_FUNCTION
+  ValuesOutputIt scan_by_key(execution_policy<Derived>& policy,
+                             KeysInputIt                keys_first,
+                             KeysInputIt                keys_last,
+                             ValuesInputIt              values_first,
+                             ValuesOutputIt             values_result,
+                             EqualityOp                 equality_op,
+                             ScanOp                     scan_op,
+                             AddInitToScan              add_init_to_scan)
+  {
+    int          num_items    = static_cast<int>(thrust::distance(keys_first, keys_last));
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
+
+    if (num_items == 0)
+      return values_result;
+
+    cudaError_t status;
+    status = doit_step<Inclusive>(NULL,
+                                  storage_size,
+                                  keys_first,
+                                  values_first,
+                                  num_items,
+                                  values_result,
+                                  equality_op,
+                                  scan_op,
+                                  add_init_to_scan,
+                                  stream,
+                                  debug_sync);
+    cuda_cub::throw_on_error(status, "scan_by_key: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = doit_step<Inclusive>(ptr,
+                                  storage_size,
+                                  keys_first,
+                                  values_first,
+                                  num_items,
+                                  values_result,
+                                  equality_op,
+                                  scan_op,
+                                  add_init_to_scan,
+                                  stream,
+                                  debug_sync);
+    cuda_cub::throw_on_error(status, "scan_by_key: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "scan_by_key: failed to synchronize");
+
+    return values_result + num_items;
+  }    // func doit
+}    // namspace scan_by_key
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+//---------------------------
+//   Inclusive scan
+//---------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class BinaryPred,
+          class ScanOp>
+ValOutputIt __host__ __device__
+inclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      BinaryPred                 binary_pred,
+                      ScanOp                     scan_op)
+{
+  ValOutputIt ret = value_result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typedef typename iterator_traits<ValInputIt>::value_type T;
+    ret = __scan_by_key::scan_by_key<thrust::detail::true_type>(policy,
+                                                        key_first,
+                                                        key_last,
+                                                        value_first,
+                                                        value_result,
+                                                        binary_pred,
+                                                        scan_op,
+                                                        __scan_by_key::DoNothing<T>());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
+                                        key_first,
+                                        key_last,
+                                        value_first,
+                                        value_result,
+                                        binary_pred,
+                                        scan_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class BinaryPred>
+ValOutputIt __host__ __device__
+inclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      BinaryPred                 binary_pred)
+{
+  typedef typename thrust::iterator_traits<ValOutputIt>::value_type value_type;
+  return cuda_cub::inclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         binary_pred,
+                                         plus<value_type>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt>
+ValOutputIt __host__ __device__
+inclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result)
+{
+  typedef typename thrust::iterator_traits<KeyInputIt>::value_type key_type;
+  return cuda_cub::inclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         equal_to<key_type>());
+}
+
+
+//---------------------------
+//   Exclusive scan
+//---------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class Init,
+          class BinaryPred,
+          class ScanOp>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      Init                       init,
+                      BinaryPred                 binary_pred,
+                      ScanOp                     scan_op)
+{
+  ValOutputIt ret = value_result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __scan_by_key::scan_by_key<thrust::detail::false_type>(
+        policy,
+        key_first,
+        key_last,
+        value_first,
+        value_result,
+        binary_pred,
+        scan_op,
+        __scan_by_key::AddInitToScan<Init, ScanOp>(init, scan_op));
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
+                                        key_first,
+                                        key_last,
+                                        value_first,
+                                        value_result,
+                                        init,
+                                        binary_pred,
+                                        scan_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class Init,
+          class BinaryPred>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      Init                       init,
+                      BinaryPred                 binary_pred)
+{
+  return cuda_cub::exclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         init,
+                                         binary_pred,
+                                         plus<Init>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class Init>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      Init                       init)
+{
+  typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+  return cuda_cub::exclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         init,
+                                         equal_to<key_type>());
+}
+
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result)
+{
+  typedef typename iterator_traits<ValOutputIt>::value_type value_type;
+  return cuda_cub::exclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         value_type(0));
+}
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/scan.h>
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/scatter.h b/thrust/thrust/system/cuda/detail/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ba0a4b743b3a4def4e17639cb3dcc263bddb788
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/scatter.h
@@ -0,0 +1,106 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class ItemsIt,
+          class MapIt,
+          class ResultIt>
+void __host__ __device__
+scatter(execution_policy<Derived>& policy,
+        ItemsIt                    first,
+        ItemsIt                    last,
+        MapIt                      map,
+        ResultIt                   result)
+{
+  cuda_cub::transform(policy,
+                   first,
+                   last,
+                   thrust::make_permutation_iterator(result, map),
+                   identity());
+}
+
+template <class Derived,
+          class ItemsIt,
+          class MapIt,
+          class StencilIt,
+          class ResultIt,
+          class Predicate>
+void __host__ __device__
+scatter_if(execution_policy<Derived>& policy,
+           ItemsIt                    first,
+           ItemsIt                    last,
+           MapIt                      map,
+           StencilIt                  stencil,
+           ResultIt                   result,
+           Predicate                  predicate)
+{
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      stencil,
+                      thrust::make_permutation_iterator(result, map),
+                      identity(),
+                      predicate);
+}
+
+template <class Derived,
+          class ItemsIt,
+          class MapIt,
+          class StencilIt,
+          class ResultIt,
+          class Predicate>
+void __host__ __device__
+scatter_if(execution_policy<Derived>& policy,
+           ItemsIt                    first,
+           ItemsIt                    last,
+           MapIt                      map,
+           StencilIt                  stencil,
+           ResultIt                   result)
+{
+  cuda_cub::scatter_if(policy,
+                    first,
+                    last,
+                    map,
+                    stencil,
+                    result,
+                    identity());
+}
+
+
+} // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/sequence.h b/thrust/thrust/system/cuda/detail/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/sequence.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..38ba1011d581b3187f3b6ac847070192d6f292d7
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/set_operations.h
@@ -0,0 +1,1998 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/set_operations.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/distance.h>
+#include <thrust/detail/alignment.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+namespace __set_operations {
+
+  template <bool UpperBound,
+            class IntT,
+            class Size,
+            class It,
+            class T,
+            class Comp>
+  THRUST_DEVICE_FUNCTION void
+  binary_search_iteration(It   data,
+                          Size &begin,
+                          Size &end,
+                          T    key,
+                          int  shift,
+                          Comp comp)
+  {
+
+    IntT scale = (1 << shift) - 1;
+    Size mid   = (begin + scale * end) >> shift;
+
+    T    key2 = data[mid];
+    bool pred = UpperBound ? !comp(key, key2) : comp(key2, key);
+    if (pred)
+      begin = mid + 1;
+    else
+      end = mid;
+  }
+
+  template <bool UpperBound, class Size, class T, class It, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  binary_search(It data, Size count, T key, Comp comp)
+  {
+    Size begin = 0;
+    Size end   = count;
+    while (begin < end)
+      binary_search_iteration<UpperBound, int>(data,
+                                               begin,
+                                               end,
+                                               key,
+                                               1,
+                                               comp);
+    return begin;
+  }
+
+  template <bool UpperBound, class IntT, class Size, class T, class It, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  biased_binary_search(It data, Size count, T key, IntT levels, Comp comp)
+  {
+    Size begin = 0;
+    Size end   = count;
+
+    if (levels >= 4 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 9, comp);
+    if (levels >= 3 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 7, comp);
+    if (levels >= 2 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 5, comp);
+    if (levels >= 1 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 4, comp);
+
+    while (begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 1, comp);
+    return begin;
+  }
+
+  template <bool UpperBound, class Size, class It1, class It2, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  merge_path(It1 a, Size aCount, It2 b, Size bCount, Size diag, Comp comp)
+  {
+    typedef typename thrust::iterator_traits<It1>::value_type T;
+
+    Size begin = thrust::max<Size>(0, diag - bCount);
+    Size end   = thrust::min<Size>(diag, aCount);
+
+    while (begin < end)
+    {
+      Size  mid  = (begin + end) >> 1;
+      T    aKey = a[mid];
+      T    bKey = b[diag - 1 - mid];
+      bool pred = UpperBound ? comp(aKey, bKey) : !comp(bKey, aKey);
+      if (pred)
+        begin = mid + 1;
+      else
+        end = mid;
+    }
+    return begin;
+  }
+
+  template <class It1, class It2, class Size, class Size2, class CompareOp>
+  THRUST_DEVICE_FUNCTION pair<Size, Size>
+  balanced_path(It1       keys1,
+                It2       keys2,
+                Size      num_keys1,
+                Size      num_keys2,
+                Size      diag,
+                Size2     levels,
+                CompareOp compare_op)
+  {
+    typedef typename iterator_traits<It1>::value_type T;
+
+    Size index1 = merge_path<false>(keys1,
+                                    num_keys1,
+                                    keys2,
+                                    num_keys2,
+                                    diag,
+                                    compare_op);
+    Size index2 = diag - index1;
+
+    bool star = false;
+    if (index2 < num_keys2)
+    {
+      T x = keys2[index2];
+
+      // Search for the beginning of the duplicate run in both A and B.
+      Size start1 = biased_binary_search<false>(keys1,
+                                                index1,
+                                                x,
+                                                levels,
+                                                compare_op);
+      Size start2 = biased_binary_search<false>(keys2,
+                                                index2,
+                                                x,
+                                                levels,
+                                                compare_op);
+
+      // The distance between x's merge path and its lower_bound is its rank.
+      // We add up the a and b ranks and evenly distribute them to
+      // get a stairstep path.
+      Size run1      = index1 - start1;
+      Size run2      = index2 - start2;
+      Size total_run = run1 + run2;
+
+      // Attempt to advance b and regress a.
+      Size advance2 = max<Size>(total_run >> 1, total_run - run1);
+      Size end2     = min<Size>(num_keys2, start2 + advance2 + 1);
+
+      Size run_end2 = index2 + binary_search<true>(keys2 + index2,
+                                                   end2 - index2,
+                                                   x,
+                                                   compare_op);
+      run2 = run_end2 - start2;
+
+      advance2      = min<Size>(advance2, run2);
+      Size advance1 = total_run - advance2;
+
+      bool round_up      = (advance1 == advance2 + 1) && (advance2 < run2);
+      if (round_up) star = true;
+
+      index1 = start1 + advance1;
+    }
+    return thrust::make_pair(index1, (diag - index1) + star);
+  }    // func balanced_path
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD - 1
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
+  };    // PtxPolicy
+
+  template<class Arch, class T, class U>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class T, class U>
+  struct Tuning<sm30,T,U>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES             = mpl::max<size_t, sizeof(T), sizeof(U)>::value,
+      COMBINED_INPUT_BYTES        = sizeof(T),    // + sizeof(Value),
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              ((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  }; // tuning sm30
+
+  template<class T, class U>
+  struct Tuning<sm52,T,U>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES             = mpl::max<size_t, sizeof(T), sizeof(U)>::value,
+      COMBINED_INPUT_BYTES        = sizeof(T), // + sizeof(U),
+      NOMINAL_4B_ITEMS_PER_THREAD = 15,
+      ITEMS_PER_THREAD            = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              ((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  }; // tuning sm52
+
+  template<class T, class U>
+  struct Tuning<sm60,T,U>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES             = mpl::max<size_t, sizeof(T), sizeof(U)>::value,
+      COMBINED_INPUT_BYTES        = sizeof(T), // + sizeof(U),
+      NOMINAL_4B_ITEMS_PER_THREAD = 19,
+      ITEMS_PER_THREAD            = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              ((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  }; // tuning sm60
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class ValuesIt1,
+            class ValuesIt2,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class Size,
+            class CompareOp,
+            class SetOp,
+            class HAS_VALUES>
+  struct SetOpAgent
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type  key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type  key2_type;
+    typedef typename iterator_traits<ValuesIt1>::value_type value1_type;
+    typedef typename iterator_traits<ValuesIt2>::value_type value2_type;
+
+    typedef key1_type  key_type;
+    typedef value1_type value_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, key_type, value_type>::type
+    {
+      typedef Tuning<Arch, key_type, value_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysIt1>::type   KeysLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, KeysIt2>::type   KeysLoadIt2;
+      typedef typename core::LoadIterator<PtxPlan, ValuesIt1>::type ValuesLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, ValuesIt2>::type ValuesLoadIt2;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type   BlockLoadKeys1;
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type   BlockLoadKeys2;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt1>::type BlockLoadValues1;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt2>::type BlockLoadValues2;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      // gather required temporary storage in a union
+      //
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage          scan;
+          typename TilePrefixCallback::TempStorage prefix;
+        };
+
+        struct
+        {
+          core::uninitialized_array<int, PtxPlan::BLOCK_THREADS>
+              offset;
+          union
+          {
+            typename BlockLoadKeys1::TempStorage   load_keys1;
+            typename BlockLoadKeys2::TempStorage   load_keys2;
+            typename BlockLoadValues1::TempStorage load_values1;
+            typename BlockLoadValues2::TempStorage load_values2;
+
+            // Allocate extra shmem than truely neccessary
+            // This will permit to avoid range checks in
+            // serial set operations, e.g. serial_set_difference
+            core::uninitialized_array<
+                key_type,
+                PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS>
+                keys_shared;
+
+            core::uninitialized_array<
+                value_type,
+                PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS>
+                values_shared;
+          };
+        };
+      };    // union TempStorage
+    };      // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt1   KeysLoadIt1;
+    typedef typename ptx_plan::KeysLoadIt2   KeysLoadIt2;
+    typedef typename ptx_plan::ValuesLoadIt1 ValuesLoadIt1;
+    typedef typename ptx_plan::ValuesLoadIt2 ValuesLoadIt2;
+
+    typedef typename ptx_plan::BlockLoadKeys1   BlockLoadKeys1;
+    typedef typename ptx_plan::BlockLoadKeys2   BlockLoadKeys2;
+    typedef typename ptx_plan::BlockLoadValues1 BlockLoadValues1;
+    typedef typename ptx_plan::BlockLoadValues2 BlockLoadValues2;
+
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan BlockScan;
+
+    typedef typename ptx_plan::TempStorage TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &  storage;
+      ScanTileState &tile_state;
+      KeysLoadIt1    keys1_in;
+      KeysLoadIt2    keys2_in;
+      ValuesLoadIt1  values1_in;
+      ValuesLoadIt2  values2_in;
+      Size           keys1_count;
+      Size           keys2_count;
+      KeysOutputIt   keys_out;
+      ValuesOutputIt values_out;
+      CompareOp      compare_op;
+      SetOp          set_op;
+      pair<Size, Size> *partitions;
+      std::size_t *output_count;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE, class T, class It1, class It2>
+      THRUST_DEVICE_FUNCTION void
+      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+                  It1 input1,
+                  It2 input2,
+                  int count1,
+                  int count2)
+      {
+        if (IS_FULL_TILE)
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ++ITEM)
+          {
+            int idx      = BLOCK_THREADS * ITEM + threadIdx.x;
+            output[ITEM] = (idx < count1)
+                               ? static_cast<T>(input1[idx])
+                               : static_cast<T>(input2[idx - count1]);
+          }
+
+          // last ITEM might be a conditional load even for full tiles
+          // please check first before attempting to load.
+          int ITEM = ITEMS_PER_THREAD - 1;
+          int idx  = BLOCK_THREADS * ITEM + threadIdx.x;
+          if (idx < count1 + count2)
+            output[ITEM] = (idx < count1)
+                               ? static_cast<T>(input1[idx])
+                               : static_cast<T>(input2[idx - count1]);
+        }
+        else
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1 + count2)
+            {
+              output[ITEM] = (idx < count1)
+                                 ? static_cast<T>(input1[idx])
+                                 : static_cast<T>(input2[idx - count1]);
+            }
+          }
+        }
+      }
+
+      template <class T, class It>
+      THRUST_DEVICE_FUNCTION void
+      reg_to_shared(It output,
+                    T (&input)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+          output[idx] = input[ITEM];
+        }
+      }
+
+      template <class OutputIt, class T, class SharedIt>
+      void THRUST_DEVICE_FUNCTION
+      scatter(OutputIt output,
+              T (&input)[ITEMS_PER_THREAD],
+              SharedIt shared,
+              int      active_mask,
+              Size     thread_output_prefix,
+              Size     tile_output_prefix,
+              int      tile_output_count)
+      {
+        using core::sync_threadblock;
+
+
+
+        int local_scatter_idx = thread_output_prefix - tile_output_prefix;
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          if (active_mask & (1 << ITEM))
+          {
+            shared[local_scatter_idx++] = input[ITEM];
+          }
+        }
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < tile_output_count;
+             item += BLOCK_THREADS)
+        {
+          output[tile_output_prefix + item] = shared[item];
+        }
+      }
+
+      int THRUST_DEVICE_FUNCTION
+      serial_set_op(key_type *keys,
+                    int       keys1_beg,
+                    int       keys2_beg,
+                    int       keys1_count,
+                    int       keys2_count,
+                    key_type (&output)[ITEMS_PER_THREAD],
+                    int (&indices)[ITEMS_PER_THREAD],
+                    CompareOp compare_op,
+                    SetOp     set_op)
+      {
+        int active_mask = set_op(keys,
+                                 keys1_beg,
+                                 keys2_beg,
+                                 keys1_count,
+                                 keys2_count,
+                                 output,
+                                 indices,
+                                 compare_op);
+
+        return active_mask;
+      }
+
+      //---------------------------------------------------------------------
+      // Tile operations
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE>
+      void THRUST_DEVICE_FUNCTION
+      consume_tile(Size tile_idx)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
+
+        pair<Size, Size> partition_beg = partitions[tile_idx + 0];
+        pair<Size, Size> partition_end = partitions[tile_idx + 1];
+
+        Size keys1_beg = partition_beg.first;
+        Size keys1_end = partition_end.first;
+        Size keys2_beg = partition_beg.second;
+        Size keys2_end = partition_end.second;
+
+        // number of keys per tile
+        //
+        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+
+       // load keys into shared memory for further processing
+        key_type keys_loc[ITEMS_PER_THREAD];
+
+        gmem_to_reg<!IS_LAST_TILE>(keys_loc,
+                                   keys1_in + keys1_beg,
+                                   keys2_in + keys2_beg,
+                                   num_keys1,
+                                   num_keys2);
+
+        reg_to_shared(&storage.keys_shared[0], keys_loc);
+
+        sync_threadblock();
+
+        int diag_loc = min<int>(ITEMS_PER_THREAD * threadIdx.x,
+                                num_keys1 + num_keys2);
+
+        pair<int, int> partition_loc =
+            balanced_path(&storage.keys_shared[0],
+                          &storage.keys_shared[num_keys1],
+                          num_keys1,
+                          num_keys2,
+                          diag_loc,
+                          4,
+                          compare_op);
+
+        int keys1_beg_loc = partition_loc.first;
+        int keys2_beg_loc = partition_loc.second;
+
+        // compute difference between next and current thread
+        // to obtain number of elements per thread
+        int value = threadIdx.x == 0
+                        ? (num_keys1 << 16) | num_keys2
+                        : (partition_loc.first << 16) | partition_loc.second;
+
+        int dst = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1;
+        storage.offset[dst] = value;
+
+        core::sync_threadblock();
+
+        pair<int,int> partition1_loc = thrust::make_pair(
+          storage.offset[threadIdx.x] >> 16,
+          storage.offset[threadIdx.x] & 0xFFFF);
+
+        int keys1_end_loc = partition1_loc.first;
+        int keys2_end_loc = partition1_loc.second;
+
+        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+        // perform serial set operation
+        //
+        int indices[ITEMS_PER_THREAD];
+
+        int active_mask = serial_set_op(&storage.keys_shared[0],
+                                        keys1_beg_loc,
+                                        keys2_beg_loc + num_keys1,
+                                        num_keys1_loc,
+                                        num_keys2_loc,
+                                        keys_loc,
+                                        indices,
+                                        compare_op,
+                                        set_op);
+        sync_threadblock();
+#if 0
+        if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2)
+          active_mask = 0;
+#endif
+
+        // look-back scan over thread_output_count
+        // to compute global thread_output_base and tile_otput_count;
+        Size tile_output_count    = 0;
+        Size thread_output_prefix = 0;
+        Size tile_output_prefix   = 0;
+        Size thread_output_count = static_cast<Size>(__popc(active_mask));
+
+        if (tile_idx == 0)    // first tile
+        {
+          BlockScan(storage.scan)
+              .ExclusiveSum(thread_output_count,
+                            thread_output_prefix,
+                            tile_output_count);
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+            {
+              tile_state.SetInclusive(0, tile_output_count);
+            }
+          }
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+
+          BlockScan(storage.scan)
+              .ExclusiveSum(thread_output_count,
+                            thread_output_prefix,
+                            prefix_cb);
+          tile_output_count  = prefix_cb.GetBlockAggregate();
+          tile_output_prefix = prefix_cb.GetExclusivePrefix();
+        }
+
+        sync_threadblock();
+
+        // scatter results
+        //
+        scatter(keys_out,
+                keys_loc,
+                &storage.keys_shared[0],
+                active_mask,
+                thread_output_prefix,
+                tile_output_prefix,
+                tile_output_count);
+
+        if (HAS_VALUES::value)
+        {
+          value_type values_loc[ITEMS_PER_THREAD];
+          gmem_to_reg<!IS_LAST_TILE>(values_loc,
+                                     values1_in + keys1_beg,
+                                     values2_in + keys2_beg,
+                                     num_keys1,
+                                     num_keys2);
+
+          sync_threadblock();
+
+          reg_to_shared(&storage.values_shared[0], values_loc);
+
+          sync_threadblock();
+
+          // gather items from shared mem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            if (active_mask & (1 << ITEM))
+            {
+              values_loc[ITEM] = storage.values_shared[indices[ITEM]];
+            }
+          }
+
+          sync_threadblock();
+
+          scatter(values_out,
+                  values_loc,
+                  &storage.values_shared[0],
+                  active_mask,
+                  thread_output_prefix,
+                  tile_output_prefix,
+                  tile_output_count);
+        }
+
+        if (IS_LAST_TILE && threadIdx.x == 0)
+        {
+          *output_count = tile_output_prefix + tile_output_count;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &  storage_,
+           ScanTileState &tile_state_,
+           KeysIt1        keys1_,
+           KeysIt2        keys2_,
+           ValuesIt1      values1_,
+           ValuesIt2      values2_,
+           Size           keys1_count_,
+           Size           keys2_count_,
+           KeysOutputIt   keys_out_,
+           ValuesOutputIt values_out_,
+           CompareOp      compare_op_,
+           SetOp          set_op_,
+           pair<Size, Size> *partitions_,
+           std::size_t * output_count_)
+          : storage(storage_),
+            tile_state(tile_state_),
+            keys1_in(core::make_load_iterator(ptx_plan(), keys1_)),
+            keys2_in(core::make_load_iterator(ptx_plan(), keys2_)),
+            values1_in(core::make_load_iterator(ptx_plan(), values1_)),
+            values2_in(core::make_load_iterator(ptx_plan(), values2_)),
+            keys1_count(keys1_count_),
+            keys2_count(keys2_count_),
+            keys_out(keys_out_),
+            values_out(values_out_),
+            compare_op(compare_op_),
+            set_op(set_op_),
+            partitions(partitions_),
+            output_count(output_count_)
+      {
+        int  tile_idx      = blockIdx.x;
+        int  num_tiles     = gridDim.x;
+
+        if (tile_idx < num_tiles-1)
+        {
+          consume_tile<false>(tile_idx);
+        }
+        else
+        {
+          consume_tile<true>(tile_idx);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysIt1        keys1,
+                       KeysIt2        keys2,
+                       ValuesIt1      values1,
+                       ValuesIt2      values2,
+                       Size           keys1_count,
+                       Size           keys2_count,
+                       KeysOutputIt   keys_output,
+                       ValuesOutputIt values_output,
+                       CompareOp      compare_op,
+                       SetOp          set_op,
+                       pair<Size, Size> *partitions,
+                       std::size_t *  output_count,
+                       ScanTileState tile_state,
+                       char *        shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           keys1,
+           keys2,
+           values1,
+           values2,
+           keys1_count,
+           keys2_count,
+           keys_output,
+           values_output,
+           compare_op,
+           set_op,
+           partitions,
+           output_count);
+    }
+  };    // struct SetOpAgent
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class CompareOp>
+  struct PartitionAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<256> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysIt1 keys1,
+                       KeysIt2 keys2,
+                       Size    keys1_count,
+                       Size    keys2_count,
+                       Size    num_partitions,
+                       pair<Size, Size> *partitions,
+                       CompareOp compare_op,
+                       int       items_per_tile,
+                       char * /*shmem*/)
+    {
+      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      if (partition_idx < num_partitions)
+      {
+        Size partition_at = min<Size>(partition_idx * items_per_tile,
+                                      keys1_count + keys2_count);
+        pair<Size, Size> diag = balanced_path(keys1,
+                                              keys2,
+                                              keys1_count,
+                                              keys2_count,
+                                              partition_at,
+                                              4ll,
+                                              compare_op);
+        partitions[partition_idx] = diag;
+      }
+    }
+  };    // struct PartitionAgent
+
+  template <class ScanTileState,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+    }
+  }; // struct InitAgent
+
+  //---------------------------------------------------------------------
+  // Serial set operations
+  //---------------------------------------------------------------------
+
+  // serial_set_intersection
+  // -----------------------
+  // emit A if A and B are in range and equal.
+  struct serial_set_intersection
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pA = compare_op(aKey, bKey);
+        bool pB = compare_op(bKey, aKey);
+
+        // The outputs must come from A by definition of set interection.
+        output[i]  = aKey;
+        indices[i] = aBegin;
+
+        if ((aBegin < aEnd) && (bBegin < bEnd) && pA == pB)
+          active_mask |= 1 << i;
+
+        if (!pB) {aKey = keys[++aBegin]; }
+        if (!pA) {bKey = keys[++bBegin]; }
+      }
+      return active_mask;
+    }
+  };    // struct serial_set_intersection
+
+  // serial_set_symmetric_difference
+  // ---------------------
+  // emit A if A < B and emit B if B < A.
+  struct serial_set_symmetric_difference
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+      int end    = aEnd + bEnd;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pB = aBegin >= aEnd;
+        bool pA = !pB && bBegin >= bEnd;
+
+        if (!pA && !pB)
+        {
+          pA = compare_op(aKey, bKey);
+          pB = !pA && compare_op(bKey, aKey);
+        }
+
+        // The outputs must come from A by definition of set difference.
+        output[i]  = pA ? aKey : bKey;
+        indices[i] = pA ? aBegin : bBegin;
+
+        if (aBegin + bBegin < end && pA != pB)
+          active_mask |= 1 << i;
+
+        if (!pB) {aKey = keys[++aBegin]; }
+        if (!pA) {bKey = keys[++bBegin]; }
+
+      }
+      return active_mask;
+    }
+  };    // struct set_symmetric_difference
+
+  // serial_set_difference
+  // ---------------------
+  // emit A if A < B
+  struct serial_set_difference
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+      int end    = aEnd + bEnd;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pB = aBegin >= aEnd;
+        bool pA = !pB && bBegin >= bEnd;
+
+        if (!pA && !pB)
+        {
+          pA = compare_op(aKey, bKey);
+          pB = !pA && compare_op(bKey, aKey);
+        }
+
+        // The outputs must come from A by definition of set difference.
+        output[i]  = aKey;
+        indices[i] = aBegin;
+
+        if (aBegin + bBegin < end && pA)
+          active_mask |= 1 << i;
+
+        if (!pB) { aKey = keys[++aBegin]; }
+        if (!pA) { bKey = keys[++bBegin]; }
+      }
+      return active_mask;
+    }
+  };    // struct set_difference
+
+  // serial_set_union
+  // ----------------
+  // emit A if A <= B else emit B
+  struct serial_set_union
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+      int end    = aEnd + bEnd;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pB = aBegin >= aEnd;
+        bool pA = !pB && bBegin >= bEnd;
+
+        if (!pA && !pB)
+        {
+          pA = compare_op(aKey, bKey);
+          pB = !pA && compare_op(bKey, aKey);
+        }
+
+        // Output A in case of a tie, so check if b < a.
+        output[i]  = pB ? bKey : aKey;
+        indices[i] = pB ? bBegin : aBegin;
+
+        if (aBegin + bBegin < end)
+          active_mask |= 1 << i;
+
+        if (!pB) { aKey = keys[++aBegin]; }
+        if (!pA) { bKey = keys[++bBegin]; }
+
+      }
+      return active_mask;
+    }
+  };    // struct set_union
+
+  template <class HAS_VALUES,
+            class KeysIt1,
+            class KeysIt2,
+            class ValuesIt1,
+            class ValuesIt2,
+            class Size,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class CompareOp,
+            class SetOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *         d_temp_storage,
+            size_t &       temp_storage_size,
+            KeysIt1        keys1,
+            KeysIt2        keys2,
+            ValuesIt1      values1,
+            ValuesIt2      values2,
+            Size           num_keys1,
+            Size           num_keys2,
+            KeysOutputIt   keys_output,
+            ValuesOutputIt values_output,
+            std::size_t *  output_count,
+            CompareOp      compare_op,
+            SetOp          set_op,
+            cudaStream_t   stream,
+            bool           debug_sync)
+  {
+    Size keys_total = num_keys1 + num_keys2;
+    if (keys_total == 0)
+      return cudaErrorNotSupported;
+
+    cudaError_t status = cudaSuccess;
+
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    typedef AgentLauncher<
+        SetOpAgent<KeysIt1,
+                   KeysIt2,
+                   ValuesIt1,
+                   ValuesIt2,
+                   KeysOutputIt,
+                   ValuesOutputIt,
+                   Size,
+                   CompareOp,
+                   SetOp,
+                   HAS_VALUES> >
+        set_op_agent;
+
+    typedef AgentLauncher<PartitionAgent<KeysIt1, KeysIt2, Size, CompareOp> >
+        partition_agent;
+
+    typedef typename set_op_agent::ScanTileState ScanTileState;
+    typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
+
+
+    AgentPlan set_op_plan    = set_op_agent::get_plan(stream);
+    AgentPlan init_plan      = init_agent::get_plan();
+    AgentPlan partition_plan = partition_agent::get_plan();
+
+    int  tile_size = set_op_plan.items_per_tile;
+    Size num_tiles = (keys_total + tile_size - 1) / tile_size;
+
+    size_t tile_agent_storage;
+    status = ScanTileState::AllocationSize(num_tiles, tile_agent_storage);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    size_t vshmem_storage = core::vshmem_size(set_op_plan.shared_memory_size,
+                                              num_tiles);
+    size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2;
+
+    void *allocations[3] = {NULL, NULL, NULL};
+    size_t allocation_sizes[3] = {tile_agent_storage,
+                                  partition_agent_storage,
+                                  vshmem_storage};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_state;
+    status = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    pair<Size, Size> *partitions = (pair<Size, Size> *)allocations[1];
+    char *vshmem_ptr = vshmem_storage > 0 ? (char *)allocations[2] : NULL;
+
+    init_agent ia(init_plan, num_tiles, stream, "set_op::init_agent", debug_sync);
+    ia.launch(tile_state, num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    partition_agent pa(partition_plan, num_tiles+1, stream, "set_op::partition agent", debug_sync);
+    pa.launch(keys1,
+              keys2,
+              num_keys1,
+              num_keys2,
+              num_tiles+1,
+              partitions,
+              compare_op,
+              tile_size);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    set_op_agent sa(set_op_plan, keys_total, stream, vshmem_ptr, "set_op::set_op_agent", debug_sync);
+    sa.launch(keys1,
+              keys2,
+              values1,
+              values2,
+              num_keys1,
+              num_keys2,
+              keys_output,
+              values_output,
+              compare_op,
+              set_op,
+              partitions,
+              output_count,
+              tile_state);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return status;
+ }
+
+ template <typename HAS_VALUES,
+           typename Derived,
+           typename KeysIt1,
+           typename KeysIt2,
+           typename ValuesIt1,
+           typename ValuesIt2,
+           typename KeysOutputIt,
+           typename ValuesOutputIt,
+           typename CompareOp,
+           typename SetOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ValuesOutputIt>
+  set_operations(execution_policy<Derived>& policy,
+                 KeysIt1                    keys1_first,
+                 KeysIt1                    keys1_last,
+                 KeysIt2                    keys2_first,
+                 KeysIt2                    keys2_last,
+                 ValuesIt1                  values1_first,
+                 ValuesIt2                  values2_first,
+                 KeysOutputIt               keys_output,
+                 ValuesOutputIt             values_output,
+                 CompareOp                  compare_op,
+                 SetOp                      set_op)
+  {
+    typedef typename iterator_traits<KeysIt1>::difference_type size_type;
+
+    size_type num_keys1 = static_cast<size_type>(thrust::distance(keys1_first, keys1_last));
+    size_type num_keys2 = static_cast<size_type>(thrust::distance(keys2_first, keys2_last));
+
+    if (num_keys1 + num_keys2 == 0)
+      return thrust::make_pair(keys_output, values_output);
+
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, doit_step<HAS_VALUES>,
+        num_keys1, num_keys2, (NULL,
+                                   temp_storage_bytes,
+                                   keys1_first,
+                                   keys2_first,
+                                   values1_first,
+                                   values2_first,
+                                   num_keys1_fixed,
+                                   num_keys2_fixed,
+                                   keys_output,
+                                   values_output,
+                                   reinterpret_cast<std::size_t*>(NULL),
+                                   compare_op,
+                                   set_op,
+                                   stream,
+                                   debug_sync));
+    cuda_cub::throw_on_error(status, "set_operations failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(std::size_t), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage");
+
+    std::size_t* d_output_count
+      = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
+
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, doit_step<HAS_VALUES>,
+        num_keys1, num_keys2, (allocations[1],
+                                   temp_storage_bytes,
+                                   keys1_first,
+                                   keys2_first,
+                                   values1_first,
+                                   values2_first,
+                                   num_keys1_fixed,
+                                   num_keys2_fixed,
+                                   keys_output,
+                                   values_output,
+                                   d_output_count,
+                                   compare_op,
+                                   set_op,
+                                   stream,
+                                   debug_sync));
+    cuda_cub::throw_on_error(status, "set_operations failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "set_operations failed to synchronize");
+
+    std::size_t output_count = cuda_cub::get_value(policy, d_output_count);
+
+    return thrust::make_pair(keys_output + output_count, values_output + output_count);
+  }
+}    // namespace __set_operations
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_difference(execution_policy<Derived> &policy,
+               ItemsIt1                   items1_first,
+               ItemsIt1                   items1_last,
+               ItemsIt2                   items2_first,
+               ItemsIt2                   items2_last,
+               OutputIt                   result,
+               CompareOp                  compare)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
+    //
+    ret = __set_operations::set_operations<thrust::detail::false_type>(
+              policy,
+              items1_first,
+              items1_last,
+              items2_first,
+              items2_last,
+              null_,
+              null_,
+              result,
+              null_,
+              compare,
+              __set_operations::serial_set_difference())
+              .first;
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_difference(cvt_to_seq(derived_cast(policy)),
+                                 items1_first,
+                                 items1_last,
+                                 items2_first,
+                                 items2_last,
+                                 result,
+                                 compare);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_difference(execution_policy<Derived> &policy,
+               ItemsIt1                   items1_first,
+               ItemsIt1                   items1_last,
+               ItemsIt2                   items2_first,
+               ItemsIt2                   items2_last,
+               OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_difference(policy,
+                                  items1_first,
+                                  items1_last,
+                                  items2_first,
+                                  items2_last,
+                                  result,
+                                  less<value_type>());
+}
+
+/*****************************/
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_intersection(execution_policy<Derived> &policy,
+                 ItemsIt1                   items1_first,
+                 ItemsIt1                   items1_last,
+                 ItemsIt2                   items2_first,
+                 ItemsIt2                   items2_last,
+                 OutputIt                   result,
+                 CompareOp                  compare)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
+    //
+    ret = __set_operations::set_operations<thrust::detail::false_type>(
+              policy,
+              items1_first,
+              items1_last,
+              items2_first,
+              items2_last,
+              null_,
+              null_,
+              result,
+              null_,
+              compare,
+              __set_operations::serial_set_intersection())
+              .first;
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_intersection(cvt_to_seq(derived_cast(policy)),
+                                   items1_first,
+                                   items1_last,
+                                   items2_first,
+                                   items2_last,
+                                   result,
+                                   compare);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_intersection(execution_policy<Derived> &policy,
+                 ItemsIt1                   items1_first,
+                 ItemsIt1                   items1_last,
+                 ItemsIt2                   items2_first,
+                 ItemsIt2                   items2_last,
+                 OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_intersection(policy,
+                                    items1_first,
+                                    items1_last,
+                                    items2_first,
+                                    items2_last,
+                                    result,
+                                    less<value_type>());
+}
+
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_symmetric_difference(execution_policy<Derived> &policy,
+                         ItemsIt1                   items1_first,
+                         ItemsIt1                   items1_last,
+                         ItemsIt2                   items2_first,
+                         ItemsIt2                   items2_last,
+                         OutputIt                   result,
+                         CompareOp                  compare)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
+    //
+    ret = __set_operations::set_operations<thrust::detail::false_type>(
+              policy,
+              items1_first,
+              items1_last,
+              items2_first,
+              items2_last,
+              null_,
+              null_,
+              result,
+              null_,
+              compare,
+              __set_operations::serial_set_symmetric_difference())
+              .first;
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)),
+                                           items1_first,
+                                           items1_last,
+                                           items2_first,
+                                           items2_last,
+                                           result,
+                                           compare);
+#endif
+  }
+  return ret;
+}
+
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_symmetric_difference(execution_policy<Derived> &policy,
+                         ItemsIt1                   items1_first,
+                         ItemsIt1                   items1_last,
+                         ItemsIt2                   items2_first,
+                         ItemsIt2                   items2_last,
+                         OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_symmetric_difference(policy,
+                                            items1_first,
+                                            items1_last,
+                                            items2_first,
+                                            items2_last,
+                                            result,
+                                            less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_union(execution_policy<Derived> &policy,
+          ItemsIt1                   items1_first,
+          ItemsIt1                   items1_last,
+          ItemsIt2                   items2_first,
+          ItemsIt2                   items2_last,
+          OutputIt                   result,
+          CompareOp                  compare)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
+    //
+    ret = __set_operations::set_operations<thrust::detail::false_type>(
+              policy,
+              items1_first,
+              items1_last,
+              items2_first,
+              items2_last,
+              null_,
+              null_,
+              result,
+              null_,
+              compare,
+              __set_operations::serial_set_union())
+              .first;
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_union(cvt_to_seq(derived_cast(policy)),
+                            items1_first,
+                            items1_last,
+                            items2_first,
+                            items2_last,
+                            result,
+                            compare);
+#endif
+  }
+  return ret;
+}
+
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_union(execution_policy<Derived> &policy,
+          ItemsIt1                   items1_first,
+          ItemsIt1                   items1_last,
+          ItemsIt2                   items2_first,
+          ItemsIt2                   items2_last,
+          OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_union(policy,
+                             items1_first,
+                             items1_last,
+                             items2_first,
+                             items2_last,
+                             result,
+                             less<value_type>());
+}
+
+
+/*****************************/
+/*****************************/
+/*****     *_by_key      *****/
+/*****************************/
+/*****************************/
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_difference_by_key(execution_policy<Derived> &policy,
+                      KeysIt1                    keys1_first,
+                      KeysIt1                    keys1_last,
+                      KeysIt2                    keys2_first,
+                      KeysIt2                    keys2_last,
+                      ItemsIt1                   items1_first,
+                      ItemsIt2                   items2_first,
+                      KeysOutputIt               keys_result,
+                      ItemsOutputIt              items_result,
+                      CompareOp                  compare_op)
+{
+  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __set_operations::set_operations<thrust::detail::true_type>(
+        policy,
+        keys1_first,
+        keys1_last,
+        keys2_first,
+        keys2_last,
+        items1_first,
+        items2_first,
+        keys_result,
+        items_result,
+        compare_op,
+        __set_operations::serial_set_difference());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)),
+                                        keys1_first,
+                                        keys1_last,
+                                        keys2_first,
+                                        keys2_last,
+                                        items1_first,
+                                        items2_first,
+                                        keys_result,
+                                        items_result,
+                                        compare_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_difference_by_key(execution_policy<Derived> &policy,
+                      KeysIt1                    keys1_first,
+                      KeysIt1                    keys1_last,
+                      KeysIt2                    keys2_first,
+                      KeysIt2                    keys2_last,
+                      ItemsIt1                   items1_first,
+                      ItemsIt2                   items2_first,
+                      KeysOutputIt               keys_result,
+                      ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_difference_by_key(policy,
+                                         keys1_first,
+                                         keys1_last,
+                                         keys2_first,
+                                         keys2_last,
+                                         items1_first,
+                                         items2_first,
+                                         keys_result,
+                                         items_result,
+                                         less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_intersection_by_key(execution_policy<Derived> &policy,
+                        KeysIt1                    keys1_first,
+                        KeysIt1                    keys1_last,
+                        KeysIt2                    keys2_first,
+                        KeysIt2                    keys2_last,
+                        ItemsIt1                   items1_first,
+                        KeysOutputIt               keys_result,
+                        ItemsOutputIt              items_result,
+                        CompareOp                  compare_op)
+{
+  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __set_operations::set_operations<thrust::detail::true_type>(
+        policy,
+        keys1_first,
+        keys1_last,
+        keys2_first,
+        keys2_last,
+        items1_first,
+        items1_first,
+        keys_result,
+        items_result,
+        compare_op,
+        __set_operations::serial_set_intersection());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)),
+                                          keys1_first,
+                                          keys1_last,
+                                          keys2_first,
+                                          keys2_last,
+                                          items1_first,
+                                          keys_result,
+                                          items_result,
+                                          compare_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_intersection_by_key(execution_policy<Derived> &policy,
+                        KeysIt1                    keys1_first,
+                        KeysIt1                    keys1_last,
+                        KeysIt2                    keys2_first,
+                        KeysIt2                    keys2_last,
+                        ItemsIt1                   items1_first,
+                        KeysOutputIt               keys_result,
+                        ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_intersection_by_key(policy,
+                                           keys1_first,
+                                           keys1_last,
+                                           keys2_first,
+                                           keys2_last,
+                                           items1_first,
+                                           keys_result,
+                                           items_result,
+                                           less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_symmetric_difference_by_key(execution_policy<Derived> &policy,
+                                KeysIt1                    keys1_first,
+                                KeysIt1                    keys1_last,
+                                KeysIt2                    keys2_first,
+                                KeysIt2                    keys2_last,
+                                ItemsIt1                   items1_first,
+                                ItemsIt2                   items2_first,
+                                KeysOutputIt               keys_result,
+                                ItemsOutputIt              items_result,
+                                CompareOp                  compare_op)
+{
+  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __set_operations::set_operations<thrust::detail::true_type>(
+        policy,
+        keys1_first,
+        keys1_last,
+        keys2_first,
+        keys2_last,
+        items1_first,
+        items2_first,
+        keys_result,
+        items_result,
+        compare_op,
+        __set_operations::serial_set_symmetric_difference());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)),
+                                                  keys1_first,
+                                                  keys1_last,
+                                                  keys2_first,
+                                                  keys2_last,
+                                                  items1_first,
+                                                  items2_first,
+                                                  keys_result,
+                                                  items_result,
+                                                  compare_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_symmetric_difference_by_key(execution_policy<Derived> &policy,
+                                KeysIt1                    keys1_first,
+                                KeysIt1                    keys1_last,
+                                KeysIt2                    keys2_first,
+                                KeysIt2                    keys2_last,
+                                ItemsIt1                   items1_first,
+                                ItemsIt2                   items2_first,
+                                KeysOutputIt               keys_result,
+                                ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_symmetric_difference_by_key(policy,
+                                                   keys1_first,
+                                                   keys1_last,
+                                                   keys2_first,
+                                                   keys2_last,
+                                                   items1_first,
+                                                   items2_first,
+                                                   keys_result,
+                                                   items_result,
+                                                   less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_union_by_key(execution_policy<Derived> &policy,
+                 KeysIt1                    keys1_first,
+                 KeysIt1                    keys1_last,
+                 KeysIt2                    keys2_first,
+                 KeysIt2                    keys2_last,
+                 ItemsIt1                   items1_first,
+                 ItemsIt2                   items2_first,
+                 KeysOutputIt               keys_result,
+                 ItemsOutputIt              items_result,
+                 CompareOp                  compare_op)
+{
+  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __set_operations::set_operations<thrust::detail::true_type>(
+        policy,
+        keys1_first,
+        keys1_last,
+        keys2_first,
+        keys2_last,
+        items1_first,
+        items2_first,
+        keys_result,
+        items_result,
+        compare_op,
+        __set_operations::serial_set_union());
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)),
+                                   keys1_first,
+                                   keys1_last,
+                                   keys2_first,
+                                   keys2_last,
+                                   items1_first,
+                                   items2_first,
+                                   keys_result,
+                                   items_result,
+                                   compare_op);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_union_by_key(execution_policy<Derived> &policy,
+                 KeysIt1                    keys1_first,
+                 KeysIt1                    keys1_last,
+                 KeysIt2                    keys2_first,
+                 KeysIt2                    keys2_last,
+                 ItemsIt1                   items1_first,
+                 ItemsIt2                   items2_first,
+                 KeysOutputIt               keys_result,
+                 ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_union_by_key(policy,
+                                    keys1_first,
+                                    keys1_last,
+                                    keys2_first,
+                                    keys2_last,
+                                    items1_first,
+                                    items2_first,
+                                    keys_result,
+                                    items_result,
+                                    less<value_type>());
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..0711c224fe4241402ad0b3283cbb7cd87600f8f8
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -0,0 +1,1748 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <cub/device/device_radix_sort.cuh>
+
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/detail/trivial_sequence.h>
+#include <thrust/detail/integer_math.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+#include <thrust/distance.h>
+#include <thrust/sequence.h>
+#include <thrust/detail/alignment.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __merge_sort {
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class BinaryPred>
+  THRUST_DEVICE_FUNCTION Size
+  merge_path(KeysIt1    keys1,
+             KeysIt2    keys2,
+             Size       keys1_count,
+             Size       keys2_count,
+             Size       diag,
+             BinaryPred binary_pred)
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
+
+    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
+    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
+
+    while (keys1_begin < keys1_end)
+    {
+      Size      mid  = (keys1_begin + keys1_end) >> 1;
+      key1_type key1 = keys1[mid];
+      key2_type key2 = keys2[diag - 1 - mid];
+      bool      pred = binary_pred(key2, key1);
+      if (pred)
+      {
+        keys1_end = mid;
+      }
+      else
+      {
+        keys1_begin = mid + 1;
+      }
+    }
+    return keys1_begin;
+  }
+
+  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
+  THRUST_DEVICE_FUNCTION void
+  serial_merge(It  keys_shared,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T2 (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+  {
+    int keys1_end = keys1_beg + keys1_count;
+    int keys2_end = keys2_beg + keys2_count;
+
+    typedef typename iterator_value<It>::type key_type;
+
+    key_type key1 = keys_shared[keys1_beg];
+    key_type key2 = keys_shared[keys2_beg];
+
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      bool p = (keys2_beg < keys2_end) &&
+               ((keys1_beg >= keys1_end) ||
+                compare_op(key2,key1));
+
+      output[ITEM]  = p ? key2 : key1;
+      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
+
+      if (p)
+      {
+        key2 = keys_shared[keys2_beg];
+      }
+      else
+      {
+        key1 = keys_shared[keys1_beg];
+      }
+    }
+  }
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  }; // PtxPolicy
+
+
+  template<class Arch, class T>
+  struct Tuning;
+
+  template<class T>
+  struct Tuning<sm35,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+
+  template<class T>
+  struct Tuning<sm52,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 15,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+
+  template<class T>
+  struct Tuning<sm60,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 17,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+
+  template<class T>
+  struct Tuning<sm30,T>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };
+
+  template <class KeysIt,
+            class ItemsIt,
+            class Size,
+            class CompareOp,
+            class SORT_ITEMS,
+            class STABLE>
+  struct BlockSortAgent
+  {
+    typedef typename iterator_traits<KeysIt>::value_type key_type;
+    typedef typename iterator_traits<ItemsIt>::value_type item_type;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, key_type>::type
+    {
+      typedef Tuning<Arch,key_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysIt>::type  KeysLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type ItemsLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt>::type  BlockLoadKeys;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type BlockLoadItems;
+
+      typedef typename core::BlockStore<PtxPlan, KeysIt>::type     BlockStoreKeysIt;
+      typedef typename core::BlockStore<PtxPlan, ItemsIt>::type    BlockStoreItemsIt;
+      typedef typename core::BlockStore<PtxPlan, key_type*>::type  BlockStoreKeysRaw;
+      typedef typename core::BlockStore<PtxPlan, item_type*>::type BlockStoreItemsRaw;
+
+      union TempStorage
+      {
+        typename BlockLoadKeys::TempStorage   load_keys;
+        typename BlockLoadItems::TempStorage  load_items;
+        typename BlockStoreKeysIt::TempStorage  store_keys_it;
+        typename BlockStoreItemsIt::TempStorage store_items_it;
+        typename BlockStoreKeysRaw::TempStorage  store_keys_raw;
+        typename BlockStoreItemsRaw::TempStorage store_items_raw;
+
+        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
+      };    // union TempStorage
+    };      // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt         KeysLoadIt;
+    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
+    typedef typename ptx_plan::BlockLoadKeys      BlockLoadKeys;
+    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
+    typedef typename ptx_plan::BlockStoreKeysIt   BlockStoreKeysIt;
+    typedef typename ptx_plan::BlockStoreItemsIt  BlockStoreItemsIt;
+    typedef typename ptx_plan::BlockStoreKeysRaw  BlockStoreKeysRaw;
+    typedef typename ptx_plan::BlockStoreItemsRaw BlockStoreItemsRaw;
+    typedef typename ptx_plan::TempStorage        TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      bool         ping;
+      TempStorage& storage;
+      KeysLoadIt   keys_in;
+      ItemsLoadIt  items_in;
+      Size         keys_count;
+      KeysIt       keys_out_it;
+      ItemsIt      items_out_it;
+      key_type*    keys_out_raw;
+      item_type*   items_out_raw;
+      CompareOp    compare_op;
+
+      //---------------------------------------------------------------------
+      // Serial stable sort network
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
+                                item_type (&items)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int I = 0; I < ITEMS_PER_THREAD; ++I)
+        {
+#pragma unroll
+          for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2)
+          {
+            if (compare_op(keys[J + 1], keys[J]))
+            {
+              using thrust::swap;
+              swap(keys[J], keys[J + 1]);
+              if (SORT_ITEMS::value)
+              {
+                swap(items[J], items[J + 1]);
+              }
+            }
+          }    // inner loop
+        }      // outer loop
+      }
+
+      //---------------------------------------------------------------------
+      // Parallel thread block merge sort
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION void
+      block_mergesort(int tid,
+                      int count,
+                      key_type (&keys_loc)[ITEMS_PER_THREAD],
+                      item_type (&items_loc)[ITEMS_PER_THREAD])
+      {
+        using core::uninitialized_array;
+        using core::sync_threadblock;
+
+        // stable sort items in a single thread
+        //
+        stable_odd_even_sort(keys_loc,items_loc);
+
+        // each thread has  sorted keys_loc
+        // merge sort keys_loc in shared memory
+        //
+#pragma unroll
+        for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
+        {
+          sync_threadblock();
+
+          // store keys in shmem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx                  = ITEMS_PER_THREAD * threadIdx.x + ITEM;
+            storage.keys_shared[idx] = keys_loc[ITEM];
+          }
+
+          sync_threadblock();
+
+          int  indices[ITEMS_PER_THREAD];
+
+          int list  = ~(coop - 1) & tid;
+          int start = ITEMS_PER_THREAD * list;
+          int size  = ITEMS_PER_THREAD * (coop >> 1);
+
+          int diag = min(count,
+                         ITEMS_PER_THREAD * ((coop - 1) & tid));
+
+          int keys1_beg = min(count, start);
+          int keys1_end = min(count, keys1_beg + size);
+          int keys2_beg = keys1_end;
+          int keys2_end = min(count, keys2_beg + size);
+
+          int keys1_count = keys1_end - keys1_beg;
+          int keys2_count = keys2_end - keys2_beg;
+
+          int partition_diag = merge_path(&storage.keys_shared[keys1_beg],
+                                          &storage.keys_shared[keys2_beg],
+                                          keys1_count,
+                                          keys2_count,
+                                          diag,
+                                          compare_op);
+
+          int keys1_beg_loc   = keys1_beg + partition_diag;
+          int keys1_end_loc   = keys1_end;
+          int keys2_beg_loc   = keys2_beg + diag - partition_diag;
+          int keys2_end_loc   = keys2_end;
+          int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
+          int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
+          serial_merge(&storage.keys_shared[0],
+                       keys1_beg_loc,
+                       keys2_beg_loc,
+                       keys1_count_loc,
+                       keys2_count_loc,
+                       keys_loc,
+                       indices,
+                       compare_op);
+
+
+          if (SORT_ITEMS::value)
+          {
+            sync_threadblock();
+
+            // store keys in shmem
+            //
+#pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+              int idx                   = ITEMS_PER_THREAD * threadIdx.x + ITEM;
+              storage.items_shared[idx] = items_loc[ITEM];
+            }
+
+            sync_threadblock();
+
+            // gather items from shmem
+            //
+#pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+              items_loc[ITEM] = storage.items_shared[indices[ITEM]];
+            }
+          }
+        }
+      }    // func block_merge_sort
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(int  tid,
+                   Size /*tile_idx*/,
+                   Size tile_base,
+                   int  num_remaining)
+      {
+        using core::uninitialized_array;
+        using core::sync_threadblock;
+
+        item_type items_loc[ITEMS_PER_THREAD];
+        if (SORT_ITEMS::value)
+        {
+          BlockLoadItems(storage.load_items)
+              .Load(items_in + tile_base,
+                    items_loc,
+                    num_remaining,
+                    *(items_in + tile_base));
+
+          sync_threadblock();
+        }
+
+        key_type keys_loc[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_in + tile_base,
+                    keys_loc,
+                    num_remaining,
+                    *(keys_in + tile_base));
+        }
+        else
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_in + tile_base, keys_loc);
+        }
+
+        if (IS_LAST_TILE)
+        {
+          // if last tile, find valid max_key
+          // and fill the remainig keys with it
+          //
+          key_type max_key = keys_loc[0];
+#pragma unroll
+          for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
+            {
+              max_key = compare_op(max_key, keys_loc[ITEM])
+                            ? keys_loc[ITEM]
+                            : max_key;
+            }
+            else
+            {
+              keys_loc[ITEM] = max_key;
+            }
+          }
+        }
+
+        sync_threadblock();
+
+        if (IS_LAST_TILE)
+        {
+          block_mergesort(tid,
+                          num_remaining,
+                          keys_loc,
+                          items_loc);
+        }
+        else
+        {
+          block_mergesort(tid,
+                          ITEMS_PER_TILE,
+                          keys_loc,
+                          items_loc);
+        }
+
+        sync_threadblock();
+
+        if (ping)
+        {
+          if (IS_LAST_TILE)
+          {
+            BlockStoreKeysIt(storage.store_keys_it)
+                .Store(keys_out_it + tile_base, keys_loc, num_remaining);
+          }
+          else
+          {
+            BlockStoreKeysIt(storage.store_keys_it)
+                .Store(keys_out_it + tile_base, keys_loc);
+          }
+
+          if (SORT_ITEMS::value)
+          {
+            sync_threadblock();
+
+            BlockStoreItemsIt(storage.store_items_it)
+                .Store(items_out_it + tile_base, items_loc, num_remaining);
+          }
+        }
+        else
+        {
+          if (IS_LAST_TILE)
+          {
+            BlockStoreKeysRaw(storage.store_keys_raw)
+                .Store(keys_out_raw + tile_base, keys_loc, num_remaining);
+          }
+          else
+          {
+            BlockStoreKeysRaw(storage.store_keys_raw)
+                .Store(keys_out_raw + tile_base, keys_loc);
+          }
+
+          if (SORT_ITEMS::value)
+          {
+            sync_threadblock();
+
+            BlockStoreItemsRaw(storage.store_items_raw)
+                .Store(items_out_raw + tile_base, items_loc, num_remaining);
+          }
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(bool         ping_,
+           TempStorage& storage_,
+           KeysLoadIt   keys_in_,
+           ItemsLoadIt  items_in_,
+           Size         keys_count_,
+           KeysIt       keys_out_it_,
+           ItemsIt      items_out_it_,
+           key_type*    keys_out_raw_,
+           item_type*   items_out_raw_,
+           CompareOp    compare_op_)
+          : ping(ping_),
+            storage(storage_),
+            keys_in(keys_in_),
+            items_in(items_in_),
+            keys_count(keys_count_),
+            keys_out_it(keys_out_it_),
+            items_out_it(items_out_it_),
+            keys_out_raw(keys_out_raw_),
+            items_out_raw(items_out_raw_),
+            compare_op(compare_op_)
+      {
+        int  tid           = threadIdx.x;
+        Size tile_idx      = blockIdx.x;
+        Size num_tiles     = gridDim.x;
+        Size tile_base     = tile_idx * ITEMS_PER_TILE;
+        int  items_in_tile = min<int>(keys_count - tile_base, ITEMS_PER_TILE);
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
+        }
+        else
+        {
+          consume_tile<true>(tid, tile_idx, tile_base, items_in_tile);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(bool       ping,
+                       KeysIt     keys_inout,
+                       ItemsIt    items_inout,
+                       Size       keys_count,
+                       key_type*  keys_out,
+                       item_type* items_out,
+                       CompareOp  compare_op,
+                       char*      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(ping,
+           storage,
+           core::make_load_iterator(ptx_plan(), keys_inout),
+           core::make_load_iterator(ptx_plan(), items_inout),
+           keys_count,
+           keys_inout,
+           items_inout,
+           keys_out,
+           items_out,
+           compare_op);
+    }
+  };    // struct BlockSortAgent
+
+  template <class KeysIt,
+            class Size,
+            class CompareOp>
+  struct PartitionAgent
+  {
+    typedef typename iterator_traits<KeysIt>::value_type key_type;
+    template<class Arch>
+    struct PtxPlan : PtxPolicy<256> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(bool      ping,
+                       KeysIt    keys_ping,
+                       key_type* keys_pong,
+                       Size      keys_count,
+                       Size      num_partitions,
+                       Size*     merge_partitions,
+                       CompareOp compare_op,
+                       Size      coop,
+                       int       items_per_tile,
+                       char*     /*shmem*/)
+    {
+      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      if (partition_idx < num_partitions)
+      {
+        Size list  = ~(coop - 1) & partition_idx;
+        Size start = items_per_tile * list;
+        Size size  = items_per_tile * (coop >> 1);
+
+        Size keys1_beg = min(keys_count, start);
+        Size keys1_end = min(keys_count, start + size);
+        Size keys2_beg = keys1_end;
+        Size keys2_end = min(keys_count, keys2_beg + size);
+
+
+        Size partition_at = min(keys2_end - keys1_beg,
+                                items_per_tile * ((coop - 1) & partition_idx));
+
+        Size partition_diag = ping ? merge_path(keys_ping + keys1_beg,
+                                                keys_ping + keys2_beg,
+                                                keys1_end - keys1_beg,
+                                                keys2_end - keys2_beg,
+                                                partition_at,
+                                                compare_op)
+                                   : merge_path(keys_pong + keys1_beg,
+                                                keys_pong + keys2_beg,
+                                                keys1_end - keys1_beg,
+                                                keys2_end - keys2_beg,
+                                                partition_at,
+                                                compare_op);
+
+
+        merge_partitions[partition_idx] = keys1_beg + partition_diag;
+      }
+    }
+  };    // struct PartitionAgent
+
+  template <class KeysIt,
+            class ItemsIt,
+            class Size,
+            class CompareOp,
+            class MERGE_ITEMS>
+  struct MergeAgent
+  {
+    typedef typename iterator_traits<KeysIt>::value_type  key_type;
+    typedef typename iterator_traits<ItemsIt>::value_type item_type;
+
+    typedef KeysIt     KeysOutputPongIt;
+    typedef ItemsIt    ItemsOutputPongIt;
+    typedef key_type*  KeysOutputPingIt;
+    typedef item_type* ItemsOutputPingIt;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,key_type>::type
+    {
+      typedef Tuning<Arch,key_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysIt>::type     KeysLoadPingIt;
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type    ItemsLoadPingIt;
+      typedef typename core::LoadIterator<PtxPlan, key_type*>::type  KeysLoadPongIt;
+      typedef typename core::LoadIterator<PtxPlan, item_type*>::type ItemsLoadPongIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadPingIt>::type  BlockLoadKeysPing;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadPingIt>::type BlockLoadItemsPing;
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadPongIt>::type  BlockLoadKeysPong;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadPongIt>::type BlockLoadItemsPong;
+
+      typedef typename core::BlockStore<PtxPlan, KeysOutputPongIt>::type  BlockStoreKeysPong;
+      typedef typename core::BlockStore<PtxPlan, ItemsOutputPongIt>::type BlockStoreItemsPong;
+      typedef typename core::BlockStore<PtxPlan, KeysOutputPingIt>::type  BlockStoreKeysPing;
+      typedef typename core::BlockStore<PtxPlan, ItemsOutputPingIt>::type BlockStoreItemsPing;
+
+      // gather required temporary storage in a union
+      //
+      union TempStorage
+      {
+        typename BlockLoadKeysPing::TempStorage  load_keys_ping;
+        typename BlockLoadItemsPing::TempStorage load_items_ping;
+        typename BlockLoadKeysPong::TempStorage  load_keys_pong;
+        typename BlockLoadItemsPong::TempStorage load_items_pong;
+
+        typename BlockStoreKeysPing::TempStorage  store_keys_ping;
+        typename BlockStoreItemsPing::TempStorage store_items_ping;
+        typename BlockStoreKeysPong::TempStorage  store_keys_pong;
+        typename BlockStoreItemsPong::TempStorage store_items_pong;
+
+        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadPingIt  KeysLoadPingIt;
+    typedef typename ptx_plan::ItemsLoadPingIt ItemsLoadPingIt;
+    typedef typename ptx_plan::KeysLoadPongIt  KeysLoadPongIt;
+    typedef typename ptx_plan::ItemsLoadPongIt ItemsLoadPongIt;
+
+    typedef typename ptx_plan::BlockLoadKeysPing  BlockLoadKeysPing;
+    typedef typename ptx_plan::BlockLoadItemsPing BlockLoadItemsPing;
+    typedef typename ptx_plan::BlockLoadKeysPong  BlockLoadKeysPong;
+    typedef typename ptx_plan::BlockLoadItemsPong BlockLoadItemsPong;
+
+    typedef typename ptx_plan::BlockStoreKeysPing  BlockStoreKeysPing;
+    typedef typename ptx_plan::BlockStoreItemsPing BlockStoreItemsPing;
+    typedef typename ptx_plan::BlockStoreKeysPong  BlockStoreKeysPong;
+    typedef typename ptx_plan::BlockStoreItemsPong BlockStoreItemsPong;
+
+    typedef typename ptx_plan::TempStorage     TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      bool            ping;
+      TempStorage&    storage;
+
+      KeysLoadPingIt  keys_in_ping;
+      ItemsLoadPingIt items_in_ping;
+      KeysLoadPongIt  keys_in_pong;
+      ItemsLoadPongIt items_in_pong;
+
+      Size            keys_count;
+
+      KeysOutputPongIt  keys_out_pong;
+      ItemsOutputPongIt items_out_pong;
+      KeysOutputPingIt  keys_out_ping;
+      ItemsOutputPingIt items_out_ping;
+
+      CompareOp       compare_op;
+      Size*           merge_partitions;
+      Size            coop;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE, class T, class It1, class It2>
+      THRUST_DEVICE_FUNCTION void
+      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+                  It1 input1,
+                  It2 input2,
+                  int count1,
+                  int count2)
+      {
+        if (IS_FULL_TILE)
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
+          }
+        }
+        else
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1 + count2)
+            {
+              output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
+            }
+          }
+        }
+      }
+
+      template <class T, class It>
+      THRUST_DEVICE_FUNCTION void
+      reg_to_shared(It output,
+                    T (&input)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+          output[idx] = input[ITEM];
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(int  tid,
+                   Size tile_idx,
+                   Size tile_base,
+                   int  count)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
+
+        Size partition_beg = merge_partitions[tile_idx + 0];
+        Size partition_end = merge_partitions[tile_idx + 1];
+
+        Size list = ~(coop - 1) & tile_idx;
+        Size start = ITEMS_PER_TILE * list;
+        Size size  = ITEMS_PER_TILE * (coop >> 1);
+
+        Size diag   = ITEMS_PER_TILE * tile_idx - start;
+
+        Size keys1_beg = partition_beg;
+        Size keys1_end = partition_end;
+        Size keys2_beg = min<Size>(keys_count, 2 * start + size + diag - partition_beg);
+        Size keys2_end = min<Size>(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end);
+
+        if (coop - 1 == ((coop - 1) & tile_idx))
+        {
+          keys1_end = min(keys_count, start + size);
+          keys2_end = min(keys_count, start + size * 2);
+        }
+
+        // number of keys per tile
+        //
+        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+        // load keys1 & keys2
+        key_type keys_loc[ITEMS_PER_THREAD];
+        if (ping)
+        {
+          gmem_to_reg<IS_FULL_TILE>(keys_loc,
+                                    keys_in_ping + keys1_beg,
+                                    keys_in_ping + keys2_beg,
+                                    num_keys1,
+                                    num_keys2);
+        }
+        else
+        {
+          gmem_to_reg<IS_FULL_TILE>(keys_loc,
+                                    keys_in_pong + keys1_beg,
+                                    keys_in_pong + keys2_beg,
+                                    num_keys1,
+                                    num_keys2);
+        }
+        reg_to_shared(&storage.keys_shared[0], keys_loc);
+
+        // preload items into registers already
+        //
+        item_type items_loc[ITEMS_PER_THREAD];
+        if (MERGE_ITEMS::value)
+        {
+          if (ping)
+          {
+            gmem_to_reg<IS_FULL_TILE>(items_loc,
+                                      items_in_ping + keys1_beg,
+                                      items_in_ping + keys2_beg,
+                                      num_keys1,
+                                      num_keys2);
+          }
+          else
+          {
+            gmem_to_reg<IS_FULL_TILE>(items_loc,
+                                      items_in_pong + keys1_beg,
+                                      items_in_pong + keys2_beg,
+                                      num_keys1,
+                                      num_keys2);
+          }
+        }
+
+        sync_threadblock();
+
+        // use binary search in shared memory
+        // to find merge path for each of thread
+        // we can use int type here, because the number of
+        // items in shared memory is limited
+        //
+        int diag0_loc = min<Size>(num_keys1 + num_keys2,
+                                  ITEMS_PER_THREAD * tid);
+
+        int keys1_beg_loc = merge_path(&storage.keys_shared[0],
+                                       &storage.keys_shared[num_keys1],
+                                       num_keys1,
+                                       num_keys2,
+                                       diag0_loc,
+                                       compare_op);
+        int keys1_end_loc = num_keys1;
+        int keys2_beg_loc = diag0_loc - keys1_beg_loc;
+        int keys2_end_loc = num_keys2;
+
+        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+        // perform serial merge
+        //
+        int indices[ITEMS_PER_THREAD];
+
+        serial_merge(&storage.keys_shared[0],
+                     keys1_beg_loc,
+                     keys2_beg_loc + num_keys1,
+                     num_keys1_loc,
+                     num_keys2_loc,
+                     keys_loc,
+                     indices,
+                     compare_op);
+
+        sync_threadblock();
+
+        // write keys
+        //
+        if (ping)
+        {
+          if (IS_FULL_TILE)
+          {
+            BlockStoreKeysPing(storage.store_keys_ping)
+                .Store(keys_out_ping + tile_base, keys_loc);
+          }
+          else
+          {
+            BlockStoreKeysPing(storage.store_keys_ping)
+                .Store(keys_out_ping + tile_base, keys_loc, num_keys1 + num_keys2);
+          }
+        }
+        else
+        {
+          if (IS_FULL_TILE)
+          {
+            BlockStoreKeysPong(storage.store_keys_pong)
+                .Store(keys_out_pong + tile_base, keys_loc);
+          }
+          else
+          {
+            BlockStoreKeysPong(storage.store_keys_pong)
+                .Store(keys_out_pong + tile_base, keys_loc, num_keys1 + num_keys2);
+          }
+        }
+
+        // if items are provided, merge them
+        if (MERGE_ITEMS::value)
+        {
+          sync_threadblock();
+
+          reg_to_shared(&storage.items_shared[0], items_loc);
+
+          sync_threadblock();
+
+          // gather items from shared mem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            items_loc[ITEM] = storage.items_shared[indices[ITEM]];
+          }
+
+          sync_threadblock();
+
+          // write from reg to gmem
+          //
+          if (ping)
+          {
+            if (IS_FULL_TILE)
+            {
+              BlockStoreItemsPing(storage.store_items_ping)
+                  .Store(items_out_ping + tile_base, items_loc);
+            }
+            else
+            {
+              BlockStoreItemsPing(storage.store_items_ping)
+                  .Store(items_out_ping + tile_base, items_loc, count);
+            }
+          }
+          else
+          {
+            if (IS_FULL_TILE)
+            {
+              BlockStoreItemsPong(storage.store_items_pong)
+                  .Store(items_out_pong + tile_base, items_loc);
+            }
+            else
+            {
+              BlockStoreItemsPong(storage.store_items_pong)
+                  .Store(items_out_pong + tile_base, items_loc, count);
+            }
+          }
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(bool              ping_,
+           TempStorage&      storage_,
+           KeysLoadPingIt    keys_in_ping_,
+           ItemsLoadPingIt   items_in_ping_,
+           KeysLoadPongIt    keys_in_pong_,
+           ItemsLoadPongIt   items_in_pong_,
+           Size              keys_count_,
+           KeysOutputPingIt  keys_out_ping_,
+           ItemsOutputPingIt items_out_ping_,
+           KeysOutputPongIt  keys_out_pong_,
+           ItemsOutputPongIt items_out_pong_,
+           CompareOp         compare_op_,
+           Size*             merge_partitions_,
+           Size              coop_)
+          : ping(ping_),
+            storage(storage_),
+            keys_in_ping(keys_in_ping_),
+            items_in_ping(items_in_ping_),
+            keys_in_pong(keys_in_pong_),
+            items_in_pong(items_in_pong_),
+            keys_count(keys_count_),
+            keys_out_pong(keys_out_pong_),
+            items_out_pong(items_out_pong_),
+            keys_out_ping(keys_out_ping_),
+            items_out_ping(items_out_ping_),
+            compare_op(compare_op_),
+            merge_partitions(merge_partitions_),
+            coop(coop_)
+      {
+        // XXX with 8.5 chaging type to Size (or long long) results in error!
+        int  tile_idx      = blockIdx.x;
+        Size num_tiles     = gridDim.x;
+        Size tile_base     = Size(tile_idx) * ITEMS_PER_TILE;
+        int tid           = threadIdx.x;
+        int items_in_tile = static_cast<int>(min((Size)ITEMS_PER_TILE,
+                                                 keys_count - tile_base));
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<true>(tid,
+                             tile_idx,
+                             tile_base,
+                             ITEMS_PER_TILE);
+        }
+        else
+        {
+          consume_tile<false>(tid,
+                              tile_idx,
+                              tile_base,
+                              items_in_tile);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(bool       ping,
+                       KeysIt     keys_ping,
+                       ItemsIt    items_ping,
+                       Size       keys_count,
+                       key_type*  keys_pong,
+                       item_type* items_pong,
+                       CompareOp  compare_op,
+                       Size*      merge_partitions,
+                       Size       coop,
+                       char*      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(ping,
+           storage,
+           core::make_load_iterator(ptx_plan(), keys_ping),
+           core::make_load_iterator(ptx_plan(), items_ping),
+           core::make_load_iterator(ptx_plan(), keys_pong),
+           core::make_load_iterator(ptx_plan(), items_pong),
+           keys_count,
+           keys_pong,
+           items_pong,
+           keys_ping,
+           items_ping,
+           compare_op,
+           merge_partitions,
+           coop);
+    }
+  };    // struct MergeAgent;
+
+  /////////////////////////
+
+  template <class SORT_ITEMS,
+            class STABLE,
+            class KeysIt,
+            class ItemsIt,
+            class Size,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void*        d_temp_storage,
+            size_t&      temp_storage_bytes,
+            KeysIt       keys,
+            ItemsIt      items,
+            Size         keys_count,
+            CompareOp    compare_op,
+            cudaStream_t stream,
+            bool         debug_sync)
+  {
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef typename iterator_traits<KeysIt>::value_type  key_type;
+    typedef typename iterator_traits<ItemsIt>::value_type item_type;
+
+    typedef core::AgentLauncher<
+        BlockSortAgent<KeysIt,
+                       ItemsIt,
+                       Size,
+                       CompareOp,
+                       SORT_ITEMS,
+                       STABLE> >
+        block_sort_agent;
+
+    typedef core::AgentLauncher<PartitionAgent<KeysIt, Size, CompareOp> >
+        partition_agent;
+
+    typedef core::AgentLauncher<
+        MergeAgent<KeysIt,
+                   ItemsIt,
+                   Size,
+                   CompareOp,
+                   SORT_ITEMS> >
+        merge_agent;
+
+    cudaError_t status = cudaSuccess;
+
+    if (keys_count == 0)
+      return status;
+
+    typename core::get_plan<partition_agent>::type partition_plan =
+        partition_agent::get_plan();
+
+    typename core::get_plan<merge_agent>::type merge_plan =
+        merge_agent::get_plan(stream);
+
+    AgentPlan block_sort_plan = merge_plan;
+
+    int tile_size = merge_plan.items_per_tile;
+    Size num_tiles = (keys_count + tile_size - 1) / tile_size;
+
+    size_t temp_storage1 = (1 + num_tiles) * sizeof(Size);
+    size_t temp_storage2 = keys_count * sizeof(key_type);
+    size_t temp_storage3 = keys_count * sizeof(item_type) * SORT_ITEMS::value;
+    size_t temp_storage4 = core::vshmem_size(max(block_sort_plan.shared_memory_size,
+                                                 merge_plan.shared_memory_size),
+                                             num_tiles);
+
+    void*  allocations[4]      = {NULL, NULL, NULL, NULL};
+    size_t allocation_sizes[4] = {temp_storage1, temp_storage2, temp_storage3, temp_storage4};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_bytes,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    };
+
+    int num_passes = static_cast<int>(thrust::detail::log2_ri(num_tiles));
+    bool ping = !(1 & num_passes);
+
+    Size*      merge_partitions = (Size*)allocations[0];
+    key_type*  keys_buffer      = (key_type*)allocations[1];
+    item_type* items_buffer     = (item_type*)allocations[2];
+
+    char* vshmem_ptr = temp_storage4 > 0 ? (char*)allocations[3] : NULL;
+
+
+    block_sort_agent(block_sort_plan, keys_count, stream, vshmem_ptr, "block_sort_agent", debug_sync)
+        .launch(ping, keys, items, keys_count, keys_buffer, items_buffer, compare_op);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    size_t num_partitions = num_tiles + 1;
+
+    partition_agent pa(partition_plan, num_partitions, stream, "partition_agent", debug_sync);
+    merge_agent     ma(merge_plan, keys_count, stream, vshmem_ptr, "merge_agent", debug_sync);
+
+    for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
+    {
+      Size coop = Size(2) << pass;
+
+      pa.launch(ping,
+                keys,
+                keys_buffer,
+                keys_count,
+                num_partitions,
+                merge_partitions,
+                compare_op,
+                coop,
+                merge_plan.items_per_tile);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+
+      ma.launch(ping,
+                keys,
+                items,
+                keys_count,
+                keys_buffer,
+                items_buffer,
+                compare_op,
+                merge_partitions,
+                coop);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    return status;
+  }
+
+  template <typename SORT_ITEMS,
+            typename STABLE,
+            typename Derived,
+            typename KeysIt,
+            typename ItemsIt,
+            typename CompareOp>
+  THRUST_RUNTIME_FUNCTION
+  void merge_sort(execution_policy<Derived>& policy,
+                  KeysIt                     keys_first,
+                  KeysIt                     keys_last,
+                  ItemsIt                    items_first,
+                  CompareOp                  compare_op)
+
+  {
+    typedef typename iterator_traits<KeysIt>::difference_type size_type;
+
+    size_type count = static_cast<size_type>(thrust::distance(keys_first, keys_last));
+
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    status = doit_step<SORT_ITEMS, STABLE>(NULL,
+                                           storage_size,
+                                           keys_first,
+                                           items_first,
+                                           count,
+                                           compare_op,
+                                           stream,
+                                           debug_sync);
+    cuda_cub::throw_on_error(status, "merge_sort: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = doit_step<SORT_ITEMS, STABLE>(ptr,
+                                           storage_size,
+                                           keys_first,
+                                           items_first,
+                                           count,
+                                           compare_op,
+                                           stream,
+                                           debug_sync);
+    cuda_cub::throw_on_error(status, "merge_sort: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "merge_sort: failed to synchronize");
+  }
+}    // namespace __merge_sort
+
+namespace __radix_sort {
+
+  template <class SORT_ITEMS, class Comparator>
+  struct dispatch;
+
+  // sort keys in ascending order
+  template <class K>
+  struct dispatch<thrust::detail::false_type, thrust::less<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& /*items_buffer*/,
+         Size                     count,
+         cudaStream_t             stream,
+         bool                     debug_sync)
+    {
+      return cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                            temp_storage_bytes,
+                                            keys_buffer,
+                                            static_cast<int>(count),
+                                            0,
+                                            static_cast<int>(sizeof(Key) * 8),
+                                            stream,
+                                            debug_sync);
+    }
+  }; // struct dispatch -- sort keys in ascending order;
+
+  // sort keys in descending order
+  template <class K>
+  struct dispatch<thrust::detail::false_type, thrust::greater<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& /*items_buffer*/,
+         Size                     count,
+         cudaStream_t             stream,
+         bool                     debug_sync)
+    {
+      return cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                      temp_storage_bytes,
+                                                      keys_buffer,
+                                                      static_cast<int>(count),
+                                                      0,
+                                                      static_cast<int>(sizeof(Key) * 8),
+                                                      stream,
+                                                      debug_sync);
+    }
+  }; // struct dispatch -- sort keys in descending order;
+
+  // sort pairs in ascending order
+  template <class K>
+  struct dispatch<thrust::detail::true_type, thrust::less<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& items_buffer,
+         Size                     count,
+         cudaStream_t             stream,
+         bool                     debug_sync)
+    {
+      return cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                             temp_storage_bytes,
+                                             keys_buffer,
+                                             items_buffer,
+                                             static_cast<int>(count),
+                                             0,
+                                             static_cast<int>(sizeof(Key) * 8),
+                                             stream,
+                                             debug_sync);
+    }
+  }; // struct dispatch -- sort pairs in ascending order;
+
+  // sort pairs in descending order
+  template <class K>
+  struct dispatch<thrust::detail::true_type, thrust::greater<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& items_buffer,
+         Size                     count,
+         cudaStream_t             stream,
+         bool                     debug_sync)
+    {
+      return cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       keys_buffer,
+                                                       items_buffer,
+                                                       static_cast<int>(count),
+                                                       0,
+                                                       static_cast<int>(sizeof(Key) * 8),
+                                                       stream,
+                                                       debug_sync);
+    }
+  }; // struct dispatch -- sort pairs in descending order;
+
+  template <typename SORT_ITEMS,
+            typename Derived,
+            typename Key,
+            typename Item,
+            typename Size,
+            typename CompareOp>
+  THRUST_RUNTIME_FUNCTION
+  void radix_sort(execution_policy<Derived>& policy,
+                  Key*                       keys,
+                  Item*                      items,
+                  Size                       count,
+                  CompareOp)
+  {
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cub::DoubleBuffer<Key>  keys_buffer(keys, NULL);
+    cub::DoubleBuffer<Item> items_buffer(items, NULL);
+
+    Size keys_count = count;
+    Size items_count = SORT_ITEMS::value ? count : 0;
+
+    cudaError_t status;
+
+    status = dispatch<SORT_ITEMS, CompareOp>::doit(NULL,
+                                                   temp_storage_bytes,
+                                                   keys_buffer,
+                                                   items_buffer,
+                                                   keys_count,
+                                                   stream,
+                                                   debug_sync);
+    cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step");
+
+    size_t keys_temp_storage  = core::align_to(sizeof(Key) * keys_count, 128);
+    size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128);
+
+    size_t storage_size = keys_temp_storage
+                        + items_temp_storage
+                        + temp_storage_bytes;
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+
+    keys_buffer.d_buffers[1]  = thrust::detail::aligned_reinterpret_cast<Key*>(
+      tmp.data().get()
+    );
+    items_buffer.d_buffers[1] = thrust::detail::aligned_reinterpret_cast<Item*>(
+      tmp.data().get() + keys_temp_storage
+    );
+    void *ptr = static_cast<void*>(
+      tmp.data().get() + keys_temp_storage + items_temp_storage
+    );
+
+    status = dispatch<SORT_ITEMS, CompareOp>::doit(ptr,
+                                                   temp_storage_bytes,
+                                                   keys_buffer,
+                                                   items_buffer,
+                                                   keys_count,
+                                                   stream,
+                                                   debug_sync);
+    cuda_cub::throw_on_error(status, "radix_sort: failed on 2nd step");
+
+    if (keys_buffer.selector != 0)
+    {
+      Key* temp_ptr = reinterpret_cast<Key*>(keys_buffer.d_buffers[1]);
+      cuda_cub::copy_n(policy, temp_ptr, keys_count, keys);
+    }
+    if (SORT_ITEMS::value && items_buffer.selector != 0)
+    {
+      Item* temp_ptr = reinterpret_cast<Item*>(items_buffer.d_buffers[1]);
+      cuda_cub::copy_n(policy, temp_ptr, items_count, items);
+    }
+  }
+}    // __radix_sort
+
+//---------------------------------------------------------------------
+// Smart sort picks at compile-time whether to dispatch radix or merge sort
+//---------------------------------------------------------------------
+
+namespace __smart_sort {
+
+  template <class Key, class CompareOp>
+  struct can_use_primitive_sort
+      : thrust::detail::and_<
+            thrust::detail::is_arithmetic<Key>,
+            thrust::detail::or_<
+                thrust::detail::is_same<CompareOp, thrust::less<Key> >,
+                thrust::detail::is_same<CompareOp, thrust::greater<Key> > > > {};
+
+  template <class Iterator, class CompareOp>
+  struct enable_if_primitive_sort
+      : thrust::detail::enable_if<
+            can_use_primitive_sort<typename iterator_value<Iterator>::type,
+                                   CompareOp>::value> {};
+
+  template <class Iterator, class CompareOp>
+  struct enable_if_comparison_sort
+      : thrust::detail::disable_if<
+            can_use_primitive_sort<typename iterator_value<Iterator>::type,
+                                   CompareOp>::value> {};
+
+
+  template <class SORT_ITEMS,
+            class STABLE,
+            class Policy,
+            class KeysIt,
+            class ItemsIt,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION typename enable_if_comparison_sort<KeysIt, CompareOp>::type
+  smart_sort(Policy&   policy,
+             KeysIt    keys_first,
+             KeysIt    keys_last,
+             ItemsIt   items_first,
+             CompareOp compare_op)
+  {
+    __merge_sort::merge_sort<SORT_ITEMS, STABLE>(policy,
+                                                 keys_first,
+                                                 keys_last,
+                                                 items_first,
+                                                 compare_op);
+
+  }
+
+  template <class SORT_ITEMS,
+            class STABLE,
+            class Policy,
+            class KeysIt,
+            class ItemsIt,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION typename enable_if_primitive_sort<KeysIt, CompareOp>::type
+  smart_sort(execution_policy<Policy>& policy,
+             KeysIt                    keys_first,
+             KeysIt                    keys_last,
+             ItemsIt                   items_first,
+             CompareOp                 compare_op)
+  {
+    // ensure sequences have trivial iterators
+    thrust::detail::trivial_sequence<KeysIt, Policy>
+        keys(policy, keys_first, keys_last);
+
+    if (SORT_ITEMS::value)
+    {
+      thrust::detail::trivial_sequence<ItemsIt, Policy>
+          values(policy, items_first, items_first + (keys_last - keys_first));
+
+      __radix_sort::radix_sort<SORT_ITEMS>(
+          policy,
+          thrust::raw_pointer_cast(&*keys.begin()),
+          thrust::raw_pointer_cast(&*values.begin()),
+          keys_last - keys_first,
+          compare_op);
+
+      if (!is_contiguous_iterator<ItemsIt>::value)
+      {
+        cuda_cub::copy(policy, values.begin(), values.end(), items_first);
+      }
+    }
+    else
+    {
+      __radix_sort::radix_sort<SORT_ITEMS>(
+          policy,
+          thrust::raw_pointer_cast(&*keys.begin()),
+          thrust::raw_pointer_cast(&*keys.begin()),
+          keys_last - keys_first,
+          compare_op);
+    }
+
+    // copy results back, if necessary
+    if (!is_contiguous_iterator<KeysIt>::value)
+    {
+      cuda_cub::copy(policy, keys.begin(), keys.end(), keys_first);
+    }
+  }
+}    // namespace __smart_sort
+
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+
+__thrust_exec_check_disable__
+template <class Derived, class ItemsIt, class CompareOp>
+void __host__ __device__
+sort(execution_policy<Derived>& policy,
+     ItemsIt                    first,
+     ItemsIt                    last,
+     CompareOp                  compare_op)
+{
+  if (__THRUST_HAS_CUDART__)
+  {
+    typedef typename thrust::iterator_value<ItemsIt>::type item_type;
+    __smart_sort::smart_sort<thrust::detail::false_type, thrust::detail::false_type>(
+        policy, first, last, (item_type*)NULL, compare_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);
+#endif
+  }
+}
+
+__thrust_exec_check_disable__
+template <class Derived, class ItemsIt, class CompareOp>
+void __host__ __device__
+stable_sort(execution_policy<Derived>& policy,
+            ItemsIt                    first,
+            ItemsIt                    last,
+            CompareOp                  compare_op)
+{
+  if (__THRUST_HAS_CUDART__)
+  {
+    typedef typename thrust::iterator_value<ItemsIt>::type item_type;
+    __smart_sort::smart_sort<thrust::detail::false_type, thrust::detail::true_type>(
+        policy, first, last, (item_type*)NULL, compare_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    thrust::stable_sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);
+#endif
+  }
+}
+
+__thrust_exec_check_disable__
+template <class Derived, class KeysIt, class ValuesIt, class CompareOp>
+void __host__ __device__
+sort_by_key(execution_policy<Derived>& policy,
+            KeysIt                     keys_first,
+            KeysIt                     keys_last,
+            ValuesIt                   values,
+            CompareOp                  compare_op)
+{
+  if (__THRUST_HAS_CUDART__)
+  {
+    __smart_sort::smart_sort<thrust::detail::true_type, thrust::detail::false_type>(
+        policy, keys_first, keys_last, values, compare_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    thrust::sort_by_key(
+        cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op);
+#endif
+  }
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt,
+          class ValuesIt,
+          class CompareOp>
+void __host__ __device__
+stable_sort_by_key(execution_policy<Derived> &policy,
+            KeysIt                     keys_first,
+            KeysIt                     keys_last,
+            ValuesIt                   values,
+            CompareOp                  compare_op)
+{
+  if (__THRUST_HAS_CUDART__)
+  {
+    __smart_sort::smart_sort<thrust::detail::true_type, thrust::detail::true_type>(
+        policy, keys_first, keys_last, values, compare_op);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    thrust::stable_sort_by_key(
+        cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op);
+#endif
+  }
+}
+
+// API with default comparator
+
+template <class Derived, class ItemsIt>
+void __host__ __device__
+sort(execution_policy<Derived>& policy,
+     ItemsIt                    first,
+     ItemsIt                    last)
+{
+  typedef typename thrust::iterator_value<ItemsIt>::type item_type;
+  cuda_cub::sort(policy, first, last, less<item_type>());
+}
+
+template <class Derived, class ItemsIt>
+void __host__ __device__
+stable_sort(execution_policy<Derived>& policy,
+            ItemsIt                    first,
+            ItemsIt                    last)
+{
+  typedef typename thrust::iterator_value<ItemsIt>::type item_type;
+  cuda_cub::stable_sort(policy, first, last, less<item_type>());
+}
+
+template <class Derived, class KeysIt, class ValuesIt>
+void __host__ __device__
+sort_by_key(execution_policy<Derived>& policy,
+            KeysIt                     keys_first,
+            KeysIt                     keys_last,
+            ValuesIt                   values)
+{
+  typedef typename thrust::iterator_value<KeysIt>::type key_type;
+  cuda_cub::sort_by_key(policy, keys_first, keys_last, values, less<key_type>());
+}
+
+template <class Derived, class KeysIt, class ValuesIt>
+void __host__ __device__
+stable_sort_by_key(
+    execution_policy<Derived>& policy, KeysIt keys_first, KeysIt keys_last, ValuesIt values)
+{
+  typedef typename thrust::iterator_value<KeysIt>::type key_type;
+  cuda_cub::stable_sort_by_key(policy, keys_first, keys_last, values, less<key_type>());
+}
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/swap_ranges.h b/thrust/thrust/system/cuda/detail/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba3b47d9b9221e931ca5c3e49cdc04b67e199392
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/swap_ranges.h
@@ -0,0 +1,107 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/swap.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+namespace __swap_ranges {
+
+
+  template <class ItemsIt1, class ItemsIt2>
+  struct swap_f
+  {
+    ItemsIt1 items1;
+    ItemsIt2 items2;
+
+    typedef  typename iterator_traits<ItemsIt1>::value_type value1_type;
+    typedef  typename iterator_traits<ItemsIt2>::value_type value2_type;
+
+    THRUST_FUNCTION
+    swap_f(ItemsIt1 items1_, ItemsIt2 items2_)
+        : items1(items1_), items2(items2_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      value1_type item1 = items1[idx];
+      value2_type item2 = items2[idx];
+      // XXX thrust::swap is buggy
+      // if reference_type of ItemIt1/ItemsIt2
+      // is a proxy reference, then KABOOM!
+      // to avoid this, just copy the value first before swap
+      // *todo* specialize on real & proxy references
+      using thrust::swap;
+      swap(item1, item2);
+      items1[idx] = item1;
+      items2[idx] = item2;
+    }
+  };
+}    // namespace __swap_ranges
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2>
+ItemsIt2 __host__ __device__
+swap_ranges(execution_policy<Derived> &policy,
+            ItemsIt1                   first1,
+            ItemsIt1                   last1,
+            ItemsIt2                   first2)
+{
+  typedef typename iterator_traits<ItemsIt1>::difference_type size_type;
+
+  size_type num_items = static_cast<size_type>(thrust::distance(first1, last1));
+
+  cuda_cub::parallel_for(policy,
+                         __swap_ranges::swap_f<ItemsIt1,
+                                               ItemsIt2>(first1, first2),
+                         num_items);
+
+  cuda_cub::throw_on_error(
+    cuda_cub::synchronize(policy)
+  , "swap_ranges: failed to synchronize"
+  );
+
+  return first2 + num_items;
+}
+
+
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/tabulate.h b/thrust/thrust/system/cuda/detail/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..70b2720d9a9eb00d8d68f90d2e34fa0623572fb7
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/tabulate.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/distance.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+namespace __tabulate {
+
+  template <class Iterator, class TabulateOp, class Size>
+  struct functor
+  {
+    Iterator items;
+    TabulateOp op;
+
+    __host__ __device__
+    functor(Iterator items_, TabulateOp op_)
+        : items(items_), op(op_) {}
+
+    void __device__ operator()(Size idx)
+    {
+      items[idx] = op(idx);
+    }
+  };    // struct functor
+
+}    // namespace __tabulate
+
+template <class Derived,
+          class Iterator,
+          class TabulateOp>
+void __host__ __device__
+tabulate(execution_policy<Derived>& policy,
+         Iterator                   first,
+         Iterator                   last,
+         TabulateOp                 tabulate_op)
+{
+  typedef typename iterator_traits<Iterator>::difference_type size_type;
+
+  size_type count = thrust::distance(first, last);
+
+  typedef __tabulate::functor<Iterator, TabulateOp, size_type> functor_t;
+
+  cuda_cub::parallel_for(policy,
+                         functor_t(first, tabulate_op),
+                         count);
+
+  cuda_cub::throw_on_error(
+    cuda_cub::synchronize(policy)
+  , "tabulate: failed to synchronize"
+  );
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/temporary_buffer.h b/thrust/thrust/system/cuda/detail/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b5276141625d61567d3adb06a363682b4df968b
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2016 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/thrust/thrust/system/cuda/detail/terminate.h b/thrust/thrust/system/cuda/detail/terminate.h
new file mode 100644
index 0000000000000000000000000000000000000000..d14bed2ab3d4db55750a92b76cba5daaba38a684
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/terminate.h
@@ -0,0 +1,63 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cstdio>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+inline __device__
+void terminate()
+{
+  thrust::cuda_cub::terminate();
+}
+
+
+inline __host__ __device__
+void terminate_with_message(const char* message)
+{
+  printf("%s\n", message);
+  thrust::cuda_cub::terminate();
+}
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..053fe9095a9bba47a05cf8b21c4a1954107685aa
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -0,0 +1,426 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+
+namespace __transform {
+
+  struct no_stencil_tag
+  {
+  };
+
+  struct always_true_predicate
+  {
+    template <class T>
+    bool THRUST_DEVICE_FUNCTION operator()(T const &) const
+    {
+      return true;
+    }
+  };
+
+  template <class InputIt,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  struct unary_transform_f
+  {
+    InputIt     input;
+    OutputIt    output;
+    StencilIt   stencil;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    unary_transform_f(InputIt     input_,
+                      OutputIt    output_,
+                      StencilIt   stencil_,
+                      TransformOp op_,
+                      Predicate   pred_)
+        : input(input_),
+          output(output_),
+          stencil(stencil_),
+          op(op_),
+          pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(stencil[idx])))
+        output[idx] = op(raw_reference_cast(input[idx]));
+    }
+  }; // struct unary_transform_stencil_f
+
+  template <class InputIt,
+            class OutputIt,
+            class TransformOp,
+            class Predicate>
+  struct unary_transform_f<InputIt,
+                           OutputIt,
+                           no_stencil_tag,
+                           TransformOp,
+                           Predicate>
+  {
+    InputIt     input;
+    OutputIt    output;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    unary_transform_f(InputIt        input_,
+                      OutputIt       output_,
+                      no_stencil_tag,
+                      TransformOp    op_,
+                      Predicate      pred_)
+        : input(input_), output(output_), op(op_), pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(input[idx])))
+        output[idx] = op(raw_reference_cast(input[idx]));
+    }
+  }; // struct unary_transform_f
+
+  template <class InputIt1,
+            class InputIt2,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  struct binary_transform_f
+  {
+    InputIt1    input1;
+    InputIt2    input2;
+    OutputIt    output;
+    StencilIt   stencil;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    binary_transform_f(InputIt1    input1_,
+                       InputIt2    input2_,
+                       OutputIt    output_,
+                       StencilIt   stencil_,
+                       TransformOp op_,
+                       Predicate   pred_)
+        : input1(input1_),
+          input2(input2_),
+          output(output_),
+          stencil(stencil_),
+          op(op_),
+          pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(stencil[idx])))
+        output[idx] = op(raw_reference_cast(input1[idx]),
+                         raw_reference_cast(input2[idx]));
+    }
+  }; // struct binary_transform_stencil_f
+
+  template <class InputIt1,
+            class InputIt2,
+            class OutputIt,
+            class TransformOp,
+            class Predicate>
+  struct binary_transform_f<InputIt1,
+                            InputIt2,
+                            OutputIt,
+                            no_stencil_tag,
+                            TransformOp,
+                            Predicate>
+  {
+    InputIt1    input1;
+    InputIt2    input2;
+    OutputIt    output;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    binary_transform_f(InputIt1       input1_,
+                       InputIt2       input2_,
+                       OutputIt       output_,
+                       no_stencil_tag ,
+                       TransformOp    op_,
+                       Predicate      pred_)
+        : input1(input1_),
+          input2(input2_),
+          output(output_),
+          op(op_),
+          pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(input1[idx])))
+        output[idx] = op(raw_reference_cast(input1[idx]),
+                         raw_reference_cast(input2[idx]));
+    }
+  }; // struct binary_transform_f
+
+  template <class Policy,
+            class InputIt,
+            class Size,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  OutputIt THRUST_FUNCTION
+  unary(Policy &     policy,
+        InputIt      items,
+        OutputIt     result,
+        Size         num_items,
+        StencilIt    stencil,
+        TransformOp  transform_op,
+        Predicate    predicate)
+  {
+    if (num_items == 0)
+      return result;
+
+    typedef unary_transform_f<InputIt,
+                              OutputIt,
+                              StencilIt,
+                              TransformOp,
+                              Predicate>
+        unary_transform_t;
+
+    cuda_cub::parallel_for(policy,
+                           unary_transform_t(items,
+                                             result,
+                                             stencil,
+                                             transform_op,
+                                             predicate),
+                           num_items);
+
+    cuda_cub::throw_on_error(
+      cuda_cub::synchronize(policy)
+    , "transform: failed to synchronize"
+    );
+
+    return result + num_items;
+  }
+
+  template <class Policy,
+            class InputIt1,
+            class InputIt2,
+            class Size,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  OutputIt THRUST_FUNCTION
+  binary(Policy &    policy,
+         InputIt1    items1,
+         InputIt2    items2,
+         OutputIt    result,
+         Size        num_items,
+         StencilIt   stencil,
+         TransformOp transform_op,
+         Predicate   predicate)
+  {
+    if (num_items == 0)
+      return result;
+
+    typedef binary_transform_f<InputIt1,
+                               InputIt2,
+                               OutputIt,
+                               StencilIt,
+                               TransformOp,
+                               Predicate>
+        binary_transform_t;
+
+    cuda_cub::parallel_for(policy,
+                           binary_transform_t(items1,
+                                              items2,
+                                              result,
+                                              stencil,
+                                              transform_op,
+                                              predicate),
+                           num_items);
+
+    cuda_cub::throw_on_error(
+      cuda_cub::synchronize(policy)
+    , "transform: failed to synchronize"
+    );
+
+    return result + num_items;
+  }
+
+}    // namespace __transform
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+//-------------------------
+//  one input data stream
+//-------------------------
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class StencilInputIt,
+          class TransformOp,
+          class Predicate>
+OutputIt THRUST_FUNCTION
+transform_if(execution_policy<Derived> &policy,
+             InputIt                    first,
+             InputIt                    last,
+             StencilInputIt             stencil,
+             OutputIt                   result,
+             TransformOp                transform_op,
+             Predicate                  predicate)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  return __transform::unary(policy,
+                            first,
+                            result,
+                            num_items,
+                            stencil,
+                            transform_op,
+                            predicate);
+}    // func transform_if
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp,
+          class Predicate>
+OutputIt THRUST_FUNCTION
+transform_if(execution_policy<Derived> &policy,
+             InputIt                    first,
+             InputIt                    last,
+             OutputIt                   result,
+             TransformOp                transform_op,
+             Predicate                  predicate)
+{
+  return cuda_cub::transform_if(policy,
+                                first,
+                                last,
+                                __transform::no_stencil_tag(),
+                                result,
+                                transform_op,
+                                predicate);
+}    // func transform_if
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp>
+OutputIt THRUST_FUNCTION
+transform(execution_policy<Derived> &policy,
+          InputIt                    first,
+          InputIt                    last,
+          OutputIt                   result,
+          TransformOp                transform_op)
+{
+  return cuda_cub::transform_if(policy,
+                                first,
+                                last,
+                                result,
+                                transform_op,
+                                __transform::always_true_predicate());
+} // func transform
+
+//-------------------------
+// two input data streams
+//-------------------------
+
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class StencilInputIt,
+          class OutputIt,
+          class TransformOp,
+          class Predicate>
+OutputIt THRUST_FUNCTION
+transform_if(execution_policy<Derived> &policy,
+             InputIt1                   first1,
+             InputIt1                   last1,
+             InputIt2                   first2,
+             StencilInputIt             stencil,
+             OutputIt                   result,
+             TransformOp                transform_op,
+             Predicate                  predicate)
+{
+  typedef typename iterator_traits<InputIt1>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first1, last1));
+  return __transform::binary(policy,
+                             first1,
+                             first2,
+                             result,
+                             num_items,
+                             stencil,
+                             transform_op,
+                             predicate);
+}    // func transform_if
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class OutputIt,
+          class TransformOp>
+OutputIt THRUST_FUNCTION
+transform(execution_policy<Derived> &policy,
+          InputIt1                   first1,
+          InputIt1                   last1,
+          InputIt2                   first2,
+          OutputIt                   result,
+          TransformOp                transform_op)
+{
+  return cuda_cub::transform_if(policy,
+                                first1,
+                                last1,
+                                first2,
+                                __transform::no_stencil_tag(),
+                                result,
+                                transform_op,
+                                __transform::always_true_predicate());
+} // func transform
+
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/transform_reduce.h b/thrust/thrust/system/cuda/detail/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9a193f242433cbe360ca7482ab0a65f0078c38f
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/transform_reduce.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt,
+          class TransformOp,
+          class T,
+          class ReduceOp>
+T __host__ __device__
+transform_reduce(execution_policy<Derived> &policy,
+                 InputIt                    first,
+                 InputIt                    last,
+                 TransformOp                transform_op,
+                 T                          init,
+                 ReduceOp                   reduce_op)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  typedef transform_input_iterator_t<T,
+                                     InputIt,
+                                     TransformOp>
+      transformed_iterator_t;
+
+  return cuda_cub::reduce_n(policy,
+                            transformed_iterator_t(first, transform_op),
+                            num_items,
+                            init,
+                            reduce_op);
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbf70b0a748803f61fac623482c349feaf0be86c
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/transform_scan.h
@@ -0,0 +1,111 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/scan.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp,
+          class ScanOp>
+OutputIt __host__ __device__
+transform_inclusive_scan(execution_policy<Derived> &policy,
+                         InputIt                    first,
+                         InputIt                    last,
+                         OutputIt                   result,
+                         TransformOp                transform_op,
+                         ScanOp                     scan_op)
+{
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using input_type = typename thrust::iterator_value<InputIt>::type;
+#if THRUST_CPP_DIALECT < 2017
+  using result_type = typename std::result_of<TransformOp(input_type)>::type;
+#else
+  using result_type = std::invoke_result_t<TransformOp, input_type>;
+#endif
+
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  typedef transform_input_iterator_t<result_type,
+                                     InputIt,
+                                     TransformOp>
+      transformed_iterator_t;
+
+  return cuda_cub::inclusive_scan_n(policy,
+                                 transformed_iterator_t(first, transform_op),
+                                 num_items,
+                                 result,
+                                 scan_op);
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp,
+          class InitialValueType,
+          class ScanOp>
+OutputIt __host__ __device__
+transform_exclusive_scan(execution_policy<Derived> &policy,
+                         InputIt                    first,
+                         InputIt                    last,
+                         OutputIt                   result,
+                         TransformOp                transform_op,
+                         InitialValueType           init,
+                         ScanOp                     scan_op)
+{
+  // Use the initial value type per https://wg21.link/P0571
+  using result_type = InitialValueType;
+
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  typedef transform_input_iterator_t<result_type,
+                                     InputIt,
+                                     TransformOp>
+      transformed_iterator_t;
+
+  return cuda_cub::exclusive_scan_n(policy,
+                                 transformed_iterator_t(first, transform_op),
+                                 num_items,
+                                 result,
+                                 init,
+                                 scan_op);
+}
+
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/uninitialized_copy.h b/thrust/thrust/system/cuda/detail/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d916e33ba2a09662839b0ef97277c5e1a671adb
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/uninitialized_copy.h
@@ -0,0 +1,116 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+namespace __uninitialized_copy {
+
+  template <class InputIt, class OutputIt>
+  struct functor
+  {
+    InputIt  input;
+    OutputIt output;
+
+    typedef typename iterator_traits<InputIt>::value_type  InputType;
+    typedef typename iterator_traits<OutputIt>::value_type OutputType;
+
+    THRUST_FUNCTION
+    functor(InputIt input_, OutputIt output_)
+        : input(input_), output(output_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      InputType const &in  = raw_reference_cast(input[idx]);
+      OutputType &     out = raw_reference_cast(output[idx]);
+
+#if defined(__CUDA__) && defined(__clang__)
+      // XXX unsafe, but clang is seemngly unable to call in-place new
+      out = in;
+#else
+      ::new (static_cast<void *>(&out)) OutputType(in);
+#endif
+    }
+  };    // struct functor
+
+}    // namespace __uninitialized_copy
+
+template <class Derived,
+          class InputIt,
+          class Size,
+          class OutputIt>
+OutputIt __host__ __device__
+uninitialized_copy_n(execution_policy<Derived> &policy,
+                     InputIt                    first,
+                     Size                       count,
+                     OutputIt                   result)
+{
+  typedef __uninitialized_copy::functor<InputIt,OutputIt> functor_t;
+
+  cuda_cub::parallel_for(policy,
+                         functor_t(first, result),
+                         count);
+
+  cuda_cub::throw_on_error(
+    cuda_cub::synchronize(policy)
+  , "uninitialized_copy_n: failed to synchronize"
+  );
+
+  return result + count;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+uninitialized_copy(execution_policy<Derived>& policy,
+                   InputIt                    first,
+                   InputIt                    last,
+                   OutputIt                   result)
+{
+  return cuda_cub::uninitialized_copy_n(policy,
+                                        first,
+                                        thrust::distance(first, last),
+                                        result);
+}
+
+}    // namespace cuda_
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/uninitialized_fill.h b/thrust/thrust/system/cuda/detail/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8f5fa80973dbf4e52fdab3fed18b6517af6fced
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/uninitialized_fill.h
@@ -0,0 +1,114 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+namespace __uninitialized_fill {
+
+  template <class Iterator, class T>
+  struct functor
+  {
+    Iterator  items;
+    T         value;
+
+    typedef typename iterator_traits<Iterator>::value_type value_type;
+
+    THRUST_FUNCTION
+    functor(Iterator items_, T const& value_)
+        : items(items_), value(value_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      value_type& out = raw_reference_cast(items[idx]);
+
+#if defined(__CUDA__) && defined(__clang__)
+      // XXX unsafe. cuda-clang is seemingly unable to call ::new in device code
+      out = value;
+#else
+      ::new (static_cast<void *>(&out)) value_type(value);
+#endif
+    }
+  };    // struct functor
+
+}    // namespace __uninitialized_copy
+
+template <class Derived,
+          class Iterator,
+          class Size,
+          class T>
+Iterator __host__ __device__
+uninitialized_fill_n(execution_policy<Derived>& policy,
+                     Iterator                   first,
+                     Size                       count,
+                     T const&                   x)
+{
+  typedef __uninitialized_fill::functor<Iterator,T> functor_t;
+
+  cuda_cub::parallel_for(policy,
+                         functor_t(first, x),
+                         count);
+
+  cuda_cub::throw_on_error(
+    cuda_cub::synchronize(policy)
+  , "uninitialized_fill_n: failed to synchronize"
+  );
+
+  return first + count;
+}
+
+template <class Derived,
+          class Iterator,
+          class T>
+void __host__ __device__
+uninitialized_fill(execution_policy<Derived>& policy,
+                   Iterator                   first,
+                   Iterator                   last,
+                   T const&                   x)
+{
+  cuda_cub::uninitialized_fill_n(policy,
+                              first,
+                              thrust::distance(first, last),
+                              x);
+}
+
+}    // namespace cuda_cub
+
+} // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2aff4c6489ccf47e76288ffd7c5afe7c43b2dc0
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/unique.h
@@ -0,0 +1,801 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <cub/device/device_select.cuh>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/functional.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__ ForwardIterator
+unique(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    ForwardIterator                                             first,
+    ForwardIterator                                             last,
+    BinaryPredicate                                             binary_pred);
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryPredicate>
+__host__ __device__ OutputIterator
+unique_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator                                               first,
+    InputIterator                                               last,
+    OutputIterator                                              result,
+    BinaryPredicate                                             binary_pred);
+
+namespace cuda_cub {
+
+// XXX  it should be possible to unify unique & unique_by_key into a single
+//      agent with various specializations, similar to what is done
+//      with partition
+namespace __unique {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class,class>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class T, size_t NOMINAL_4B_ITEMS_PER_THREAD>
+  struct items_per_thread
+  {
+    enum
+    {
+      value = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<int,
+                   1,
+                   (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
+                    sizeof(T))>::value>::value
+    };
+  };
+
+  template<class T>
+  struct Tuning<sm52,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<64,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm52
+
+
+  template <class T>
+  struct Tuning<sm35, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm35
+
+  template<class T>
+  struct Tuning<sm30,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm30
+
+  template <class ItemsIt,
+            class ItemsOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  struct UniqueAgent
+  {
+    typedef typename iterator_traits<ItemsIt>::value_type item_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, item_type>::type
+    {
+      typedef Tuning<Arch, item_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type ItemsLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type BlockLoadItems;
+
+      typedef cub::BlockDiscontinuity<item_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityItems;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      typedef core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE>
+          shared_items_t;
+
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage               scan;
+          typename TilePrefixCallback::TempStorage      prefix;
+          typename BlockDiscontinuityItems::TempStorage discontinuity;
+        };
+
+        typename BlockLoadItems::TempStorage  load_items;
+        shared_items_t shared_items;
+
+      };    // union TempStorage
+    };      // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::ItemsLoadIt             ItemsLoadIt;
+    typedef typename ptx_plan::BlockLoadItems          BlockLoadItems;
+    typedef typename ptx_plan::BlockDiscontinuityItems BlockDiscontinuityItems;
+    typedef typename ptx_plan::TilePrefixCallback      TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan               BlockScan;
+    typedef typename ptx_plan::shared_items_t          shared_items_t;
+    typedef typename ptx_plan::TempStorage             TempStorage;
+
+    enum
+    {
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &                      temp_storage;
+      ScanTileState &                    tile_state;
+      ItemsLoadIt                        items_in;
+      ItemsOutputIt                      items_out;
+      cub::InequalityWrapper<BinaryPred> predicate;
+      Size                               num_items;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      shared_items_t &get_shared()
+      {
+        return temp_storage.shared_items;
+      }
+
+      void THRUST_DEVICE_FUNCTION
+      scatter(item_type (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  /*num_tile_items*/,
+              int  num_tile_selections,
+              Size num_selections_prefix,
+              Size /*num_selections*/)
+      {
+        using core::sync_threadblock;
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int local_scatter_offset = selection_indices[ITEM] -
+                                     num_selections_prefix;
+          if (selection_flags[ITEM])
+          {
+            get_shared()[local_scatter_offset] = items[ITEM];
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+          items_out[num_selections_prefix + item] = get_shared()[item];
+        }
+
+        sync_threadblock();
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
+
+        item_type items_loc[ITEMS_PER_THREAD];
+        Size      selection_flags[ITEMS_PER_THREAD];
+        Size      selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_in + tile_base,
+                    items_loc,
+                    num_tile_items,
+                    *(items_in + tile_base));
+        }
+        else
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_in + tile_base, items_loc);
+        }
+
+
+        sync_threadblock();
+
+        if (IS_FIRST_TILE)
+        {
+          BlockDiscontinuityItems(temp_storage.discontinuity)
+              .FlagHeads(selection_flags, items_loc, predicate);
+        }
+        else
+        {
+          item_type tile_predecessor = items_in[tile_base - 1];
+          BlockDiscontinuityItems(temp_storage.discontinuity)
+              .FlagHeads(selection_flags, items_loc, predicate, tile_predecessor);
+        }
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set selection_flags for out-of-bounds items
+          if ((IS_LAST_TILE) &&
+              (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+            selection_flags[ITEM] = 1;
+        }
+
+        sync_threadblock();
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(temp_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       temp_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(temp_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        sync_threadblock();
+
+        scatter(items_loc,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+      }
+
+
+      template <bool IS_LAST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &    temp_storage_,
+           ScanTileState &  tile_state_,
+           ItemsLoadIt      items_in_,
+           ItemsOutputIt    items_out_,
+           BinaryPred       binary_pred_,
+           Size             num_items_,
+           int              num_tiles,
+           NumSelectedOutIt num_selected_out)
+          : temp_storage(temp_storage_),
+            tile_state(tile_state_),
+            items_in(items_in_),
+            items_out(items_out_),
+            predicate(binary_pred_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ItemsIt          items_in,
+                       ItemsOutputIt    items_out,
+                       BinaryPred       binary_pred,
+                       NumSelectedOutIt num_selected_out,
+                       Size             num_items,
+                       ScanTileState    tile_state,
+                       int              num_tiles,
+                       char *           shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           core::make_load_iterator(ptx_plan(), items_in),
+           items_out,
+           binary_pred,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  };    // struct UniqueAgent
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+
+  }; // struct InitAgent
+
+  template <class ItemsInputIt,
+            class ItemsOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            ItemsInputIt     items_in,
+            ItemsOutputIt    items_out,
+            BinaryPred       binary_pred,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream,
+            bool             debug_sync)
+  {
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        UniqueAgent<ItemsInputIt,
+                    ItemsOutputIt,
+                    BinaryPred,
+                    Size,
+                    NumSelectedOutIt> >
+        unique_agent;
+
+    typedef typename unique_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type   init_plan   = init_agent::get_plan();
+    typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
+
+
+    int tile_size = unique_plan.items_per_tile;
+    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
+                                           num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    size_t      allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    //
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    num_tiles = max<size_t>(1,num_tiles);
+    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent", debug_sync);
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    if (num_items == 0) { return status; }
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent", debug_sync);
+    ua.launch(items_in,
+              items_out,
+              binary_pred,
+              num_selected_out,
+              num_items,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename ItemsInputIt,
+            typename ItemsOutputIt,
+            typename BinaryPred>
+  THRUST_RUNTIME_FUNCTION
+  ItemsOutputIt unique(execution_policy<Derived>& policy,
+                       ItemsInputIt               items_first,
+                       ItemsInputIt               items_last,
+                       ItemsOutputIt              items_result,
+                       BinaryPred                 binary_pred)
+  {
+    //  typedef typename iterator_traits<ItemsInputIt>::difference_type size_type;
+    typedef int size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(items_first, items_last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       items_first,
+                       items_result,
+                       binary_pred,
+                       reinterpret_cast<size_type*>(NULL),
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "unique: failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       items_first,
+                       items_result,
+                       binary_pred,
+                       d_num_selected_out,
+                       num_items,
+                       stream,
+                       debug_sync);
+    cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "unique: failed to synchronize");
+
+    size_type num_selected = get_value(policy, d_num_selected_out);
+
+    return items_result + num_selected;
+  }
+}    // namespace __unique
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class BinaryPred>
+OutputIt __host__ __device__
+unique_copy(execution_policy<Derived> &policy,
+            InputIt                    first,
+            InputIt                    last,
+            OutputIt                   result,
+            BinaryPred                 binary_pred)
+{
+  OutputIt ret = result;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __unique::unique(policy,
+                           first,
+                           last,
+                           result,
+                           binary_pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::unique_copy(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              result,
+                              binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+unique_copy(execution_policy<Derived> &policy,
+            InputIt                    first,
+            InputIt                    last,
+            OutputIt                   result)
+{
+  typedef typename iterator_traits<InputIt>::value_type input_type;
+  return cuda_cub::unique_copy(policy, first, last, result, equal_to<input_type>());
+}
+
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class BinaryPred>
+InputIt __host__ __device__
+unique(execution_policy<Derived> &policy,
+       InputIt                    first,
+       InputIt                    last,
+       BinaryPred                 binary_pred)
+{
+  InputIt ret = first;
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::unique(cvt_to_seq(derived_cast(policy)),
+                         first,
+                         last,
+                         binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class InputIt>
+InputIt __host__ __device__
+unique(execution_policy<Derived> &policy,
+       InputIt                    first,
+       InputIt                    last)
+{
+  typedef typename iterator_traits<InputIt>::value_type input_type;
+  return cuda_cub::unique(policy, first, last, equal_to<input_type>());
+}
+
+}    // namespace cuda_cub
+} // end namespace thrust
+
+//
+#include <thrust/memory.h>
+#include <thrust/unique.h>
+#endif
diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..e20832131593afe2c63af6a5bb0854beca45bd44
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/unique_by_key.h
@@ -0,0 +1,934 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cub/device/device_select.cuh>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+#include <thrust/detail/alignment.h>
+
+namespace thrust
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIterator1,
+          typename ForwardIterator2>
+__host__ __device__ thrust::pair<ForwardIterator1, ForwardIterator2>
+unique_by_key(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    ForwardIterator1                                            keys_first,
+    ForwardIterator1                                            keys_last,
+    ForwardIterator2                                            values_first);
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+__host__ __device__ thrust::pair<OutputIterator1, OutputIterator2>
+unique_by_key_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator1                                              keys_first,
+    InputIterator1                                              keys_last,
+    InputIterator2                                              values_first,
+    OutputIterator1                                             keys_result,
+    OutputIterator2                                             values_result);
+
+
+namespace cuda_cub {
+
+// XXX  it should be possible to unify unique & unique_by_key into a single
+//      agent with various specializations, similar to what is done
+//      with partition
+namespace __unique_by_key {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class,class>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class T, size_t NOMINAL_4B_ITEMS_PER_THREAD>
+  struct items_per_thread
+  {
+    enum
+    {
+      value = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<int,
+                   1,
+                   (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
+                    sizeof(T))>::value>::value
+    };
+  };
+
+
+  template<class T>
+  struct Tuning<sm52,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<64,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm52
+
+  template<class T>
+  struct Tuning<sm35,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm35
+
+  template<class T>
+  struct Tuning<sm30,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm30
+
+  template <class KeyInputIt,
+            class ValInputIt,
+            class KeyOutputIt,
+            class ValOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  struct UniqueByKeyAgent
+  {
+    typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+    typedef typename iterator_traits<ValInputIt>::value_type value_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, key_type>::type
+    {
+      typedef Tuning<Arch, key_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeyInputIt>::type KeyLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, ValInputIt>::type ValLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeyLoadIt>::type BlockLoadKeys;
+      typedef typename core::BlockLoad<PtxPlan, ValLoadIt>::type BlockLoadValues;
+
+      typedef cub::BlockDiscontinuity<key_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityKeys;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      typedef core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE>
+          shared_keys_t;
+      typedef core::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE>
+          shared_values_t;
+
+      union TempStorage
+      {
+        struct
+        {
+          typename BlockScan::TempStorage              scan;
+          typename TilePrefixCallback::TempStorage     prefix;
+          typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        };
+
+        typename BlockLoadKeys::TempStorage   load_keys;
+        typename BlockLoadValues::TempStorage load_values;
+
+        shared_keys_t   shared_keys;
+        shared_values_t shared_values;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeyLoadIt              KeyLoadIt;
+    typedef typename ptx_plan::ValLoadIt              ValLoadIt;
+    typedef typename ptx_plan::BlockLoadKeys          BlockLoadKeys;
+    typedef typename ptx_plan::BlockLoadValues        BlockLoadValues;
+    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
+    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan              BlockScan;
+    typedef typename ptx_plan::TempStorage            TempStorage;
+    typedef typename ptx_plan::shared_keys_t          shared_keys_t;
+    typedef typename ptx_plan::shared_values_t        shared_values_t;
+
+    enum
+    {
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &                      temp_storage;
+      ScanTileState &                    tile_state;
+      KeyLoadIt                          keys_in;
+      ValLoadIt                          values_in;
+      KeyOutputIt                        keys_out;
+      ValOutputIt                        values_out;
+      cub::InequalityWrapper<BinaryPred> predicate;
+      Size                               num_items;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      struct key_tag {};
+      struct value_tag {};
+
+      THRUST_DEVICE_FUNCTION
+      shared_keys_t &get_shared(key_tag)
+      {
+        return temp_storage.shared_keys;
+      }
+      THRUST_DEVICE_FUNCTION
+      shared_values_t &get_shared(value_tag)
+      {
+        return temp_storage.shared_values;
+      }
+
+
+      template <class Tag,
+                class OutputIt,
+                class T>
+      void THRUST_DEVICE_FUNCTION
+      scatter(Tag      tag,
+              OutputIt items_out,
+              T (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  /*num_tile_items*/,
+              int  num_tile_selections,
+              Size num_selections_prefix,
+              Size /*num_selections*/)
+      {
+        using core::sync_threadblock;
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int local_scatter_offset = selection_indices[ITEM] -
+                                     num_selections_prefix;
+          if (selection_flags[ITEM])
+          {
+            get_shared(tag)[local_scatter_offset] = items[ITEM];
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+          items_out[num_selections_prefix + item] = get_shared(tag)[item];
+        }
+
+        sync_threadblock();
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        using core::sync_threadblock;
+
+        key_type keys[ITEMS_PER_THREAD];
+        Size     selection_flags[ITEMS_PER_THREAD];
+        Size     selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoadKeys(temp_storage.load_keys)
+              .Load(keys_in + tile_base,
+                    keys,
+                    num_tile_items,
+                    *(keys_in + tile_base));
+        }
+        else
+        {
+          BlockLoadKeys(temp_storage.load_keys).Load(keys_in + tile_base, keys);
+        }
+
+
+        sync_threadblock();
+
+        value_type values[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoadValues(temp_storage.load_values)
+              .Load(values_in + tile_base,
+                    values,
+                    num_tile_items,
+                    *(values_in + tile_base));
+        }
+        else
+        {
+          BlockLoadValues(temp_storage.load_values)
+              .Load(values_in + tile_base, values);
+        }
+
+        sync_threadblock();
+
+        if (IS_FIRST_TILE)
+        {
+          BlockDiscontinuityKeys(temp_storage.discontinuity)
+              .FlagHeads(selection_flags, keys, predicate);
+        }
+        else
+        {
+          key_type tile_predecessor = keys_in[tile_base - 1];
+          BlockDiscontinuityKeys(temp_storage.discontinuity)
+              .FlagHeads(selection_flags, keys, predicate, tile_predecessor);
+        }
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set selection_flags for out-of-bounds items
+          if ((IS_LAST_TILE) && (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+            selection_flags[ITEM] = 1;
+        }
+
+        sync_threadblock();
+
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(temp_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       temp_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(temp_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        sync_threadblock();
+
+        scatter(key_tag(),
+                keys_out,
+                keys,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        sync_threadblock();
+
+        scatter(value_tag(),
+                values_out,
+                values,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+      }
+
+
+      template <bool IS_LAST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &    temp_storage_,
+           ScanTileState &  tile_state_,
+           KeyLoadIt        keys_in_,
+           ValLoadIt        values_in_,
+           KeyOutputIt      keys_out_,
+           ValOutputIt      values_out_,
+           BinaryPred       binary_pred_,
+           Size             num_items_,
+           int              num_tiles,
+           NumSelectedOutIt num_selected_out)
+          // filed ctors
+          : temp_storage(temp_storage_),
+            tile_state(tile_state_),
+            keys_in(keys_in_),
+            values_in(values_in_),
+            keys_out(keys_out_),
+            values_out(values_out_),
+            predicate(binary_pred_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeyInputIt       keys_in,
+                       ValInputIt       values_in,
+                       KeyOutputIt      keys_out,
+                       ValOutputIt      values_out,
+                       BinaryPred       binary_pred,
+                       NumSelectedOutIt num_selected_out,
+                       Size             num_items,
+                       ScanTileState    tile_state,
+                       int              num_tiles,
+                       char *           shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           core::make_load_iterator(ptx_plan(), keys_in),
+           core::make_load_iterator(ptx_plan(), values_in),
+           keys_out,
+           values_out,
+           binary_pred,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  }; // struct UniqueByKeyAgent
+
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+
+  }; // struct InitAgent
+
+
+  template <class KeyInputIt,
+            class ValInputIt,
+            class KeyOutputIt,
+            class ValOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            KeyInputIt       keys_in,
+            ValInputIt       values_in,
+            KeyOutputIt      keys_out,
+            ValOutputIt      values_out,
+            BinaryPred       binary_pred,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream,
+            bool             debug_sync)
+  {
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        UniqueByKeyAgent<KeyInputIt,
+                         ValInputIt,
+                         KeyOutputIt,
+                         ValOutputIt,
+                         BinaryPred,
+                         Size,
+                         NumSelectedOutIt> >
+        unique_agent;
+
+    typedef typename unique_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type   init_plan   = init_agent::get_plan();
+    typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
+
+
+    int tile_size = unique_plan.items_per_tile;
+    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+
+    size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
+                                           num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    size_t      allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    //
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    num_tiles = max<size_t>(1,num_tiles);
+    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent", debug_sync);
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    if (num_items == 0) { return status; }
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent", debug_sync);
+    ua.launch(keys_in,
+              values_in,
+              keys_out,
+              values_out,
+              binary_pred,
+              num_selected_out,
+              num_items,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename KeyInputIt,
+            typename ValInputIt,
+            typename KeyOutputIt,
+            typename ValOutputIt,
+            typename BinaryPred>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeyOutputIt, ValOutputIt>
+  unique_by_key(execution_policy<Derived>& policy,
+                KeyInputIt                 keys_first,
+                KeyInputIt                 keys_last,
+                ValInputIt                 values_first,
+                KeyOutputIt                keys_result,
+                ValOutputIt                values_result,
+                BinaryPred                 binary_pred)
+  {
+
+    typedef int size_type;
+
+    size_type num_items
+      = static_cast<size_type>(thrust::distance(keys_first, keys_last));
+
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
+
+    cudaError_t status;
+    status = __unique_by_key::doit_step(NULL,
+                                        temp_storage_bytes,
+                                        keys_first,
+                                        values_first,
+                                        keys_result,
+                                        values_result,
+                                        binary_pred,
+                                        reinterpret_cast<size_type*>(NULL),
+                                        num_items,
+                                        stream,
+                                        debug_sync);
+    cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique_by_key failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique_by_key failed on 2nd alias_storage");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = __unique_by_key::doit_step(allocations[1],
+                                        temp_storage_bytes,
+                                        keys_first,
+                                        values_first,
+                                        keys_result,
+                                        values_result,
+                                        binary_pred,
+                                        d_num_selected_out,
+                                        num_items,
+                                        stream,
+                                        debug_sync);
+    cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "unique_by_key: failed to synchronize");
+
+    size_type num_selected = get_value(policy, d_num_selected_out);
+
+    return thrust::make_pair(
+      keys_result + num_selected,
+      values_result + num_selected
+    );
+  }
+
+} // namespace __unique_by_key
+
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt,
+          class BinaryPred>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+unique_by_key_copy(execution_policy<Derived> &policy,
+                   KeyInputIt                 keys_first,
+                   KeyInputIt                 keys_last,
+                   ValInputIt                 values_first,
+                   KeyOutputIt                keys_result,
+                   ValOutputIt                values_result,
+                   BinaryPred                 binary_pred)
+{
+  pair<KeyOutputIt, ValOutputIt> ret = thrust::make_pair(keys_result, values_result);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = __unique_by_key::unique_by_key(policy,
+                                keys_first,
+                                keys_last,
+                                values_first,
+                                keys_result,
+                                values_result,
+                                binary_pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)),
+                                     keys_first,
+                                     keys_last,
+                                     values_first,
+                                     keys_result,
+                                     values_result,
+                                     binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+unique_by_key_copy(execution_policy<Derived> &policy,
+                   KeyInputIt                 keys_first,
+                   KeyInputIt                 keys_last,
+                   ValInputIt                 values_first,
+                   KeyOutputIt                keys_result,
+                   ValOutputIt                values_result)
+{
+  typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+  return cuda_cub::unique_by_key_copy(policy,
+                                   keys_first,
+                                   keys_last,
+                                   values_first,
+                                   keys_result,
+                                   values_result,
+                                   equal_to<key_type>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class BinaryPred>
+pair<KeyInputIt, ValInputIt> __host__ __device__
+unique_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              BinaryPred                 binary_pred)
+{
+  pair<KeyInputIt, ValInputIt> ret = thrust::make_pair(keys_first, values_first);
+  if (__THRUST_HAS_CUDART__)
+  {
+    ret = cuda_cub::unique_by_key_copy(policy,
+                                       keys_first,
+                                       keys_last,
+                                       values_first,
+                                       keys_first,
+                                       values_first,
+                                       binary_pred);
+  }
+  else
+  {
+#if !__THRUST_HAS_CUDART__
+    ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)),
+                                keys_first,
+                                keys_last,
+                                values_first,
+                                binary_pred);
+#endif
+  }
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt>
+pair<KeyInputIt, ValInputIt> __host__ __device__
+unique_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first)
+{
+  typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+  return cuda_cub::unique_by_key(policy,
+                              keys_first,
+                              keys_last,
+                              values_first,
+                              equal_to<key_type>());
+}
+
+
+
+}    // namespace cuda_cub
+} // end namespace thrust
+
+#include <thrust/memory.h>
+#include <thrust/unique.h>
+
+#endif
diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..07ee7d9a1c86217a4108bd291e7eb45a6f297665
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/util.h
@@ -0,0 +1,589 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights meserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <cstdio>
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <cub/util_arch.cuh>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+namespace thrust
+{
+
+namespace cuda_cub {
+
+inline __host__ __device__
+cudaStream_t
+default_stream()
+{
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return cudaStreamPerThread;
+#else
+  return cudaStreamLegacy;
+#endif
+}
+
+// Fallback implementation of the customization point.
+template <class Derived>
+__host__ __device__
+cudaStream_t
+get_stream(execution_policy<Derived> &)
+{
+  return default_stream();
+}
+
+// Entry point/interface.
+template <class Derived>
+__host__ __device__ cudaStream_t
+stream(execution_policy<Derived> &policy)
+{
+  return get_stream(derived_cast(policy));
+}
+
+// Fallback implementation of the customization point.
+__thrust_exec_check_disable__
+template <class Derived>
+__host__ __device__
+cudaError_t
+synchronize_stream(execution_policy<Derived> &policy)
+{
+  cudaError_t result;
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      cudaStreamSynchronize(stream(policy));
+      result = cudaGetLastError();
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      #if __THRUST_HAS_CUDART__
+        THRUST_UNUSED_VAR(policy);
+        cudaDeviceSynchronize();
+        result = cudaGetLastError();
+      #else
+        THRUST_UNUSED_VAR(policy);
+        result = cudaSuccess;
+      #endif
+    #endif
+  }
+  return result;
+}
+
+// Entry point/interface.
+template <class Policy>
+__host__ __device__
+cudaError_t
+synchronize(Policy &policy)
+{
+  return synchronize_stream(derived_cast(policy));
+}
+
+template <class Type>
+THRUST_HOST_FUNCTION cudaError_t
+trivial_copy_from_device(Type *       dst,
+                         Type const * src,
+                         size_t       count,
+                         cudaStream_t stream)
+{
+  cudaError status = cudaSuccess;
+  if (count == 0) return status;
+
+  status = ::cudaMemcpyAsync(dst,
+                             src,
+                             sizeof(Type) * count,
+                             cudaMemcpyDeviceToHost,
+                             stream);
+  cudaStreamSynchronize(stream);
+  return status;
+}
+
+template <class Type>
+THRUST_HOST_FUNCTION cudaError_t
+trivial_copy_to_device(Type *       dst,
+                       Type const * src,
+                       size_t       count,
+                       cudaStream_t stream)
+{
+  cudaError status = cudaSuccess;
+  if (count == 0) return status;
+
+  status = ::cudaMemcpyAsync(dst,
+                             src,
+                             sizeof(Type) * count,
+                             cudaMemcpyHostToDevice,
+                             stream);
+  cudaStreamSynchronize(stream);
+  return status;
+}
+
+template <class Policy, class Type>
+__host__ __device__ cudaError_t
+trivial_copy_device_to_device(Policy &    policy,
+                              Type *      dst,
+                              Type const *src,
+                              size_t      count)
+{
+  cudaError_t  status = cudaSuccess;
+  if (count == 0) return status;
+
+  cudaStream_t stream = cuda_cub::stream(policy);
+  //
+  status = ::cudaMemcpyAsync(dst,
+                             src,
+                             sizeof(Type) * count,
+                             cudaMemcpyDeviceToDevice,
+                             stream);
+  cuda_cub::synchronize(policy);
+  return status;
+}
+
+inline void __host__ __device__
+terminate()
+{
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      asm("trap;");
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      std::terminate();
+    #endif
+  }
+}
+
+__host__  __device__
+inline void throw_on_error(cudaError_t status)
+{
+#if __THRUST_HAS_CUDART__
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+  cudaGetLastError();
+#endif
+
+  if (cudaSuccess != status)
+  {
+    if (THRUST_IS_HOST_CODE) {
+      #if THRUST_INCLUDE_HOST_CODE
+        throw thrust::system_error(status, thrust::cuda_category());
+      #endif
+    } else {
+      #if THRUST_INCLUDE_DEVICE_CODE
+        #if __THRUST_HAS_CUDART__
+          printf("Thrust CUDA backend error: %s: %s\n",
+                 cudaGetErrorName(status),
+                 cudaGetErrorString(status));
+        #else
+          printf("Thrust CUDA backend error: %d\n",
+                 static_cast<int>(status));
+        #endif
+        cuda_cub::terminate();
+      #endif
+    }
+  }
+}
+
+__host__ __device__
+inline void throw_on_error(cudaError_t status, char const *msg)
+{
+#if __THRUST_HAS_CUDART__
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+  cudaGetLastError();
+#endif
+
+  if (cudaSuccess != status)
+  {
+    if (THRUST_IS_HOST_CODE) {
+      #if THRUST_INCLUDE_HOST_CODE
+        throw thrust::system_error(status, thrust::cuda_category(), msg);
+      #endif
+    } else {
+      #if THRUST_INCLUDE_DEVICE_CODE
+        #if __THRUST_HAS_CUDART__
+          printf("Thrust CUDA backend error: %s: %s: %s\n",
+                 cudaGetErrorName(status),
+                 cudaGetErrorString(status),
+                 msg);
+        #else
+          printf("Thrust CUDA backend error: %d: %s \n",
+                 static_cast<int>(status),
+                 msg);
+        #endif
+        cuda_cub::terminate();
+      #endif
+    }
+  }
+}
+
+// FIXME: Move the iterators elsewhere.
+
+template <class ValueType,
+          class InputIt,
+          class UnaryOp>
+struct transform_input_iterator_t
+{
+  typedef transform_input_iterator_t                         self_t;
+  typedef typename iterator_traits<InputIt>::difference_type difference_type;
+  typedef ValueType                                          value_type;
+  typedef void                                               pointer;
+  typedef value_type                                         reference;
+  typedef std::random_access_iterator_tag                    iterator_category;
+
+  InputIt         input;
+  mutable UnaryOp op;
+
+  __host__ __device__ __forceinline__
+  transform_input_iterator_t(InputIt input, UnaryOp op)
+      : input(input), op(op) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  transform_input_iterator_t(const self_t &) = default;
+#endif
+
+  // UnaryOp might not be copy assignable, such as when it is a lambda.  Define
+  // an explicit copy assignment operator that doesn't try to assign it.
+  self_t& operator=(const self_t& o)
+  {
+    input = o.input;
+    return *this;
+  }
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_t operator++(int)
+  {
+    self_t retval = *this;
+    ++input;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_t operator++()
+  {
+    ++input;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const
+  {
+    typename thrust::iterator_value<InputIt>::type x = *input;
+    return op(x);
+  }
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*()
+  {
+    typename thrust::iterator_value<InputIt>::type x = *input;
+    return op(x);
+  }
+
+  /// Addition
+  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
+  {
+    return self_t(input + n, op);
+  }
+
+  /// Addition assignment
+  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
+  {
+    input += n;
+    return *this;
+  }
+
+  /// Subtraction
+  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
+  {
+    return self_t(input - n, op);
+  }
+
+  /// Subtraction assignment
+  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
+  {
+    input -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
+  {
+    return input - other.input;
+  }
+
+  /// Array subscript
+  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
+  {
+    return op(input[n]);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
+  {
+    return (input == rhs.input);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
+  {
+    return (input != rhs.input);
+  }
+};    // struct transform_input_iterarot_t
+
+template <class ValueType,
+          class InputIt1,
+          class InputIt2,
+          class BinaryOp>
+struct transform_pair_of_input_iterators_t
+{
+  typedef transform_pair_of_input_iterators_t                 self_t;
+  typedef typename iterator_traits<InputIt1>::difference_type difference_type;
+  typedef ValueType                                           value_type;
+  typedef void                                                pointer;
+  typedef value_type                                          reference;
+  typedef std::random_access_iterator_tag                     iterator_category;
+
+  InputIt1         input1;
+  InputIt2         input2;
+  mutable BinaryOp op;
+
+  __host__ __device__ __forceinline__
+  transform_pair_of_input_iterators_t(InputIt1 input1_,
+                                      InputIt2 input2_,
+                                      BinaryOp op_)
+      : input1(input1_), input2(input2_), op(op_) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  transform_pair_of_input_iterators_t(const self_t &) = default;
+#endif
+
+  // BinaryOp might not be copy assignable, such as when it is a lambda.
+  // Define an explicit copy assignment operator that doesn't try to assign it.
+  self_t& operator=(const self_t& o)
+  {
+    input1 = o.input1;
+    input2 = o.input2;
+    return *this;
+  }
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_t operator++(int)
+  {
+    self_t retval = *this;
+    ++input1;
+    ++input2;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_t operator++()
+  {
+    ++input1;
+    ++input2;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const
+  {
+    return op(*input1, *input2);
+  }
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*()
+  {
+    return op(*input1, *input2);
+  }
+
+  /// Addition
+  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
+  {
+    return self_t(input1 + n, input2 + n, op);
+  }
+
+  /// Addition assignment
+  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
+  {
+    input1 += n;
+    input2 += n;
+    return *this;
+  }
+
+  /// Subtraction
+  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
+  {
+    return self_t(input1 - n, input2 - n, op);
+  }
+
+  /// Subtraction assignment
+  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
+  {
+    input1 -= n;
+    input2 -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
+  {
+    return input1 - other.input1;
+  }
+
+  /// Array subscript
+  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
+  {
+    return op(input1[n], input2[n]);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
+  {
+    return (input1 == rhs.input1) && (input2 == rhs.input2);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
+  {
+    return (input1 != rhs.input1) || (input2 != rhs.input2);
+  }
+
+};    // struct transform_pair_of_input_iterators_t
+
+
+struct identity
+{
+  template <class T>
+  __host__ __device__ T const &
+  operator()(T const &t) const
+  {
+    return t;
+  }
+
+  template <class T>
+  __host__ __device__ T &
+  operator()(T &t) const
+  {
+    return t;
+  }
+};
+
+
+template <class T>
+struct counting_iterator_t
+{
+  typedef counting_iterator_t             self_t;
+  typedef T                               difference_type;
+  typedef T                               value_type;
+  typedef void                            pointer;
+  typedef T                               reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  T count;
+
+  __host__ __device__ __forceinline__
+  counting_iterator_t(T count_) : count(count_) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_t operator++(int)
+  {
+    self_t retval = *this;
+    ++count;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_t operator++()
+  {
+    ++count;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const
+  {
+    return count;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*()
+  {
+    return count;
+  }
+
+  /// Addition
+  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
+  {
+    return self_t(count + n);
+  }
+
+  /// Addition assignment
+  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
+  {
+    count += n;
+    return *this;
+  }
+
+  /// Subtraction
+  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
+  {
+    return self_t(count - n);
+  }
+
+  /// Subtraction assignment
+  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
+  {
+    count -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
+  {
+    return count - other.count;
+  }
+
+  /// Array subscript
+  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
+  {
+    return count + n;
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
+  {
+    return (count == rhs.count);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
+  {
+    return (count != rhs.count);
+  }
+
+};    // struct count_iterator_t
+
+}    // cuda_
+
+} // end namespace thrust
diff --git a/thrust/thrust/system/cuda/error.h b/thrust/thrust/system/cuda/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcbadd8551a8c30e1faa67ccace47b448dd79429
--- /dev/null
+++ b/thrust/thrust/system/cuda/error.h
@@ -0,0 +1,183 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/system/cuda/error.h
+ *  \brief CUDA-specific error reporting
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/error_code.h>
+#include <thrust/system/cuda/detail/guarded_driver_types.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace cuda
+{
+
+// To construct an error_code after a CUDA Runtime error:
+//
+//   error_code(::cudaGetLastError(), cuda_category())
+
+// XXX N3000 prefers enum class errc { ... }
+/*! Namespace for CUDA Runtime errors.
+ */
+namespace errc
+{
+
+/*! \p errc_t enumerates the kinds of CUDA Runtime errors.
+ */
+enum errc_t
+{
+  // from cuda/include/driver_types.h
+  // mirror their order
+  success                            = cudaSuccess,
+  missing_configuration              = cudaErrorMissingConfiguration,
+  memory_allocation                  = cudaErrorMemoryAllocation,
+  initialization_error               = cudaErrorInitializationError,
+  launch_failure                     = cudaErrorLaunchFailure,
+  prior_launch_failure               = cudaErrorPriorLaunchFailure,
+  launch_timeout                     = cudaErrorLaunchTimeout,
+  launch_out_of_resources            = cudaErrorLaunchOutOfResources,
+  invalid_device_function            = cudaErrorInvalidDeviceFunction,
+  invalid_configuration              = cudaErrorInvalidConfiguration,
+  invalid_device                     = cudaErrorInvalidDevice,
+  invalid_value                      = cudaErrorInvalidValue,
+  invalid_pitch_value                = cudaErrorInvalidPitchValue,
+  invalid_symbol                     = cudaErrorInvalidSymbol,
+  map_buffer_object_failed           = cudaErrorMapBufferObjectFailed,
+  unmap_buffer_object_failed         = cudaErrorUnmapBufferObjectFailed,
+  invalid_host_pointer               = cudaErrorInvalidHostPointer,
+  invalid_device_pointer             = cudaErrorInvalidDevicePointer,
+  invalid_texture                    = cudaErrorInvalidTexture,
+  invalid_texture_binding            = cudaErrorInvalidTextureBinding,
+  invalid_channel_descriptor         = cudaErrorInvalidChannelDescriptor,
+  invalid_memcpy_direction           = cudaErrorInvalidMemcpyDirection,
+  address_of_constant_error          = cudaErrorAddressOfConstant,
+  texture_fetch_failed               = cudaErrorTextureFetchFailed,
+  texture_not_bound                  = cudaErrorTextureNotBound,
+  synchronization_error              = cudaErrorSynchronizationError,
+  invalid_filter_setting             = cudaErrorInvalidFilterSetting,
+  invalid_norm_setting               = cudaErrorInvalidNormSetting,
+  mixed_device_execution             = cudaErrorMixedDeviceExecution,
+  cuda_runtime_unloading             = cudaErrorCudartUnloading,
+  unknown                            = cudaErrorUnknown,
+  not_yet_implemented                = cudaErrorNotYetImplemented,
+  memory_value_too_large             = cudaErrorMemoryValueTooLarge,
+  invalid_resource_handle            = cudaErrorInvalidResourceHandle,
+  not_ready                          = cudaErrorNotReady,
+  insufficient_driver                = cudaErrorInsufficientDriver,
+  set_on_active_process_error        = cudaErrorSetOnActiveProcess,
+  no_device                          = cudaErrorNoDevice,
+  ecc_uncorrectable                  = cudaErrorECCUncorrectable,
+
+#if CUDART_VERSION >= 4020
+  shared_object_symbol_not_found     = cudaErrorSharedObjectSymbolNotFound,
+  shared_object_init_failed          = cudaErrorSharedObjectInitFailed,
+  unsupported_limit                  = cudaErrorUnsupportedLimit,
+  duplicate_variable_name            = cudaErrorDuplicateVariableName,
+  duplicate_texture_name             = cudaErrorDuplicateTextureName,
+  duplicate_surface_name             = cudaErrorDuplicateSurfaceName,
+  devices_unavailable                = cudaErrorDevicesUnavailable,
+  invalid_kernel_image               = cudaErrorInvalidKernelImage,
+  no_kernel_image_for_device         = cudaErrorNoKernelImageForDevice,
+  incompatible_driver_context        = cudaErrorIncompatibleDriverContext,
+  peer_access_already_enabled        = cudaErrorPeerAccessAlreadyEnabled,
+  peer_access_not_enabled            = cudaErrorPeerAccessNotEnabled,
+  device_already_in_use              = cudaErrorDeviceAlreadyInUse,
+  profiler_disabled                  = cudaErrorProfilerDisabled,
+  assert_triggered                   = cudaErrorAssert,
+  too_many_peers                     = cudaErrorTooManyPeers,
+  host_memory_already_registered     = cudaErrorHostMemoryAlreadyRegistered,
+  host_memory_not_registered         = cudaErrorHostMemoryNotRegistered,
+  operating_system_error             = cudaErrorOperatingSystem,
+#endif
+
+#if CUDART_VERSION >= 5000
+  peer_access_unsupported            = cudaErrorPeerAccessUnsupported,
+  launch_max_depth_exceeded          = cudaErrorLaunchMaxDepthExceeded,
+  launch_file_scoped_texture_used    = cudaErrorLaunchFileScopedTex,
+  launch_file_scoped_surface_used    = cudaErrorLaunchFileScopedSurf,
+  sync_depth_exceeded                = cudaErrorSyncDepthExceeded,
+  attempted_operation_not_permitted  = cudaErrorNotPermitted,
+  attempted_operation_not_supported  = cudaErrorNotSupported,
+#endif
+
+  startup_failure                    = cudaErrorStartupFailure
+}; // end errc_t
+
+
+} // end namespace errc
+
+} // end namespace cuda_cub
+
+/*! \return A reference to an object of a type derived from class \p thrust::error_category.
+ *  \note The object's \p equivalent virtual functions shall behave as specified
+ *        for the class \p thrust::error_category. The object's \p name virtual function shall
+ *        return a pointer to the string <tt>"cuda"</tt>. The object's
+ *        \p default_error_condition virtual function shall behave as follows:
+ *
+ *        If the argument <tt>ev</tt> corresponds to a CUDA error value, the function
+ *        shall return <tt>error_condition(ev,cuda_category())</tt>.
+ *        Otherwise, the function shall return <tt>system_category.default_error_condition(ev)</tt>.
+ */
+inline const error_category &cuda_category(void);
+
+
+// XXX N3000 prefers is_error_code_enum<cuda::errc>
+
+/*! Specialization of \p is_error_code_enum for \p cuda::errc::errc_t
+ */
+template<> struct is_error_code_enum<cuda::errc::errc_t> : thrust::detail::true_type {};
+
+
+// XXX replace cuda::errc::errc_t with cuda::errc upon c++0x
+/*! \return <tt>error_code(static_cast<int>(e), cuda::error_category())</tt>
+ */
+inline error_code make_error_code(cuda::errc::errc_t e);
+
+
+// XXX replace cuda::errc::errc_t with cuda::errc upon c++0x
+/*! \return <tt>error_condition(static_cast<int>(e), cuda::error_category())</tt>.
+ */
+inline error_condition make_error_condition(cuda::errc::errc_t e);
+
+} // end system
+
+namespace cuda_cub
+{
+namespace errc = system::cuda::errc;
+} // end cuda_cub
+
+namespace cuda
+{
+// XXX replace with using system::cuda_errc upon c++0x
+namespace errc = system::cuda::errc;
+} // end cuda
+
+using system::cuda_category;
+
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/error.inl>
+
diff --git a/thrust/thrust/system/cuda/execution_policy.h b/thrust/thrust/system/cuda/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..39bbb7927efd9fc1037f3a050429d0769e328ad5
--- /dev/null
+++ b/thrust/thrust/system/cuda/execution_policy.h
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+// histogram
+// sort (radix-sort, merge-sort)
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/par.h>
+
+// pass
+// ----------------
+#include <thrust/system/cuda/detail/adjacent_difference.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/system/cuda/detail/copy_if.h>
+#include <thrust/system/cuda/detail/count.h>
+#include <thrust/system/cuda/detail/equal.h>
+#include <thrust/system/cuda/detail/extrema.h>
+#include <thrust/system/cuda/detail/fill.h>
+#include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/cuda/detail/for_each.h>
+#include <thrust/system/cuda/detail/gather.h>
+#include <thrust/system/cuda/detail/generate.h>
+#include <thrust/system/cuda/detail/inner_product.h>
+#include <thrust/system/cuda/detail/mismatch.h>
+#include <thrust/system/cuda/detail/partition.h>
+#include <thrust/system/cuda/detail/reduce_by_key.h>
+#include <thrust/system/cuda/detail/remove.h>
+#include <thrust/system/cuda/detail/replace.h>
+#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/system/cuda/detail/scatter.h>
+#include <thrust/system/cuda/detail/swap_ranges.h>
+#include <thrust/system/cuda/detail/tabulate.h>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/system/cuda/detail/transform_reduce.h>
+#include <thrust/system/cuda/detail/transform_scan.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/uninitialized_fill.h>
+#include <thrust/system/cuda/detail/unique.h>
+#include <thrust/system/cuda/detail/unique_by_key.h>
+
+// fail
+// ----------------
+// fails with mixed types
+#include <thrust/system/cuda/detail/reduce.h>
+
+// mixed types are not compiling, commented in testing/scan.cu
+#include <thrust/system/cuda/detail/scan.h>
+
+// stubs passed
+// ----------------
+#include <thrust/system/cuda/detail/binary_search.h>
+#include <thrust/system/cuda/detail/merge.h>
+#include <thrust/system/cuda/detail/scan_by_key.h>
+#include <thrust/system/cuda/detail/set_operations.h>
+#include <thrust/system/cuda/detail/sort.h>
+
+// work in progress
+
diff --git a/thrust/thrust/system/cuda/experimental/pinned_allocator.h b/thrust/thrust/system/cuda/experimental/pinned_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..50e00cad33a6fcedfc6f26c079c3cb339b450cb9
--- /dev/null
+++ b/thrust/thrust/system/cuda/experimental/pinned_allocator.h
@@ -0,0 +1,244 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/experimental/pinned_allocator.h
+ *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <stdexcept>
+#include <limits>
+#include <string>
+#include <thrust/system/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace cuda
+{
+
+namespace experimental
+{
+
+/*! \addtogroup memory_management_classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T> class pinned_allocator;
+
+template<>
+  class pinned_allocator<void>
+{
+  public:
+    typedef void           value_type;
+    typedef void       *   pointer;
+    typedef const void *   const_pointer;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    // convert a pinned_allocator<void> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+}; // end pinned_allocator
+
+
+template<typename T>
+  class pinned_allocator
+{
+  public:
+    //! \{
+    typedef T              value_type;
+    typedef T*             pointer;
+    typedef const T*       const_pointer;
+    typedef T&             reference;
+    typedef const T&       const_reference;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+    //! \}
+
+    // convert a pinned_allocator<T> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+
+    /*! \p pinned_allocator's null constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator() {}
+
+    /*! \p pinned_allocator's null destructor does nothing.
+     */
+    __host__ __device__
+    inline ~pinned_allocator() {}
+
+    /*! \p pinned_allocator's copy constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator const &) {}
+
+    /*! This version of \p pinned_allocator's copy constructor
+     *  is templated on the \c value_type of the \p pinned_allocator
+     *  to copy from.  It is provided merely for convenience; it
+     *  does nothing.
+     */
+    template<typename U>
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator<U> const &) {}
+
+    /*! This method returns the address of a \c reference of
+     *  interest.
+     *
+     *  \p r The \c reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+
+    /*! This method returns the address of a \c const_reference
+     *  of interest.
+     *
+     *  \p r The \c const_reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! This method allocates storage for objects in pinned host
+     *  memory.
+     *
+     *  \p cnt The number of objects to allocate.
+     *  \return a \c pointer to the newly allocated objects.
+     *  \note This method does not invoke \p value_type's constructor.
+     *        It is the responsibility of the caller to initialize the
+     *        objects at the returned \c pointer.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = 0)
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      pointer result(0);
+      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw std::bad_alloc();
+      } // end if
+
+      return result;
+    } // end allocate()
+
+    /*! This method deallocates pinned host memory previously allocated
+     *  with this \c pinned_allocator.
+     *
+     *  \p p A \c pointer to the previously allocated memory.
+     *  \p cnt The number of objects previously allocated at
+     *         \p p.
+     *  \note This method does not invoke \p value_type's destructor.
+     *        It is the responsibility of the caller to destroy
+     *        the objects stored at \p p.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type /*cnt*/)
+    {
+      cudaError_t error = cudaFreeHost(p);
+
+      cudaGetLastError(); // Clear global CUDA error state.
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw thrust::system_error(error, thrust::cuda_category());
+      } // end if
+    } // end deallocate()
+
+    /*! This method returns the maximum size of the \c cnt parameter
+     *  accepted by the \p allocate() method.
+     *
+     *  \return The maximum number of objects that may be allocated
+     *          by a single call to \p allocate().
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! This method tests this \p pinned_allocator for equality to
+     *  another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c true.
+     */
+    __host__ __device__
+    inline bool operator==(pinned_allocator const& x) const { return true; }
+
+    /*! This method tests this \p pinned_allocator for inequality
+     *  to another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c false.
+     */
+    __host__ __device__
+    inline bool operator!=(pinned_allocator const &x) const { return !operator==(x); }
+}; // end pinned_allocator
+
+/*! \}
+ */
+
+} // end experimental
+
+} // end cuda
+
+} // end system
+
+// alias cuda's members at top-level
+namespace cuda
+{
+
+namespace experimental
+{
+
+using thrust::system::cuda::experimental::pinned_allocator;
+
+} // end experimental
+
+} // end cuda
+
+} // end thrust
+
diff --git a/thrust/thrust/system/cuda/future.h b/thrust/thrust/system/cuda/future.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc2986f8b2238d9aca56db1a99c7dc6b0d7fd259
--- /dev/null
+++ b/thrust/thrust/system/cuda/future.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/system/cuda/pointer.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+
+namespace system { namespace cuda
+{
+
+struct ready_event;
+
+template <typename T>
+struct ready_future;
+
+struct unique_eager_event;
+
+template <typename T>
+struct unique_eager_future;
+
+template <typename... Events>
+__host__
+unique_eager_event when_all(Events&&... evs);
+
+}} // namespace system::cuda
+
+namespace cuda
+{
+
+using thrust::system::cuda::ready_event;
+
+using thrust::system::cuda::ready_future;
+
+using thrust::system::cuda::unique_eager_event;
+using event = unique_eager_event;
+
+using thrust::system::cuda::unique_eager_future;
+template <typename T> using future = unique_eager_future<T>;
+
+using thrust::system::cuda::when_all;
+
+} // namespace cuda
+
+template <typename DerivedPolicy>
+__host__ 
+thrust::cuda::unique_eager_event
+unique_eager_event_type(
+  thrust::cuda::execution_policy<DerivedPolicy> const&
+) noexcept;
+
+template <typename T, typename DerivedPolicy>
+__host__ 
+thrust::cuda::unique_eager_future<T>
+unique_eager_future_type(
+  thrust::cuda::execution_policy<DerivedPolicy> const&
+) noexcept;
+
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/future.inl>
+
+#endif
+
diff --git a/thrust/thrust/system/cuda/memory.h b/thrust/thrust/system/cuda/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..f20ce352a2340f83871d31769ecffe872c7193d2
--- /dev/null
+++ b/thrust/thrust/system/cuda/memory.h
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ccudaliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/memory.h
+ *  \brief Managing memory associated with Thrust's CUDA system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/memory_resource.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/mr/allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace cuda_cub {
+
+/*! Allocates an area of memory available to Thrust's <tt>cuda</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>cuda::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>cuda::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cuda::pointer<void></tt> returned by this function must be
+ *        deallocated with \p cuda::free.
+ *  \see cuda::free
+ *  \see std::malloc
+ */
+inline __host__ __device__ pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>cuda</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>cuda::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated elements. A null <tt>cuda::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cuda::pointer<T></tt> returned by this function must be
+ *        deallocated with \p cuda::free.
+ *  \see cuda::free
+ *  \see std::malloc
+ */
+template <typename T>
+inline __host__ __device__ pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>cuda::malloc</tt>.
+ *  \param ptr A <tt>cuda::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>cuda::malloc</tt>.
+ *  \see cuda::malloc
+ *  \see std::free
+ */
+inline __host__ __device__ void free(pointer<void> ptr);
+
+/*! \p cuda::allocator is the default allocator used by the \p cuda system's containers such as
+ *  <tt>cuda::vector</tt> if no user-specified allocator is provided. \p cuda::allocator allocates
+ *  (deallocates) storage with \p cuda::malloc (\p cuda::free).
+ */
+template<typename T>
+using allocator = thrust::mr::stateless_resource_allocator<T, system::cuda::memory_resource>;
+
+}    // namespace cuda_cub
+
+namespace system {
+namespace cuda {
+using thrust::cuda_cub::malloc;
+using thrust::cuda_cub::free;
+using thrust::cuda_cub::allocator;
+} // namespace cuda
+} // namespace system
+
+namespace cuda {
+using thrust::cuda_cub::malloc;
+using thrust::cuda_cub::free;
+using thrust::cuda_cub::allocator;
+}    // end cuda
+
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/memory.inl>
+
diff --git a/thrust/thrust/system/cuda/memory_resource.h b/thrust/thrust/system/cuda/memory_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..9110e0af45845ed4a045e09011a1afaa3a66321f
--- /dev/null
+++ b/thrust/thrust/system/cuda/memory_resource.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file cuda/memory_resource.h
+ *  \brief Memory resources for the CUDA system.
+ */
+
+#pragma once
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cuda/detail/managed_memory_pointer.h>
+#include <thrust/system/cuda/pointer.h>
+#include <thrust/system/detail/bad_alloc.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/memory/detail/host_system_resource.h>
+
+namespace thrust
+{
+
+namespace system
+{
+namespace cuda
+{
+
+//! \cond
+namespace detail
+{
+
+    typedef cudaError_t (*allocation_fn)(void **, std::size_t);
+    typedef cudaError_t (*deallocation_fn)(void *);
+
+    template<allocation_fn Alloc, deallocation_fn Dealloc, typename Pointer>
+    class cuda_memory_resource THRUST_FINAL : public mr::memory_resource<Pointer>
+    {
+    public:
+        Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+        {
+            (void)alignment;
+
+            void * ret;
+            cudaError_t status = Alloc(&ret, bytes);
+
+            if (status != cudaSuccess)
+            {
+                cudaGetLastError(); // Clear the CUDA global error state.
+                throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+            }
+
+            return Pointer(ret);
+        }
+
+        void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+        {
+            (void)bytes;
+            (void)alignment;
+
+            cudaError_t status = Dealloc(thrust::detail::pointer_traits<Pointer>::get(p));
+
+            if (status != cudaSuccess)
+            {
+                thrust::cuda_cub::throw_on_error(status, "CUDA free failed");
+            }
+        }
+    };
+
+    inline cudaError_t cudaMallocManaged(void ** ptr, std::size_t bytes)
+    {
+        return ::cudaMallocManaged(ptr, bytes, cudaMemAttachGlobal);
+    }
+
+    typedef detail::cuda_memory_resource<cudaMalloc, cudaFree,
+        thrust::cuda::pointer<void> >
+        device_memory_resource;
+    typedef detail::cuda_memory_resource<detail::cudaMallocManaged, cudaFree,
+        detail::managed_memory_pointer<void> >
+        managed_memory_resource;
+    typedef detail::cuda_memory_resource<cudaMallocHost, cudaFreeHost,
+        thrust::host_memory_resource::pointer>
+        pinned_memory_resource;
+
+} // end detail
+//! \endcond
+
+/*! The memory resource for the CUDA system. Uses <tt>cudaMalloc</tt> and wraps the result with \p cuda::pointer. */
+typedef detail::device_memory_resource memory_resource;
+/*! The universal memory resource for the CUDA system. Uses <tt>cudaMallocManaged</tt> and wraps the result with \p cuda::pointer. */
+typedef detail::managed_memory_resource universal_memory_resource;
+/*! The host pinned memory resource for the CUDA system. Uses <tt>cudaMallocHost</tt> and wraps the result with \p cuda::pointer. */
+typedef detail::pinned_memory_resource universal_host_pinned_memory_resource;
+
+} // end cuda
+} // end system
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/cuda/pointer.h b/thrust/thrust/system/cuda/pointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f198385ce23fe6c391cb999e39c769f789f4729b
--- /dev/null
+++ b/thrust/thrust/system/cuda/pointer.h
@@ -0,0 +1,321 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ccudaliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+namespace thrust
+{
+namespace cuda_cub
+{
+
+template <typename>
+class pointer;
+
+} // end cuda_cub
+} // end thrust
+
+
+// specialize thrust::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace thrust
+{
+
+template <typename Element>
+struct iterator_traits<thrust::cuda_cub::pointer<Element> >
+{
+private:
+  typedef thrust::cuda_cub::pointer<Element> ptr;
+
+public:
+  typedef typename ptr::iterator_category iterator_category;
+  typedef typename ptr::value_type        value_type;
+  typedef typename ptr::difference_type   difference_type;
+  typedef ptr                             pointer;
+  typedef typename ptr::reference         reference;
+};    // end iterator_traits
+
+namespace cuda_cub {
+
+// forward declaration of reference for pointer
+template <typename Element>
+class reference;
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+template <typename Element>
+struct reference_msvc_workaround
+{
+  typedef thrust::cuda_cub::reference<Element> type;
+};    // end reference_msvc_workaround
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the cuda system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in cuda memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p cuda::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cuda::malloc
+ *  \see cuda::free
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+class pointer
+    : public thrust::pointer<
+          T,
+          thrust::cuda_cub::tag,
+          thrust::cuda_cub::reference<T>,
+          thrust::cuda_cub::pointer<T> >
+{
+
+private:
+  typedef thrust::pointer<
+      T,
+      thrust::cuda_cub::tag,
+      typename reference_msvc_workaround<T>::type,
+      thrust::cuda_cub::pointer<T> >
+      super_t;
+
+public:
+  /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+   */
+  __host__ __device__
+  pointer() : super_t() {}
+
+  #if THRUST_CPP_DIALECT >= 2011
+  // NOTE: This is needed so that Thrust smart pointers can be used in
+  // `std::unique_ptr`.
+  __host__ __device__
+  pointer(decltype(nullptr)) : super_t(nullptr) {}
+  #endif
+
+  /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+   *
+   *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+   *         accessible by the \p cuda system.
+   *  \tparam OtherT \p OtherT shall be convertible to \p T.
+   */
+  template <typename OtherT>
+  __host__ __device__ explicit pointer(OtherT *ptr) : super_t(ptr)
+  {
+  }
+
+  /*! This constructor allows construction from another pointer-like object with related type.
+   *
+   *  \param other The \p OtherPointer to copy.
+   *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+   *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
+   */
+  template <typename OtherPointer>
+  __host__ __device__
+  pointer(const OtherPointer &other,
+          typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer>::type * = 0) : super_t(other)
+  {
+  }
+
+  /*! This constructor allows construction from another pointer-like object with \p void type.
+   *
+   *  \param other The \p OtherPointer to copy.
+   *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+   *          to \p thrust::system::cuda::tag and its element type shall be \p void.
+   */
+  template <typename OtherPointer>
+  __host__ __device__
+  explicit
+  pointer(const OtherPointer &other,
+          typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+              OtherPointer,
+              pointer>::type * = 0) : super_t(other)
+  {
+  }
+
+  /*! Assignment operator allows assigning from another pointer-like object with related type.
+   *
+   *  \param other The other pointer-like object to assign from.
+   *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+   *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
+   */
+  template <typename OtherPointer>
+  __host__ __device__
+      typename thrust::detail::enable_if_pointer_is_convertible<
+          OtherPointer,
+          pointer,
+          pointer &>::type
+      operator=(const OtherPointer &other)
+  {
+    return super_t::operator=(other);
+  }
+
+  #if THRUST_CPP_DIALECT >= 2011
+  // NOTE: This is needed so that Thrust smart pointers can be used in
+  // `std::unique_ptr`.
+  __host__ __device__
+  pointer& operator=(decltype(nullptr))
+  {
+    super_t::operator=(nullptr);
+    return *this;
+  }
+  #endif
+};    // struct pointer
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p cuda system.
+ *  \p reference is the type of the result of dereferencing a \p cuda::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template <typename T>
+class reference
+    : public thrust::reference<
+          T,
+          thrust::cuda_cub::pointer<T>,
+          thrust::cuda_cub::reference<T> >
+{
+
+private:
+  typedef thrust::reference<
+      T,
+      thrust::cuda_cub::pointer<T>,
+      thrust::cuda_cub::reference<T> >
+      super_t;
+
+public:
+  /*! \cond
+   */
+
+  typedef typename super_t::value_type value_type;
+  typedef typename super_t::pointer    pointer;
+
+  /*! \endcond
+   */
+
+  /*! This constructor initializes this \p reference to refer to an object
+   *  pointed to by the given \p pointer. After this \p reference is constructed,
+   *  it shall refer to the object pointed to by \p ptr.
+   *
+   *  \param ptr A \p pointer to copy from.
+   */
+  __host__ __device__ explicit reference(const pointer &ptr)
+      : super_t(ptr)
+  {
+  }
+
+  /*! This constructor accepts a const reference to another \p reference of related type.
+   *  After this \p reference is constructed, it shall refer to the same object as \p other.
+   *
+   *  \param other A \p reference to copy from.
+   *  \tparam OtherT The element type of the other \p reference.
+   *
+   *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+   *        from <tt>reference<T></tt>.
+   */
+  template <typename OtherT>
+  __host__ __device__
+  reference(const reference<OtherT> &other,
+            typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer>::type * = 0)
+      : super_t(other)
+  {
+  }
+
+  /*! Copy assignment operator copy assigns from another \p reference of related type.
+   *
+   *  \param other The other \p reference to assign from.
+   *  \return <tt>*this</tt>
+   *  \tparam OtherT The element type of the other \p reference.
+   */
+  template <typename OtherT>
+  __host__ __device__
+      reference &
+      operator=(const reference<OtherT> &other);
+
+  /*! Assignment operator assigns from a \p value_type.
+   *
+   *  \param x The \p value_type to assign from.
+   *  \return <tt>*this</tt>
+   */
+  __host__ __device__
+      reference &
+      operator=(const value_type &x);
+};    // struct reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference of interest.
+ */
+template <typename T>
+__host__ __device__ void swap(reference<T> x, reference<T> y);
+
+} // end cuda_cub
+
+namespace system {
+
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::cuda
+ *  \brief \p thrust::system::cuda is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's CUDA backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::cuda</tt>
+ *         namespace for easy access.
+ *
+ */
+
+namespace cuda {
+using thrust::cuda_cub::pointer;
+using thrust::cuda_cub::reference;
+} // end cuda
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::cuda
+ *  \brief \p thrust::cuda is a top-level alias for \p thrust::system::cuda. */
+namespace cuda {
+using thrust::cuda_cub::pointer;
+using thrust::cuda_cub::reference;
+} // end cuda
+
+} // end thrust
+
+#include <thrust/system/cuda/detail/pointer.inl>
diff --git a/thrust/thrust/system/cuda/vector.h b/thrust/thrust/system/cuda/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..9348057a70ba58fc459e7578ebbbff12c5bc3c0b
--- /dev/null
+++ b/thrust/thrust/system/cuda/vector.h
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ccudaliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's CUDA system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+template<typename T, typename Allocator> class host_vector;
+
+namespace cuda_cub
+{
+
+/*! \p cuda_bulk::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p cuda_bulk::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p cuda_bulk::vector reside in memory
+ *  available to the \p cuda_bulk system.
+ *
+ *  \tparam T The element type of the \p cuda_bulk::vector.
+ *  \tparam Allocator The allocator type of the \p cuda_bulk::vector. Defaults to \p cuda_bulk::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cuda_bulk::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+} // end cuda_cub
+
+// alias system::cuda_bulk names at top-level
+namespace cuda
+{
+
+using thrust::cuda_cub::vector;
+
+} // end cuda_bulk
+
+namespace system {
+namespace cuda {
+using thrust::cuda_cub::vector;
+}
+}
+
+} // end thrust
diff --git a/thrust/thrust/system/detail/adl/adjacent_difference.h b/thrust/thrust/system/detail/adl/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6f6c72820a2fe158d7389d03d49334e1675438b
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/adjacent_difference.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the adjacent_difference.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch adjacent_difference
+
+#include <thrust/system/detail/sequential/adjacent_difference.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/adjacent_difference.h>
+#include <thrust/system/cuda/detail/adjacent_difference.h>
+#include <thrust/system/omp/detail/adjacent_difference.h>
+#include <thrust/system/tbb/detail/adjacent_difference.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/adjacent_difference.h>
+#include __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+#undef __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/adjacent_difference.h>
+#include __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/assign_value.h b/thrust/thrust/system/detail/adl/assign_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..d38934affd1a0a51fb64e011106b9af83dca8cdb
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/assign_value.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the assign_value.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch assign_value
+
+#include <thrust/system/detail/sequential/assign_value.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/assign_value.h>
+#include <thrust/system/cuda/detail/assign_value.h>
+#include <thrust/system/omp/detail/assign_value.h>
+#include <thrust/system/tbb/detail/assign_value.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/assign_value.h>
+#include __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
+#undef __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/assign_value.h>
+#include __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/async/copy.h b/thrust/thrust/system/detail/adl/async/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..72debb3b66715b284056aea1648fcaf5589afd70
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/async/copy.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/copy.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async copy.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/copy.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/copy.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_COPY_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_COPY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/copy.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_COPY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_COPY_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/async/for_each.h b/thrust/thrust/system/detail/adl/async/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..08347f659279c487ac9ba5d29a6db3e4b9a6ddb9
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/async/for_each.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/for_each.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async for_each.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/for_each.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/for_each.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_FOR_EACH_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_FOR_EACH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/for_each.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_FOR_EACH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_FOR_EACH_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/async/reduce.h b/thrust/thrust/system/detail/adl/async/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..f13ab02fdca0b0fdf416f3bd117ee239e54df15c
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/async/reduce.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/reduce.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async reduce.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/reduce.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/reduce.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_REDUCE_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_REDUCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/reduce.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_REDUCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_REDUCE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/async/sort.h b/thrust/thrust/system/detail/adl/async/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3a83ad404f8943d52dbeeca9a183997d8dff7c3
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/async/sort.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/sort.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async sort.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/sort.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SORT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/sort.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SORT_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SORT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SORT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/sort.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SORT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SORT_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/async/transform.h b/thrust/thrust/system/detail/adl/async/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..abb2163ead0654a805deb3b31ca29f8c576ac9e9
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/async/transform.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a transform of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/transform.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async transform.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/transform.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_TRANSFORM_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/transform.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_TRANSFORM_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_TRANSFORM_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_TRANSFORM_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/transform.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_TRANSFORM_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_TRANSFORM_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/binary_search.h b/thrust/thrust/system/detail/adl/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f9ac06df9bce42d261cf650b143d6389734988e
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/binary_search.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the binary_search.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch binary_search
+
+#include <thrust/system/detail/sequential/binary_search.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/binary_search.h>
+#include <thrust/system/cuda/detail/binary_search.h>
+#include <thrust/system/omp/detail/binary_search.h>
+#include <thrust/system/tbb/detail/binary_search.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/binary_search.h>
+#include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+#undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/binary_search.h>
+#include __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/copy.h b/thrust/thrust/system/detail/adl/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..0035b83efee0265e87fdc34f76c66c1f62d1cf60
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/copy.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the copy.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch copy
+
+#include <thrust/system/detail/sequential/copy.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/copy.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/system/omp/detail/copy.h>
+#include <thrust/system/tbb/detail/copy.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy.h>
+#include __THRUST_HOST_SYSTEM_COPY_HEADER
+#undef __THRUST_HOST_SYSTEM_COPY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy.h>
+#include __THRUST_DEVICE_SYSTEM_COPY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_COPY_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/copy_if.h b/thrust/thrust/system/detail/adl/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..31adaf8e1a7366071924404561873a1cf6b89042
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/copy_if.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy_if.h of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the copy_if.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch copy_if
+
+#include <thrust/system/detail/sequential/copy_if.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/copy_if.h>
+#include <thrust/system/cuda/detail/copy_if.h>
+#include <thrust/system/omp/detail/copy_if.h>
+#include <thrust/system/tbb/detail/copy_if.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_COPY_IF_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy_if.h>
+#include __THRUST_HOST_SYSTEM_COPY_IF_HEADER
+#undef __THRUST_HOST_SYSTEM_COPY_IF_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_COPY_IF_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy_if.h>
+#include __THRUST_DEVICE_SYSTEM_COPY_IF_HEADER
+#undef __THRUST_DEVICE_SYSTEM_COPY_IF_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/count.h b/thrust/thrust/system/detail/adl/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d6f1f748ffea9d1b3a33c764cc2ac307b51a5f8
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/count.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a count of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the count.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch count
+
+#include <thrust/system/detail/sequential/count.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/count.h>
+#include <thrust/system/cuda/detail/count.h>
+#include <thrust/system/omp/detail/count.h>
+#include <thrust/system/tbb/detail/count.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_COUNT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/count.h>
+#include __THRUST_HOST_SYSTEM_COUNT_HEADER
+#undef __THRUST_HOST_SYSTEM_COUNT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_COUNT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/count.h>
+#include __THRUST_DEVICE_SYSTEM_COUNT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_COUNT_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/equal.h b/thrust/thrust/system/detail/adl/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b02e33b857eb9d7efae5747cc9bcbde6b8c0b17
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/equal.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a equal of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the equal.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch equal
+
+#include <thrust/system/detail/sequential/equal.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/equal.h>
+#include <thrust/system/cuda/detail/equal.h>
+#include <thrust/system/omp/detail/equal.h>
+#include <thrust/system/tbb/detail/equal.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_EQUAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/equal.h>
+#include __THRUST_HOST_SYSTEM_EQUAL_HEADER
+#undef __THRUST_HOST_SYSTEM_EQUAL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_EQUAL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/equal.h>
+#include __THRUST_DEVICE_SYSTEM_EQUAL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_EQUAL_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/extrema.h b/thrust/thrust/system/detail/adl/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..62fb39be922d5dd0eb0be84b513ace96bcad3e7d
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/extrema.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a extrema of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the extrema.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch extrema
+
+#include <thrust/system/detail/sequential/extrema.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/extrema.h>
+#include <thrust/system/cuda/detail/extrema.h>
+#include <thrust/system/omp/detail/extrema.h>
+#include <thrust/system/tbb/detail/extrema.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_EXTREMA_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/extrema.h>
+#include __THRUST_HOST_SYSTEM_EXTREMA_HEADER
+#undef __THRUST_HOST_SYSTEM_EXTREMA_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/extrema.h>
+#include __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER
+#undef __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/fill.h b/thrust/thrust/system/detail/adl/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..f76a81b4f3477d87abe5b88c71f89e4158d68d28
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/fill.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the fill.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch fill
+
+#include <thrust/system/detail/sequential/fill.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/fill.h>
+#include <thrust/system/cuda/detail/fill.h>
+#include <thrust/system/omp/detail/fill.h>
+#include <thrust/system/tbb/detail/fill.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/fill.h>
+#include __THRUST_HOST_SYSTEM_FILL_HEADER
+#undef __THRUST_HOST_SYSTEM_FILL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FILL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/fill.h>
+#include __THRUST_DEVICE_SYSTEM_FILL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FILL_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/find.h b/thrust/thrust/system/detail/adl/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d85e09a3ab713bfa9dd668bb0beb791cfc3db1b
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/find.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the find.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch find
+
+#include <thrust/system/detail/sequential/find.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/find.h>
+#include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/omp/detail/find.h>
+#include <thrust/system/tbb/detail/find.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_FIND_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/find.h>
+#include __THRUST_HOST_SYSTEM_FIND_HEADER
+#undef __THRUST_HOST_SYSTEM_FIND_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FIND_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/find.h>
+#include __THRUST_DEVICE_SYSTEM_FIND_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FIND_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/for_each.h b/thrust/thrust/system/detail/adl/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..8509edca3630f2c9d313e34efeca10abbee03e7b
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/for_each.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the for_each.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch for_each
+
+#include <thrust/system/detail/sequential/for_each.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/for_each.h>
+#include <thrust/system/cuda/detail/for_each.h>
+#include <thrust/system/omp/detail/for_each.h>
+#include <thrust/system/tbb/detail/for_each.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/for_each.h>
+#include __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
+#undef __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/for_each.h>
+#include __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/gather.h b/thrust/thrust/system/detail/adl/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..242da3c9095757a2c7de9e0b97ae5fe4118c8172
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/gather.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the gather.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch gather
+
+#include <thrust/system/detail/sequential/gather.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/gather.h>
+#include <thrust/system/cuda/detail/gather.h>
+#include <thrust/system/omp/detail/gather.h>
+#include <thrust/system/tbb/detail/gather.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_GATHER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/gather.h>
+#include __THRUST_HOST_SYSTEM_GATHER_HEADER
+#undef __THRUST_HOST_SYSTEM_GATHER_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_GATHER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/gather.h>
+#include __THRUST_DEVICE_SYSTEM_GATHER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_GATHER_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/generate.h b/thrust/thrust/system/detail/adl/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1d7b4bac5f24a40072e461d9bce530dfa12319
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/generate.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the generate.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch generate
+
+#include <thrust/system/detail/sequential/generate.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/generate.h>
+#include <thrust/system/cuda/detail/generate.h>
+#include <thrust/system/omp/detail/generate.h>
+#include <thrust/system/tbb/detail/generate.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_GENERATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/generate.h>
+#include __THRUST_HOST_SYSTEM_GENERATE_HEADER
+#undef __THRUST_HOST_SYSTEM_GENERATE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_GENERATE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/generate.h>
+#include __THRUST_DEVICE_SYSTEM_GENERATE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_GENERATE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/get_value.h b/thrust/thrust/system/detail/adl/get_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..306eb423eb4b1bc55c01c12eca0087a95b0ff376
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/get_value.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the get_value.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch get_value
+
+#include <thrust/system/detail/sequential/get_value.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/get_value.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/omp/detail/get_value.h>
+#include <thrust/system/tbb/detail/get_value.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_GET_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/get_value.h>
+#include __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
+#undef __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/get_value.h>
+#include __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/inner_product.h b/thrust/thrust/system/detail/adl/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..9423b1bdbf19deba9f7ec91d9ce4417b0c2f5145
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/inner_product.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the inner_product.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch inner_product
+
+#include <thrust/system/detail/sequential/inner_product.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/inner_product.h>
+#include <thrust/system/cuda/detail/inner_product.h>
+#include <thrust/system/omp/detail/inner_product.h>
+#include <thrust/system/tbb/detail/inner_product.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/inner_product.h>
+#include __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
+#undef __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/inner_product.h>
+#include __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/iter_swap.h b/thrust/thrust/system/detail/adl/iter_swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9da52a6274c151e8602f41b72a0a9dafed13c26
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/iter_swap.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the iter_swap.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch iter_swap
+
+#include <thrust/system/detail/sequential/iter_swap.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/iter_swap.h>
+#include <thrust/system/cuda/detail/iter_swap.h>
+#include <thrust/system/omp/detail/iter_swap.h>
+#include <thrust/system/tbb/detail/iter_swap.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/iter_swap.h>
+#include __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
+#undef __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/iter_swap.h>
+#include __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/logical.h b/thrust/thrust/system/detail/adl/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdaad4d293a2695f0a1218e0cf828eb12406ad16
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/logical.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the logical.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch logical
+
+#include <thrust/system/detail/sequential/logical.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/logical.h>
+#include <thrust/system/cuda/detail/logical.h>
+#include <thrust/system/omp/detail/logical.h>
+#include <thrust/system/tbb/detail/logical.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_LOGICAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/logical.h>
+#include __THRUST_HOST_SYSTEM_LOGICAL_HEADER
+#undef __THRUST_HOST_SYSTEM_LOGICAL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/logical.h>
+#include __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/malloc_and_free.h b/thrust/thrust/system/detail/adl/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..c36db0270ddfd2ee2b56f2fa04272b6dcbce8796
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/malloc_and_free.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the malloc_and_free.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch malloc_and_free
+
+#include <thrust/system/detail/sequential/malloc_and_free.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+#include <thrust/system/cuda/detail/malloc_and_free.h>
+#include <thrust/system/omp/detail/malloc_and_free.h>
+#include <thrust/system/tbb/detail/malloc_and_free.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/malloc_and_free.h>
+#include __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
+#undef __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/malloc_and_free.h>
+#include __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/merge.h b/thrust/thrust/system/detail/adl/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..7abca9bcf3f1697196277c40c4718c2c5004de9a
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/merge.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the merge.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch merge
+
+#include <thrust/system/detail/sequential/merge.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/merge.h>
+#include <thrust/system/cuda/detail/merge.h>
+#include <thrust/system/omp/detail/merge.h>
+#include <thrust/system/tbb/detail/merge.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_MERGE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/merge.h>
+#include __THRUST_HOST_SYSTEM_MERGE_HEADER
+#undef __THRUST_HOST_SYSTEM_MERGE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_MERGE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/merge.h>
+#include __THRUST_DEVICE_SYSTEM_MERGE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MERGE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/mismatch.h b/thrust/thrust/system/detail/adl/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..74feb826928851a2d09e92cb5403dc0375cda288
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/mismatch.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the mismatch.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch mismatch
+
+#include <thrust/system/detail/sequential/mismatch.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/mismatch.h>
+#include <thrust/system/cuda/detail/mismatch.h>
+#include <thrust/system/omp/detail/mismatch.h>
+#include <thrust/system/tbb/detail/mismatch.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_MISMATCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/mismatch.h>
+#include __THRUST_HOST_SYSTEM_MISMATCH_HEADER
+#undef __THRUST_HOST_SYSTEM_MISMATCH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/mismatch.h>
+#include __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/partition.h b/thrust/thrust/system/detail/adl/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..a45f845a5c6ec5bc0016bdfb823e3b9b3d695276
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/partition.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the partition.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch partition
+
+#include <thrust/system/detail/sequential/partition.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/partition.h>
+#include <thrust/system/cuda/detail/partition.h>
+#include <thrust/system/omp/detail/partition.h>
+#include <thrust/system/tbb/detail/partition.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_PARTITION_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/partition.h>
+#include __THRUST_HOST_SYSTEM_PARTITION_HEADER
+#undef __THRUST_HOST_SYSTEM_PARTITION_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_PARTITION_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/partition.h>
+#include __THRUST_DEVICE_SYSTEM_PARTITION_HEADER
+#undef __THRUST_DEVICE_SYSTEM_PARTITION_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/per_device_resource.h b/thrust/thrust/system/detail/adl/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..721f49e03fd49c5db5b1094575a62630d0509fc1
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/per_device_resource.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the per_device_resource.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch per_device_resource
+
+#include <thrust/system/detail/sequential/per_device_resource.h>
+
+#if 0
+#include <thrust/system/cpp/detail/per_device_resource.h>
+#include <thrust/system/cuda/detail/per_device_resource.h>
+#include <thrust/system/omp/detail/per_device_resource.h>
+#include <thrust/system/tbb/detail/per_device_resource.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_PER_DEVICE_RESOURCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/per_device_resource.h>
+#include __THRUST_HOST_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+#undef __THRUST_HOST_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_PER_DEVICE_RESOURCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/per_device_resource.h>
+#include __THRUST_DEVICE_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/reduce.h b/thrust/thrust/system/detail/adl/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a9673b3f957e590c60d7667fc57d4f50069c409
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/reduce.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the reduce.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch reduce
+
+#include <thrust/system/detail/sequential/reduce.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/reduce.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/system/omp/detail/reduce.h>
+#include <thrust/system/tbb/detail/reduce.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce.h>
+#include __THRUST_HOST_SYSTEM_REDUCE_HEADER
+#undef __THRUST_HOST_SYSTEM_REDUCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reduce.h>
+#include __THRUST_DEVICE_SYSTEM_REDUCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REDUCE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/reduce_by_key.h b/thrust/thrust/system/detail/adl/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..0605f9befaf97bea651e2fde12c790fcd7103744
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/reduce_by_key.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the reduce_by_key.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch reduce_by_key
+
+#include <thrust/system/detail/sequential/reduce_by_key.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/reduce_by_key.h>
+#include <thrust/system/cuda/detail/reduce_by_key.h>
+#include <thrust/system/omp/detail/reduce_by_key.h>
+#include <thrust/system/tbb/detail/reduce_by_key.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce_by_key.h>
+#include __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
+#undef __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reduce_by_key.h>
+#include __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/remove.h b/thrust/thrust/system/detail/adl/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..c281379d54ed74d0a55453b92a39557999f2ad01
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/remove.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the remove.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch remove
+
+#include <thrust/system/detail/sequential/remove.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/remove.h>
+#include <thrust/system/cuda/detail/remove.h>
+#include <thrust/system/omp/detail/remove.h>
+#include <thrust/system/tbb/detail/remove.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_REMOVE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/remove.h>
+#include __THRUST_HOST_SYSTEM_REMOVE_HEADER
+#undef __THRUST_HOST_SYSTEM_REMOVE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REMOVE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/remove.h>
+#include __THRUST_DEVICE_SYSTEM_REMOVE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REMOVE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/replace.h b/thrust/thrust/system/detail/adl/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8fb5746f1ced28be6571c8535ce0d8615863234
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/replace.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the replace.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch replace
+
+#include <thrust/system/detail/sequential/replace.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/replace.h>
+#include <thrust/system/cuda/detail/replace.h>
+#include <thrust/system/omp/detail/replace.h>
+#include <thrust/system/tbb/detail/replace.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_REPLACE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/replace.h>
+#include __THRUST_HOST_SYSTEM_REPLACE_HEADER
+#undef __THRUST_HOST_SYSTEM_REPLACE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REPLACE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/replace.h>
+#include __THRUST_DEVICE_SYSTEM_REPLACE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REPLACE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/reverse.h b/thrust/thrust/system/detail/adl/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6bd8947ee4bf3715441e9516160082427aa1491
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/reverse.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the reverse.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch reverse
+
+#include <thrust/system/detail/sequential/reverse.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/reverse.h>
+#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/system/omp/detail/reverse.h>
+#include <thrust/system/tbb/detail/reverse.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_REVERSE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reverse.h>
+#include __THRUST_HOST_SYSTEM_REVERSE_HEADER
+#undef __THRUST_HOST_SYSTEM_REVERSE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REVERSE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reverse.h>
+#include __THRUST_DEVICE_SYSTEM_REVERSE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REVERSE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/scan.h b/thrust/thrust/system/detail/adl/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..a24910410589c68c6bb122be24a50c04e44a4204
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/scan.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the scan.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch scan
+
+#include <thrust/system/detail/sequential/scan.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/scan.h>
+#include <thrust/system/cuda/detail/scan.h>
+#include <thrust/system/omp/detail/scan.h>
+#include <thrust/system/tbb/detail/scan.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan.h>
+#include __THRUST_HOST_SYSTEM_SCAN_HEADER
+#undef __THRUST_HOST_SYSTEM_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scan.h>
+#include __THRUST_DEVICE_SYSTEM_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SCAN_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/scan_by_key.h b/thrust/thrust/system/detail/adl/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..94f73503cac55cb65419bfd81dc9e6c9e7e63c0c
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/scan_by_key.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the scan_by_key.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch scan_by_key
+
+#include <thrust/system/detail/sequential/scan_by_key.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/scan_by_key.h>
+#include <thrust/system/cuda/detail/scan_by_key.h>
+#include <thrust/system/omp/detail/scan_by_key.h>
+#include <thrust/system/tbb/detail/scan_by_key.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan_by_key.h>
+#include __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
+#undef __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scan_by_key.h>
+#include __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/scatter.h b/thrust/thrust/system/detail/adl/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9f42b28b13cfa7928c54e76c950224b4bcfb66a
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/scatter.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the scatter.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch scatter
+
+#include <thrust/system/detail/sequential/scatter.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/scatter.h>
+#include <thrust/system/cuda/detail/scatter.h>
+#include <thrust/system/omp/detail/scatter.h>
+#include <thrust/system/tbb/detail/scatter.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SCATTER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scatter.h>
+#include __THRUST_HOST_SYSTEM_SCATTER_HEADER
+#undef __THRUST_HOST_SYSTEM_SCATTER_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SCATTER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scatter.h>
+#include __THRUST_DEVICE_SYSTEM_SCATTER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SCATTER_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/sequence.h b/thrust/thrust/system/detail/adl/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3c2a20f47e6f7c7c7e79a8d348ab30a7a1eb7d8
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/sequence.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the sequence.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch sequence
+
+#include <thrust/system/detail/sequential/sequence.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/sequence.h>
+#include <thrust/system/cuda/detail/sequence.h>
+#include <thrust/system/omp/detail/sequence.h>
+#include <thrust/system/tbb/detail/sequence.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SEQUENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sequence.h>
+#include __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
+#undef __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/sequence.h>
+#include __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/set_operations.h b/thrust/thrust/system/detail/adl/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d09355e12ad1beaf3ad6af34558d5a8692bd62a
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/set_operations.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the set_operations.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch set_operations
+
+#include <thrust/system/detail/sequential/set_operations.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/set_operations.h>
+#include <thrust/system/cuda/detail/set_operations.h>
+#include <thrust/system/omp/detail/set_operations.h>
+#include <thrust/system/tbb/detail/set_operations.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/set_operations.h>
+#include __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
+#undef __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/set_operations.h>
+#include __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/sort.h b/thrust/thrust/system/detail/adl/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f6118c90bf6345a1fd4d6eb2f05d2630911fa64
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/sort.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the sort.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch sort
+
+#include <thrust/system/detail/sequential/sort.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/sort.h>
+#include <thrust/system/cuda/detail/sort.h>
+#include <thrust/system/omp/detail/sort.h>
+#include <thrust/system/tbb/detail/sort.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SORT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sort.h>
+#include __THRUST_HOST_SYSTEM_SORT_HEADER
+#undef __THRUST_HOST_SYSTEM_SORT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SORT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/sort.h>
+#include __THRUST_DEVICE_SYSTEM_SORT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SORT_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/swap_ranges.h b/thrust/thrust/system/detail/adl/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ca3719d96c1ed8de468a4b2ff4ab549bd4e88c0
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/swap_ranges.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the swap_ranges.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch swap_ranges
+
+#include <thrust/system/detail/sequential/swap_ranges.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/swap_ranges.h>
+#include <thrust/system/cuda/detail/swap_ranges.h>
+#include <thrust/system/omp/detail/swap_ranges.h>
+#include <thrust/system/tbb/detail/swap_ranges.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/swap_ranges.h>
+#include __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
+#undef __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/swap_ranges.h>
+#include __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/tabulate.h b/thrust/thrust/system/detail/adl/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ae2b22a5cbd3d2705cf4b13757c050b7c6161cc
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/tabulate.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the tabulate.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch tabulate
+
+#include <thrust/system/detail/sequential/tabulate.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/tabulate.h>
+#include <thrust/system/cuda/detail/tabulate.h>
+#include <thrust/system/omp/detail/tabulate.h>
+#include <thrust/system/tbb/detail/tabulate.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_TABULATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/tabulate.h>
+#include __THRUST_HOST_SYSTEM_TABULATE_HEADER
+#undef __THRUST_HOST_SYSTEM_TABULATE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TABULATE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/tabulate.h>
+#include __THRUST_DEVICE_SYSTEM_TABULATE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TABULATE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/temporary_buffer.h b/thrust/thrust/system/detail/adl/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cada5ee4b10a9fc36d19f80a276bb19ef7fff6d
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/temporary_buffer.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the temporary_buffer.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch get_temporary_buffer or return_temporary_buffer
+
+#include <thrust/system/detail/sequential/temporary_buffer.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/temporary_buffer.h>
+#include <thrust/system/cuda/detail/temporary_buffer.h>
+#include <thrust/system/omp/detail/temporary_buffer.h>
+#include <thrust/system/tbb/detail/temporary_buffer.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/temporary_buffer.h>
+#include __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
+#undef __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/temporary_buffer.h>
+#include __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/transform.h b/thrust/thrust/system/detail/adl/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..b70333093fd48b6c23fa2e8ec3ab20a8e51cad9f
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/transform.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the transform.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch transform
+
+#include <thrust/system/detail/sequential/transform.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/transform.h>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/system/omp/detail/transform.h>
+#include <thrust/system/tbb/detail/transform.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_TRANSFORM_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform.h>
+#include __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
+#undef __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform.h>
+#include __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/transform_reduce.h b/thrust/thrust/system/detail/adl/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3f9494dfa6e54bbfdeb2a51fabd8bebc2188e98
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/transform_reduce.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the transform_reduce.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch transform_reduce
+
+#include <thrust/system/detail/sequential/transform_reduce.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/transform_reduce.h>
+#include <thrust/system/cuda/detail/transform_reduce.h>
+#include <thrust/system/omp/detail/transform_reduce.h>
+#include <thrust/system/tbb/detail/transform_reduce.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_reduce.h>
+#include __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
+#undef __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform_reduce.h>
+#include __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/transform_scan.h b/thrust/thrust/system/detail/adl/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a05c7eeed9afc549d9d6bd1e28a64e67b4a4578
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/transform_scan.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the transform_scan.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch transform_scan
+
+#include <thrust/system/detail/sequential/transform_scan.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/transform_scan.h>
+#include <thrust/system/cuda/detail/transform_scan.h>
+#include <thrust/system/omp/detail/transform_scan.h>
+#include <thrust/system/tbb/detail/transform_scan.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_scan.h>
+#include __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
+#undef __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform_scan.h>
+#include __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/uninitialized_copy.h b/thrust/thrust/system/detail/adl/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a13b18aa8d73dae57b450a0a53e2fb97de2165ea
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/uninitialized_copy.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the uninitialized_copy.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch uninitialized_copy
+
+#include <thrust/system/detail/sequential/uninitialized_copy.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/omp/detail/uninitialized_copy.h>
+#include <thrust/system/tbb/detail/uninitialized_copy.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_copy.h>
+#include __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
+#undef __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/uninitialized_copy.h>
+#include __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/uninitialized_fill.h b/thrust/thrust/system/detail/adl/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..98b57836e124dcd75fcd7a7fe75c0646ed3f76ba
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/uninitialized_fill.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the uninitialized_fill.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch uninitialized_fill
+
+#include <thrust/system/detail/sequential/uninitialized_fill.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+#include <thrust/system/cuda/detail/uninitialized_fill.h>
+#include <thrust/system/omp/detail/uninitialized_fill.h>
+#include <thrust/system/tbb/detail/uninitialized_fill.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_fill.h>
+#include __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
+#undef __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/uninitialized_fill.h>
+#include __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/unique.h b/thrust/thrust/system/detail/adl/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..4082f5299269e77aacbae174754d57977d45ebdd
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/unique.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the unique.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch unique
+
+#include <thrust/system/detail/sequential/unique.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/unique.h>
+#include <thrust/system/cuda/detail/unique.h>
+#include <thrust/system/omp/detail/unique.h>
+#include <thrust/system/tbb/detail/unique.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_UNIQUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique.h>
+#include __THRUST_HOST_SYSTEM_UNIQUE_HEADER
+#undef __THRUST_HOST_SYSTEM_UNIQUE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/unique.h>
+#include __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER
+
diff --git a/thrust/thrust/system/detail/adl/unique_by_key.h b/thrust/thrust/system/detail/adl/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf9acd42cd730a8f42bedb01407cf75137a86fb
--- /dev/null
+++ b/thrust/thrust/system/detail/adl/unique_by_key.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the unique_by_key.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch unique_by_key
+
+#include <thrust/system/detail/sequential/unique_by_key.h>
+
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/unique_by_key.h>
+#include <thrust/system/cuda/detail/unique_by_key.h>
+#include <thrust/system/omp/detail/unique_by_key.h>
+#include <thrust/system/tbb/detail/unique_by_key.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique_by_key.h>
+#include __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
+#undef __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/unique_by_key.h>
+#include __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER
+
diff --git a/thrust/thrust/system/detail/bad_alloc.h b/thrust/thrust/system/detail/bad_alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..461704fd6b74a33f3c9c789f0f02833bf49586d3
--- /dev/null
+++ b/thrust/thrust/system/detail/bad_alloc.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <new>
+#include <string>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+
+// define our own bad_alloc so we can set its .what()
+class bad_alloc
+  : public std::bad_alloc
+{
+  public:
+    inline bad_alloc(const std::string &w)
+      : std::bad_alloc(), m_what()
+    {
+      m_what = std::bad_alloc::what();
+      m_what += ": ";
+      m_what += w;
+    } // end bad_alloc()
+
+    inline virtual ~bad_alloc(void) throw () {};
+
+    inline virtual const char *what(void) const throw()
+    {
+      return m_what.c_str();
+    } // end what()
+
+  private:
+    std::string m_what;
+}; // end bad_alloc
+  
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/errno.h b/thrust/thrust/system/detail/errno.h
new file mode 100644
index 0000000000000000000000000000000000000000..78aec2acea7b9a2ddc1b131c33624872b79db13d
--- /dev/null
+++ b/thrust/thrust/system/detail/errno.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// The rationale for the existence of these apparently redundant definitions is
+// to provide them portably and to avoid bringing in system headers which might
+// pollute the global namespace. These identifiers are in lowercase to avoid
+// colliding with the real macros in errno.h.
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace detail
+{
+
+static const int eafnosupport    = 9901;
+static const int eaddrinuse      = 9902;
+static const int eaddrnotavail   = 9903;
+static const int eisconn         = 9904;
+static const int ebadmsg         = 9905;
+static const int econnaborted    = 9906;
+static const int ealready        = 9907;
+static const int econnrefused    = 9908;
+static const int econnreset      = 9909;
+static const int edestaddrreq    = 9910;
+static const int ehostunreach    = 9911;
+static const int eidrm           = 9912;
+static const int emsgsize        = 9913;
+static const int enetdown        = 9914;
+static const int enetreset       = 9915;
+static const int enetunreach     = 9916;
+static const int enobufs         = 9917;
+static const int enolink         = 9918;
+static const int enodata         = 9919;
+static const int enomsg          = 9920;
+static const int enoprotoopt     = 9921;
+static const int enosr           = 9922;
+static const int enotsock        = 9923;
+static const int enostr          = 9924;
+static const int enotconn        = 9925;
+static const int enotsup         = 9926;
+static const int ecanceled       = 9927;
+static const int einprogress     = 9928;
+static const int eopnotsupp      = 9929;
+static const int ewouldblock     = 9930;
+static const int eownerdead      = 9931;
+static const int eproto          = 9932;
+static const int eprotonosupport = 9933;
+static const int enotrecoverable = 9934;
+static const int etime           = 9935;
+static const int etxtbsy         = 9936;
+static const int etimedout       = 9938;
+static const int eloop           = 9939;
+static const int eoverflow       = 9940;
+static const int eprototype      = 9941;
+static const int enosys          = 9942;
+static const int einval          = 9943;
+static const int erange          = 9944;
+static const int eilseq          = 9945;
+static const int e2big           = 9946;
+static const int edom            = 9947;
+static const int efault          = 9948;
+static const int ebadf           = 9949;
+static const int epipe           = 9950;
+static const int exdev           = 9951;
+static const int ebusy           = 9952;
+static const int enotempty       = 9953;
+static const int enoexec         = 9954;
+static const int eexist          = 9955;
+static const int efbig           = 9956;
+static const int enametoolong    = 9957;
+static const int enotty          = 9958;
+static const int eintr           = 9959;
+static const int espipe          = 9960;
+static const int eio             = 9961;
+static const int eisdir          = 9962;
+static const int echild          = 9963;
+static const int enolck          = 9964;
+static const int enospc          = 9965;
+static const int enxio           = 9966;
+static const int enodev          = 9967;
+static const int enoent          = 9968;
+static const int esrch           = 9969;
+static const int enotdir         = 9970;
+static const int enomem          = 9971;
+static const int eperm           = 9972;
+static const int eacces          = 9973;
+static const int erofs           = 9974;
+static const int edeadlk         = 9975;
+static const int eagain          = 9976;
+static const int enfile          = 9977;
+static const int emfile          = 9978;
+static const int emlink          = 9979;
+
+} // end detail
+
+} // end system
+
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/error_category.inl b/thrust/thrust/system/detail/error_category.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4602b0f300709de4cb91a7a6553f289d61293844
--- /dev/null
+++ b/thrust/thrust/system/detail/error_category.inl
@@ -0,0 +1,236 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/error_code.h>
+#include <thrust/system/detail/errno.h>
+#include <thrust/functional.h>
+#include <cstring>
+
+namespace thrust
+{
+
+namespace system
+{
+
+error_category
+  ::~error_category(void)
+{
+  ;
+} // end error_category::~error_category()
+
+
+error_condition error_category
+  ::default_error_condition(int ev) const
+{
+  return error_condition(ev, *this);
+} // end error_category::default_error_condition()
+
+
+bool error_category
+  ::equivalent(int code, const error_condition &condition) const
+{
+  return default_error_condition(code) == condition;
+} // end error_condition::equivalent()
+
+
+bool error_category
+  ::equivalent(const error_code &code, int condition) const
+{
+  bool result = (this->operator==(code.category())) && (code.value() == condition);
+  return result;
+} // end error_code::equivalent()
+
+
+bool error_category
+  ::operator==(const error_category &rhs) const
+{
+  return this == &rhs;
+} // end error_category::operator==()
+
+
+bool error_category
+  ::operator!=(const error_category &rhs) const
+{
+  return !this->operator==(rhs);
+} // end error_category::operator!=()
+
+
+bool error_category
+  ::operator<(const error_category &rhs) const
+{
+  return thrust::less<const error_category*>()(this,&rhs);
+} // end error_category::operator<()
+
+
+namespace detail
+{
+
+
+class generic_error_category
+  : public error_category
+{
+  public:
+    inline generic_error_category(void) {}
+
+    inline virtual const char *name(void) const
+    {
+      return "generic";
+    }
+
+    inline virtual std::string message(int ev) const
+    {
+      static const std::string unknown_err("Unknown error");
+
+      // XXX strerror is not thread-safe:
+      //     prefer strerror_r (which is not provided on windows)
+      THRUST_DISABLE_MSVC_WARNING_BEGIN(4996)
+      const char *c_str = std::strerror(ev);
+      THRUST_DISABLE_MSVC_WARNING_END(4996)
+      return c_str ? std::string(c_str) : unknown_err;
+    }
+}; // end generic_category_result
+
+
+class system_error_category
+  : public error_category
+{
+  public:
+    inline system_error_category(void) {}
+
+    inline virtual const char *name(void) const
+    {
+      return "system";
+    }
+
+    inline virtual std::string message(int ev) const
+    {
+      return generic_category().message(ev);
+    }
+
+    inline virtual error_condition default_error_condition(int ev) const
+    {
+      using namespace errc;
+
+      switch(ev)
+      {
+        case eafnosupport:    return make_error_condition(address_family_not_supported);
+        case eaddrinuse:      return make_error_condition(address_in_use);
+        case eaddrnotavail:   return make_error_condition(address_not_available);
+        case eisconn:         return make_error_condition(already_connected);
+        case e2big:           return make_error_condition(argument_list_too_long);
+        case edom:            return make_error_condition(argument_out_of_domain);
+        case efault:          return make_error_condition(bad_address);
+        case ebadf:           return make_error_condition(bad_file_descriptor);
+        case ebadmsg:         return make_error_condition(bad_message);
+        case epipe:           return make_error_condition(broken_pipe);
+        case econnaborted:    return make_error_condition(connection_aborted);
+        case ealready:        return make_error_condition(connection_already_in_progress);
+        case econnrefused:    return make_error_condition(connection_refused);
+        case econnreset:      return make_error_condition(connection_reset);
+        case exdev:           return make_error_condition(cross_device_link);
+        case edestaddrreq:    return make_error_condition(destination_address_required);
+        case ebusy:           return make_error_condition(device_or_resource_busy);
+        case enotempty:       return make_error_condition(directory_not_empty);
+        case enoexec:         return make_error_condition(executable_format_error);
+        case eexist:          return make_error_condition(file_exists);
+        case efbig:           return make_error_condition(file_too_large);
+        case enametoolong:    return make_error_condition(filename_too_long);
+        case enosys:          return make_error_condition(function_not_supported);
+        case ehostunreach:    return make_error_condition(host_unreachable);
+        case eidrm:           return make_error_condition(identifier_removed);
+        case eilseq:          return make_error_condition(illegal_byte_sequence);
+        case enotty:          return make_error_condition(inappropriate_io_control_operation);
+        case eintr:           return make_error_condition(interrupted);
+        case einval:          return make_error_condition(invalid_argument);
+        case espipe:          return make_error_condition(invalid_seek);
+        case eio:             return make_error_condition(io_error);
+        case eisdir:          return make_error_condition(is_a_directory);
+        case emsgsize:        return make_error_condition(message_size);
+        case enetdown:        return make_error_condition(network_down);
+        case enetreset:       return make_error_condition(network_reset);
+        case enetunreach:     return make_error_condition(network_unreachable);
+        case enobufs:         return make_error_condition(no_buffer_space);
+        case echild:          return make_error_condition(no_child_process);
+        case enolink:         return make_error_condition(no_link);
+        case enolck:          return make_error_condition(no_lock_available);
+        case enodata:         return make_error_condition(no_message_available);
+        case enomsg:          return make_error_condition(no_message);
+        case enoprotoopt:     return make_error_condition(no_protocol_option);
+        case enospc:          return make_error_condition(no_space_on_device);
+        case enosr:           return make_error_condition(no_stream_resources);
+        case enxio:           return make_error_condition(no_such_device_or_address);
+        case enodev:          return make_error_condition(no_such_device);
+        case enoent:          return make_error_condition(no_such_file_or_directory);
+        case esrch:           return make_error_condition(no_such_process);
+        case enotdir:         return make_error_condition(not_a_directory);
+        case enotsock:        return make_error_condition(not_a_socket);
+        case enostr:          return make_error_condition(not_a_stream);
+        case enotconn:        return make_error_condition(not_connected);
+        case enomem:          return make_error_condition(not_enough_memory);
+        case enotsup:         return make_error_condition(not_supported);
+        case ecanceled:       return make_error_condition(operation_canceled);
+        case einprogress:     return make_error_condition(operation_in_progress);
+        case eperm:           return make_error_condition(operation_not_permitted);
+        case eopnotsupp:      return make_error_condition(operation_not_supported);
+        case ewouldblock:     return make_error_condition(operation_would_block);
+        case eownerdead:      return make_error_condition(owner_dead);
+        case eacces:          return make_error_condition(permission_denied);
+        case eproto:          return make_error_condition(protocol_error);
+        case eprotonosupport: return make_error_condition(protocol_not_supported);
+        case erofs:           return make_error_condition(read_only_file_system);
+        case edeadlk:         return make_error_condition(resource_deadlock_would_occur);
+        case eagain:          return make_error_condition(resource_unavailable_try_again);
+        case erange:          return make_error_condition(result_out_of_range);
+        case enotrecoverable: return make_error_condition(state_not_recoverable);
+        case etime:           return make_error_condition(stream_timeout);
+        case etxtbsy:         return make_error_condition(text_file_busy);
+        case etimedout:       return make_error_condition(timed_out);
+        case enfile:          return make_error_condition(too_many_files_open_in_system);
+        case emfile:          return make_error_condition(too_many_files_open);
+        case emlink:          return make_error_condition(too_many_links);
+        case eloop:           return make_error_condition(too_many_symbolic_link_levels);
+        case eoverflow:       return make_error_condition(value_too_large);
+        case eprototype:      return make_error_condition(wrong_protocol_type);
+        default:              return error_condition(ev,system_category());
+      }
+    }
+}; // end system_category_result
+
+
+} // end detail
+
+
+const error_category &generic_category(void)
+{
+  static const detail::generic_error_category result;
+  return result;
+}
+
+
+const error_category &system_category(void)
+{
+  static const detail::system_error_category result;
+  return result;
+}
+
+
+} // end system
+
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/error_code.inl b/thrust/thrust/system/detail/error_code.inl
new file mode 100644
index 0000000000000000000000000000000000000000..6631f486fef2fc2622a8deb5010fb0c7aab8a8ab
--- /dev/null
+++ b/thrust/thrust/system/detail/error_code.inl
@@ -0,0 +1,197 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/error_code.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+error_code
+  ::error_code(void)
+    :m_val(0),m_cat(&system_category())
+{
+  ;
+} // end error_code::error_code()
+
+
+error_code
+  ::error_code(int val, const error_category &cat)
+    :m_val(val),m_cat(&cat)
+{
+  ;
+} // end error_code::error_code()
+
+
+template <typename ErrorCodeEnum>
+  error_code
+    ::error_code(ErrorCodeEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                 , typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value>::type *
+#endif // THRUST_HOST_COMPILER_MSVC
+                )
+{
+  *this = make_error_code(e);
+} // end error_code::error_code()
+
+
+void error_code
+  ::assign(int val, const error_category &cat)
+{
+  m_val = val;
+  m_cat = &cat;
+} // end error_code::assign()
+
+
+template <typename ErrorCodeEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+  typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value, error_code>::type &
+#else
+  error_code &
+#endif // THRUST_HOST_COMPILER_MSVC
+    error_code
+      ::operator=(ErrorCodeEnum e)
+{
+  *this = make_error_code(e);
+  return *this;
+} // end error_code::operator=()
+
+
+void error_code
+  ::clear(void)
+{
+  m_val = 0;
+  m_cat = &system_category();
+} // end error_code::clear()
+
+
+int error_code
+  ::value(void) const
+{
+  return m_val;
+} // end error_code::value()
+
+
+const error_category &error_code
+  ::category(void) const
+{
+  return *m_cat;
+} // end error_code::category()
+
+
+error_condition error_code
+  ::default_error_condition(void) const
+{
+  return category().default_error_condition(value());
+} // end error_code::default_error_condition()
+
+
+std::string error_code
+  ::message(void) const
+{
+  return category().message(value());
+} // end error_code::message()
+
+
+error_code
+  ::operator bool (void) const
+{
+  return value() != 0;
+} // end error_code::operator bool ()
+
+
+error_code make_error_code(errc::errc_t e)
+{
+  return error_code(static_cast<int>(e), generic_category());
+} // end make_error_code()
+
+
+bool operator<(const error_code &lhs, const error_code &rhs)
+{
+  bool result = lhs.category().operator<(rhs.category());
+  result = result || lhs.category().operator==(rhs.category());
+  result = result || lhs.value() < rhs.value();
+  return result;
+} // end operator==()
+
+
+template<typename charT, typename traits>
+  std::basic_ostream<charT,traits>&
+    operator<<(std::basic_ostream<charT,traits> &os, const error_code &ec)
+{
+  return os << ec.category().name() << ':' << ec.value();
+} // end operator<<()
+
+
+bool operator==(const error_code &lhs, const error_code &rhs)
+{
+  return lhs.category().operator==(rhs.category()) && lhs.value() == rhs.value();
+} // end operator==()
+
+
+bool operator==(const error_code &lhs, const error_condition &rhs)
+{
+  return lhs.category().equivalent(lhs.value(), rhs) || rhs.category().equivalent(lhs,rhs.value());
+} // end operator==()
+
+
+bool operator==(const error_condition &lhs, const error_code &rhs)
+{
+  return rhs.category().equivalent(lhs.value(), lhs) || lhs.category().equivalent(rhs, lhs.value());
+} // end operator==()
+
+
+bool operator==(const error_condition &lhs, const error_condition &rhs)
+{
+  return lhs.category().operator==(rhs.category()) && lhs.value() == rhs.value();
+} // end operator==()
+
+
+bool operator!=(const error_code &lhs, const error_code &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+bool operator!=(const error_code &lhs, const error_condition &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+bool operator!=(const error_condition &lhs, const error_code &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+bool operator!=(const error_condition &lhs, const error_condition &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+} // end system
+
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/error_condition.inl b/thrust/thrust/system/detail/error_condition.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9dc493bcc9ee5007f0787b391eda5ad0a7faf479
--- /dev/null
+++ b/thrust/thrust/system/detail/error_condition.inl
@@ -0,0 +1,133 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/detail/error_condition.inl>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+error_condition
+  ::error_condition(void)
+    :m_val(0),m_cat(&generic_category())
+{
+  ;
+} // end error_condition::error_condition()
+
+
+error_condition
+  ::error_condition(int val, const error_category &cat)
+    :m_val(val),m_cat(&cat)
+{
+  ;
+} // end error_condition::error_condition()
+
+
+template<typename ErrorConditionEnum>
+  error_condition
+    ::error_condition(ErrorConditionEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                      , typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value>::type *
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                     )
+{
+  *this = make_error_condition(e);
+} // end error_condition::error_condition()
+
+
+void error_condition
+  ::assign(int val, const error_category &cat)
+{
+  m_val = val;
+  m_cat = &cat;
+} // end error_category::assign()
+
+
+template<typename ErrorConditionEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+  typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>::type &
+#else
+  error_condition &
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+    error_condition
+      ::operator=(ErrorConditionEnum e)
+{
+  *this = make_error_condition(e);
+  return *this;
+} // end error_condition::operator=()
+
+
+void error_condition
+  ::clear(void)
+{
+  m_val = 0;
+  m_cat = &generic_category();
+} // end error_condition::clear()
+
+
+int error_condition
+  ::value(void) const
+{
+  return m_val;
+} // end error_condition::value()
+
+
+const error_category &error_condition
+  ::category(void) const
+{
+  return *m_cat;
+} // end error_condition::category()
+
+
+std::string error_condition
+  ::message(void) const
+{
+  return category().message(value());
+} // end error_condition::message()
+
+
+error_condition
+  ::operator bool (void) const
+{
+  return value() != 0;
+} // end error_condition::operator bool ()
+
+
+error_condition make_error_condition(errc::errc_t e)
+{
+  return error_condition(static_cast<int>(e), generic_category());
+} // end make_error_condition()
+
+
+bool operator<(const error_condition &lhs,
+               const error_condition &rhs)
+{
+  return lhs.category().operator<(rhs.category()) || (lhs.category().operator==(rhs.category()) && (lhs.value() < rhs.value()));
+} // end operator<()
+
+
+} // end system
+
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/adjacent_difference.h b/thrust/thrust/system/detail/generic/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e4caaa88b904788d3a7e026bf487c01f74348e2
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/adjacent_difference.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief Generic implementation of adjacent_difference.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result);
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/adjacent_difference.inl>
+
diff --git a/thrust/thrust/system/detail/generic/adjacent_difference.inl b/thrust/thrust/system/detail/generic/adjacent_difference.inl
new file mode 100644
index 0000000000000000000000000000000000000000..ad4ad1cd4c234ba10e42bc64449b8942646a425c
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/adjacent_difference.inl
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/transform.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+  thrust::minus<InputType> binary_op;
+
+  return thrust::adjacent_difference(exec, first, last, result, binary_op);
+} // end adjacent_difference()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+
+  if(first == last)
+  {
+    // empty range, nothing to do
+    return result; 
+  }
+  else 
+  {
+    // an in-place operation is requested, copy the input and call the entry point
+    // XXX a special-purpose kernel would be faster here since
+    // only block boundaries need to be copied
+    thrust::detail::temporary_array<InputType, DerivedPolicy> input_copy(exec, first, last);
+    
+    *result = *first;
+    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op); 
+  }
+
+  return result + (last - first);
+}
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/advance.h b/thrust/thrust/system/detail/generic/advance.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9cab587b374b9349ee7bfff8128a42462ad17ab
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/advance.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename InputIterator, typename Distance>
+__host__ __device__
+void advance(InputIterator& i, Distance n);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/advance.inl>
+
diff --git a/thrust/thrust/system/detail/generic/advance.inl b/thrust/thrust/system/detail/generic/advance.inl
new file mode 100644
index 0000000000000000000000000000000000000000..ae98d596bce8073812b4798c1d33dec696a7b8ee
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/advance.inl
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+__thrust_exec_check_disable__
+template<typename InputIterator, typename Distance>
+__host__ __device__
+void advance(InputIterator& i, Distance n, thrust::incrementable_traversal_tag)
+{
+  while(n)
+  {
+    ++i;
+    --n;
+  } // end while
+} // end advance()
+
+__thrust_exec_check_disable__
+template<typename InputIterator, typename Distance>
+__host__ __device__
+void advance(InputIterator& i, Distance n, thrust::random_access_traversal_tag)
+{
+  i += n;
+} // end advance()
+
+} // end detail
+
+template<typename InputIterator, typename Distance>
+__host__ __device__
+void advance(InputIterator& i, Distance n)
+{
+  // dispatch on iterator traversal
+  thrust::system::detail::generic::detail::advance(i, n,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end advance()
+
+} // end namespace detail
+} // end namespace generic
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/binary_search.h b/thrust/thrust/system/detail/generic/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cd85c63f30b2484d7d9c2111d6e51f957a8a282
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/binary_search.h
@@ -0,0 +1,174 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief Generic implementations of binary search functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec, 
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec, 
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value, 
+                   StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output,
+                             StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable &value);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable, typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable &value,
+            StrictWeakOrdering comp);
+
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/binary_search.inl>
+
diff --git a/thrust/thrust/system/detail/generic/binary_search.inl b/thrust/thrust/system/detail/generic/binary_search.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b7c72f1cb3b418a89268411d9c49a116b27faa84
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/binary_search.inl
@@ -0,0 +1,402 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.inl
+ *  \brief Inline file for binary_search.h
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/distance.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/binary_search.h>
+
+#include <thrust/for_each.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/generic/scalar/binary_search.h>
+#include <thrust/system/detail/generic/select_system.h>
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// XXX WAR circular #inclusion with this forward declaration
+template<typename,typename> class temporary_array;
+
+
+} // end detail
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+// short names to avoid nvcc bug
+struct lbf
+{
+  template<typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
+  __host__ __device__
+  typename thrust::iterator_traits<RandomAccessIterator>::difference_type
+    operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
+  {
+    return thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp) - begin;
+  }
+};
+
+
+struct ubf
+{
+  template<typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
+  __host__ __device__
+  typename thrust::iterator_traits<RandomAccessIterator>::difference_type
+    operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
+  {
+    return thrust::system::detail::generic::scalar::upper_bound(begin, end, value, comp) - begin;
+  }
+};
+
+
+struct bsf
+{
+  template<typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
+  __host__ __device__
+  bool operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
+  {
+    RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp);
+    
+    thrust::detail::wrapped_function<StrictWeakOrdering,bool> wrapped_comp(comp);
+    
+    return iter != end && !wrapped_comp(value, *iter);
+  }
+};
+
+
+template<typename ForwardIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
+struct binary_search_functor
+{
+  ForwardIterator begin;
+  ForwardIterator end;
+  StrictWeakOrdering comp;
+  BinarySearchFunction func;
+  
+  __host__ __device__
+  binary_search_functor(ForwardIterator begin, ForwardIterator end, StrictWeakOrdering comp, BinarySearchFunction func)
+    : begin(begin), end(end), comp(comp), func(func) {}
+  
+  template<typename Tuple>
+  __host__ __device__
+  void operator()(Tuple t)
+  {
+    thrust::get<1>(t) = func(begin, end, thrust::get<0>(t), comp);
+  }
+}; // binary_search_functor
+
+
+// Vector Implementation
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
+__host__ __device__
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output,
+                             StrictWeakOrdering comp,
+                             BinarySearchFunction func)
+{
+  thrust::for_each(exec,
+                   thrust::make_zip_iterator(thrust::make_tuple(values_begin, output)),
+                   thrust::make_zip_iterator(thrust::make_tuple(values_end, output + thrust::distance(values_begin, values_end))),
+                   detail::binary_search_functor<ForwardIterator, StrictWeakOrdering, BinarySearchFunction>(begin, end, comp, func));
+  
+  return output + thrust::distance(values_begin, values_end);
+}
+
+   
+
+// Scalar Implementation
+template<typename OutputType, typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename BinarySearchFunction>
+__host__ __device__
+OutputType binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator begin,
+                         ForwardIterator end,
+                         const T& value, 
+                         StrictWeakOrdering comp,
+                         BinarySearchFunction func)
+{
+  // use the vectorized path to implement the scalar version
+
+  // allocate device buffers for value and output
+  thrust::detail::temporary_array<T,DerivedPolicy>          d_value(exec,1);
+  thrust::detail::temporary_array<OutputType,DerivedPolicy> d_output(exec,1);
+
+  { // copy value to device
+    typedef typename thrust::iterator_system<const T*>::type value_in_system_t;
+    value_in_system_t value_in_system;
+    using thrust::system::detail::generic::select_system;
+    thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(value_in_system)),
+                                 thrust::detail::derived_cast(thrust::detail::strip_const(exec))),
+                   &value, 1, d_value.begin());
+  }
+
+  // perform the query
+  thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
+
+  OutputType output;
+  { // copy result to host and return
+    typedef typename thrust::iterator_system<OutputType*>::type result_out_system_t;
+    result_out_system_t result_out_system;
+    using thrust::system::detail::generic::select_system;
+    thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+                                 thrust::detail::derived_cast(thrust::detail::strip_const(result_out_system))),
+                   d_output.begin(), 1, &output);
+  }
+
+  return output;
+}
+
+
+// this functor differs from thrust::less<T>
+// because it allows the types of lhs & rhs to differ
+// which is required by the binary search functions
+// XXX use C++14 thrust::less<> when it's ready
+struct binary_search_less
+{
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(const T1& lhs, const T2& rhs) const
+  {
+    return lhs < rhs;
+  }
+};
+
+   
+} // end namespace detail
+
+
+//////////////////////
+// Scalar Functions //
+//////////////////////
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value)
+{
+  namespace p = thrust::placeholders;
+  return thrust::lower_bound(exec, begin, end, value, detail::binary_search_less());
+}
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
+  
+  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value)
+{
+  namespace p = thrust::placeholders;
+  return thrust::upper_bound(exec, begin, end, value, detail::binary_search_less());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
+  
+  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::ubf());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value)
+{
+  return thrust::binary_search(exec, begin, end, value, detail::binary_search_less());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+  return detail::binary_search<bool>(exec, begin, end, value, comp, detail::bsf());
+}
+
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output)
+{
+  namespace p = thrust::placeholders;
+  return thrust::lower_bound(exec, begin, end, values_begin, values_end, output, detail::binary_search_less());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::lbf());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output)
+{
+  namespace p = thrust::placeholders;
+  return thrust::upper_bound(exec, begin, end, values_begin, values_end, output, detail::binary_search_less());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::ubf());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+__host__ __device__
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output)
+{
+  namespace p = thrust::placeholders;
+  return thrust::binary_search(exec, begin, end, values_begin, values_end, output, detail::binary_search_less());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output,
+                             StrictWeakOrdering comp)
+{
+  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::bsf());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable &value)
+{
+  return thrust::equal_range(exec, first, last, value, detail::binary_search_less());
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value,
+            StrictWeakOrdering comp)
+{
+  ForwardIterator lb = thrust::lower_bound(exec, first, last, value, comp);
+  ForwardIterator ub = thrust::upper_bound(exec, first, last, value, comp);
+  return thrust::make_pair(lb, ub);
+}
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/copy.h b/thrust/thrust/system/detail/generic/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..e22535618efd8c896b8e04ba21b636e4832743ea
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/copy.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator  first,
+                      InputIterator  last,
+                      OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator  first,
+                        Size           n,
+                        OutputIterator result);
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/copy.inl>
+
diff --git a/thrust/thrust/system/detail/generic/copy.inl b/thrust/thrust/system/detail/generic/copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9763a06823e5229c6e95693f3287fe95d61bed07
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/copy.inl
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/functional.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/transform.h>
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/detail/minimum_system.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator                            first,
+                      InputIterator                            last,
+                      OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type T;
+  return thrust::transform(exec, first, last, result, thrust::identity<T>());
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator                            first,
+                        Size                                     n,
+                        OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type value_type;
+  typedef thrust::identity<value_type>                         xfrm_type;
+
+  typedef thrust::detail::unary_transform_functor<xfrm_type> functor_type;
+
+  typedef thrust::tuple<InputIterator,OutputIterator> iterator_tuple;
+  typedef thrust::zip_iterator<iterator_tuple>        zip_iter;
+
+  zip_iter zipped = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+
+  return thrust::get<1>(thrust::for_each_n(exec, zipped, n, functor_type(xfrm_type())).get_iterator_tuple());
+} // end copy_n()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/copy_if.h b/thrust/thrust/system/detail/generic/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e3fb73a67e05abf633fdc6ef154df99b671759c
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/copy_if.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+   OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator result,
+                          Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/copy_if.inl>
+
diff --git a/thrust/thrust/system/detail/generic/copy_if.inl b/thrust/thrust/system/detail/generic/copy_if.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4bdafe382ddb904631a979109b05e94e82b10d8e
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/copy_if.inl
@@ -0,0 +1,161 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/copy_if.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/functional.h>
+#include <thrust/distance.h>
+#include <thrust/transform.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/integer_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+template<typename IndexType,
+         typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first,
+                       InputIterator1 last,
+                       InputIterator2 stencil,
+                       OutputIterator result,
+                       Predicate pred)
+{
+  THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(IndexType n = thrust::distance(first, last));
+  
+  // compute {0,1} predicates
+  thrust::detail::temporary_array<IndexType, DerivedPolicy> predicates(exec, n);
+  thrust::transform(exec,
+                    stencil,
+                    stencil + n,
+                    predicates.begin(),
+                    thrust::detail::predicate_to_integral<Predicate,IndexType>(pred));
+  
+  // scan {0,1} predicates
+  thrust::detail::temporary_array<IndexType, DerivedPolicy> scatter_indices(exec, n);
+  thrust::exclusive_scan(exec,
+                         predicates.begin(),
+                         predicates.end(),
+                         scatter_indices.begin(),
+                         static_cast<IndexType>(0),
+                         thrust::plus<IndexType>());
+  
+  // scatter the true elements
+  thrust::scatter_if(exec,
+                     first,
+                     last,
+                     scatter_indices.begin(),
+                     predicates.begin(),
+                     result,
+                     thrust::identity<IndexType>());
+  
+  // find the end of the new sequence
+  IndexType output_size = scatter_indices[n - 1] + predicates[n - 1];
+  
+  return result + output_size;
+}
+
+
+} // end namespace detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  // XXX it's potentially expensive to send [first,last) twice
+  //     we should probably specialize this case for POD
+  //     since we can safely keep the input in a temporary instead
+  //     of doing two loads
+  return thrust::copy_if(exec, first, last, first, result, pred);
+} // end copy_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+   OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator result,
+                          Predicate pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
+  
+  // empty sequence
+  if(first == last)
+    return result;
+  
+  difference_type n = thrust::distance(first, last);
+  
+  // create an unsigned version of n (we know n is positive from the comparison above)
+  // to avoid a warning in the compare below
+  typename thrust::detail::make_unsigned<difference_type>::type unsigned_n(n);
+  
+  // use 32-bit indices when possible (almost always)
+  if(sizeof(difference_type) > sizeof(unsigned int) && unsigned_n > thrust::detail::integer_traits<unsigned int>::const_max)
+  {
+    result = detail::copy_if<difference_type>(exec, first, last, stencil, result, pred);
+  } // end if
+  else
+  {
+    result = detail::copy_if<unsigned int>(exec, first, last, stencil, result, pred);
+  } // end else
+
+  return result;
+} // end copy_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/count.h b/thrust/thrust/system/detail/generic/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..218369e386e18219906a043171b4a99c489a643a
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/count.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template <typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+__host__ __device__
+typename thrust::iterator_traits<InputIterator>::difference_type
+count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value);
+
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+typename thrust::iterator_traits<InputIterator>::difference_type
+count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/count.inl>
+
diff --git a/thrust/thrust/system/detail/generic/count.inl b/thrust/thrust/system/detail/generic/count.inl
new file mode 100644
index 0000000000000000000000000000000000000000..f12f0122e8c3b01b06f0f88cf4006918f87d5609
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/count.inl
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/count.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template <typename InputType, typename Predicate, typename CountType>
+struct count_if_transform
+{
+  __host__ __device__ 
+  count_if_transform(Predicate _pred) : pred(_pred){}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  CountType operator()(const InputType& val)
+  {
+    if(pred(val))
+      return 1;
+    else
+      return 0;
+  } // end operator()
+
+  Predicate pred;
+}; // end count_if_transform
+
+
+template <typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+__host__ __device__
+typename thrust::iterator_traits<InputIterator>::difference_type
+count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
+{
+  using thrust::placeholders::_1;
+
+  return thrust::count_if(exec, first, last, _1 == value);
+} // end count()
+
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+typename thrust::iterator_traits<InputIterator>::difference_type
+count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+  typedef typename thrust::iterator_traits<InputIterator>::difference_type CountType;
+  
+  thrust::system::detail::generic::count_if_transform<InputType, Predicate, CountType> unary_op(pred);
+  thrust::plus<CountType> binary_op;
+  return thrust::transform_reduce(exec, first, last, unary_op, CountType(0), binary_op);
+} // end count_if()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/distance.h b/thrust/thrust/system/detail/generic/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..03b0fb5564a8efd630e78dac14ce52794281d603
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/distance.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename InputIterator>
+inline __host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/distance.inl>
+
diff --git a/thrust/thrust/system/detail/generic/distance.inl b/thrust/thrust/system/detail/generic/distance.inl
new file mode 100644
index 0000000000000000000000000000000000000000..930d0844cbcc2a7a91c908a1375af0b413a87e7c
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/distance.inl
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator>
+inline __host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag)
+{
+  typename thrust::iterator_traits<InputIterator>::difference_type result(0);
+
+  while(first != last)
+  {
+    ++first;
+    ++result;
+  } // end while
+
+  return result;
+} // end advance()
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator>
+inline __host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last, thrust::random_access_traversal_tag)
+{
+  return last - first;
+} // end distance()
+
+
+} // end detail
+
+__thrust_exec_check_disable__
+template<typename InputIterator>
+inline __host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last)
+{
+  // dispatch on iterator traversal
+  return thrust::system::detail::generic::detail::distance(first, last,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end advance()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/equal.h b/thrust/thrust/system/detail/generic/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..8962b1bd1428a3c845924a9b7a7d2ef3b2147322
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/equal.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/equal.inl>
+
diff --git a/thrust/thrust/system/detail/generic/equal.inl b/thrust/thrust/system/detail/generic/equal.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7c9dec4bc45cb619b83f0bc2859823699730927d
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/equal.inl
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/equal.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/mismatch.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
+  
+  return thrust::equal(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
+}
+
+
+// the == below could be a __host__ function in the case of std::vector::iterator::operator==
+// we make this exception for equal and use __thrust_exec_check_disable__ because it is used in vector's implementation
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred)
+{
+  return thrust::mismatch(exec, first1, last1, first2, binary_pred).first == last1;
+}
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/extrema.h b/thrust/thrust/system/detail/generic/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3ee8188971687249b7052ef4f062f5adf972768
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/extrema.h
@@ -0,0 +1,89 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file extrema.h
+ *  \brief Generic device implementations of extrema functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template <typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/extrema.inl>
+
diff --git a/thrust/thrust/system/detail/generic/extrema.inl b/thrust/thrust/system/detail/generic/extrema.inl
new file mode 100644
index 0000000000000000000000000000000000000000..22183db9a3c450637a647031bc34b0cbd472db8a
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/extrema.inl
@@ -0,0 +1,263 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file distance.h
+ *  \brief Device implementations for distance.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/get_iterator_value.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+//////////////
+// Functors //
+//////////////
+//
+
+// return the smaller/larger element making sure to prefer the 
+// first occurance of the minimum/maximum element
+template <typename InputType, typename IndexType, typename BinaryPredicate>
+struct min_element_reduction
+{
+  BinaryPredicate comp;
+
+  __host__ __device__ 
+  min_element_reduction(BinaryPredicate comp) : comp(comp){}
+
+  __host__ __device__ 
+  thrust::tuple<InputType, IndexType>
+  operator()(const thrust::tuple<InputType, IndexType>& lhs, 
+             const thrust::tuple<InputType, IndexType>& rhs )
+  {
+    if(comp(thrust::get<0>(lhs), thrust::get<0>(rhs)))
+      return lhs;
+    if(comp(thrust::get<0>(rhs), thrust::get<0>(lhs)))
+      return rhs;
+
+    // values are equivalent, prefer value with smaller index
+    if(thrust::get<1>(lhs) < thrust::get<1>(rhs))
+      return lhs;
+    else
+      return rhs;
+  } // end operator()()
+}; // end min_element_reduction
+
+
+template <typename InputType, typename IndexType, typename BinaryPredicate>
+struct max_element_reduction
+{
+  BinaryPredicate comp;
+
+  __host__ __device__ 
+  max_element_reduction(BinaryPredicate comp) : comp(comp){}
+
+  __host__ __device__ 
+  thrust::tuple<InputType, IndexType>
+  operator()(const thrust::tuple<InputType, IndexType>& lhs, 
+             const thrust::tuple<InputType, IndexType>& rhs )
+  {
+    if(comp(thrust::get<0>(lhs), thrust::get<0>(rhs)))
+      return rhs;
+    if(comp(thrust::get<0>(rhs), thrust::get<0>(lhs)))
+      return lhs;
+
+    // values are equivalent, prefer value with smaller index
+    if(thrust::get<1>(lhs) < thrust::get<1>(rhs))
+      return lhs;
+    else
+      return rhs;
+  } // end operator()()
+}; // end max_element_reduction
+
+
+// return the smaller & larger element making sure to prefer the 
+// first occurance of the minimum/maximum element
+template <typename InputType, typename IndexType, typename BinaryPredicate>
+struct minmax_element_reduction
+{
+  BinaryPredicate comp;
+
+  __host__ __device__
+  minmax_element_reduction(BinaryPredicate comp) : comp(comp){}
+
+  __host__ __device__ 
+  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >
+  operator()(const thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >& lhs, 
+             const thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >& rhs )
+  {
+
+    return thrust::make_tuple(min_element_reduction<InputType, IndexType, BinaryPredicate>(comp)(thrust::get<0>(lhs), thrust::get<0>(rhs)),
+                              max_element_reduction<InputType, IndexType, BinaryPredicate>(comp)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+  } // end operator()()
+}; // end minmax_element_reduction
+
+
+template <typename InputType, typename IndexType>
+struct duplicate_tuple
+{
+  __host__ __device__ 
+  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >
+  operator()(const thrust::tuple<InputType,IndexType>& t)
+  {
+    return thrust::make_tuple(t, t);
+  }
+}; // end duplicate_tuple
+
+
+} // end namespace detail
+
+
+template <typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+
+  return thrust::min_element(exec, first, last, thrust::less<value_type>());
+} // end min_element()
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  if (first == last)
+    return last;
+
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
+
+  thrust::tuple<InputType, IndexType> result =
+    thrust::reduce
+      (exec,
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
+       thrust::tuple<InputType, IndexType>(thrust::detail::get_iterator_value(derived_cast(exec), first), 0),
+       detail::min_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
+
+  return first + thrust::get<1>(result);
+} // end min_element()
+
+
+template <typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+
+  return thrust::max_element(exec, first, last, thrust::less<value_type>());
+} // end max_element()
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  if (first == last)
+    return last;
+
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
+
+  thrust::tuple<InputType, IndexType> result =
+    thrust::reduce
+      (exec,
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
+       thrust::tuple<InputType, IndexType>(thrust::detail::get_iterator_value(derived_cast(exec),first), 0),
+       detail::max_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
+
+  return first + thrust::get<1>(result);
+} // end max_element()
+
+
+template <typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+
+  return thrust::minmax_element(exec, first, last, thrust::less<value_type>());
+} // end minmax_element()
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  if (first == last)
+    return thrust::make_pair(last, last);
+
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
+
+  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> > result = 
+    thrust::transform_reduce
+      (exec,
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
+       detail::duplicate_tuple<InputType, IndexType>(),
+       detail::duplicate_tuple<InputType, IndexType>()(
+         thrust::tuple<InputType, IndexType>(thrust::detail::get_iterator_value(derived_cast(exec),first), 0)),
+       detail::minmax_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
+
+  return thrust::make_pair(first + thrust::get<1>(thrust::get<0>(result)), first + thrust::get<1>(thrust::get<1>(result)));
+} // end minmax_element()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/fill.h b/thrust/thrust/system/detail/generic/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c4f2ed4e76920bc632e342558b5dcc24c103cf3
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/fill.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/internal_functional.h>
+#include <thrust/generate.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+__host__ __device__
+  OutputIterator fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  // XXX consider using the placeholder expression _1 = value
+  return thrust::generate_n(exec, first, n, thrust::detail::fill_functor<T>(value));
+}
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void fill(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  // XXX consider using the placeholder expression _1 = value
+  thrust::generate(exec, first, last, thrust::detail::fill_functor<T>(value));
+}
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/find.h b/thrust/thrust/system/detail/generic/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..00e11e53c61d8916d51d044eba11f34092cf597c
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/find.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value);
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/find.inl>
+
diff --git a/thrust/thrust/system/detail/generic/find.inl b/thrust/thrust/system/detail/generic/find.inl
new file mode 100644
index 0000000000000000000000000000000000000000..a7126825d99fcc206238ed493787e42550eb4b42
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/find.inl
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/find.h>
+#include <thrust/reduce.h>
+
+#include <thrust/tuple.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+
+
+// Contributed by Erich Elsen
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value)
+{
+  using thrust::placeholders::_1;
+
+  return thrust::find_if(exec, first, last, _1 == value);
+} // end find()
+
+
+template<typename TupleType>
+struct find_if_functor
+{
+  __host__ __device__
+  TupleType operator()(const TupleType& lhs, const TupleType& rhs) const
+  {
+    // select the smallest index among true results
+    if(thrust::get<0>(lhs) && thrust::get<0>(rhs))
+    {
+      return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+    }
+    else if(thrust::get<0>(lhs))
+    {
+      return lhs;
+    }
+    else
+    {
+      return rhs;
+    }
+  }
+};
+    
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::difference_type difference_type;
+  typedef typename thrust::tuple<bool,difference_type> result_type;
+  
+  // empty sequence
+  if(first == last) return last;
+  
+  const difference_type n = thrust::distance(first, last);
+  
+  // this implementation breaks up the sequence into separate intervals
+  // in an attempt to early-out as soon as a value is found
+  
+  // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
+  const difference_type interval_threshold = 1 << 20;
+  const difference_type interval_size = (thrust::min)(interval_threshold, n);
+  
+  // force transform_iterator output to bool
+  typedef thrust::transform_iterator<Predicate, InputIterator, bool> XfrmIterator;
+  typedef thrust::tuple<XfrmIterator, thrust::counting_iterator<difference_type> > IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+  
+  IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, pred),
+                                                thrust::counting_iterator<difference_type>(0));
+  
+  ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
+  ZipIterator end   = begin + n;
+  
+  for(ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
+  {
+    ZipIterator interval_end = interval_begin + interval_size;
+    if(end < interval_end)
+    {
+      interval_end = end;
+    } // end if
+    
+    result_type result = thrust::reduce(exec,
+                                        interval_begin, interval_end,
+                                        result_type(false,interval_end - begin),
+                                        find_if_functor<result_type>());
+    
+    // see if we found something
+    if(thrust::get<0>(result))
+    {
+      return first + thrust::get<1>(result);
+    }
+  }
+  
+  //nothing was found if we reach here...
+  return first + n;
+}
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred)
+{
+  return thrust::find_if(exec, first, last, thrust::detail::not1(pred));
+} // end find()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/for_each.h b/thrust/thrust/system/detail/generic/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4add43052d3be2808ceae4bf64514bfc41e6d67
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/for_each.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file for_each.h
+ *  \brief Generic implementation of for_each & for_each_n.
+ *         It is an error to call these functions; they have no implementation.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/detail/static_assert.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+__host__ __device__
+InputIterator for_each(thrust::execution_policy<DerivedPolicy> &,
+                       InputIterator first,
+                       InputIterator ,
+                       UnaryFunction )
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
+  return first;
+} // end for_each()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+__host__ __device__
+InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &,
+                         InputIterator first,
+                         Size ,
+                         UnaryFunction )
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
+  return first;
+} // end for_each_n()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/gather.h b/thrust/thrust/system/detail/generic/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..d587572f0b46ae1cb3dfb7b4ca19dc54d5f60b32
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/gather.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator                            map_first,
+                        InputIterator                            map_last,
+                        RandomAccessIterator                     input_first,
+                        OutputIterator                           result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           map_first,
+                           InputIterator1                           map_last,
+                           InputIterator2                           stencil,
+                           RandomAccessIterator                     input_first,
+                           OutputIterator                           result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           map_first,
+                           InputIterator1                           map_last,
+                           InputIterator2                           stencil,
+                           RandomAccessIterator                     input_first,
+                           OutputIterator                           result,
+                           Predicate                                pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/gather.inl>
+
diff --git a/thrust/thrust/system/detail/generic/gather.inl b/thrust/thrust/system/detail/generic/gather.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4f4289ecb2af7c42b4ffb64850a5b3bfb12029b0
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/gather.inl
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/gather.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator                            map_first,
+                        InputIterator                            map_last,
+                        RandomAccessIterator                     input_first,
+                        OutputIterator                           result)
+{
+  return thrust::transform(exec,
+                           thrust::make_permutation_iterator(input_first, map_first),
+                           thrust::make_permutation_iterator(input_first, map_last),
+                           result,
+                           thrust::identity<typename thrust::iterator_value<RandomAccessIterator>::type>());
+} // end gather()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           map_first,
+                           InputIterator1                           map_last,
+                           InputIterator2                           stencil,
+                           RandomAccessIterator                     input_first,
+                           OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator2>::type StencilType;
+  return thrust::gather_if(exec,
+                           map_first,
+                           map_last,
+                           stencil,
+                           input_first,
+                           result,
+                           thrust::identity<StencilType>());
+} // end gather_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           map_first,
+                           InputIterator1                           map_last,
+                           InputIterator2                           stencil,
+                           RandomAccessIterator                     input_first,
+                           OutputIterator                           result,
+                           Predicate                                pred)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
+  return thrust::transform_if(exec,
+                              thrust::make_permutation_iterator(input_first, map_first),
+                              thrust::make_permutation_iterator(input_first, map_last),
+                              stencil,
+                              result,
+                              thrust::identity<InputType>(),
+                              pred);
+} // end gather_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/generate.h b/thrust/thrust/system/detail/generic/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..edc2cc5eb3582a11ab7afa0cd78030b2b26688f2
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/generate.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Generator>
+__host__ __device__
+  void generate(thrust::execution_policy<ExecutionPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen);
+
+template<typename ExecutionPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+__host__ __device__
+  OutputIterator generate_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/generate.inl>
+
diff --git a/thrust/thrust/system/detail/generic/generate.inl b/thrust/thrust/system/detail/generic/generate.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9ca319b99992f221dec6273eb4a056568a680f15
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/generate.inl
@@ -0,0 +1,99 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/generate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Generator>
+__host__ __device__
+  void generate(thrust::execution_policy<ExecutionPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen)
+{
+  // this static assert is necessary due to a workaround in generate_functor
+  // it takes a const reference to accept temporaries from proxy iterators
+  // and then const_casts the constness away
+  //
+  // this had the weird side effect of allowing generate (and fill, and whatever
+  // else is implemented in terms of generate) to fill through const iterators.
+  // this might become unnecessary once Thrust is C++11-and-above only, since the
+  // other solution is to take an rvalue reference in a second overload of
+  // operator() of the function object, but until we support pre-11, this is a
+  // nice solution that validates the const_cast and doesn't take away any
+  // functionality.
+  THRUST_STATIC_ASSERT_MSG(
+    !thrust::detail::is_const<
+      typename thrust::detail::remove_reference<
+        typename thrust::iterator_traits<ForwardIterator>::reference
+      >::type
+    >::value
+  , "generating to `const` iterators is not allowed"
+  );
+  thrust::for_each(exec, first, last, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
+} // end generate()
+
+template<typename ExecutionPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+__host__ __device__
+  OutputIterator generate_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen)
+{
+  // this static assert is necessary due to a workaround in generate_functor
+  // it takes a const reference to accept temporaries from proxy iterators
+  // and then const_casts the constness away
+  //
+  // this had the weird side effect of allowing generate (and fill, and whatever
+  // else is implemented in terms of generate) to fill through const iterators.
+  // this might become unnecessary once Thrust is C++11-and-above only, since the
+  // other solution is to take an rvalue reference in a second overload of
+  // operator() of the function object, but until we support pre-11, this is a
+  // nice solution that validates the const_cast and doesn't take away any
+  // functionality.
+  THRUST_STATIC_ASSERT_MSG(
+    !thrust::detail::is_const<
+      typename thrust::detail::remove_reference<
+        typename thrust::iterator_traits<OutputIterator>::reference
+      >::type
+    >::value
+  , "generating to `const` iterators is not allowed"
+  );
+  return thrust::for_each_n(exec, first, n, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
+} // end generate()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/inner_product.h b/thrust/thrust/system/detail/generic/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..71e1a92705b0570734aa544899e2fab7a681bb37
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/inner_product.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType>
+__host__ __device__
+  OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputType init);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType, typename BinaryFunction1, typename BinaryFunction2>
+__host__ __device__
+OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/inner_product.inl>
+
diff --git a/thrust/thrust/system/detail/generic/inner_product.inl b/thrust/thrust/system/detail/generic/inner_product.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0a50386be88e6b051206000b4bcbf2d04689055d
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/inner_product.inl
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/inner_product.h>
+#include <thrust/functional.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/transform_reduce.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType>
+__host__ __device__
+OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init)
+{
+  thrust::plus<OutputType>       binary_op1;
+  thrust::multiplies<OutputType> binary_op2;
+  return thrust::inner_product(exec, first1, last1, first2, init, binary_op1, binary_op2);
+} // end inner_product()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType, typename BinaryFunction1, typename BinaryFunction2>
+__host__ __device__
+OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2)
+{
+  typedef thrust::zip_iterator<thrust::tuple<InputIterator1,InputIterator2> > ZipIter;
+
+  ZipIter first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
+
+  // only the first iterator in the tuple is relevant for the purposes of last
+  ZipIter last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
+
+  return thrust::transform_reduce(exec, first, last, thrust::detail::zipped_binary_op<OutputType,BinaryFunction2>(binary_op2), init, binary_op1);
+} // end inner_product()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/logical.h b/thrust/thrust/system/detail/generic/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..702dbad852d9e074147368a87b28a082fcfa8242
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/logical.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/find.h>
+#include <thrust/logical.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool all_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  return thrust::find_if(exec, first, last, thrust::detail::not1(pred)) == last;
+}
+
+
+template<typename ExecutionPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool any_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  return thrust::find_if(exec, first, last, pred) != last;
+}
+
+
+template<typename ExecutionPolicy, typename InputIterator, typename Predicate>
+__host__ __device__
+bool none_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  return !thrust::any_of(exec, first, last, pred);
+}
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/memory.h b/thrust/thrust/system/detail/generic/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..344b3673d11023557e5d2c483146624aac402cde
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/memory.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generic/memory.h
+ *  \brief Generic implementation of memory functions.
+ *         Calling some of these is an error. They have no implementation.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename Size>
+__host__ __device__
+void malloc(thrust::execution_policy<DerivedPolicy> &, Size);
+
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+thrust::pointer<T,DerivedPolicy> malloc(thrust::execution_policy<DerivedPolicy> &s, std::size_t n);
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void free(thrust::execution_policy<DerivedPolicy> &, Pointer);
+
+template<typename Pointer1, typename Pointer2>
+__host__ __device__
+void assign_value(tag, Pointer1, Pointer2);
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer);
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+void iter_swap(thrust::execution_policy<DerivedPolicy>&, Pointer1, Pointer2);
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/memory.inl>
+
diff --git a/thrust/thrust/system/detail/generic/memory.inl b/thrust/thrust/system/detail/generic/memory.inl
new file mode 100644
index 0000000000000000000000000000000000000000..eadf39ae96f4f8c49151b07db80b855944c9fbd4
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/memory.inl
@@ -0,0 +1,104 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/malloc_and_free.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename Size>
+__host__ __device__
+  void malloc(thrust::execution_policy<DerivedPolicy> &, Size)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Size, false>::value)
+  , "unimplemented for this system"
+  );
+}
+
+
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+  thrust::pointer<T,DerivedPolicy>
+    malloc(thrust::execution_policy<DerivedPolicy> &exec, std::size_t n)
+{
+  thrust::pointer<void,DerivedPolicy> void_ptr = thrust::malloc(exec, sizeof(T) * n);
+
+  return pointer<T,DerivedPolicy>(static_cast<T*>(void_ptr.get()));
+} // end malloc()
+
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void free(thrust::execution_policy<DerivedPolicy> &, Pointer)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer, false>::value)
+  , "unimplemented for this system"
+  );
+}
+
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+void assign_value(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer1, false>::value)
+  , "unimplemented for this system"
+  );
+}
+
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer, false>::value)
+  , "unimplemented for this system"
+  );
+}
+
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+void iter_swap(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer1, false>::value)
+  , "unimplemented for this system"
+  );
+}
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/merge.h b/thrust/thrust/system/detail/generic/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..d80906e3d31faa5f01519ab5c7963fe8762f77bb
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/merge.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+// XXX calling this function is an error; there is no implementation
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/merge.inl>
+
diff --git a/thrust/thrust/system/detail/generic/merge.inl b/thrust/thrust/system/detail/generic/merge.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2938e8c9232c31f029c62ad45af7054ab997ebb7
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/merge.inl
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/system/detail/generic/merge.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &,
+                       InputIterator1,
+                       InputIterator1,
+                       InputIterator2,
+                       InputIterator2,
+                       OutputIterator result,
+                       StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end merge()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::merge(exec,first1,last1,first2,last2,result,thrust::less<value_type>());
+} // end merge()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<Compare> comp_first(comp);
+
+  iterator_tuple3 result = thrust::merge(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end merge_by_key()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::merge_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end merge_by_key()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/mismatch.h b/thrust/thrust/system/detail/generic/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..50e9f678b1ff6a85c2d32e5ab45aed88a1c7224b
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/mismatch.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2,
+             BinaryPredicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/mismatch.inl>
+
diff --git a/thrust/thrust/system/detail/generic/mismatch.inl b/thrust/thrust/system/detail/generic/mismatch.inl
new file mode 100644
index 0000000000000000000000000000000000000000..8348374a5a94297b6552571e60d788e8f6efbda1
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/mismatch.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/mismatch.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/find.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+__host__ __device__
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2)
+{
+  using namespace thrust::placeholders;
+
+  return thrust::mismatch(exec, first1, last1, first2, _1 == _2);
+} // end mismatch()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2,
+             BinaryPredicate pred)
+{
+  // Contributed by Erich Elsen
+  typedef thrust::tuple<InputIterator1,InputIterator2> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>          ZipIterator;
+  
+  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
+  ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
+  
+  ZipIterator result = thrust::find_if_not(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<BinaryPredicate>(pred));
+  
+  return thrust::make_pair(thrust::get<0>(result.get_iterator_tuple()),
+                           thrust::get<1>(result.get_iterator_tuple()));
+} // end mismatch()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/partition.h b/thrust/thrust/system/detail/generic/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdd158c4cc13ebb81f6fc407276aeada8d1201c5
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/partition.h
@@ -0,0 +1,170 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief Generic implementations of partition functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition_point(thrust::execution_policy<ExecutionPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  bool is_partitioned(thrust::execution_policy<ExecutionPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/partition.inl>
+
diff --git a/thrust/thrust/system/detail/generic/partition.inl b/thrust/thrust/system/detail/generic/partition.inl
new file mode 100644
index 0000000000000000000000000000000000000000..73a8a286edaa494cebf043d500a43438d3890ca5
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/partition.inl
@@ -0,0 +1,248 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/partition.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+#include <thrust/remove.h>
+#include <thrust/count.h>
+#include <thrust/advance.h>
+#include <thrust/partition.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(thrust::execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // count the size of the true partition
+  typename thrust::iterator_difference<ForwardIterator>::type num_true = thrust::count_if(exec, first,last,pred);
+
+  // point to the beginning of the false partition
+  ForwardIterator out_false = first;
+  thrust::advance(out_false, num_true);
+
+  return thrust::stable_partition_copy(exec, temp.begin(), temp.end(), first, out_false, pred).first;
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(thrust::execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // count the size of the true partition
+  InputIterator stencil_last = stencil;
+  thrust::advance(stencil_last, temp.size());
+  typename thrust::iterator_difference<InputIterator>::type num_true = thrust::count_if(exec, stencil, stencil_last, pred);
+
+  // point to the beginning of the false partition
+  ForwardIterator out_false = first;
+  thrust::advance(out_false, num_true);
+
+  return thrust::stable_partition_copy(exec, temp.begin(), temp.end(), stencil, first, out_false, pred).first;
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  thrust::detail::unary_negate<Predicate> not_pred(pred);
+
+  // remove_copy_if the true partition to out_true
+  OutputIterator1 end_of_true_partition = thrust::remove_copy_if(exec, first, last, out_true, not_pred);
+
+  // remove_copy_if the false partition to out_false
+  OutputIterator2 end_of_false_partition = thrust::remove_copy_if(exec, first, last, out_false, pred);
+
+  return thrust::make_pair(end_of_true_partition, end_of_false_partition);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  thrust::detail::unary_negate<Predicate> not_pred(pred);
+
+  // remove_copy_if the true partition to out_true
+  OutputIterator1 end_of_true_partition = thrust::remove_copy_if(exec, first, last, stencil, out_true, not_pred);
+
+  // remove_copy_if the false partition to out_false
+  OutputIterator2 end_of_false_partition = thrust::remove_copy_if(exec, first, last, stencil, out_false, pred);
+
+  return thrust::make_pair(end_of_true_partition, end_of_false_partition);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  return thrust::stable_partition(exec, first, last, pred);
+} // end partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  return thrust::stable_partition(exec, first, last, stencil, pred);
+} // end partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  return thrust::stable_partition_copy(exec,first,last,out_true,out_false,pred);
+} // end partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  return thrust::stable_partition_copy(exec,first,last,stencil,out_true,out_false,pred);
+} // end partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition_point(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred)
+{
+  return thrust::find_if_not(exec, first, last, pred);
+} // end partition_point()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  bool is_partitioned(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  return thrust::is_sorted(exec,
+                           thrust::make_transform_iterator(first, thrust::detail::not1(pred)),
+                           thrust::make_transform_iterator(last,  thrust::detail::not1(pred)));
+} // end is_partitioned()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/per_device_resource.h b/thrust/thrust/system/detail/generic/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..9378940f3a66305b0dcc8834c13e00a49825689b
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/per_device_resource.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/mr/memory_resource.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename MR, typename DerivedPolicy>
+__host__
+MR * get_per_device_resource(thrust::detail::execution_policy_base<DerivedPolicy>&)
+{
+    return mr::get_global_resource<MR>();
+}
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/reduce.h b/thrust/thrust/system/detail/generic/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3e7af0d28bc6a4a0d8f8c893bd7348f1b5e59b6
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/reduce.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last);
+
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+__host__ __device__
+  T reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, T init);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+__host__ __device__
+  T reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, T init, BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/reduce.inl>
+
diff --git a/thrust/thrust/system/detail/generic/reduce.inl b/thrust/thrust/system/detail/generic/reduce.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b866e86dc4947244fbb323bfee2e93305eb223b9
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/reduce.inl
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/reduce.h>
+#include <thrust/system/detail/generic/reduce.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/detail/static_assert.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy, typename InputIterator>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type InputType;
+
+  // use InputType(0) as init by default
+  return thrust::reduce(exec, first, last, InputType(0));
+} // end reduce()
+
+
+template<typename ExecutionPolicy, typename InputIterator, typename T>
+__host__ __device__
+  T reduce(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, T init)
+{
+  // use plus<T> by default
+  return thrust::reduce(exec, first, last, init, thrust::plus<T>());
+} // end reduce()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator,
+         typename OutputType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputType reduce(thrust::execution_policy<ExecutionPolicy> &,
+                    RandomAccessIterator,
+                    RandomAccessIterator,
+                    OutputType,
+                    BinaryFunction)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value)
+  , "unimplemented for this system"
+  );
+  return OutputType();
+} // end reduce()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/reduce_by_key.h b/thrust/thrust/system/detail/generic/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaa5959a427f8b098085722d3821aa92d180ad97
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/reduce_by_key.h
@@ -0,0 +1,89 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/reduce_by_key.inl>
+
diff --git a/thrust/thrust/system/detail/generic/reduce_by_key.inl b/thrust/thrust/system/detail/generic/reduce_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..86640ea9f8447b21caf534d95b56cb3ff5eb2302
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/reduce_by_key.inl
@@ -0,0 +1,197 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce_by_key.inl
+ *  \brief Inline file for reduce_by_key.h.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/transform.h>
+#include <thrust/scatter.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <limits>
+
+#include <thrust/detail/internal_functional.h>
+#include <thrust/scan.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+template <typename ValueType, typename TailFlagType, typename AssociativeOperator>
+struct reduce_by_key_functor
+{
+  AssociativeOperator binary_op;
+  
+  typedef typename thrust::tuple<ValueType, TailFlagType> result_type;
+  
+  __host__ __device__
+  reduce_by_key_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
+  
+  __host__ __device__
+  result_type operator()(result_type a, result_type b)
+  {
+    return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
+                       thrust::get<1>(a) | thrust::get<1>(b));
+  }
+};
+
+
+} // end namespace detail
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+    typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
+
+    typedef unsigned int FlagType;  // TODO use difference_type
+
+    // Use the input iterator's value type per https://wg21.link/P0571
+    using ValueType = typename thrust::iterator_value<InputIterator2>::type;
+
+    if (keys_first == keys_last)
+        return thrust::make_pair(keys_output, values_output);
+
+    // input size
+    difference_type n = keys_last - keys_first;
+
+    InputIterator2 values_last = values_first + n;
+    
+    // compute head flags
+    thrust::detail::temporary_array<FlagType,ExecutionPolicy> head_flags(exec, n);
+    thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, head_flags.begin() + 1, thrust::detail::not2(binary_pred));
+    head_flags[0] = 1;
+
+    // compute tail flags
+    thrust::detail::temporary_array<FlagType,ExecutionPolicy> tail_flags(exec, n); //COPY INSTEAD OF TRANSFORM
+    thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, tail_flags.begin(), thrust::detail::not2(binary_pred));
+    tail_flags[n-1] = 1;
+
+    // scan the values by flag
+    thrust::detail::temporary_array<ValueType,ExecutionPolicy> scanned_values(exec, n);
+    thrust::detail::temporary_array<FlagType,ExecutionPolicy>  scanned_tail_flags(exec, n);
+    
+    thrust::inclusive_scan
+        (exec,
+         thrust::make_zip_iterator(thrust::make_tuple(values_first,           head_flags.begin())),
+         thrust::make_zip_iterator(thrust::make_tuple(values_last,            head_flags.end())),
+         thrust::make_zip_iterator(thrust::make_tuple(scanned_values.begin(), scanned_tail_flags.begin())),
+         detail::reduce_by_key_functor<ValueType, FlagType, BinaryFunction>(binary_op));
+
+    thrust::exclusive_scan(exec, tail_flags.begin(), tail_flags.end(), scanned_tail_flags.begin(), FlagType(0), thrust::plus<FlagType>());
+
+    // number of unique keys
+    FlagType N = scanned_tail_flags[n - 1] + 1;
+    
+    // scatter the keys and accumulated values    
+    thrust::scatter_if(exec, keys_first,            keys_last,             scanned_tail_flags.begin(), head_flags.begin(), keys_output);
+    thrust::scatter_if(exec, scanned_values.begin(), scanned_values.end(), scanned_tail_flags.begin(), tail_flags.begin(), values_output);
+
+    return thrust::make_pair(keys_output + N, values_output + N); 
+} // end reduce_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type KeyType;
+
+  // use equal_to<KeyType> as default BinaryPredicate
+  return thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
+} // end reduce_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred)
+{
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_output_iterator<OutputIterator2>::value,
+    thrust::iterator_value<InputIterator2>,
+    thrust::iterator_value<OutputIterator2>
+  >::type T;
+
+  // use plus<T> as default BinaryFunction
+  return thrust::reduce_by_key(exec,
+                               keys_first, keys_last, 
+                               values_first,
+                               keys_output,
+                               values_output,
+                               binary_pred,
+                               thrust::plus<T>());
+} // end reduce_by_key()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/remove.h b/thrust/thrust/system/detail/generic/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..343f643e9da5f60a9c53076f136dbca7ca7631e0
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/remove.h
@@ -0,0 +1,113 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.h
+ *  \brief Generic implementations of remove functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  ForwardIterator remove(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator remove_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/remove.inl>
+
diff --git a/thrust/thrust/system/detail/generic/remove.inl b/thrust/thrust/system/detail/generic/remove.inl
new file mode 100644
index 0000000000000000000000000000000000000000..6cb5a694ba2c9fc1d8602d4b215850fcc34c6c71
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/remove.inl
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.inl
+ *  \brief Inline file for remove.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/remove.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  ForwardIterator remove(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value)
+{
+  thrust::detail::equal_to_value<T> pred(value);
+
+  // XXX consider using a placeholder here
+  return thrust::remove_if(exec, first, last, pred);
+} // end remove()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator remove_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value)
+{
+  thrust::detail::equal_to_value<T> pred(value);
+
+  // XXX consider using a placeholder here
+  return thrust::remove_copy_if(exec, first, last, result, pred);
+} // end remove_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // create temporary storage for an intermediate result
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // remove into temp
+  return thrust::remove_copy_if(exec, temp.begin(), temp.end(), temp.begin(), first, pred);
+} // end remove_if()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // create temporary storage for an intermediate result
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // remove into temp
+  return thrust::remove_copy_if(exec, temp.begin(), temp.end(), stencil, first, pred);
+} // end remove_if() 
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  return thrust::remove_copy_if(exec, first, last, first, result, pred);
+} // end remove_copy_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  return thrust::copy_if(exec, first, last, stencil, result, thrust::detail::not1(pred));
+} // end remove_copy_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/replace.h b/thrust/thrust/system/detail/generic/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..6167f711ad16ce3015df0c892394788f317680b2
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/replace.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator1 first,
+                                 InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+__host__ __device__
+  OutputIterator replace_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void replace(thrust::execution_policy<DerivedPolicy> &exec,
+               ForwardIterator first,
+               ForwardIterator last,
+               const T &old_value,
+               const T &new_value);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/replace.inl>
+
diff --git a/thrust/thrust/system/detail/generic/replace.inl b/thrust/thrust/system/detail/generic/replace.inl
new file mode 100644
index 0000000000000000000000000000000000000000..eea70ccd173c3f3b37a61c0a8b45137bcf06d97b
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/replace.inl
@@ -0,0 +1,178 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/functional.h>
+#include <thrust/system/detail/generic/replace.h>
+#include <thrust/transform.h>
+#include <thrust/replace.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+// this functor receives x, and returns a new_value if predicate(x) is true; otherwise,
+// it returns x
+template<typename Predicate, typename NewType, typename OutputType>
+  struct new_value_if
+{
+  __host__ __device__
+  new_value_if(Predicate p, NewType nv):pred(p),new_value(nv){}
+
+  template<typename InputType>
+  __host__ __device__
+  OutputType operator()(const InputType &x) const
+  {
+    return pred(x) ? new_value : x;
+  } // end operator()()
+
+  // this version of operator()() works like the previous but
+  // feeds its second argument to pred
+  template<typename InputType, typename PredicateArgumentType>
+  __host__ __device__
+  OutputType operator()(const InputType &x, const PredicateArgumentType &y)
+  {
+    return pred(y) ? new_value : x;
+  } // end operator()()
+  
+  Predicate pred;
+  NewType new_value;
+}; // end new_value_if
+
+
+// this unary functor ignores its argument and returns a constant
+template<typename T>
+  struct constant_unary
+{
+  __host__ __device__
+  constant_unary(T _c):c(_c){}
+
+  template<typename U>
+  __host__ __device__
+  T operator()(U &)
+  {
+    return c;
+  } // end operator()()
+
+  T c;
+}; // end constant_unary
+
+
+} // end detail
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+
+  detail::new_value_if<Predicate,T,OutputType> op(pred,new_value);
+  return thrust::transform(exec, first, last, result, op);
+} // end replace_copy_if()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+__host__ __device__
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator1 first,
+                                 InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+
+  detail::new_value_if<Predicate,T,OutputType> op(pred,new_value);
+  return thrust::transform(exec, first, last, stencil, result, op);
+} // end replace_copy_if()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+__host__ __device__
+  OutputIterator replace_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value)
+{
+  using thrust::placeholders::_1;
+
+  return thrust::replace_copy_if(exec, first, last, result, _1 == old_value, new_value);
+} // end replace_copy()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value)
+{
+  detail::constant_unary<T> f(new_value);
+  thrust::transform_if(exec, first, last, first, first, f, pred);
+} // end replace_if()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+__host__ __device__
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value)
+{
+  detail::constant_unary<T> f(new_value);
+  thrust::transform_if(exec, first, last, stencil, first, f, pred);
+} // end replace_if()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void replace(thrust::execution_policy<DerivedPolicy> &exec,
+               ForwardIterator first,
+               ForwardIterator last,
+               const T &old_value,
+               const T &new_value)
+{
+  using thrust::placeholders::_1;
+
+  return thrust::replace_if(exec, first, last, _1 == old_value, new_value);
+} // end replace()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/reverse.h b/thrust/thrust/system/detail/generic/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..11421d41b43e6eb731edd31c0b0b75ea94085215
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/reverse.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename BidirectionalIterator>
+__host__ __device__
+  void reverse(thrust::execution_policy<DerivedPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last);
+
+
+template<typename DerivedPolicy,
+         typename BidirectionalIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator reverse_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/reverse.inl>
+
diff --git a/thrust/thrust/system/detail/generic/reverse.inl b/thrust/thrust/system/detail/generic/reverse.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b77c75b6fca5491230916b163846e709dfac7886
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/reverse.inl
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/reverse.h>
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/detail/copy.h>
+#include <thrust/swap.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy, typename BidirectionalIterator>
+__host__ __device__
+  void reverse(thrust::execution_policy<ExecutionPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last)
+{
+  typedef typename thrust::iterator_difference<BidirectionalIterator>::type difference_type;
+
+  // find the midpoint of [first,last)
+  difference_type N = thrust::distance(first, last);
+  BidirectionalIterator mid(first);
+  thrust::advance(mid, N / 2);
+
+  // swap elements of [first,mid) with [last - 1, mid)
+  thrust::swap_ranges(exec, first, mid, thrust::make_reverse_iterator(last));
+} // end reverse()
+
+
+template<typename ExecutionPolicy,
+         typename BidirectionalIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator reverse_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result)
+{
+  return thrust::copy(exec,
+                      thrust::make_reverse_iterator(last),
+                      thrust::make_reverse_iterator(first),
+                      result);
+} // end reverse_copy()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+
diff --git a/thrust/thrust/system/detail/generic/scalar/binary_search.h b/thrust/thrust/system/detail/generic/scalar/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..373b59a606affd84e68edbf8fe3df44da9e24df6
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scalar/binary_search.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace detail
+{
+
+namespace generic
+{
+
+namespace scalar
+{
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+  pair<RandomAccessIterator,RandomAccessIterator>
+    equal_range(RandomAccessIterator first, RandomAccessIterator last,
+                const T &val,
+                BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename Compare>
+__host__ __device__
+bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const T &value, Compare comp);
+
+} // end scalar
+
+} // end generic
+
+} // end detail
+
+} // end system
+
+} // end thrust
+
+#include <thrust/system/detail/generic/scalar/binary_search.inl>
+
diff --git a/thrust/thrust/system/detail/generic/scalar/binary_search.inl b/thrust/thrust/system/detail/generic/scalar/binary_search.inl
new file mode 100644
index 0000000000000000000000000000000000000000..06a240f1ebcacef511271e7d4e5f4d65f7330659
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scalar/binary_search.inl
@@ -0,0 +1,159 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/function.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace detail
+{
+
+namespace generic
+{
+
+namespace scalar
+{
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  Size start = 0, i;
+  while(start < n)
+  {
+    i = (start + n) / 2;
+    if(wrapped_comp(first[i], val))
+    {
+      start = i + 1;
+    }
+    else
+    {
+      n = i;
+    }
+  } // end while
+  
+  return first + start;
+}
+
+// XXX generalize these upon implementation of scalar::distance & scalar::advance
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp)
+{
+  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
+  return lower_bound_n(first, n, val, comp);
+}
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  Size start = 0, i;
+  while(start < n)
+  {
+    i = (start + n) / 2;
+    if(wrapped_comp(val, first[i]))
+    {
+      n = i;
+    }
+    else
+    {
+      start = i + 1;
+    }
+  } // end while
+  
+  return first + start;
+}
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp)
+{
+  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
+  return upper_bound_n(first, n, val, comp);
+}
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+  pair<RandomAccessIterator,RandomAccessIterator>
+    equal_range(RandomAccessIterator first, RandomAccessIterator last,
+                const T &val,
+                BinaryPredicate comp)
+{
+  RandomAccessIterator lb = thrust::system::detail::generic::scalar::lower_bound(first, last, val, comp);
+  return thrust::make_pair(lb, thrust::system::detail::generic::scalar::upper_bound(lb, last, val, comp));
+}
+
+
+template<typename RandomAccessIterator, typename T, typename Compare>
+__host__ __device__
+bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const T &value, Compare comp)
+{
+  RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(first, last, value, comp);
+
+  // wrap comp
+  thrust::detail::wrapped_function<
+    Compare,
+    bool
+  > wrapped_comp(comp);
+
+  return iter != last && !wrapped_comp(value,*iter);
+}
+
+} // end scalar
+
+} // end generic
+
+} // end detail
+
+} // end system
+
+} // end thrust
+
+#include <thrust/system/detail/generic/scalar/binary_search.inl>
+
diff --git a/thrust/thrust/system/detail/generic/scan.h b/thrust/thrust/system/detail/generic/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..c32b0f2b9fba5a921aa66fa00c8d75c54d132293
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scan.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+// XXX it is an error to call this function; it has no implementation 
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init);
+
+
+// XXX it is an error to call this function; it has no implementation 
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/scan.inl>
+
diff --git a/thrust/thrust/system/detail/generic/scan.inl b/thrust/thrust/system/detail/generic/scan.inl
new file mode 100644
index 0000000000000000000000000000000000000000..300b697b26877e9b813aa3ccfceb287e7bf34b96
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scan.inl
@@ -0,0 +1,130 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/system/detail/generic/scan.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/scan.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  // assume plus as the associative operator
+  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<>());
+} // end inclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+
+  // assume 0 as the initialization value
+  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+} // end exclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init)
+{
+  // assume plus as the associative operator
+  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<>());
+} // end exclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &,
+                                InputIterator,
+                                InputIterator,
+                                OutputIterator result,
+                                BinaryFunction)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end inclusive_scan
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &,
+                                InputIterator,
+                                InputIterator,
+                                OutputIterator result,
+                                T,
+                                BinaryFunction)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end exclusive_scan()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/scan_by_key.h b/thrust/thrust/system/detail/generic/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c2ea7931b38e16ca52076364b6b4ee24b7932d8
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scan_by_key.h
@@ -0,0 +1,144 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan_by_key.h
+ *  \brief Generic implementations of key-value scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/scan_by_key.inl>
+
diff --git a/thrust/thrust/system/detail/generic/scan_by_key.inl b/thrust/thrust/system/detail/generic/scan_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d3d1667a9f323a71f91a35af54bbfa0b981578c2
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scan_by_key.inl
@@ -0,0 +1,246 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/scan_by_key.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/replace.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+template <typename OutputType, typename HeadFlagType, typename AssociativeOperator>
+struct segmented_scan_functor
+{
+  AssociativeOperator binary_op;
+  
+  typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
+  
+  __host__ __device__
+  segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
+  
+  __host__ __device__
+  result_type operator()(result_type a, result_type b)
+  {
+    return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
+                       thrust::get<1>(a) | thrust::get<1>(b));
+  }
+};
+
+
+} // end namespace detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred)
+{
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  typedef unsigned int HeadFlagType;
+
+  const size_t n = last1 - first1;
+
+  if(n != 0)
+  {
+    // compute head flags
+    thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
+    flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
+
+    // scan key-flag tuples, 
+    // For additional details refer to Section 2 of the following paper
+    //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
+    //    NVIDIA Technical Report NVR-2008-003, December 2008
+    //    http://mgarland.org/files/papers/nvr-2008-003.pdf
+    thrust::inclusive_scan(exec,
+                           thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())),
+                           thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())) + n,
+                           thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
+                           detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
+  }
+
+  return result + n;
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred)
+{
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  typedef unsigned int HeadFlagType;
+
+  const size_t n = last1 - first1;
+
+  if(n != 0)
+  {
+    InputIterator2 last2 = first2 + n;
+
+    // compute head flags
+    thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
+    flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
+
+    // shift input one to the right and initialize segments with init
+    thrust::detail::temporary_array<OutputType,DerivedPolicy> temp(exec, n);
+    thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
+    temp[0] = init;
+
+    // scan key-flag tuples, 
+    // For additional details refer to Section 2 of the following paper
+    //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
+    //    NVIDIA Technical Report NVR-2008-003, December 2008
+    //    http://mgarland.org/files/papers/nvr-2008-003.pdf
+    thrust::inclusive_scan(exec,
+                           thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())),
+                           thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())) + n,
+                           thrust::make_zip_iterator(thrust::make_tuple(result,       flags.begin())),
+                           detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
+  }
+
+  return result + n;
+}
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/scatter.h b/thrust/thrust/system/detail/generic/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a65a4cc01ea23211330192f69999532f6d60575
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scatter.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter(thrust::execution_policy<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+__host__ __device__
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/scatter.inl>
+
diff --git a/thrust/thrust/system/detail/generic/scatter.inl b/thrust/thrust/system/detail/generic/scatter.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7a1f5229800fab2bb921d3ae20fc55fa3c1834be
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/scatter.inl
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/scatter.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter(thrust::execution_policy<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output)
+{
+  thrust::transform(exec,
+                    first,
+                    last,
+                    thrust::make_permutation_iterator(output, map),
+                    thrust::identity<typename thrust::iterator_value<InputIterator1>::type>());
+} // end scatter()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+__host__ __device__
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output)
+{
+  // default predicate is identity
+  typedef typename thrust::iterator_value<InputIterator3>::type StencilType;
+  thrust::scatter_if(exec, first, last, map, stencil, output, thrust::identity<StencilType>());
+} // end scatter_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+__host__ __device__
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type InputType;
+  thrust::transform_if(exec, first, last, stencil, thrust::make_permutation_iterator(output, map), thrust::identity<InputType>(), pred);
+} // end scatter_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/select_system.h b/thrust/thrust/system/detail/generic/select_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b5d7750362f6f8af42ee3c81930b7ed7d113aa3
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/select_system.h
@@ -0,0 +1,125 @@
+
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename Tag>
+  struct select_system1_exists;
+
+template<typename Tag1, typename Tag2>
+  struct select_system2_exists;
+
+template<typename Tag1, typename Tag2, typename Tag3>
+  struct select_system3_exists;
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
+  struct select_system4_exists;
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
+  struct select_system5_exists;
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
+  struct select_system6_exists;
+
+template<typename System>
+__host__ __device__
+  typename thrust::detail::disable_if<
+    select_system1_exists<System>::value,
+    System &
+  >::type
+    select_system(thrust::execution_policy<System> &system);
+
+template<typename System1, typename System2>
+__host__ __device__
+  typename thrust::detail::enable_if_defined<
+    thrust::detail::minimum_system<System1,System2>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2);
+
+template<typename System1, typename System2, typename System3>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system3_exists<System1,System2,System3>::value,
+    thrust::detail::minimum_system<System1,System2,System3>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3);
+
+template<typename System1, typename System2, typename System3, typename System4>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system4_exists<System1,System2,System3,System4>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4);
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system5_exists<System1,System2,System3,System4,System5>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5);
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5, typename System6>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system6_exists<System1,System2,System3,System4,System5,System6>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5,System6>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5,
+                   thrust::execution_policy<System6> &system6);
+
+// Map a single any_system_tag to device_system_tag.
+inline __host__ __device__
+thrust::device_system_tag select_system(thrust::any_system_tag);
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/select_system.inl>
diff --git a/thrust/thrust/system/detail/generic/select_system.inl b/thrust/thrust/system/detail/generic/select_system.inl
new file mode 100644
index 0000000000000000000000000000000000000000..fbe3094bed8b769cbd9a68a4d6d25aea47843dc6
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/select_system.inl
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/generic/select_system_exists.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace select_system_detail
+{
+
+
+// min_system case 1: both systems have the same type, just return the first one
+template<typename System>
+__host__ __device__
+System &min_system(thrust::execution_policy<System> &system1,
+                   thrust::execution_policy<System> &)
+{
+  return thrust::detail::derived_cast(system1);
+} // end min_system()
+
+
+// min_system case 2: systems have differing type and the first type is considered the minimum
+template<typename System1, typename System2>
+__host__ __device__
+typename thrust::detail::enable_if<
+  thrust::detail::is_same<
+    System1,
+    typename thrust::detail::minimum_system<System1,System2>::type
+  >::value,
+  System1 &
+>::type
+  min_system(thrust::execution_policy<System1> &system1, thrust::execution_policy<System2> &)
+{
+  return thrust::detail::derived_cast(system1);
+} // end min_system()
+
+
+// min_system case 3: systems have differing type and the second type is considered the minimum
+template<typename System1, typename System2>
+__host__ __device__
+typename thrust::detail::enable_if<
+  thrust::detail::is_same<
+    System2,
+    typename thrust::detail::minimum_system<System1,System2>::type
+  >::value,
+    System2 &
+  >::type
+    min_system(thrust::execution_policy<System1> &, thrust::execution_policy<System2> &system2)
+{
+  return thrust::detail::derived_cast(system2);
+} // end min_system()
+
+
+} // end select_system_detail
+
+
+template<typename System>
+__host__ __device__
+  typename thrust::detail::disable_if<
+    select_system1_exists<System>::value,
+    System &
+  >::type
+    select_system(thrust::execution_policy<System> &system)
+{
+  return thrust::detail::derived_cast(system);
+} // end select_system()
+
+
+template<typename System1, typename System2>
+__host__ __device__
+  typename thrust::detail::enable_if_defined<
+    thrust::detail::minimum_system<System1,System2>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2)
+{
+  return select_system_detail::min_system(system1,system2);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system3_exists<System1,System2,System3>::value,
+    thrust::detail::minimum_system<System1,System2,System3>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3)
+{
+  return select_system(select_system(system1,system2), system3);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system4_exists<System1,System2,System3,System4>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4)
+{
+  return select_system(select_system(system1,system2,system3), system4);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system5_exists<System1,System2,System3,System4,System5>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5)
+{
+  return select_system(select_system(system1,system2,system3,system4), system5);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5, typename System6>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system6_exists<System1,System2,System3,System4,System5,System6>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5,System6>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5,
+                   thrust::execution_policy<System6> &system6)
+{
+  return select_system(select_system(system1,system2,system3,system4,system5), system6);
+} // end select_system()
+
+
+// map a single any_system_tag to device_system_tag
+inline __host__ __device__
+thrust::device_system_tag select_system(thrust::any_system_tag)
+{
+  return thrust::device_system_tag();
+} // end select_system()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/select_system_exists.h b/thrust/thrust/system/detail/generic/select_system_exists.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba8ef8bb7eb1a27f84b40c5e2973133bc6ba3539
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/select_system_exists.h
@@ -0,0 +1,168 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generic/type_traits.h
+ *  \brief Introspection for free functions defined in generic.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+// forward declaration of any_system_tag for any_conversion below
+struct any_system_tag;
+
+namespace system
+{
+namespace detail
+{
+
+// we must define these traits outside of generic's namespace
+namespace generic_type_traits_ns
+{
+
+typedef char yes;
+typedef char (&no)[2];
+
+struct any_conversion
+{
+  template<typename T> any_conversion(const T &);
+
+  // add this extra constructor to disambiguate conversion from any_system_tag
+  any_conversion(const any_system_tag &);
+};
+
+namespace select_system_exists_ns
+{
+  no select_system(const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
+
+  template<typename T> yes check(const T &);
+
+  no check(no);
+
+  template<typename Tag>
+    struct select_system1_exists
+  {
+    static Tag &tag;
+
+    static const bool value = sizeof(check(select_system(tag))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2>
+    struct select_system2_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3>
+    struct select_system3_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
+    struct select_system4_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+    static Tag4 &tag4;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
+    struct select_system5_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+    static Tag4 &tag4;
+    static Tag5 &tag5;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4,tag5))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
+    struct select_system6_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+    static Tag4 &tag4;
+    static Tag5 &tag5;
+    static Tag6 &tag6;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4,tag5,tag6))) == sizeof(yes);
+  };
+} // end select_system_exists_ns
+
+} // end generic_type_traits_ns
+
+namespace generic
+{
+
+template<typename Tag>
+  struct select_system1_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system1_exists<Tag>
+{};
+
+template<typename Tag1, typename Tag2>
+  struct select_system2_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system2_exists<Tag1,Tag2>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3>
+  struct select_system3_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system3_exists<Tag1,Tag2,Tag3>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
+  struct select_system4_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system4_exists<Tag1,Tag2,Tag3,Tag4>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
+  struct select_system5_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system5_exists<Tag1,Tag2,Tag3,Tag4,Tag5>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
+  struct select_system6_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system6_exists<Tag1,Tag2,Tag3,Tag4,Tag5,Tag6>
+{};
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/sequence.h b/thrust/thrust/system/detail/generic/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7bc842ae67307abcc1568021a2fcaf52e9db555
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/sequence.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/sequence.inl>
+
diff --git a/thrust/thrust/system/detail/generic/sequence.inl b/thrust/thrust/system/detail/generic/sequence.inl
new file mode 100644
index 0000000000000000000000000000000000000000..16631c7f43b83d4a29d826982d81a4f9a6f22d7c
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/sequence.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/functional.h>
+#include <thrust/system/detail/generic/sequence.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/tabulate.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type T;
+
+  thrust::sequence(exec, first, last, T(0));
+} // end sequence()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init)
+{
+  thrust::sequence(exec, first, last, init, T(1));
+} // end sequence()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step)
+{
+  using thrust::placeholders::_1;
+
+  thrust::tabulate(exec, first, last, init + step * _1);
+} // end sequence()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/set_operations.h b/thrust/thrust/system/detail/generic/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..4dbee0ae40102a62e78dd804b683daa35cb15e7a
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/set_operations.h
@@ -0,0 +1,319 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_difference(thrust::execution_policy<ExecutionPolicy> &exec,
+                              InputIterator1                             first1,
+                              InputIterator1                             last1,
+                              InputIterator2                             first2,
+                              InputIterator2                             last2,
+                              OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_difference(thrust::execution_policy<ExecutionPolicy> &exec,
+                              InputIterator1                             first1,
+                              InputIterator1                             last1,
+                              InputIterator2                             first2,
+                              InputIterator2                             last2,
+                              OutputIterator                             result,
+                              StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                        InputIterator1                             keys_first1,
+                        InputIterator1                             keys_last1,
+                        InputIterator2                             keys_first2,
+                        InputIterator2                             keys_last2,
+                        InputIterator3                             values_first1,
+                        InputIterator4                             values_first2,
+                        OutputIterator1                            keys_result,
+                        OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                        InputIterator1                             keys_first1,
+                        InputIterator1                             keys_last1,
+                        InputIterator2                             keys_first2,
+                        InputIterator2                             keys_last2,
+                        InputIterator3                             values_first1,
+                        InputIterator4                             values_first2,
+                        OutputIterator1                            keys_result,
+                        OutputIterator2                            values_result,
+                        StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_intersection(thrust::execution_policy<ExecutionPolicy> &system,
+                                InputIterator1                             first1,
+                                InputIterator1                             last1,
+                                InputIterator2                             first2,
+                                InputIterator2                             last2,
+                                OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_intersection(thrust::execution_policy<StrictWeakOrdering> &system,
+                                InputIterator1                                first1,
+                                InputIterator1                                last1,
+                                InputIterator2                                first2,
+                                InputIterator2                                last2,
+                                OutputIterator                                result,
+                                StrictWeakOrdering                            comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                          InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                          InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result,
+                          StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_symmetric_difference(thrust::execution_policy<ExecutionPolicy> &system,
+                                        InputIterator1                             first1,
+                                        InputIterator1                             last1,
+                                        InputIterator2                             first2,
+                                        InputIterator2                             last2,
+                                        OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_symmetric_difference(thrust::execution_policy<ExecutionPolicy> &system,
+                                        InputIterator1                             first1,
+                                        InputIterator1                             last1,
+                                        InputIterator2                             first2,
+                                        InputIterator2                             last2,
+                                        OutputIterator                             result,
+                                        StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                                  InputIterator1                             keys_first1,
+                                  InputIterator1                             keys_last1,
+                                  InputIterator2                             keys_first2,
+                                  InputIterator2                             keys_last2,
+                                  InputIterator3                             values_first1,
+                                  InputIterator4                             values_first2,
+                                  OutputIterator1                            keys_result,
+                                  OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                                  InputIterator1                             keys_first1,
+                                  InputIterator1                             keys_last1,
+                                  InputIterator2                             keys_first2,
+                                  InputIterator2                             keys_last2,
+                                  InputIterator3                             values_first1,
+                                  InputIterator4                             values_first2,
+                                  OutputIterator1                            keys_result,
+                                  OutputIterator2                            values_result,
+                                  StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_union(thrust::execution_policy<ExecutionPolicy> &system,
+                         InputIterator1                             first1,
+                         InputIterator1                             last1,
+                         InputIterator2                             first2,
+                         InputIterator2                             last2,
+                         OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_union(thrust::execution_policy<ExecutionPolicy> &system,
+                         InputIterator1                             first1,
+                         InputIterator1                             last1,
+                         InputIterator2                             first2,
+                         InputIterator2                             last2,
+                         OutputIterator                             result,
+                         StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                   InputIterator1                             keys_first1,
+                   InputIterator1                             keys_last1,
+                   InputIterator2                             keys_first2,
+                   InputIterator2                             keys_last2,
+                   InputIterator3                             values_first1,
+                   InputIterator4                             values_first2,
+                   OutputIterator1                            keys_result,
+                   OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                   InputIterator1                             keys_first1,
+                   InputIterator1                             keys_last1,
+                   InputIterator2                             keys_first2,
+                   InputIterator2                             keys_last2,
+                   InputIterator3                             values_first1,
+                   InputIterator4                             values_first2,
+                   OutputIterator1                            keys_result,
+                   OutputIterator2                            values_result,
+                   StrictWeakOrdering                         comp);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/set_operations.inl>
+
diff --git a/thrust/thrust/system/detail/generic/set_operations.inl b/thrust/thrust/system/detail/generic/set_operations.inl
new file mode 100644
index 0000000000000000000000000000000000000000..6264aff167f9e3fcf9d7a5cadb51985fbc34c786
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/set_operations.inl
@@ -0,0 +1,477 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/system/detail/generic/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator1                           first1,
+                              InputIterator1                           last1,
+                              InputIterator2                           first2,
+                              InputIterator2                           last2,
+                              OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_difference(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator1                           keys_first1,
+                        InputIterator1                           keys_last1,
+                        InputIterator2                           keys_first2,
+                        InputIterator2                           keys_last2,
+                        InputIterator3                           values_first1,
+                        InputIterator4                           values_first2,
+                        OutputIterator1                          keys_result,
+                        OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end set_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator1                           keys_first1,
+                        InputIterator1                           keys_last1,
+                        InputIterator2                           keys_first2,
+                        InputIterator2                           keys_last2,
+                        InputIterator3                           values_first1,
+                        InputIterator4                           values_first2,
+                        OutputIterator1                          keys_result,
+                        OutputIterator2                          values_result,
+                        StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_difference(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1                           first1,
+                                InputIterator1                           last1,
+                                InputIterator2                           first2,
+                                InputIterator2                           last2,
+                                OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_intersection(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_intersection()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1                           keys_first1,
+                          InputIterator1                           keys_last1,
+                          InputIterator2                           keys_first2,
+                          InputIterator2                           keys_last2,
+                          InputIterator3                           values_first1,
+                          OutputIterator1                          keys_result,
+                          OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_intersection_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, thrust::less<value_type>());
+} // end set_intersection_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_intersection_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1                           keys_first1,
+                          InputIterator1                           keys_last1,
+                          InputIterator2                           keys_first2,
+                          InputIterator2                           keys_last2,
+                          InputIterator3                           values_first1,
+                          OutputIterator1                          keys_result,
+                          OutputIterator2                          values_result,
+                          StrictWeakOrdering                       comp)
+{
+  typedef typename thrust::iterator_value<InputIterator3>::type value_type1;
+  typedef thrust::constant_iterator<value_type1>                constant_iterator;
+
+  typedef thrust::tuple<InputIterator1, InputIterator3>     iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, constant_iterator>  iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2>   iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  // fabricate a values_first2 by repeating a default-constructed value_type1
+  // XXX assumes value_type1 is default-constructible
+  constant_iterator values_first2 = thrust::make_constant_iterator(value_type1());
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_intersection(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_intersection_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                        InputIterator1                           first1,
+                                        InputIterator1                           last1,
+                                        InputIterator2                           first2,
+                                        InputIterator2                           last2,
+                                        OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_symmetric_difference(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_symmetric_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                  InputIterator1                           keys_first1,
+                                  InputIterator1                           keys_last1,
+                                  InputIterator2                           keys_first2,
+                                  InputIterator2                           keys_last2,
+                                  InputIterator3                           values_first1,
+                                  InputIterator4                           values_first2,
+                                  OutputIterator1                          keys_result,
+                                  OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_symmetric_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end set_symmetric_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_symmetric_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                  InputIterator1                           keys_first1,
+                                  InputIterator1                           keys_last1,
+                                  InputIterator2                           keys_first2,
+                                  InputIterator2                           keys_last2,
+                                  InputIterator3                           values_first1,
+                                  InputIterator4                           values_first2,
+                                  OutputIterator1                          keys_result,
+                                  OutputIterator2                          values_result,
+                                  StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_symmetric_difference(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_symmetric_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1                           first1,
+                         InputIterator1                           last1,
+                         InputIterator2                           first2,
+                         InputIterator2                           last2,
+                         OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_union(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_union()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator1                           keys_first1,
+                   InputIterator1                           keys_last1,
+                   InputIterator2                           keys_first2,
+                   InputIterator2                           keys_last2,
+                   InputIterator3                           values_first1,
+                   InputIterator4                           values_first2,
+                   OutputIterator1                          keys_result,
+                   OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_union_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end set_union_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  set_union_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator1                           keys_first1,
+                   InputIterator1                           keys_last1,
+                   InputIterator2                           keys_first2,
+                   InputIterator2                           keys_last2,
+                   InputIterator3                           values_first1,
+                   InputIterator4                           values_first2,
+                   OutputIterator1                          keys_result,
+                   OutputIterator2                          values_result,
+                   StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_union(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_union_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &,
+                              InputIterator1,
+                              InputIterator1,
+                              InputIterator2,
+                              InputIterator2,
+                              OutputIterator  result,
+                              StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end set_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &,
+                                InputIterator1,
+                                InputIterator1,
+                                InputIterator2,
+                                InputIterator2,
+                                OutputIterator result,
+                                StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end set_intersection()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &,
+                                        InputIterator1,
+                                        InputIterator1,
+                                        InputIterator2,
+                                        InputIterator2,
+                                        OutputIterator result,
+                                        StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end set_symmetric_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &,
+                         InputIterator1,
+                         InputIterator1,
+                         InputIterator2,
+                         InputIterator2,
+                         OutputIterator result,
+                         StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
+  return result;
+} // end set_union()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/shuffle.h b/thrust/thrust/system/detail/generic/shuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..a690c11c59e0cefafa4e1573e16d8d5dd5a478b2
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/shuffle.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.h
+ *  \brief Generic implementations of shuffle functions.
+ */
+
+#pragma once
+
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust {
+namespace system {
+namespace detail {
+namespace generic {
+
+template <typename ExecutionPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, URBG&& g);
+
+template <typename ExecutionPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, OutputIterator result, URBG&& g);
+
+}  // end namespace generic
+}  // end namespace detail
+}  // end namespace system
+}  // end namespace thrust
+
+#include <thrust/system/detail/generic/shuffle.inl>
+
+#endif
diff --git a/thrust/thrust/system/detail/generic/shuffle.inl b/thrust/thrust/system/detail/generic/shuffle.inl
new file mode 100644
index 0000000000000000000000000000000000000000..80b45dc024511f9368c6551c7b4c0263e96efad1
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/shuffle.inl
@@ -0,0 +1,213 @@
+/*
+ *  Copyright 2008-20120 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/system/detail/generic/shuffle.h>
+
+namespace thrust {
+namespace system {
+namespace detail {
+namespace generic {
+
+// An implementation of a Feistel cipher for operating on 64 bit keys
+class feistel_bijection {
+ private:
+  struct round_state {
+    uint32_t left;
+    uint32_t right;
+  };
+
+ public:
+  template <class URBG>
+  __host__ __device__ feistel_bijection(uint64_t m, URBG&& g) {
+    uint64_t total_bits = get_cipher_bits(m);
+    // Half bits rounded down
+    left_side_bits = total_bits / 2;
+    left_side_mask = (1ull << left_side_bits) - 1;
+    // Half the bits rounded up
+    right_side_bits = total_bits - left_side_bits;
+    right_side_mask = (1ull << right_side_bits) - 1;
+
+    for (uint64_t i = 0; i < num_rounds; i++) {
+      key[i] = g();
+    }
+  }
+
+  __host__ __device__ uint64_t nearest_power_of_two() const {
+    return 1ull << (left_side_bits + right_side_bits);
+  }
+  __host__ __device__ uint64_t operator()(const uint64_t val) const {
+    // Extract the right and left sides of the input
+    uint32_t left = (uint32_t)(val >> right_side_bits);
+    uint32_t right = (uint32_t)(val & right_side_mask);
+    round_state state = {left, right};
+
+    for (uint64_t i = 0; i < num_rounds; i++) {
+      state = do_round(state, i);
+    }
+
+    // Check we have the correct number of bits on each side
+    assert((state.left >> left_side_bits) == 0);
+    assert((state.right >> right_side_bits) == 0);
+
+    // Combine the left and right sides together to get result
+    return state.left << right_side_bits | state.right;
+  }
+
+ private:
+  // Find the nearest power of two
+  __host__ __device__ uint64_t get_cipher_bits(uint64_t m) {
+    uint64_t i = 0;
+    while (m != 0) {
+      i++;
+      m >>= 1;
+    }
+    return i;
+  }
+
+  // Round function, a 'pseudorandom function' whos output is indistinguishable
+  // from random for each key value input. This is not cryptographically secure
+  // but sufficient for generating permutations. We hash the value with the
+  // tau88 engine and combine it with the random bits of the key (provided by
+  // the user-defined engine).
+  __host__ __device__ uint32_t round_function(uint64_t value,
+                                              const uint64_t key) const {
+    uint64_t value_hash = thrust::random::taus88(value)();
+    return (value_hash ^ key) & left_side_mask;
+  }
+
+  __host__ __device__ round_state do_round(const round_state state,
+                                           const uint64_t round) const {
+    const uint32_t new_left = state.right & left_side_mask;
+    const uint32_t round_function_res =
+        state.left ^ round_function(state.right, key[round]);
+    if (right_side_bits != left_side_bits) {
+      // Upper bit of the old right becomes lower bit of new right if we have
+      // odd length feistel
+      const uint32_t new_right =
+          (round_function_res << 1ull) | state.right >> left_side_bits;
+      return {new_left, new_right};
+    }
+    return {new_left, round_function_res};
+  }
+
+  static const uint64_t num_rounds = 8;
+  uint64_t right_side_bits;
+  uint64_t left_side_bits;
+  uint64_t right_side_mask;
+  uint64_t left_side_mask;
+  uint64_t key[num_rounds];
+};
+
+struct key_flag_tuple {
+  uint64_t key;
+  uint64_t flag;
+};
+
+// scan only flags
+struct key_flag_scan_op {
+  __host__ __device__ key_flag_tuple operator()(const key_flag_tuple& a,
+                                                const key_flag_tuple& b) {
+    return {b.key, a.flag + b.flag};
+  }
+};
+
+struct construct_key_flag_op {
+  uint64_t m;
+  feistel_bijection bijection;
+  __host__ __device__ construct_key_flag_op(uint64_t m,
+                                            feistel_bijection bijection)
+      : m(m), bijection(bijection) {}
+  __host__ __device__ key_flag_tuple operator()(uint64_t idx) {
+    auto gather_key = bijection(idx);
+    return key_flag_tuple{gather_key, (gather_key < m) ? 1ull : 0ull};
+  }
+};
+
+template <typename InputIterT, typename OutputIterT>
+struct write_output_op {
+  uint64_t m;
+  InputIterT in;
+  OutputIterT out;
+  // flag contains inclusive scan of valid keys
+  // perform gather using valid keys
+  __thrust_exec_check_disable__
+  __host__ __device__ size_t operator()(key_flag_tuple x) {
+    if (x.key < m) {
+      // -1 because inclusive scan
+      out[x.flag - 1] = in[x.key];
+    }
+    return 0;  // Discarded
+  }
+};
+
+template <typename ExecutionPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, URBG&& g) {
+  typedef
+      typename thrust::iterator_traits<RandomIterator>::value_type InputType;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType, ExecutionPolicy> temp(exec, first,
+                                                                   last);
+  thrust::shuffle_copy(exec, temp.begin(), temp.end(), first, g);
+}
+
+template <typename ExecutionPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, OutputIterator result, URBG&& g) {
+  // m is the length of the input
+  // we have an available bijection of length n via a feistel cipher
+  size_t m = last - first;
+  feistel_bijection bijection(m, g);
+  uint64_t n = bijection.nearest_power_of_two();
+
+  // perform stream compaction over length n bijection to get length m
+  // pseudorandom bijection over the original input
+  thrust::counting_iterator<uint64_t> indices(0);
+  thrust::transform_iterator<construct_key_flag_op, decltype(indices),
+                             key_flag_tuple>
+      key_flag_it(indices, construct_key_flag_op(m, bijection));
+  write_output_op<RandomIterator, decltype(result)> write_functor{m, first,
+                                                                  result};
+  auto gather_output_it = thrust::make_transform_output_iterator(
+      thrust::discard_iterator<size_t>(), write_functor);
+  // the feistel_bijection outputs a stream of permuted indices in range [0,n)
+  // flag each value < m and compact it, so we have a set of permuted indices in
+  // range [0,m) each thread gathers an input element according to its
+  // pseudorandom permuted index
+  thrust::inclusive_scan(exec, key_flag_it, key_flag_it + n, gather_output_it,
+                         key_flag_scan_op());
+}
+
+}  // end namespace generic
+}  // end namespace detail
+}  // end namespace system
+}  // end namespace thrust
+#endif
diff --git a/thrust/thrust/system/detail/generic/sort.h b/thrust/thrust/system/detail/generic/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d4ac199810cd7e8dcc815c8f90c43f36cb84d61
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/sort.h
@@ -0,0 +1,154 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+  void sort(thrust::execution_policy<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort(thrust::execution_policy<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Compare>
+__host__ __device__
+  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp);
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Compare>
+__host__ __device__
+  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp);
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/sort.inl>
+
diff --git a/thrust/thrust/system/detail/generic/sort.inl b/thrust/thrust/system/detail/generic/sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..5f0fb7ebf3e7a137235726084be6bcad2eb20731
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/sort.inl
@@ -0,0 +1,220 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/sort.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/find.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+  void sort(thrust::execution_policy<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type; 
+  thrust::sort(exec, first, last, thrust::less<value_type>());
+} // end sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort(thrust::execution_policy<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp)
+{
+  // implement with stable_sort
+  thrust::stable_sort(exec, first, last, comp);
+} // end sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+  thrust::sort_by_key(exec, keys_first, keys_last, values_first, thrust::less<value_type>());
+} // end sort_by_key()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp)
+{
+  // implement with stable_sort_by_key
+  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first, comp);
+} // end sort_by_key()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+  thrust::stable_sort(exec, first, last, thrust::less<value_type>());
+} // end stable_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first)
+{
+  typedef typename iterator_value<RandomAccessIterator1>::type value_type;
+  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first, thrust::less<value_type>());
+} // end stable_sort_by_key()
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  return thrust::is_sorted_until(exec, first, last) == last;
+} // end is_sorted()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Compare>
+__host__ __device__
+  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp)
+{
+  return thrust::is_sorted_until(exec, first, last, comp) == last;
+} // end is_sorted()
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type InputType;
+
+  return thrust::is_sorted_until(exec, first, last, thrust::less<InputType>());
+} // end is_sorted_until()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Compare>
+__host__ __device__
+  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp)
+{
+  if(thrust::distance(first,last) < 2) return last;
+
+  typedef thrust::tuple<ForwardIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>            ZipIterator;
+
+  ForwardIterator first_plus_one = first;
+  thrust::advance(first_plus_one, 1);
+
+  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first_plus_one, first));
+  ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last, first));
+
+  return thrust::get<0>(thrust::find_if(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<Compare>(comp)).get_iterator_tuple());
+} // end is_sorted_until()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort(thrust::execution_policy<DerivedPolicy> &,
+                   RandomAccessIterator,
+                   RandomAccessIterator,
+                   StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value)
+  , "unimplemented for this system"
+  );
+} // end stable_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &,
+                          RandomAccessIterator1,
+                          RandomAccessIterator1,
+                          RandomAccessIterator2,
+                          StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<RandomAccessIterator1, false>::value)
+  , "unimplemented for this system"
+  );
+} // end stable_sort_by_key()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/swap_ranges.h b/thrust/thrust/system/detail/generic/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..78769715c5b3f07ce74ddc3807369c8692af2426
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/swap_ranges.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  ForwardIterator2 swap_ranges(thrust::execution_policy<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/swap_ranges.inl>
+
diff --git a/thrust/thrust/system/detail/generic/swap_ranges.inl b/thrust/thrust/system/detail/generic/swap_ranges.inl
new file mode 100644
index 0000000000000000000000000000000000000000..81977adc236a0a9f34c91d4133cfc2d08a54036c
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/swap_ranges.inl
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/swap_ranges.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+// XXX define this here rather than in internal_functional.h
+// to avoid circular dependence between swap.h & internal_functional.h
+struct swap_pair_elements
+{
+  template <typename Tuple>
+  __host__ __device__
+  void operator()(Tuple t)
+  {
+    // use unqualified swap to allow ADL to catch any user-defined swap
+    using thrust::swap;
+    swap(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end swap_pair_elements
+
+
+} // end detail
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  ForwardIterator2 swap_ranges(thrust::execution_policy<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2)
+{
+  typedef thrust::tuple<ForwardIterator1,ForwardIterator2> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>              ZipIterator;
+
+  ZipIterator result = thrust::for_each(exec,
+                                        thrust::make_zip_iterator(thrust::make_tuple(first1, first2)),
+                                        thrust::make_zip_iterator(thrust::make_tuple(last1,  first2)),
+                                        detail::swap_pair_elements());
+  return thrust::get<1>(result.get_iterator_tuple());
+} // end swap_ranges()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/tabulate.h b/thrust/thrust/system/detail/generic/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cb75e9281d56c68c6c8a5856f6b20a863894a99
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/tabulate.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename UnaryOperation>
+__host__ __device__
+  void tabulate(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/tabulate.inl>
+
diff --git a/thrust/thrust/system/detail/generic/tabulate.inl b/thrust/thrust/system/detail/generic/tabulate.inl
new file mode 100644
index 0000000000000000000000000000000000000000..1a740d26a88df790de3f2ba651091a972aa5c39d
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/tabulate.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tabulate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/transform.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename UnaryOperation>
+__host__ __device__
+  void tabulate(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op)
+{
+  typedef typename iterator_difference<ForwardIterator>::type difference_type;
+
+  // by default, counting_iterator uses a 64b difference_type on 32b platforms to avoid overflowing its counter.
+  // this causes problems when a zip_iterator is created in transform's implementation -- ForwardIterator is
+  // incremented by a 64b difference_type and some compilers warn
+  // to avoid this, specify the counting_iterator's difference_type to be the same as ForwardIterator's.
+  thrust::counting_iterator<difference_type, thrust::use_default, thrust::use_default, difference_type> iter(0);
+
+  thrust::transform(exec, iter, iter + thrust::distance(first, last), first, unary_op);
+} // end tabulate()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+
diff --git a/thrust/thrust/system/detail/generic/tag.h b/thrust/thrust/system/detail/generic/tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..4da1e79ced95412ab558d1380c52b3c2294942e7
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/tag.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generic/tag.h
+ *  \brief Implementation of the generic backend's tag.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+// tag exists only to make the generic entry points the least priority match
+// during ADL. tag should not be derived from and is constructible from anything
+struct tag
+{
+  template<typename T>
+  __host__ __device__ inline
+  tag(const T &) {}
+};
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/temporary_buffer.h b/thrust/thrust/system/detail/generic/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cf389ca15e904934360ab0a2335da403afff00b
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/temporary_buffer.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/pair.h>
+#include <thrust/detail/pointer.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t n);
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p);
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/temporary_buffer.inl>
+
diff --git a/thrust/thrust/system/detail/generic/temporary_buffer.inl b/thrust/thrust/system/detail/generic/temporary_buffer.inl
new file mode 100644
index 0000000000000000000000000000000000000000..20f33bdaa55158426904165122d72870221f3050
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/temporary_buffer.inl
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/temporary_buffer.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/malloc_and_free.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename T, typename DerivedPolicy>
+__host__ __device__
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
+{
+  thrust::pointer<T,DerivedPolicy> ptr = thrust::malloc<T>(exec, n);
+
+  // check for a failed malloc
+  if(!ptr.get())
+  {
+    n = 0;
+  } // end if
+
+  return thrust::make_pair(ptr, n);
+} // end get_temporary_buffer()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t)
+{
+  // If we are here, no user customization of the three-argument signature with
+  // a size parameter of `return_temporary_buffer` was found. There may be an
+  // old two-argument signature `return_temporary_buffer` though, so we make
+  // another ADL call to try and find one.
+  //
+  // The interface layer downcast and then did ADL dispatch - there were no
+  // matches for DerivedPolicy (aka no one customized the three-argument
+  // signature), so this overload got found an implicit upcast to
+  // `execution_policy<DerivedPolicy>` was done. Now, we're looking for a
+  // customization of the two-argument signature so we need to downcast again.
+  return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
+} // end return_temporary_buffer()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p)
+{
+  // If we are here, no user customization of either the old two-argument
+  // signature or the new three-argument signature with a size parameter of
+  // `return_temporary_buffer` was found.
+  thrust::free(exec, p);
+} // end return_temporary_buffer()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/transform.h b/thrust/thrust/system/detail/generic/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..1aa2f4993fead2b6de01cc2faa29f2a49d950fd3
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/transform.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+__host__ __device__
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator first,
+                               InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first,
+                               InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first1,
+                               InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/transform.inl>
+
diff --git a/thrust/thrust/system/detail/generic/transform.inl b/thrust/thrust/system/detail/generic/transform.inl
new file mode 100644
index 0000000000000000000000000000000000000000..589eb65c71e3ba26e6f38ba089a8503ebfce0f90
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/transform.inl
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/transform.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+__host__ __device__
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op)
+{
+  typedef thrust::detail::unary_transform_functor<UnaryFunction> UnaryTransformFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator,OutputIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last,result)),
+                     UnaryTransformFunctor(op));
+
+  return thrust::get<1>(zipped_result.get_iterator_tuple());
+} // end transform()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op)
+{
+  // given the minimal system, determine the binary transform functor we need
+  typedef thrust::detail::binary_transform_functor<BinaryFunction> BinaryTransformFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator1,InputIterator2,OutputIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first1,first2,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last1,first2,result)),
+                     BinaryTransformFunctor(op));
+
+  return thrust::get<2>(zipped_result.get_iterator_tuple());
+} // end transform()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator first,
+                               InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  typedef thrust::detail::unary_transform_if_functor<UnaryFunction,Predicate> UnaryTransformIfFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last,result)),
+                     UnaryTransformIfFunctor(unary_op,pred));
+
+  return thrust::get<1>(zipped_result.get_iterator_tuple());
+} // end transform_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first,
+                               InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  typedef thrust::detail::unary_transform_if_with_stencil_functor<UnaryFunction,Predicate> UnaryTransformIfFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator1,InputIterator2,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first,stencil,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last,stencil,result)),
+                     UnaryTransformIfFunctor(unary_op,pred));
+
+  return thrust::get<2>(zipped_result.get_iterator_tuple());
+} // end transform_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first1,
+                               InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred)
+{
+  typedef thrust::detail::binary_transform_if_functor<BinaryFunction,Predicate> BinaryTransformIfFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator1,InputIterator2,InputIterator3,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first1,first2,stencil,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last1,first2,stencil,result)),
+                     BinaryTransformIfFunctor(binary_op,pred));
+
+  return thrust::get<3>(zipped_result.get_iterator_tuple());
+} // end transform_if()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/transform_reduce.h b/thrust/thrust/system/detail/generic/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..23123fa4952f7b30619ad502ff13f7de7a245445
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/transform_reduce.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputType transform_reduce(thrust::execution_policy<ExecutionPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/transform_reduce.inl>
+
diff --git a/thrust/thrust/system/detail/generic/transform_reduce.inl b/thrust/thrust/system/detail/generic/transform_reduce.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7340f8355cda39476a76490ffb57ff03c58b23b0
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/transform_reduce.inl
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/transform_reduce.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputType transform_reduce(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op)
+{
+  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_last(last, unary_op);
+
+  return thrust::reduce(exec, xfrm_first, xfrm_last, init, binary_op);
+} // end transform_reduce()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/generic/transform_scan.h b/thrust/thrust/system/detail/generic/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f81434fc4f49afeb616d1b18678807909acebe3
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/transform_scan.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator transform_inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          BinaryFunction binary_op);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/transform_scan.inl>
+
diff --git a/thrust/thrust/system/detail/generic/transform_scan.inl b/thrust/thrust/system/detail/generic/transform_scan.inl
new file mode 100644
index 0000000000000000000000000000000000000000..31053cd10550499edaacf3f14fe919da1cf35cbf
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/transform_scan.inl
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/transform_scan.h>
+#include <thrust/scan.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator transform_inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          BinaryFunction binary_op)
+{
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using InputType = typename thrust::iterator_value<InputIterator>::type;
+#if THRUST_CPP_DIALECT < 2017
+  using ValueType = typename std::result_of<UnaryFunction(InputType)>::type;
+#else
+  using ValueType = std::invoke_result_t<UnaryFunction, InputType>;
+#endif
+
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
+
+  return thrust::inclusive_scan(exec, _first, _last, result, binary_op);
+} // end transform_inclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename InitialValueType,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          InitialValueType init,
+                                          AssociativeOperator binary_op)
+{
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
+
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
+
+  return thrust::exclusive_scan(exec, _first, _last, result, init, binary_op);
+} // end transform_exclusive_scan()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/uninitialized_copy.h b/thrust/thrust/system/detail/generic/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d1b0010dfbfd8587bac2167b25cd4982d3ad468
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/uninitialized_copy.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/uninitialized_copy.inl>
+
diff --git a/thrust/thrust/system/detail/generic/uninitialized_copy.inl b/thrust/thrust/system/detail/generic/uninitialized_copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..d6babf65c08997f86d25cd5326727e5ad5102c19
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/uninitialized_copy.inl
@@ -0,0 +1,193 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/uninitialized_copy.h>
+#include <thrust/copy.h>
+#include <thrust/for_each.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename InputType,
+         typename OutputType>
+  struct uninitialized_copy_functor
+{
+  template<typename Tuple>
+  __host__ __device__
+  void operator()(Tuple t)
+  {
+    const InputType &in = thrust::get<0>(t);
+    OutputType &out = thrust::get<1>(t);
+
+    ::new(static_cast<void*>(&out)) OutputType(in);
+  } // end operator()()
+}; // end uninitialized_copy_functor
+
+
+// non-trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result,
+                                     thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+  ZipIterator end = begin;
+
+  // get a zip_iterator pointing to the end
+  const typename thrust::iterator_difference<InputIterator>::type n = thrust::distance(first,last);
+  thrust::advance(end, n);
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type InputType;
+  typedef typename iterator_traits<ForwardIterator>::value_type OutputType;
+
+  detail::uninitialized_copy_functor<InputType, OutputType> f;
+
+  // do the for_each
+  thrust::for_each(exec, begin, end, f);
+
+  // return the end of the output range
+  return thrust::get<1>(end.get_iterator_tuple());
+} // end uninitialized_copy()
+
+
+// trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result,
+                                     thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  return thrust::copy(exec, first, last, result);
+} // end uninitialized_copy()
+
+
+// non-trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result,
+                                       thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type   InputType;
+  typedef typename iterator_traits<ForwardIterator>::value_type OutputType;
+
+  detail::uninitialized_copy_functor<InputType, OutputType> f;
+
+  // do the for_each_n
+  ZipIterator zipped_last = thrust::for_each_n(exec, zipped_first, n, f);
+
+  // return the end of the output range
+  return thrust::get<1>(zipped_last.get_iterator_tuple());
+} // end uninitialized_copy_n()
+
+
+// trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result,
+                                       thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  return thrust::copy_n(exec, first, n, result);
+} // end uninitialized_copy_n()
+
+
+} // end detail
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ResultType;
+
+  typedef typename thrust::detail::has_trivial_copy_constructor<ResultType>::type ResultTypeHasTrivialCopyConstructor;
+
+  return thrust::system::detail::generic::detail::uninitialized_copy(exec, first, last, result, ResultTypeHasTrivialCopyConstructor());
+} // end uninitialized_copy()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ResultType;
+
+  typedef typename thrust::detail::has_trivial_copy_constructor<ResultType>::type ResultTypeHasTrivialCopyConstructor;
+
+  return thrust::system::detail::generic::detail::uninitialized_copy_n(exec, first, n, result, ResultTypeHasTrivialCopyConstructor());
+} // end uninitialized_copy_n()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/uninitialized_fill.h b/thrust/thrust/system/detail/generic/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..6acc65d083d82c568fd6c2dd9240a4c09920f13a
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/uninitialized_fill.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+__host__ __device__
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/uninitialized_fill.inl>
+
diff --git a/thrust/thrust/system/detail/generic/uninitialized_fill.inl b/thrust/thrust/system/detail/generic/uninitialized_fill.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0d4cf3f54122f5741dfcaa4116a484ac43f926f5
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/uninitialized_fill.inl
@@ -0,0 +1,134 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/uninitialized_fill.h>
+#include <thrust/fill.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x,
+                          thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  thrust::fill(exec, first, last, x);
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x,
+                          thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  thrust::for_each(exec, first, last, thrust::detail::uninitialized_fill_functor<ValueType>(x));
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+__host__ __device__
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x,
+                                       thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  return thrust::fill_n(exec, first, n, x);
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+__host__ __device__
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x,
+                                       thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  return thrust::for_each_n(exec, first, n, thrust::detail::uninitialized_fill_functor<ValueType>(x));
+} // end uninitialized_fill()
+
+} // end detail
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+__host__ __device__
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  typedef thrust::detail::has_trivial_copy_constructor<ValueType> ValueTypeHasTrivialCopyConstructor;
+
+  thrust::system::detail::generic::detail::uninitialized_fill(exec, first, last, x,
+    ValueTypeHasTrivialCopyConstructor());
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+__host__ __device__
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  typedef thrust::detail::has_trivial_copy_constructor<ValueType> ValueTypeHasTrivialCopyConstructor;
+
+  return thrust::system::detail::generic::detail::uninitialized_fill_n(exec, first, n, x,
+    ValueTypeHasTrivialCopyConstructor());
+} // end uninitialized_fill()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/unique.h b/thrust/thrust/system/detail/generic/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..04388cbc008f031de63fc814b95d11485ec27fac
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/unique.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output,
+                           BinaryPredicate binary_pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/unique.inl>
+
diff --git a/thrust/thrust/system/detail/generic/unique.inl b/thrust/thrust/system/detail/generic/unique.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4cd3459fd67791ccdf8963b18bfe37126c833ba7
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/unique.inl
@@ -0,0 +1,113 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.inl
+ *  \brief Inline file for unique.h.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/detail/range/head_flags.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+  ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  return thrust::unique(exec, first, last, thrust::equal_to<InputType>());
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+  
+  thrust::detail::temporary_array<InputType,DerivedPolicy> input(exec, first, last);
+  
+  return thrust::unique_copy(exec, input.begin(), input.end(), first, binary_pred);
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type value_type;
+  return thrust::unique_copy(exec, first,last,output,thrust::equal_to<value_type>());
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  thrust::detail::head_flags<InputIterator, BinaryPredicate> stencil(first, last, binary_pred);
+  
+  using namespace thrust::placeholders;
+  
+  return thrust::copy_if(exec, first, last, stencil.begin(), output, _1);
+} // end unique_copy()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/generic/unique_by_key.h b/thrust/thrust/system/detail/generic/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb03179deed6ca3b4adfcd06cc4dabab1e0a3744
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/unique_by_key.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/unique_by_key.inl>
+
diff --git a/thrust/thrust/system/detail/generic/unique_by_key.inl b/thrust/thrust/system/detail/generic/unique_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..ff8c5b554bfb1f97c95c2d9bd1ca81cb227c971b
--- /dev/null
+++ b/thrust/thrust/system/detail/generic/unique_by_key.inl
@@ -0,0 +1,140 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/unique.h>
+#include <thrust/detail/range/head_flags.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+thrust::pair<ForwardIterator1,ForwardIterator2>
+unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+              ForwardIterator1 keys_first,
+              ForwardIterator1 keys_last,
+              ForwardIterator2 values_first)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator1>::value_type KeyType;
+  return thrust::unique_by_key(exec, keys_first, keys_last, values_first, thrust::equal_to<KeyType>());
+} // end unique_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+thrust::pair<ForwardIterator1,ForwardIterator2>
+unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+              ForwardIterator1 keys_first,
+              ForwardIterator1 keys_last,
+              ForwardIterator2 values_first,
+              BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType1;
+  typedef typename thrust::iterator_traits<ForwardIterator2>::value_type InputType2;
+
+  ForwardIterator2 values_last = values_first + (keys_last - keys_first);
+
+  thrust::detail::temporary_array<InputType1,ExecutionPolicy> keys(exec, keys_first, keys_last);
+  thrust::detail::temporary_array<InputType2,ExecutionPolicy> vals(exec, values_first, values_last);
+
+  return thrust::unique_by_key_copy(exec, keys.begin(), keys.end(), vals.begin(), keys_first, values_first, binary_pred);
+} // end unique_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator1 keys_first,
+                   InputIterator1 keys_last,
+                   InputIterator2 values_first,
+                   OutputIterator1 keys_output,
+                   OutputIterator2 values_output)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
+  return thrust::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
+} // end unique_by_key_copy()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator1 keys_first,
+                   InputIterator1 keys_last,
+                   InputIterator2 values_first,
+                   OutputIterator1 keys_output,
+                   OutputIterator2 values_output,
+                   BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
+
+  difference_type n = thrust::distance(keys_first, keys_last);
+
+  thrust::detail::head_flags<InputIterator1, BinaryPredicate> stencil(keys_first, keys_last, binary_pred);
+
+  using namespace thrust::placeholders;
+  thrust::zip_iterator< thrust::tuple<OutputIterator1, OutputIterator2> > result =
+    thrust::copy_if(exec,
+                    thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
+                    thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)) + n,
+                    stencil.begin(),
+                    thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output)),
+                    _1);
+
+  difference_type output_size = result - thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output));
+
+  return thrust::make_pair(keys_output + output_size, values_output + output_size);
+} // end unique_by_key_copy()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/internal/decompose.h b/thrust/thrust/system/detail/internal/decompose.h
new file mode 100644
index 0000000000000000000000000000000000000000..e949f202485e7356dd1296c258a26bdd28e40840
--- /dev/null
+++ b/thrust/thrust/system/detail/internal/decompose.h
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+
+  template <typename IndexType>
+    class index_range
+    {
+      public:
+        typedef IndexType index_type;
+
+        __host__ __device__
+          index_range(index_type begin, index_type end) : m_begin(begin), m_end(end) {}
+
+        __host__ __device__
+          index_type begin(void) const { return m_begin; }
+
+        __host__ __device__
+          index_type end(void)   const { return m_end; }
+
+        __host__ __device__
+          index_type size(void)  const { return m_end - m_begin; }
+
+      private:
+        index_type m_begin;
+        index_type m_end;
+    };
+
+  template <typename IndexType>
+    class uniform_decomposition
+    {
+      public:
+        typedef IndexType               index_type;
+        typedef index_range<index_type> range_type;
+
+        __host__ __device__
+        uniform_decomposition(index_type N, index_type granularity, index_type max_intervals)
+          : m_N(N),
+	    m_intervals((N + granularity - 1) / granularity),
+	    m_threshold(0),
+	    m_small_interval(granularity),
+	    m_large_interval(0)
+        {
+	  if(m_intervals > max_intervals)
+          {
+	    m_small_interval = granularity * (m_intervals / max_intervals);
+	    m_large_interval = m_small_interval + granularity;
+	    m_threshold      = m_intervals % max_intervals;
+	    m_intervals      = max_intervals;
+	  }
+        }
+
+        __host__ __device__
+          index_range<index_type> operator[](const index_type& i) const
+          {
+            if (i < m_threshold)
+            {
+              index_type begin = m_large_interval * i;
+              index_type end   = begin + m_large_interval;
+              return range_type(begin, end);
+            }
+            else
+            {
+              index_type begin = m_large_interval * m_threshold + m_small_interval * (i - m_threshold);
+              index_type end   = (begin + m_small_interval < m_N) ? begin + m_small_interval : m_N;
+              return range_type(begin, end);
+            }
+          }
+
+        __host__ __device__
+          index_type size(void) const
+          {
+            return m_intervals;
+          }
+
+      private:
+
+        index_type m_N;
+        index_type m_intervals;
+        index_type m_threshold;
+        index_type m_small_interval;
+        index_type m_large_interval;
+    };
+
+
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/adjacent_difference.h b/thrust/thrust/system/detail/sequential/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6b0ee1b246ef2f18968a05458204236d2d8d53d
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/adjacent_difference.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief Sequential implementation of adjacent_difference.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+OutputIterator adjacent_difference(sequential::execution_policy<DerivedPolicy> &,
+                                   InputIterator first,
+                                   InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+
+  if(first == last)
+    return result;
+
+  InputType curr = *first;
+
+  *result = curr;
+
+  while(++first != last)
+  {
+    InputType next = *first;
+    *(++result) = binary_op(next, curr);
+    curr = next;
+  }
+
+  return ++result;
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/assign_value.h b/thrust/thrust/system/detail/sequential/assign_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..699bcbcd7847ccfa14f8fb8ffe1591f7ced8f957
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/assign_value.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+  void assign_value(sequential::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
+{
+  *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
+} // end assign_value()
+
+} // end sequential
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/sequential/binary_search.h b/thrust/thrust/system/detail/sequential/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..54534143ecd7a4712094461b60c6e2b902a6781e
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/binary_search.h
@@ -0,0 +1,157 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief Sequential implementation of binary search algorithms.
+ */
+
+#pragma once
+
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T,
+         typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator lower_bound(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T& val,
+                            StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  typedef typename thrust::iterator_difference<ForwardIterator>::type difference_type;
+
+  difference_type len = thrust::distance(first, last);
+
+  while(len > 0)
+  {
+    difference_type half = len >> 1;
+    ForwardIterator middle = first;
+
+    thrust::advance(middle, half);
+
+    if(wrapped_comp(*middle, val))
+    {
+      first = middle;
+      ++first;
+      len = len - half - 1;
+    }
+    else
+    {
+      len = half;
+    }
+  }
+
+  return first;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T,
+         typename StrictWeakOrdering>
+__host__ __device__
+ForwardIterator upper_bound(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T& val, 
+                            StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  typedef typename thrust::iterator_difference<ForwardIterator>::type difference_type;
+
+  difference_type len = thrust::distance(first, last);
+
+  while(len > 0)
+  {
+    difference_type half = len >> 1;
+    ForwardIterator middle = first;
+
+    thrust::advance(middle, half);
+
+    if(wrapped_comp(val, *middle))
+    {
+      len = half;
+    }
+    else
+    {
+      first = middle;
+      ++first;
+      len = len - half - 1;
+    }
+  }
+
+  return first;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T,
+         typename StrictWeakOrdering>
+__host__ __device__
+bool binary_search(sequential::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator first,
+                   ForwardIterator last,
+                   const T& val, 
+                   StrictWeakOrdering comp)
+{
+  ForwardIterator iter = sequential::lower_bound(exec, first, last, val, comp);
+
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  return iter != last && !wrapped_comp(val,*iter);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/copy.h b/thrust/thrust/system/detail/sequential/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..80853f670020fe3926c38f716cc359e8a94f5e70
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/copy.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file copy.h
+ *  \brief Sequential implementations of copy algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(sequential::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(sequential::execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/sequential/copy.inl>
+
diff --git a/thrust/thrust/system/detail/sequential/copy.inl b/thrust/thrust/system/detail/sequential/copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..8027681d0fad3cc3b3c1b22b570d6231fe293315
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/copy.inl
@@ -0,0 +1,145 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/copy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/sequential/general_copy.h>
+#include <thrust/system/detail/sequential/trivial_copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+namespace copy_detail
+{
+
+
+// returns the raw pointer associated with a Pointer-like thing
+template<typename Pointer>
+__host__ __device__
+  typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+    get(Pointer ptr)
+{
+  return thrust::detail::pointer_traits<Pointer>::get(ptr);
+}
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::detail::true_type)  // is_indirectly_trivially_relocatable_to
+{
+  typedef typename thrust::iterator_difference<InputIterator>::type Size;
+
+  const Size n = last - first;
+  thrust::system::detail::sequential::trivial_copy_n(get(&*first), n, get(&*result));
+  return result + n;
+} // end copy()
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::detail::false_type)  // is_indirectly_trivially_relocatable_to
+{
+  return thrust::system::detail::sequential::general_copy(first,last,result);
+} // end copy()
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::detail::true_type)  // is_indirectly_trivially_relocatable_to
+{
+  thrust::system::detail::sequential::trivial_copy_n(get(&*first), n, get(&*result));
+  return result + n;
+} // end copy_n()
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::detail::false_type)  // is_indirectly_trivially_relocatable_to
+{
+  return thrust::system::detail::sequential::general_copy_n(first,n,result);
+} // end copy_n()
+
+
+} // end namespace copy_detail
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy(sequential::execution_policy<DerivedPolicy> &,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  return thrust::system::detail::sequential::copy_detail::copy(first, last, result,
+    typename thrust::is_indirectly_trivially_relocatable_to<InputIterator,OutputIterator>::type());
+} // end copy()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator copy_n(sequential::execution_policy<DerivedPolicy> &,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  return thrust::system::detail::sequential::copy_detail::copy_n(first, n, result,
+    typename thrust::is_indirectly_trivially_relocatable_to<InputIterator,OutputIterator>::type());
+} // end copy_n()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/copy_backward.h b/thrust/thrust/system/detail/sequential/copy_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..e825436b109b8c5db96c973747f32e69dc7f5fa1
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/copy_backward.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename BidirectionalIterator1,
+         typename BidirectionalIterator2>
+__host__ __device__
+BidirectionalIterator2 copy_backward(BidirectionalIterator1 first, 
+                                     BidirectionalIterator1 last, 
+                                     BidirectionalIterator2 result)
+{
+  while (first != last)
+  {
+    --last;
+    --result;
+    *result = *last;
+  }
+
+  return result;
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/copy_if.h b/thrust/thrust/system/detail/sequential/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb29ccdebbf4eb49fb15ad8edbc9fb3c0e6cad59
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/copy_if.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file copy_if.h
+ *  \brief Sequential implementation of copy_if.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator copy_if(sequential::execution_policy<DerivedPolicy> &,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  thrust::detail::wrapped_function<Predicate,bool> wrapped_pred(pred);
+
+  while(first != last)
+  {
+    if(wrapped_pred(*stencil))
+    {
+      *result = *first;
+      ++result;
+    } // end if
+
+    ++first;
+    ++stencil;
+  } // end while
+
+  return result;
+} // end copy_if()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/count.h b/thrust/thrust/system/detail/sequential/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed3cadffe364db5a6b63407a2d35ddd9d18b0a3b
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/count.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special count functions
+
diff --git a/thrust/thrust/system/detail/sequential/equal.h b/thrust/thrust/system/detail/sequential/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d31e70f6fdaedcb3215a737888a6c5ac11621ab
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/equal.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special equal functions
+
diff --git a/thrust/thrust/system/detail/sequential/execution_policy.h b/thrust/thrust/system/detail/sequential/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..81d52f14087e8681e425d176c6c8e3991a5cfda7
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/execution_policy.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+// this awkward sequence of definitions arises
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::execution_policy<tag>
+{};
+
+// tag's definition comes before the generic definition of execution_policy
+struct tag : execution_policy<tag>
+{
+  __host__ __device__ THRUST_CONSTEXPR tag() {}
+};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::execution_policy<Derived>
+{
+  // allow conversion to tag
+  inline operator tag () const
+  {
+    return tag();
+  }
+};
+
+
+THRUST_INLINE_CONSTANT tag seq;
+
+
+} // end sequential
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/sequential/extrema.h b/thrust/thrust/system/detail/sequential/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bfa5a17d996990e38fdc3fe43ccfae90609a681
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/extrema.h
@@ -0,0 +1,139 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file extrema.h
+ *  \brief Sequential implementations of extrema functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+ForwardIterator min_element(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  ForwardIterator imin = first;
+
+  for(; first != last; ++first)
+  {
+    if(wrapped_comp(*first, *imin))
+    {
+      imin = first;
+    }
+  }
+
+  return imin;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+ForwardIterator max_element(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  ForwardIterator imax = first;
+
+  for(; first != last; ++first)
+  {
+    if(wrapped_comp(*imax, *first))
+    {
+      imax = first;
+    }
+  }
+
+  return imax;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(sequential::execution_policy<DerivedPolicy> &,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+  
+  ForwardIterator imin = first;
+  ForwardIterator imax = first;
+
+  for(; first != last; ++first)
+  {
+    if(wrapped_comp(*first, *imin))
+    {
+      imin = first;
+    }
+
+    if(wrapped_comp(*imax, *first))
+    {
+      imax = first;
+    }
+  }
+
+  return thrust::make_pair(imin, imax);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/fill.h b/thrust/thrust/system/detail/sequential/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..20c636096c25e9a2d951ad2f50a4de72d0d1b968
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special fill functions
+
diff --git a/thrust/thrust/system/detail/sequential/find.h b/thrust/thrust/system/detail/sequential/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e551b74a66e56f3a01186ae82c3dd914741a074
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/find.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief Sequential implementation of find_if. 
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+InputIterator find_if(execution_policy<DerivedPolicy> &,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while(first != last)
+  {
+    if (wrapped_pred(*first))
+      return first;
+
+    ++first;
+  }
+
+  // return first so zip_iterator works correctly
+  return first;
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/for_each.h b/thrust/thrust/system/detail/sequential/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e83d18c127027dfb0d11906db47909b896cf053
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/for_each.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.h
+ *  \brief Sequential implementations of for_each functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+__host__ __device__
+InputIterator for_each(sequential::execution_policy<DerivedPolicy> &,
+                       InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f)
+{
+  // wrap f
+  thrust::detail::wrapped_function<
+    UnaryFunction,
+    void
+  > wrapped_f(f);
+
+  for(; first != last; ++first)
+  {
+    wrapped_f(*first);
+  }
+
+  return first;
+} // end for_each()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+__host__ __device__
+InputIterator for_each_n(sequential::execution_policy<DerivedPolicy> &,
+                         InputIterator first,
+                         Size n,
+                         UnaryFunction f)
+{
+  // wrap f
+  thrust::detail::wrapped_function<
+    UnaryFunction,
+    void
+  > wrapped_f(f);
+
+  for(Size i = 0; i != n; i++)
+  {
+    // we can dereference an OutputIterator if f does not
+    // try to use the reference for anything besides assignment
+    wrapped_f(*first);
+    ++first;
+  }
+
+  return first;
+} // end for_each_n()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/gather.h b/thrust/thrust/system/detail/sequential/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5657585eae6cb63adb21ea6b2444cad2fba2c4e
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/gather.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special gather functions
+
diff --git a/thrust/thrust/system/detail/sequential/general_copy.h b/thrust/thrust/system/detail/sequential/general_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..9546b72e5ef17b082ceda709e1e4ef71c8b864eb
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/general_copy.h
@@ -0,0 +1,147 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file general_copy.h
+ *  \brief Sequential copy algorithms for general iterators.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+namespace general_copy_detail
+{
+
+
+template<typename T1, typename T2>
+struct lazy_is_assignable
+  : thrust::detail::is_assignable<
+      typename T1::type,
+      typename T2::type
+    >
+{};
+
+
+// sometimes OutputIterator's reference type is reported as void
+// in that case, just assume that we're able to assign to it OK
+template<typename InputIterator, typename OutputIterator>
+struct reference_is_assignable
+  : thrust::detail::eval_if<
+      thrust::detail::is_same<
+        typename thrust::iterator_reference<OutputIterator>::type, void
+      >::value,
+      thrust::detail::true_type,
+      lazy_is_assignable<
+        thrust::iterator_reference<OutputIterator>,
+        thrust::iterator_reference<InputIterator>
+      >
+    >::type
+{};
+
+
+// introduce an iterator assign helper to deal with assignments from
+// a wrapped reference
+
+__thrust_exec_check_disable__
+template<typename OutputIterator, typename InputIterator>
+inline __host__ __device__
+typename thrust::detail::enable_if<
+  reference_is_assignable<InputIterator,OutputIterator>::value
+>::type
+iter_assign(OutputIterator dst, InputIterator src)
+{
+  *dst = *src;
+}
+
+
+__thrust_exec_check_disable__
+template<typename OutputIterator, typename InputIterator>
+inline __host__ __device__
+typename thrust::detail::disable_if<
+  reference_is_assignable<InputIterator,OutputIterator>::value
+>::type
+iter_assign(OutputIterator dst, InputIterator src)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type value_type;
+
+  // insert a temporary and hope for the best
+  *dst = static_cast<value_type>(*src);
+}
+
+
+} // end general_copy_detail
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator general_copy(InputIterator first,
+                              InputIterator last,
+                              OutputIterator result)
+{
+  for(; first != last; ++first, ++result)
+  {
+    // gcc 4.2 crashes while instantiating iter_assign
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
+    *result = *first;
+#else
+    general_copy_detail::iter_assign(result, first);
+#endif
+  }
+
+  return result;
+} // end general_copy()
+
+
+__thrust_exec_check_disable__
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+__host__ __device__
+  OutputIterator general_copy_n(InputIterator first,
+                                Size n,
+                                OutputIterator result)
+{
+  for(; n > Size(0); ++first, ++result, --n)
+  {
+    // gcc 4.2 crashes while instantiating iter_assign
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
+    *result = *first;
+#else
+    general_copy_detail::iter_assign(result, first);
+#endif
+  }
+
+  return result;
+} // end general_copy_n()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/generate.h b/thrust/thrust/system/detail/sequential/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac38be51617dc0cd61008035bc3e64a7544ac0c1
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/generate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special generate functions
+
diff --git a/thrust/thrust/system/detail/sequential/get_value.h b/thrust/thrust/system/detail/sequential/get_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f8eb040e01a381613661e13fe46c6d2de011e
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/get_value.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  typename thrust::iterator_value<Pointer>::type
+    get_value(sequential::execution_policy<DerivedPolicy> &, Pointer ptr)
+{
+  return *thrust::raw_pointer_cast(ptr);
+} // end get_value()
+
+
+} // end sequential
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/sequential/inner_product.h b/thrust/thrust/system/detail/sequential/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6b3c0ae174718d7dc5e6a2e64cee509634c96c0
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/inner_product.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special inner_product functions
+
diff --git a/thrust/thrust/system/detail/sequential/insertion_sort.h b/thrust/thrust/system/detail/sequential/insertion_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0bb9bc5f3a2bc47cf2e5862955cab87933db89c
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/insertion_sort.h
@@ -0,0 +1,153 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/copy_backward.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void insertion_sort(RandomAccessIterator first,
+                    RandomAccessIterator last,
+                    StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+
+  if(first == last) return;
+
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  for(RandomAccessIterator i = first + 1; i != last; ++i)
+  {
+    value_type tmp = *i;
+
+    if(wrapped_comp(tmp, *first))
+    {
+      // tmp is the smallest value encountered so far
+      sequential::copy_backward(first, i, i + 1);
+
+      *first = tmp;
+    }
+    else
+    {
+      // tmp is not the smallest value, can avoid checking for j == first
+      RandomAccessIterator j = i;
+      RandomAccessIterator k = i - 1;
+
+      while(wrapped_comp(tmp, *k))
+      {
+        *j = *k;
+        j = k;
+        --k;
+      }
+
+      *j = tmp;
+    }
+  }
+}
+
+
+__thrust_exec_check_disable__
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void insertion_sort_by_key(RandomAccessIterator1 first1,
+                           RandomAccessIterator1 last1,
+                           RandomAccessIterator2 first2,
+                           StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  if(first1 == last1) return;
+
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  RandomAccessIterator1 i1 = first1 + 1;
+  RandomAccessIterator2 i2 = first2 + 1;
+
+  for(; i1 != last1; ++i1, ++i2)
+  {
+    value_type1 tmp1 = *i1;
+    value_type2 tmp2 = *i2;
+
+    if(wrapped_comp(tmp1, *first1))
+    {
+      // tmp is the smallest value encountered so far
+      sequential::copy_backward(first1, i1, i1 + 1);
+      sequential::copy_backward(first2, i2, i2 + 1);
+
+      *first1 = tmp1;
+      *first2 = tmp2;
+    }
+    else
+    {
+      // tmp is not the smallest value, can avoid checking for j == first
+      RandomAccessIterator1 j1 = i1;
+      RandomAccessIterator1 k1 = i1 - 1;
+
+      RandomAccessIterator2 j2 = i2;
+      RandomAccessIterator2 k2 = i2 - 1;
+
+      while(wrapped_comp(tmp1, *k1))
+      {
+        *j1 = *k1;
+        *j2 = *k2;
+
+        j1 = k1;
+        j2 = k2;
+
+        --k1;
+        --k2;
+      }
+
+      *j1 = tmp1;
+      *j2 = tmp2;
+    }
+  }
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/iter_swap.h b/thrust/thrust/system/detail/sequential/iter_swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c8fde6e75e6126a46da767b291fa68e200aecd9
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/iter_swap.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/swap.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+  void iter_swap(sequential::execution_policy<DerivedPolicy> &, Pointer1 a, Pointer2 b)
+{
+  using thrust::swap;
+  swap(*thrust::raw_pointer_cast(a), *thrust::raw_pointer_cast(b));
+} // end iter_swap()
+
+
+} // end sequential
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/sequential/logical.h b/thrust/thrust/system/detail/sequential/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee4586273566707b780b50a7ba13e2a3a038ac6e
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/logical.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special logical functions
+
diff --git a/thrust/thrust/system/detail/sequential/malloc_and_free.h b/thrust/thrust/system/detail/sequential/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c545250e396611dd3190a3cd95e3302ab345efb
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/malloc_and_free.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <cstdlib> // for malloc & free
+#include <thrust/detail/raw_pointer_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy>
+inline __host__ __device__
+void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
+{
+  return std::malloc(n);
+} // end mallc()
+
+
+template<typename DerivedPolicy, typename Pointer>
+inline __host__ __device__
+void free(sequential::execution_policy<DerivedPolicy> &, Pointer ptr)
+{
+  std::free(thrust::raw_pointer_cast(ptr));
+} // end mallc()
+
+
+} // end sequential
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/detail/sequential/merge.h b/thrust/thrust/system/detail/sequential/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cd314dc7f323b84cdd9fab46587dca3d6c6f460
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/merge.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file merge.h
+ *  \brief Sequential implementation of merge algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator merge(sequential::execution_policy<DerivedPolicy> &exec,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first1,
+               InputIterator4 values_first2,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp);
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/sequential/merge.inl>
+
diff --git a/thrust/thrust/system/detail/sequential/merge.inl b/thrust/thrust/system/detail/sequential/merge.inl
new file mode 100644
index 0000000000000000000000000000000000000000..ae28ba97d2293f970afa33240be503e08a01db50
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/merge.inl
@@ -0,0 +1,153 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/merge.h>
+#include <thrust/detail/copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+OutputIterator merge(sequential::execution_policy<DerivedPolicy> &exec,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first2, *first1))
+    {
+      *result = *first2;
+      ++first2;
+    } // end if
+    else
+    {
+      *result = *first1;
+      ++first1;
+    } // end else
+
+    ++result;
+  } // end while
+
+  return thrust::copy(exec, first2, last2, thrust::copy(exec, first1, last1, result));
+} // end merge()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(sequential::execution_policy<DerivedPolicy> &,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first1,
+               InputIterator4 values_first2,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(keys_first1 != keys_last1 && keys_first2 != keys_last2)
+  {
+    if(!wrapped_comp(*keys_first2, *keys_first1))
+    {
+      // *keys_first1 <= *keys_first2
+      *keys_result   = *keys_first1;
+      *values_result = *values_first1;
+      ++keys_first1;
+      ++values_first1;
+    }
+    else
+    {
+      // *keys_first1 > keys_first2
+      *keys_result   = *keys_first2;
+      *values_result = *values_first2;
+      ++keys_first2;
+      ++values_first2;
+    }
+
+    ++keys_result;
+    ++values_result;
+  }
+
+  while(keys_first1 != keys_last1)
+  {
+    *keys_result   = *keys_first1;
+    *values_result = *values_first1;
+    ++keys_first1;
+    ++values_first1;
+    ++keys_result;
+    ++values_result;
+  }
+
+  while(keys_first2 != keys_last2)
+  {
+    *keys_result   = *keys_first2;
+    *values_result = *values_first2;
+    ++keys_first2;
+    ++values_first2;
+    ++keys_result;
+    ++values_result;
+  }
+
+  return thrust::make_pair(keys_result, values_result);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/mismatch.h b/thrust/thrust/system/detail/sequential/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6094d261a0f10e388885c1eadcd7083b6448e09
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/mismatch.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special mismatch functions
+
diff --git a/thrust/thrust/system/detail/sequential/partition.h b/thrust/thrust/system/detail/sequential/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..66996d637034e694a1d4a43609cefeb00df9c171
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/partition.h
@@ -0,0 +1,339 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief Sequential implementations of partition functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// XXX WAR an unfortunate circular #inclusion problem
+template<typename,typename> class temporary_array;
+
+
+} // end detail
+
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+void iter_swap(ForwardIterator1 iter1, ForwardIterator2 iter2)
+{
+  // XXX this isn't correct because it doesn't use thrust::swap
+  using namespace thrust::detail;
+
+  typedef typename thrust::iterator_value<ForwardIterator1>::type T;
+
+  T temp = *iter1;
+  *iter1 = *iter2;
+  *iter2 = temp;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  if(first == last)
+    return first;
+
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while(wrapped_pred(*first))
+  {
+    if(++first == last)
+      return first;
+  }
+
+  ForwardIterator next = first;
+
+  while(++next != last)
+  {
+    if(wrapped_pred(*next))
+    {
+      iter_swap(first, next);
+      ++first;
+    }
+  }
+
+  return first;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator partition(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil_first,
+                            Predicate pred)
+{
+  if(first == last)
+    return first;
+
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while(wrapped_pred(*stencil_first))
+  {
+    ++stencil_first;
+    if(++first == last)
+    {
+      return first;
+    }
+  }
+
+  ForwardIterator next = first;
+
+  // advance stencil to next element as well
+  ++stencil_first;
+
+  while(++next != last)
+  {
+    if(wrapped_pred(*stencil_first))
+    {
+      iter_swap(first, next);
+      ++first;
+    }
+
+    ++stencil_first;
+  }
+
+  return first;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(sequential::execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  typedef typename thrust::iterator_value<ForwardIterator>::type T;
+
+  typedef thrust::detail::temporary_array<T,DerivedPolicy> TempRange;
+  typedef typename TempRange::iterator                     TempIterator;
+
+  TempRange temp(exec, first, last);
+
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter)
+  {
+    if(wrapped_pred(*iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  ForwardIterator middle = first;
+
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter)
+  {
+    if(!wrapped_pred(*iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  return middle;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator stable_partition(sequential::execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  typedef typename thrust::iterator_value<ForwardIterator>::type T;
+
+  typedef thrust::detail::temporary_array<T,DerivedPolicy> TempRange;
+  typedef typename TempRange::iterator                     TempIterator;
+
+  TempRange temp(exec, first, last);
+
+  InputIterator stencil_iter = stencil;
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
+  {
+    if(wrapped_pred(*stencil_iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  ForwardIterator middle = first;
+  stencil_iter = stencil;
+
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
+  {
+    if(!wrapped_pred(*stencil_iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  return middle;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(sequential::execution_policy<DerivedPolicy> &,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  for(; first != last; ++first)
+  {
+    if(wrapped_pred(*first))
+    {
+      *out_true = *first;
+      ++out_true;
+    } // end if
+    else
+    {
+      *out_false = *first;
+      ++out_false;
+    } // end else
+  }
+
+  return thrust::make_pair(out_true, out_false);
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(sequential::execution_policy<DerivedPolicy> &,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  for(; first != last; ++first, ++stencil)
+  {
+    if(wrapped_pred(*stencil))
+    {
+      *out_true = *first;
+      ++out_true;
+    } // end if
+    else
+    {
+      *out_false = *first;
+      ++out_false;
+    } // end else
+  }
+
+  return thrust::make_pair(out_true, out_false);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/per_device_resource.h b/thrust/thrust/system/detail/sequential/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d61f92169e0e09c3821e59218f0dcbb70cbe5
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/thrust/system/detail/sequential/reduce.h b/thrust/thrust/system/detail/sequential/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..55e92acb9afc75787955a74808fb6cca96c45964
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/reduce.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief Sequential implementation of reduce algorithm.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputType reduce(sequential::execution_policy<DerivedPolicy> &,
+                    InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  // wrap binary_op
+  thrust::detail::wrapped_function<
+    BinaryFunction,
+    OutputType
+  > wrapped_binary_op(binary_op);
+
+  // initialize the result
+  OutputType result = init;
+
+  while(begin != end)
+  {
+    result = wrapped_binary_op(result, *begin);
+    ++begin;
+  } // end while
+
+  return result;
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/reduce_by_key.h b/thrust/thrust/system/detail/sequential/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e07413654eac991170460aabcdc2f557855d63f
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/reduce_by_key.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(sequential::execution_policy<DerivedPolicy> &,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
+  typedef typename thrust::iterator_traits<InputIterator2>::value_type  InputValueType;
+
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using TemporaryType = typename thrust::iterator_value<InputIterator2>::type;
+
+  if(keys_first != keys_last)
+  {
+    InputKeyType  temp_key   = *keys_first;
+    TemporaryType temp_value = *values_first;
+
+    for(++keys_first, ++values_first;
+        keys_first != keys_last;
+        ++keys_first, ++values_first)
+    {
+      InputKeyType    key  = *keys_first;
+      InputValueType value = *values_first;
+
+      if(binary_pred(temp_key, key))
+      {
+        temp_value = binary_op(temp_value, value);
+      }
+      else
+      {
+        *keys_output   = temp_key;
+        *values_output = temp_value;
+
+        ++keys_output;
+        ++values_output;
+
+        temp_key   = key;
+        temp_value = value;
+      }
+    }
+
+    *keys_output   = temp_key;
+    *values_output = temp_value;
+
+    ++keys_output;
+    ++values_output;
+  }
+
+  return thrust::make_pair(keys_output, values_output);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/remove.h b/thrust/thrust/system/detail/sequential/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..48de522dfdc5b1f6e0e274eb31f98d352943fccd
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/remove.h
@@ -0,0 +1,202 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.h
+ *  \brief Sequential implementations of remove functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  // advance iterators until wrapped_pred(*first) is true or we reach the end of input
+  while(first != last && !wrapped_pred(*first))
+    ++first;
+
+  if(first == last)
+    return first;
+
+  // result always trails first 
+  ForwardIterator result = first;
+
+  ++first;
+
+  while(first != last)
+  {
+    if(!wrapped_pred(*first))
+    {
+      *result = *first;
+      ++result;
+    }
+    ++first;
+  }
+
+  return result;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator remove_if(sequential::execution_policy<DerivedPolicy> &,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  // advance iterators until wrapped_pred(*stencil) is true or we reach the end of input
+  while(first != last && !wrapped_pred(*stencil))
+  {
+    ++first;
+    ++stencil;
+  }
+
+  if(first == last)
+    return first;
+
+  // result always trails first 
+  ForwardIterator result = first;
+
+  ++first;
+  ++stencil;
+
+  while(first != last)
+  {
+    if(!wrapped_pred(*stencil))
+    {
+      *result = *first;
+      ++result;
+    }
+    ++first;
+    ++stencil;
+  }
+
+  return result;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(sequential::execution_policy<DerivedPolicy> &,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while (first != last)
+  {
+    if (!wrapped_pred(*first))
+    {
+      *result = *first;
+      ++result;
+    }
+
+    ++first;
+  }
+
+  return result;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+__host__ __device__
+  OutputIterator remove_copy_if(sequential::execution_policy<DerivedPolicy> &,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // wrap pred
+  thrust::detail::wrapped_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while (first != last)
+  {
+    if (!wrapped_pred(*stencil))
+    {
+      *result = *first;
+      ++result;
+    }
+
+    ++first;
+    ++stencil;
+  }
+
+  return result;
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/replace.h b/thrust/thrust/system/detail/sequential/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8e83857175ff54bc97f6d3909518d2ff4c295
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/replace.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special replace functions
+
diff --git a/thrust/thrust/system/detail/sequential/reverse.h b/thrust/thrust/system/detail/sequential/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..f80974e8a8d752c575a554018cd42e94600d3ab5
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/reverse.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special reverse functions
+
diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bffc93d79d3c19d59364d9c986017d635b489e2
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/scan.h
@@ -0,0 +1,122 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief Sequential implementations of scan functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator inclusive_scan(sequential::execution_policy<DerivedPolicy> &,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+
+  // wrap binary_op
+  thrust::detail::wrapped_function<
+    BinaryFunction,
+    ValueType
+  > wrapped_binary_op(binary_op);
+
+  if(first != last)
+  {
+    ValueType sum = *first;
+
+    *result = *first;
+
+    for(++first, ++result; first != last; ++first, ++result)
+      *result = sum = wrapped_binary_op(sum,*first);
+  }
+
+  return result;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename InitialValueType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator exclusive_scan(sequential::execution_policy<DerivedPolicy> &,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                InitialValueType init,
+                                BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
+
+  if(first != last)
+  {
+    ValueType tmp = *first;  // temporary value allows in-situ scan
+    ValueType sum = init;
+
+    *result = sum;
+    sum = binary_op(sum, tmp);
+
+    for(++first, ++result; first != last; ++first, ++result)
+    {
+      tmp = *first;
+      *result = sum;
+      sum = binary_op(sum, tmp);
+    }
+  }
+
+  return result;
+} 
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/scan_by_key.h b/thrust/thrust/system/detail/sequential/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e0471b37458b8aa861a0eb1ef69457b76572657
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/scan_by_key.h
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan_by_key.h
+ *  \brief Sequential implementation of scan_by_key functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator inclusive_scan_by_key(sequential::execution_policy<DerivedPolicy> &,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+
+  // wrap binary_op
+  thrust::detail::wrapped_function<
+    BinaryFunction,
+    ValueType
+  > wrapped_binary_op(binary_op);
+
+  if(first1 != last1)
+  {
+    KeyType   prev_key   = *first1;
+    ValueType prev_value = *first2;
+
+    *result = prev_value;
+
+    for(++first1, ++first2, ++result;
+        first1 != last1;
+        ++first1, ++first2, ++result)
+    {
+      KeyType key = *first1;
+
+      if(binary_pred(prev_key, key))
+        *result = prev_value = wrapped_binary_op(prev_value,*first2);
+      else
+        *result = prev_value = *first2;
+
+      prev_key = key;
+    }
+  }
+
+  return result;
+}
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator exclusive_scan_by_key(sequential::execution_policy<DerivedPolicy> &,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+
+  if(first1 != last1)
+  {
+    KeyType   temp_key   = *first1;
+    ValueType temp_value = *first2;
+
+    ValueType next = init;
+
+    // first one is init
+    *result = next;
+
+    next = binary_op(next, temp_value);
+
+    for(++first1, ++first2, ++result;
+        first1 != last1;
+        ++first1, ++first2, ++result)
+    {
+      KeyType key = *first1;
+
+      // use temp to permit in-place scans
+      temp_value = *first2;
+
+      if (!binary_pred(temp_key, key))
+        next = init;  // reset sum
+
+      *result = next;  
+      next = binary_op(next, temp_value);
+
+      temp_key = key;
+    }
+  }
+
+  return result;
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/scatter.h b/thrust/thrust/system/detail/sequential/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6817b4cb26b5cd1c1df763cba26bfed74ad47f1
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/scatter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special scatter functions
+
diff --git a/thrust/thrust/system/detail/sequential/sequence.h b/thrust/thrust/system/detail/sequential/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c090f96f348f0443a0a76e6372a0e07a72ea5a1
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/sequence.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special sequence functions
+
diff --git a/thrust/thrust/system/detail/sequential/set_operations.h b/thrust/thrust/system/detail/sequential/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9b1cc688cd29795982c99bcc3bc2f5f2977f8ca
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/set_operations.h
@@ -0,0 +1,224 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file set_operations.h
+ *  \brief Sequential implementation of set operation functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/detail/copy.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  OutputIterator set_difference(sequential::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      ++first2;
+    } // end else if
+    else
+    {
+      ++first1;
+      ++first2;
+    } // end else
+  } // end while
+
+  return thrust::copy(exec, first1, last1, result);
+} // end set_difference()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  OutputIterator set_intersection(sequential::execution_policy<DerivedPolicy> &,
+                                  InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      ++first1;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      ++first2;
+    } // end else if
+    else
+    {
+      *result = *first1;
+      ++first1;
+      ++first2;
+      ++result;
+    } // end else
+  } // end while
+
+  return result;
+} // end set_intersection()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  OutputIterator set_symmetric_difference(sequential::execution_policy<DerivedPolicy> &exec,
+                                          InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      *result = *first2;
+      ++first2;
+      ++result;
+    } // end else if
+    else
+    {
+      ++first1;
+      ++first2;
+    } // end else
+  } // end while
+
+  return thrust::copy(exec, first2, last2, thrust::copy(exec, first1, last1, result));
+} // end set_symmetric_difference()
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+  OutputIterator set_union(sequential::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::wrapped_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      *result = *first1;
+      ++first1;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      *result = *first2;
+      ++first2;
+    } // end else if
+    else
+    {
+      *result = *first1;
+      ++first1;
+      ++first2;
+    } // end else
+
+    ++result;
+  } // end while
+
+  return thrust::copy(exec, first2, last2, thrust::copy(exec, first1, last1, result));
+} // end set_union()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/sort.h b/thrust/thrust/system/detail/sequential/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..0900743d8d4f3106afe19a2373bac45657b41247
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/sort.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file sort.h
+ *  \brief Sequential implementations of sort algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp);
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/sequential/sort.inl>
+
diff --git a/thrust/thrust/system/detail/sequential/sort.inl b/thrust/thrust/system/detail/sequential/sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..bbc18a0b26068aac2f80ad4c6b9c524dd74c7ac6
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/sort.inl
@@ -0,0 +1,204 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/reverse.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/sequential/stable_merge_sort.h>
+#include <thrust/system/detail/sequential/stable_primitive_sort.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+namespace sort_detail
+{
+
+
+////////////////////
+// Primitive Sort //
+////////////////////
+
+
+template<typename KeyType, typename Compare>
+struct needs_reverse
+  : thrust::detail::integral_constant<
+      bool,
+      thrust::detail::is_same<Compare, typename thrust::greater<KeyType> >::value
+    >
+{};
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering,
+                 thrust::detail::true_type)
+{
+  thrust::system::detail::sequential::stable_primitive_sort(exec, first, last);
+        
+  // if comp is greater<T> then reverse the keys
+  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
+
+  if(needs_reverse<KeyType,StrictWeakOrdering>::value)
+  {
+    thrust::reverse(exec, first, last);
+  }
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering,
+                        thrust::detail::true_type)
+{
+  // if comp is greater<T> then reverse the keys and values
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+
+  // note, we also have to reverse the (unordered) input to preserve stability
+  if(needs_reverse<KeyType,StrictWeakOrdering>::value)
+  {
+    thrust::reverse(exec, first1,  last1);
+    thrust::reverse(exec, first2, first2 + (last1 - first1));
+  }
+
+  thrust::system::detail::sequential::stable_primitive_sort_by_key(exec, first1, last1, first2);
+
+  if(needs_reverse<KeyType,StrictWeakOrdering>::value)
+  {
+    thrust::reverse(exec, first1,  last1);
+    thrust::reverse(exec, first2, first2 + (last1 - first1));
+  }
+}
+
+
+////////////////
+// Merge Sort //
+////////////////
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp,
+                 thrust::detail::false_type)
+{
+  thrust::system::detail::sequential::stable_merge_sort(exec, first, last, comp);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp,
+                        thrust::detail::false_type)
+{
+  thrust::system::detail::sequential::stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+}
+
+
+template<typename KeyType, typename Compare>
+struct use_primitive_sort
+  : thrust::detail::and_<
+      thrust::detail::is_arithmetic<KeyType>,
+      thrust::detail::or_<
+        thrust::detail::is_same<Compare, thrust::less<KeyType> >,
+        thrust::detail::is_same<Compare, thrust::greater<KeyType> >
+      >
+    >
+{};
+
+
+} // end namespace sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp)
+{
+
+  // the compilation time of stable_primitive_sort is too expensive to use within a single CUDA thread
+#ifndef __CUDA_ARCH__
+  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
+  sort_detail::use_primitive_sort<KeyType,StrictWeakOrdering> use_primitive_sort;
+#else
+  thrust::detail::false_type use_primitive_sort;
+#endif
+
+  sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp)
+{
+
+  // the compilation time of stable_primitive_sort_by_key is too expensive to use within a single CUDA thread
+#ifndef __CUDA_ARCH__
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  sort_detail::use_primitive_sort<KeyType,StrictWeakOrdering> use_primitive_sort;
+#else
+  thrust::detail::false_type use_primitive_sort;
+#endif
+
+  sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/stable_merge_sort.h b/thrust/thrust/system/detail/sequential/stable_merge_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..359ba8d7b43666153cd5f44c5f2a1d15cad932ab
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/stable_merge_sort.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_merge_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator begin,
+                       RandomAccessIterator end,
+                       StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 keys_begin,
+                              RandomAccessIterator1 keys_end,
+                              RandomAccessIterator2 values_begin,
+                              StrictWeakOrdering comp);
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/sequential/stable_merge_sort.inl>
+
diff --git a/thrust/thrust/system/detail/sequential/stable_merge_sort.inl b/thrust/thrust/system/detail/sequential/stable_merge_sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2939e06686284ee67249c97873f7ada5c52c60ae
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/stable_merge_sort.inl
@@ -0,0 +1,397 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/merge.h>
+#include <thrust/system/detail/sequential/insertion_sort.h>
+#include <thrust/detail/minmax.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+namespace stable_merge_sort_detail
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void inplace_merge(sequential::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator middle,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+
+  thrust::detail::temporary_array<value_type, DerivedPolicy> a(exec, first, middle);
+  thrust::detail::temporary_array<value_type, DerivedPolicy> b(exec, middle, last);
+
+  thrust::merge(exec, a.begin(), a.end(), b.begin(), b.end(), first, comp);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void inplace_merge_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 first1,
+                          RandomAccessIterator1 middle1,
+                          RandomAccessIterator1 last1,
+                          RandomAccessIterator2 first2,
+                          StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  RandomAccessIterator2 middle2 = first2 + (middle1 - first1);
+  RandomAccessIterator2 last2   = first2 + (last1   - first1);
+
+  thrust::detail::temporary_array<value_type1, DerivedPolicy> lhs1(exec, first1, middle1);
+  thrust::detail::temporary_array<value_type1, DerivedPolicy> rhs1(exec, middle1, last1);
+  thrust::detail::temporary_array<value_type2, DerivedPolicy> lhs2(exec, first2, middle2);
+  thrust::detail::temporary_array<value_type2, DerivedPolicy> rhs2(exec, middle2, last2);
+
+  thrust::merge_by_key(exec,
+                       lhs1.begin(), lhs1.end(),
+                       rhs1.begin(), rhs1.end(),
+                       lhs2.begin(), rhs2.begin(),
+                       first1, first2,
+                       comp);
+}
+
+
+template<typename RandomAccessIterator,
+         typename Size,
+         typename StrictWeakOrdering>
+__host__ __device__
+void insertion_sort_each(RandomAccessIterator first,
+                         RandomAccessIterator last,
+                         Size partition_size,
+                         StrictWeakOrdering comp)
+{
+  if(partition_size > 1)
+  {
+    for(; first < last; first += partition_size)
+    {
+      RandomAccessIterator partition_last = thrust::min(last, first + partition_size);
+
+      thrust::system::detail::sequential::insertion_sort(first, partition_last, comp);
+    } // end for
+  } // end if
+} // end insertion_sort_each()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Size,
+         typename StrictWeakOrdering>
+__host__ __device__
+void insertion_sort_each_by_key(RandomAccessIterator1 keys_first,
+                                RandomAccessIterator1 keys_last,
+                                RandomAccessIterator2 values_first,
+                                Size partition_size,
+                                StrictWeakOrdering comp)
+{
+  if(partition_size > 1)
+  {
+    for(; keys_first < keys_last; keys_first += partition_size, values_first += partition_size)
+    {
+      RandomAccessIterator1 keys_partition_last = thrust::min(keys_last, keys_first + partition_size);
+
+      thrust::system::detail::sequential::insertion_sort_by_key(keys_first, keys_partition_last, values_first, comp);
+    } // end for
+  } // end if
+} // end insertion_sort_each()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename Size,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void merge_adjacent_partitions(sequential::execution_policy<DerivedPolicy> &exec,
+                               RandomAccessIterator1 first,
+                               RandomAccessIterator1 last,
+                               Size partition_size,
+                               RandomAccessIterator2 result,
+                               StrictWeakOrdering comp)
+{
+  for(; first < last; first += 2 * partition_size, result += 2 * partition_size)
+  {
+    RandomAccessIterator1 interval_middle = thrust::min(last, first + partition_size);
+    RandomAccessIterator1 interval_last   = thrust::min(last, interval_middle + partition_size);
+
+    thrust::merge(exec,
+                  first, interval_middle,
+                  interval_middle, interval_last,
+                  result,
+                  comp);
+  } // end for
+} // end merge_adjacent_partitions()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Size,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+__host__ __device__
+void merge_adjacent_partitions_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                      RandomAccessIterator1 keys_first,
+                                      RandomAccessIterator1 keys_last,
+                                      RandomAccessIterator2 values_first,
+                                      Size partition_size,
+                                      RandomAccessIterator3 keys_result,
+                                      RandomAccessIterator4 values_result,
+                                      StrictWeakOrdering comp)
+{
+  Size stride = 2 * partition_size;
+
+  for(;
+      keys_first < keys_last;
+      keys_first += stride, values_first += stride, keys_result += stride, values_result += stride)
+  {
+    RandomAccessIterator1 keys_interval_middle = thrust::min(keys_last, keys_first + partition_size);
+    RandomAccessIterator1 keys_interval_last   = thrust::min(keys_last, keys_interval_middle + partition_size);
+
+    RandomAccessIterator2 values_first2 = values_first + (keys_interval_middle - keys_first);
+
+    thrust::merge_by_key(exec,
+                         keys_first, keys_interval_middle,
+                         keys_interval_middle, keys_interval_last,
+                         values_first,
+                         values_first2,
+                         keys_result,
+                         values_result,
+                         comp);
+  } // end for
+} // end merge_adjacent_partitions()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void iterative_stable_merge_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                                 RandomAccessIterator first,
+                                 RandomAccessIterator last,
+                                 StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+  typedef typename thrust::iterator_difference<RandomAccessIterator>::type difference_type;
+
+  difference_type n = last - first;
+
+  thrust::detail::temporary_array<value_type, DerivedPolicy> temp(exec, n);
+
+  // insertion sort each 32 element partition
+  difference_type partition_size = 32;
+  insertion_sort_each(first, last, partition_size, comp);
+
+  // ping indicates whether or not the latest data is in the source range [first, last)
+  bool ping = true;
+
+  // merge adjacent partitions until the partition size covers the entire range
+  for(;
+      partition_size < n;
+      partition_size *= 2, ping = !ping)
+  {
+    if(ping)
+    {
+      merge_adjacent_partitions(exec, first, last, partition_size, temp.begin(), comp);
+    } // end if
+    else
+    {
+      merge_adjacent_partitions(exec, temp.begin(), temp.end(), partition_size, first, comp);
+    } // end else
+  } // end for m
+
+  if(!ping)
+  {
+    thrust::copy(exec, temp.begin(), temp.end(), first);
+  } // end if
+} // end iterative_stable_merge_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void iterative_stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                        RandomAccessIterator1 keys_first,
+                                        RandomAccessIterator1 keys_last,
+                                        RandomAccessIterator2 values_first,
+                                        StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type      value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type      value_type2;
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference_type;
+
+  difference_type n = keys_last - keys_first;
+
+  thrust::detail::temporary_array<value_type1, DerivedPolicy> keys_temp(exec, n);
+  thrust::detail::temporary_array<value_type2, DerivedPolicy> values_temp(exec, n);
+
+  // insertion sort each 32 element partition
+  difference_type partition_size = 32;
+  insertion_sort_each_by_key(keys_first, keys_last, values_first, partition_size, comp);
+
+  // ping indicates whether or not the latest data is in the source range [first, last)
+  bool ping = true;
+
+  // merge adjacent partitions until the partition size covers the entire range
+  for(;
+      partition_size < n;
+      partition_size *= 2, ping = !ping)
+  {
+    if(ping)
+    {
+      merge_adjacent_partitions_by_key(exec, keys_first, keys_last, values_first, partition_size, keys_temp.begin(), values_temp.begin(), comp);
+    } // end if
+    else
+    {
+      merge_adjacent_partitions_by_key(exec, keys_temp.begin(), keys_temp.end(), values_temp.begin(), partition_size, keys_first, values_first, comp);
+    } // end else
+  } // end for m
+
+  if(!ping)
+  {
+    thrust::copy(exec, keys_temp.begin(), keys_temp.end(), keys_first);
+    thrust::copy(exec, values_temp.begin(), values_temp.end(), values_first);
+  } // end if
+} // end iterative_stable_merge_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void recursive_stable_merge_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                                 RandomAccessIterator first,
+                                 RandomAccessIterator last,
+                                 StrictWeakOrdering comp)
+{
+  if(last - first <= 32)
+  {
+    thrust::system::detail::sequential::insertion_sort(first, last, comp);
+  } // end if
+  else
+  {
+    RandomAccessIterator middle = first + (last - first) / 2;
+
+    stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, middle, comp);
+    stable_merge_sort_detail::recursive_stable_merge_sort(exec, middle,  last, comp);
+    stable_merge_sort_detail::inplace_merge(exec, first, middle, last, comp);
+  } // end else
+} // end recursive_stable_merge_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void recursive_stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                        RandomAccessIterator1 first1,
+                                        RandomAccessIterator1 last1,
+                                        RandomAccessIterator2 first2,
+                                        StrictWeakOrdering comp)
+{
+  if(last1 - first1 <= 32)
+  {
+    thrust::system::detail::sequential::insertion_sort_by_key(first1, last1, first2, comp);
+  } // end if
+  else
+  {
+    RandomAccessIterator1 middle1 = first1 + (last1 - first1) / 2;
+    RandomAccessIterator2 middle2 = first2 + (last1 - first1) / 2;
+
+    stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, middle1, first2,  comp);
+    stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, middle1,  last1, middle2, comp);
+    stable_merge_sort_detail::inplace_merge_by_key(exec, first1, middle1, last1, first2, comp);
+  } // end else
+} // end recursive_stable_merge_sort_by_key()
+
+
+} // end namespace stable_merge_sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_merge_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator first,
+                       RandomAccessIterator last,
+                       StrictWeakOrdering comp)
+{
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      // avoid recursion in CUDA threads
+      stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp);
+    #endif
+  }
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__host__ __device__
+void stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              StrictWeakOrdering comp)
+{
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      // avoid recursion in CUDA threads
+      stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+    #endif
+  }
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/stable_primitive_sort.h b/thrust/thrust/system/detail/sequential/stable_primitive_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..3426f953a08ddbc5155e35fdbc1c4447d2831f19
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/stable_primitive_sort.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+void stable_primitive_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                           RandomAccessIterator first,
+                           RandomAccessIterator last);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+void stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator1 keys_first,
+                                  RandomAccessIterator1 keys_last,
+                                  RandomAccessIterator2 values_first);
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/sequential/stable_primitive_sort.inl>
+
diff --git a/thrust/thrust/system/detail/sequential/stable_primitive_sort.inl b/thrust/thrust/system/detail/sequential/stable_primitive_sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..e5cea4ad3b7a7ce8f73bb0c8d2c9626f84c68271
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/stable_primitive_sort.inl
@@ -0,0 +1,161 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/stable_primitive_sort.h>
+#include <thrust/system/detail/sequential/stable_radix_sort.h>
+#include <thrust/functional.h>
+#include <thrust/system/detail/sequential/partition.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+namespace stable_primitive_sort_detail
+{
+
+
+template<typename Iterator>
+  struct enable_if_bool_sort
+    : thrust::detail::enable_if<
+        thrust::detail::is_same<
+          bool,
+          typename thrust::iterator_value<Iterator>::type
+        >::value
+      >
+{};
+
+
+template<typename Iterator>
+  struct disable_if_bool_sort
+    : thrust::detail::disable_if<
+        thrust::detail::is_same<
+          bool,
+          typename thrust::iterator_value<Iterator>::type
+        >::value
+      >
+{};
+
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+  typename enable_if_bool_sort<RandomAccessIterator>::type
+__host__ __device__
+    stable_primitive_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator first, RandomAccessIterator last)
+{
+  // use stable_partition if we're sorting bool
+  // stable_partition puts true values first, so we need to logical_not
+  sequential::stable_partition(exec, first, last, thrust::logical_not<bool>());
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+  typename disable_if_bool_sort<RandomAccessIterator>::type
+__host__ __device__
+    stable_primitive_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator first, RandomAccessIterator last)
+{
+  // call stable_radix_sort
+  sequential::stable_radix_sort(exec,first,last);
+}
+
+
+struct logical_not_first
+{
+  template<typename Tuple>
+  __host__ __device__
+  bool operator()(Tuple t)
+  {
+    return !thrust::get<0>(t);
+  }
+};
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  typename enable_if_bool_sort<RandomAccessIterator1>::type
+__host__ __device__
+    stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                 RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
+                                 RandomAccessIterator2 values_first)
+{
+  // use stable_partition if we're sorting bool
+  // stable_partition puts true values first, so we need to logical_not
+  sequential::stable_partition(exec,
+                               thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
+                               thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
+                               logical_not_first());
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  typename disable_if_bool_sort<RandomAccessIterator1>::type
+__host__ __device__
+    stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                 RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
+                                 RandomAccessIterator2 values_first)
+{
+  // call stable_radix_sort_by_key
+  sequential::stable_radix_sort_by_key(exec, keys_first, keys_last, values_first);
+}
+
+
+} // end stable_primitive_sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+void stable_primitive_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                           RandomAccessIterator first,
+                           RandomAccessIterator last)
+{
+  stable_primitive_sort_detail::stable_primitive_sort(exec, first,last);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+void stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator1 keys_first,
+                                  RandomAccessIterator1 keys_last,
+                                  RandomAccessIterator2 values_first)
+{
+  stable_primitive_sort_detail::stable_primitive_sort_by_key(exec, keys_first, keys_last, values_first);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/stable_radix_sort.h b/thrust/thrust/system/detail/sequential/stable_radix_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f7482ccf9d89a16c2123a78a8d1389a880b7632
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/stable_radix_sort.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+void stable_radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator begin,
+                       RandomAccessIterator end);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 keys_begin,
+                              RandomAccessIterator1 keys_end,
+                              RandomAccessIterator2 values_begin);
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/sequential/stable_radix_sort.inl>
+
diff --git a/thrust/thrust/system/detail/sequential/stable_radix_sort.inl b/thrust/thrust/system/detail/sequential/stable_radix_sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..77202bda425962fc1bed43e7c4078c236b75ba5c
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/stable_radix_sort.inl
@@ -0,0 +1,595 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <limits>
+
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/scatter.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+namespace radix_sort_detail
+{
+
+
+template <typename T>
+struct RadixEncoder : public thrust::identity<T>
+{};
+
+
+template <>
+struct RadixEncoder<char> : public thrust::unary_function<char, unsigned char>
+{
+  __host__ __device__
+  unsigned char operator()(char x) const
+  {
+    if(std::numeric_limits<char>::is_signed)
+    {
+      return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+    }
+    else
+    {
+      return x;
+    }
+  }
+};
+
+template <>
+struct RadixEncoder<signed char> : public thrust::unary_function<signed char, unsigned char>
+{
+  __host__ __device__
+  unsigned char operator()(signed char x) const
+  {
+    return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<short> : public thrust::unary_function<short, unsigned short>
+{
+  __host__ __device__
+  unsigned short operator()(short x) const
+  {
+    return x ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<int> : public thrust::unary_function<int, unsigned int>
+{
+  __host__ __device__
+  unsigned long operator()(long x) const
+  {
+    return x ^ static_cast<unsigned int>(1) << (8 * sizeof(unsigned int) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<long> : public thrust::unary_function<long, unsigned long>
+{
+  __host__ __device__
+  unsigned long operator()(long x) const
+  {
+    return x ^ static_cast<unsigned long>(1) << (8 * sizeof(unsigned long) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<long long> : public thrust::unary_function<long long, unsigned long long>
+{
+  __host__ __device__
+  unsigned long long operator()(long long x) const
+  {
+    return x ^ static_cast<unsigned long long>(1) << (8 * sizeof(unsigned long long) - 1);
+  }
+};
+
+// ideally we'd use uint32 here and uint64 below
+template <>
+struct RadixEncoder<float> : public thrust::unary_function<float, thrust::detail::uint32_t>
+{
+  __host__ __device__
+  thrust::detail::uint32_t operator()(float x) const
+  {
+    union { float f; thrust::detail::uint32_t i; } u;
+    u.f = x;
+    thrust::detail::uint32_t mask = -static_cast<thrust::detail::int32_t>(u.i >> 31) | (static_cast<thrust::detail::uint32_t>(1) << 31);
+    return u.i ^ mask;
+  }
+};
+
+template <>
+struct RadixEncoder<double> : public thrust::unary_function<double, thrust::detail::uint64_t>
+{
+  __host__ __device__
+  thrust::detail::uint64_t operator()(double x) const
+  {
+    union { double f; thrust::detail::uint64_t i; } u;
+    u.f = x;
+    thrust::detail::uint64_t mask = -static_cast<thrust::detail::int64_t>(u.i >> 63) | (static_cast<thrust::detail::uint64_t>(1) << 63);
+    return u.i ^ mask;
+  }
+};
+
+
+// this functor returns a key's to its histogram bucket count and post-increments the bucket
+template<unsigned int RadixBits, typename KeyType>
+  struct bucket_functor
+{
+  typedef RadixEncoder<KeyType> Encoder;
+  typedef typename Encoder::result_type EncodedType;
+  typedef size_t result_type;
+  static const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
+
+  Encoder encode;
+  EncodedType bit_shift;
+  size_t *histogram;
+
+  __host__ __device__
+  bucket_functor(EncodedType bit_shift, size_t *histogram)
+    : encode(),
+      bit_shift(bit_shift),
+      histogram(histogram)
+  {}
+
+  inline __host__ __device__
+  size_t operator()(KeyType key)
+  {
+    const EncodedType x = encode(key);
+
+    // note that we mutate the histogram here
+    return histogram[(x >> bit_shift) & BitMask]++;
+  }
+};
+
+
+template<unsigned int RadixBits,
+         typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Integer>
+inline __host__ __device__
+void radix_shuffle_n(sequential::execution_policy<DerivedPolicy> &exec,
+                     RandomAccessIterator1 first,
+                     const size_t n,
+                     RandomAccessIterator2 result,
+                     Integer bit_shift,
+                     size_t *histogram)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+
+  // note that we are going to mutate the histogram during this sequential scatter
+  thrust::scatter(exec,
+                  first, first + n,
+                  thrust::make_transform_iterator(first, bucket_functor<RadixBits,KeyType>(bit_shift, histogram)),
+                  result);
+}
+
+
+template<unsigned int RadixBits,
+         typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename Integer>
+__host__ __device__
+void radix_shuffle_n(sequential::execution_policy<DerivedPolicy> &exec,
+                     RandomAccessIterator1 keys_first,
+                     RandomAccessIterator2 values_first,
+                     const size_t n,
+                     RandomAccessIterator3 keys_result,
+                     RandomAccessIterator4 values_result,
+                     Integer bit_shift,
+                     size_t *histogram)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+
+  // note that we are going to mutate the histogram during this sequential scatter
+  thrust::scatter(exec,
+                  thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
+                  thrust::make_zip_iterator(thrust::make_tuple(keys_first + n, values_first + n)),
+                  thrust::make_transform_iterator(keys_first, bucket_functor<RadixBits,KeyType>(bit_shift, histogram)),
+                  thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result)));
+}
+
+
+template<unsigned int RadixBits,
+         bool HasValues,
+         typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4>
+__host__ __device__
+void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                RandomAccessIterator1 keys1,
+                RandomAccessIterator2 keys2,
+                RandomAccessIterator3 vals1,
+                RandomAccessIterator4 vals2,
+                const size_t N)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+
+  typedef RadixEncoder<KeyType> Encoder;
+  typedef typename Encoder::result_type EncodedType;
+
+  const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
+  const unsigned int HistogramSize =  1 << RadixBits;
+  
+  const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
+  
+  Encoder encode;
+
+  // storage for histograms
+  size_t histograms[NumHistograms][HistogramSize] = {{0}};
+
+  // see which passes can be eliminated
+  bool skip_shuffle[NumHistograms] = {false};
+  
+  // false if most recent data is stored in (keys1,vals1)
+  bool flip = false;
+    
+  // compute histograms
+  for(size_t i = 0; i < N; i++)
+  {
+    const EncodedType x = encode(keys1[i]);
+
+    for(unsigned int j = 0; j < NumHistograms; j++)
+    {
+      const EncodedType BitShift = RadixBits * j;
+      histograms[j][(x >> BitShift) & BitMask]++;
+    }
+  }
+
+  // scan histograms
+  for(unsigned int i = 0; i < NumHistograms; i++)
+  {
+    size_t sum = 0;
+
+    for(unsigned int j = 0; j < HistogramSize; j++)
+    {
+      size_t bin = histograms[i][j];
+
+      if(bin == N)
+        skip_shuffle[i] = true;
+
+      histograms[i][j] = sum;
+
+      sum = sum + bin;
+    }
+  }
+
+  // shuffle keys and (optionally) values 
+  for(unsigned int i = 0; i < NumHistograms; i++)
+  {
+    const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
+
+    if(!skip_shuffle[i])
+    {
+      if(flip)
+      {
+        if(HasValues)
+        {
+          radix_shuffle_n<RadixBits>(exec, keys2, vals2, N, keys1, vals1, BitShift, histograms[i]);
+        }
+        else
+        {
+          radix_shuffle_n<RadixBits>(exec, keys2, N, keys1, BitShift, histograms[i]);
+        }
+      }
+      else
+      {
+        if(HasValues)
+        {
+          radix_shuffle_n<RadixBits>(exec, keys1, vals1, N, keys2, vals2, BitShift, histograms[i]);
+        }
+        else
+        {
+          radix_shuffle_n<RadixBits>(exec, keys1, N, keys2, BitShift, histograms[i]);
+        }
+      }
+        
+      flip = (flip) ? false : true;
+    }
+  }
+ 
+  // ensure final values are in (keys1,vals1)
+  if(flip)
+  {
+    thrust::copy(exec, keys2, keys2 + N, keys1);
+
+    if(HasValues)
+    {
+      thrust::copy(exec, vals2, vals2 + N, vals1);
+    }
+  }
+}
+
+
+// Select best radix sort parameters based on sizeof(T) and input size
+// These particular values were determined through empirical testing on a Core i7 950 CPU
+template <size_t KeySize>
+struct radix_sort_dispatcher
+{
+};
+
+template <>
+struct radix_sort_dispatcher<1>
+{
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  const size_t N)
+  {
+    radix_sort_detail::radix_sort<8,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+  }
+
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2,
+           typename RandomAccessIterator3,
+           typename RandomAccessIterator4>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  RandomAccessIterator3 vals1, RandomAccessIterator4 vals2,
+                  const size_t N)
+  {
+    radix_sort_detail::radix_sort<8,true>(exec, keys1, keys2, vals1, vals2, N);
+  }
+};
+
+
+template <>
+struct radix_sort_dispatcher<2>
+{
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  const size_t N)
+  {
+#ifdef __QNX__
+    // XXX war for nvbug 200193674
+    const bool condition = true;
+#else
+    const bool condition = N < (1 << 16);
+#endif
+    if (condition)
+    {
+      radix_sort_detail::radix_sort<8,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    }
+    else
+    {
+      radix_sort_detail::radix_sort<16,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    }
+  }
+
+
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2,
+           typename RandomAccessIterator3,
+           typename RandomAccessIterator4>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  RandomAccessIterator3 vals1, RandomAccessIterator4 vals2,
+                  const size_t N)
+  {
+#ifdef __QNX__
+    // XXX war for nvbug 200193674
+    const bool condition = true;
+#else
+    const bool condition = N < (1 << 15);
+#endif
+    if (condition)
+    {
+      radix_sort_detail::radix_sort<8,true>(exec, keys1, keys2, vals1, vals2, N);
+    }
+    else
+    {
+      radix_sort_detail::radix_sort<16,true>(exec, keys1, keys2, vals1, vals2, N);
+    }
+  }
+};
+
+
+template <>
+struct radix_sort_dispatcher<4>
+{
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  const size_t N)
+  {
+    if(N < (1 << 22))
+    {
+      radix_sort_detail::radix_sort<8,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    }
+    else
+    {
+      radix_sort_detail::radix_sort<4,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    }
+  }
+
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2,
+           typename RandomAccessIterator3,
+           typename RandomAccessIterator4>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  RandomAccessIterator3 vals1, RandomAccessIterator4 vals2,
+                  const size_t N)
+  {
+    if(N < (1 << 22))
+    {
+      radix_sort_detail::radix_sort<8,true>(exec, keys1, keys2, vals1, vals2, N);
+    }
+    else
+    {
+      radix_sort_detail::radix_sort<3,true>(exec, keys1, keys2, vals1, vals2, N);
+    }
+  }
+};
+
+
+template <>
+struct radix_sort_dispatcher<8>
+{
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  const size_t N)
+  {
+    if(N < (1 << 21))
+    {
+      radix_sort_detail::radix_sort<8,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    }
+    else
+    {
+      radix_sort_detail::radix_sort<4,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    }
+  }
+
+  template<typename DerivedPolicy,
+           typename RandomAccessIterator1,
+           typename RandomAccessIterator2,
+           typename RandomAccessIterator3,
+           typename RandomAccessIterator4>
+  __host__ __device__
+  void operator()(sequential::execution_policy<DerivedPolicy> &exec,
+                  RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
+                  RandomAccessIterator3 vals1, RandomAccessIterator4 vals2,
+                  const size_t N)
+  {
+    if(N < (1 << 21))
+    {
+      radix_sort_detail::radix_sort<8,true>(exec, keys1, keys2, vals1, vals2, N);
+    }
+    else
+    {
+      radix_sort_detail::radix_sort<3,true>(exec, keys1, keys2, vals1, vals2, N);
+    }
+  }
+};
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                RandomAccessIterator1 keys1,
+                RandomAccessIterator2 keys2,
+                const size_t N)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  radix_sort_dispatcher<sizeof(KeyType)>()(exec, keys1, keys2, N);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4>
+__host__ __device__
+void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                RandomAccessIterator1 keys1,
+                RandomAccessIterator2 keys2,
+                RandomAccessIterator3 vals1,
+                RandomAccessIterator4 vals2,
+                const size_t N)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  radix_sort_dispatcher<sizeof(KeyType)>()(exec, keys1, keys2, vals1, vals2, N);
+}
+
+
+} // namespace radix_sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+__host__ __device__
+void stable_radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator first,
+                       RandomAccessIterator last)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
+
+  size_t N = last - first;
+  
+  thrust::detail::temporary_array<KeyType, DerivedPolicy> temp(exec, N);
+  
+  radix_sort_detail::radix_sort(exec, first, temp.begin(), N);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+__host__ __device__
+void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
+
+  size_t N = last1 - first1;
+  
+  thrust::detail::temporary_array<KeyType, DerivedPolicy>   temp1(exec, N);
+  thrust::detail::temporary_array<ValueType, DerivedPolicy> temp2(exec, N);
+
+  radix_sort_detail::radix_sort(exec, first1, temp1.begin(), first2, temp2.begin(), N);
+}
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/swap_ranges.h b/thrust/thrust/system/detail/sequential/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..497497d6a15286aa69371038ca204619e1a404e1
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/swap_ranges.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special swap_ranges functions
+
diff --git a/thrust/thrust/system/detail/sequential/tabulate.h b/thrust/thrust/system/detail/sequential/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..926cdec15a0e639b0d3523a3bd523a7d6547da9e
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/tabulate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special tabulate functions
+
diff --git a/thrust/thrust/system/detail/sequential/temporary_buffer.h b/thrust/thrust/system/detail/sequential/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2adfaf2810c67462e41f271e43ad0aff9cfbf75f
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/thrust/thrust/system/detail/sequential/transform.h b/thrust/thrust/system/detail/sequential/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..30305c152879af82583c7bffd6b5bb4b4fe7ac2e
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/transform.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special transform functions
+
diff --git a/thrust/thrust/system/detail/sequential/transform_reduce.h b/thrust/thrust/system/detail/sequential/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d2a1b3850dea55c3c8440aa7e22fdb6d002d151
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/transform_reduce.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special transform_reduce functions
+
diff --git a/thrust/thrust/system/detail/sequential/transform_scan.h b/thrust/thrust/system/detail/sequential/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/transform_scan.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/detail/sequential/trivial_copy.h b/thrust/thrust/system/detail/sequential/trivial_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fbd0a987a294a7a33375b74a4c127922f0d2c0b
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/trivial_copy.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file trivial_copy.h
+ *  \brief Sequential copy algorithms for plain-old-data.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <cstring>
+#include <thrust/system/detail/sequential/general_copy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+template<typename T>
+__host__ __device__
+  T *trivial_copy_n(const T *first,
+                    std::ptrdiff_t n,
+                    T *result)
+{
+  T* return_value = NULL;
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      std::memmove(result, first, n * sizeof(T));
+      return_value = result + n;
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      return_value = thrust::system::detail::sequential::general_copy_n(first, n, result);
+    #endif
+  }
+  return return_value;
+} // end trivial_copy_n()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/uninitialized_copy.h b/thrust/thrust/system/detail/sequential/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae90664ad9538e73febfde86c334011de417c8
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/uninitialized_copy.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/thrust/thrust/system/detail/sequential/uninitialized_fill.h b/thrust/thrust/system/detail/sequential/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..65e59fae5dce223c35403adc364a3e1748687923
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/uninitialized_fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special unintialized_fill functions
+
diff --git a/thrust/thrust/system/detail/sequential/unique.h b/thrust/thrust/system/detail/sequential/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..11168f0b42b1c0bb067f03e85ba7913bc6fc7160
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/unique.h
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.h
+ *  \brief Sequential implementations of unique algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  OutputIterator unique_copy(sequential::execution_policy<DerivedPolicy> &,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type T;
+
+  if(first != last)
+  {
+    T prev = *first;
+
+    for(++first; first != last; ++first)
+    {
+      T temp = *first;
+
+      if (!binary_pred(prev, temp))
+      {
+        *output = prev;
+
+        ++output;
+
+        prev = temp;
+      }
+    }
+
+    *output = prev;
+    ++output;
+  }
+
+  return output;
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  ForwardIterator unique(sequential::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  // sequential unique_copy permits in-situ operation
+  return sequential::unique_copy(exec, first, last, first, binary_pred);
+} // end unique()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/sequential/unique_by_key.h b/thrust/thrust/system/detail/sequential/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..899ce02db7e0721520940bfcffba4332c59c0f61
--- /dev/null
+++ b/thrust/thrust/system/detail/sequential/unique_by_key.h
@@ -0,0 +1,116 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique_by_key.h
+ *  \brief Sequential implementations of unique_by_key algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace sequential
+{
+
+
+__thrust_exec_check_disable__
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(sequential::execution_policy<DerivedPolicy> &,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
+  typedef typename thrust::iterator_traits<OutputIterator2>::value_type OutputValueType;
+
+  if(keys_first != keys_last)
+  {
+    InputKeyType    temp_key   = *keys_first;
+    OutputValueType temp_value = *values_first;
+
+    for(++keys_first, ++values_first;
+        keys_first != keys_last;
+        ++keys_first, ++values_first)
+    {
+      InputKeyType    key   = *keys_first;
+      OutputValueType value = *values_first;
+
+      if(!binary_pred(temp_key, key))
+      {
+        *keys_output   = temp_key;
+        *values_output = temp_value;
+
+        ++keys_output;
+        ++values_output;
+
+        temp_key   = key;
+        temp_value = value;
+      }
+    }
+
+    *keys_output   = temp_key;
+    *values_output = temp_value;
+
+    ++keys_output;
+    ++values_output;
+  }
+
+  return thrust::make_pair(keys_output, values_output);
+} // end unique_by_key_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  // sequential unique_by_key_copy() permits in-situ operation
+  return sequential::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_first, values_first, binary_pred);
+} // end unique_by_key()
+
+
+} // end namespace sequential
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/detail/system_error.inl b/thrust/thrust/system/detail/system_error.inl
new file mode 100644
index 0000000000000000000000000000000000000000..3e59458aa0485aab3336be1014d477a8a12f2f21
--- /dev/null
+++ b/thrust/thrust/system/detail/system_error.inl
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/system_error.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+
+system_error
+  ::system_error(error_code ec, const std::string &what_arg)
+    : std::runtime_error(what_arg), m_error_code(ec)
+{
+
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(error_code ec, const char *what_arg)
+    : std::runtime_error(what_arg), m_error_code(ec)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(error_code ec)
+    : std::runtime_error(""), m_error_code(ec)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(int ev, const error_category &ecat, const std::string &what_arg)
+    : std::runtime_error(what_arg), m_error_code(ev,ecat)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(int ev, const error_category &ecat, const char *what_arg)
+    : std::runtime_error(what_arg), m_error_code(ev,ecat)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(int ev, const error_category &ecat)
+    : std::runtime_error(""), m_error_code(ev,ecat)
+{
+  ;
+} // end system_error::system_error()
+
+
+const error_code &system_error
+  ::code(void) const throw()
+{
+  return m_error_code;
+} // end system_error::code()
+
+
+const char *system_error
+  ::what(void) const throw()
+{
+  if(m_what.empty())
+  {
+    try
+    {
+      m_what = this->std::runtime_error::what();
+      if(m_error_code)
+      {
+        if(!m_what.empty()) m_what += ": ";
+        m_what += m_error_code.message();
+      }
+    }
+    catch(...)
+    {
+      return std::runtime_error::what();
+    }
+  }
+
+  return m_what.c_str();
+} // end system_error::what()
+
+
+} // end system
+
+} // end thrust
+
diff --git a/thrust/thrust/system/error_code.h b/thrust/thrust/system/error_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..faa81bbca38c5fc8c6d0fa1dc8e803b66db02568
--- /dev/null
+++ b/thrust/thrust/system/error_code.h
@@ -0,0 +1,523 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file error_code.h
+ *  \brief An object used to hold error values, such as those originating from the
+ *         operating system or other low-level application program interfaces.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/errno.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace system
+{
+
+
+/*! \addtogroup system_diagnostics
+ *  \{
+ */
+
+class error_condition;
+class error_code;
+
+/*! A metafunction returning whether or not the parameter is an \p error_code enum.
+ */
+template<typename T> struct is_error_code_enum : public thrust::detail::false_type {};
+
+/*! A metafunction returning whether or not the parameter is an \p error_condition enum.
+ */
+template<typename T> struct is_error_condition_enum : public thrust::detail::false_type {};
+
+
+// XXX N3092 prefers enum class errc { ... }
+namespace errc
+{
+
+/*! An enum containing common error codes.
+ */
+enum errc_t
+{
+  address_family_not_supported       = detail::eafnosupport,
+  address_in_use                     = detail::eaddrinuse,
+  address_not_available              = detail::eaddrnotavail,
+  already_connected                  = detail::eisconn,
+  argument_list_too_long             = detail::e2big,
+  argument_out_of_domain             = detail::edom,
+  bad_address                        = detail::efault,
+  bad_file_descriptor                = detail::ebadf,
+  bad_message                        = detail::ebadmsg,
+  broken_pipe                        = detail::epipe,
+  connection_aborted                 = detail::econnaborted,
+  connection_already_in_progress     = detail::ealready,
+  connection_refused                 = detail::econnrefused,
+  connection_reset                   = detail::econnreset,
+  cross_device_link                  = detail::exdev,
+  destination_address_required       = detail::edestaddrreq,
+  device_or_resource_busy            = detail::ebusy,
+  directory_not_empty                = detail::enotempty,
+  executable_format_error            = detail::enoexec,
+  file_exists                        = detail::eexist,
+  file_too_large                     = detail::efbig,
+  filename_too_long                  = detail::enametoolong,
+  function_not_supported             = detail::enosys,
+  host_unreachable                   = detail::ehostunreach,
+  identifier_removed                 = detail::eidrm,
+  illegal_byte_sequence              = detail::eilseq,
+  inappropriate_io_control_operation = detail::enotty,
+  interrupted                        = detail::eintr,
+  invalid_argument                   = detail::einval,
+  invalid_seek                       = detail::espipe,
+  io_error                           = detail::eio,
+  is_a_directory                     = detail::eisdir,
+  message_size                       = detail::emsgsize,
+  network_down                       = detail::enetdown,
+  network_reset                      = detail::enetreset,
+  network_unreachable                = detail::enetunreach,
+  no_buffer_space                    = detail::enobufs,
+  no_child_process                   = detail::echild,
+  no_link                            = detail::enolink,
+  no_lock_available                  = detail::enolck,
+  no_message_available               = detail::enodata,
+  no_message                         = detail::enomsg,
+  no_protocol_option                 = detail::enoprotoopt,
+  no_space_on_device                 = detail::enospc,
+  no_stream_resources                = detail::enosr,
+  no_such_device_or_address          = detail::enxio,
+  no_such_device                     = detail::enodev,
+  no_such_file_or_directory          = detail::enoent,
+  no_such_process                    = detail::esrch,
+  not_a_directory                    = detail::enotdir,
+  not_a_socket                       = detail::enotsock,
+  not_a_stream                       = detail::enostr,
+  not_connected                      = detail::enotconn,
+  not_enough_memory                  = detail::enomem,
+  not_supported                      = detail::enotsup,
+  operation_canceled                 = detail::ecanceled,
+  operation_in_progress              = detail::einprogress,
+  operation_not_permitted            = detail::eperm,
+  operation_not_supported            = detail::eopnotsupp,
+  operation_would_block              = detail::ewouldblock,
+  owner_dead                         = detail::eownerdead,
+  permission_denied                  = detail::eacces,
+  protocol_error                     = detail::eproto,
+  protocol_not_supported             = detail::eprotonosupport,
+  read_only_file_system              = detail::erofs,
+  resource_deadlock_would_occur      = detail::edeadlk,
+  resource_unavailable_try_again     = detail::eagain,
+  result_out_of_range                = detail::erange,
+  state_not_recoverable              = detail::enotrecoverable,
+  stream_timeout                     = detail::etime,
+  text_file_busy                     = detail::etxtbsy,
+  timed_out                          = detail::etimedout,
+  too_many_files_open_in_system      = detail::enfile,
+  too_many_files_open                = detail::emfile,
+  too_many_links                     = detail::emlink,
+  too_many_symbolic_link_levels      = detail::eloop,
+  value_too_large                    = detail::eoverflow,
+  wrong_protocol_type                = detail::eprototype
+}; // end errc_t
+
+} // end namespace errc
+
+
+/*! Specialization of \p is_error_condition_enum for \p errc::errc_t
+ */
+template<> struct is_error_condition_enum<errc::errc_t> : public thrust::detail::true_type {};
+
+
+// [19.5.1.1] class error_category
+
+/*! \brief The class \p error_category serves as a base class for types used to identify the
+ *         source and encoding of a particular category of error code. Classes may be derived
+ *         from \p error_category to support categories of errors in addition to those defined
+ *         in the C++ International Standard.
+ */
+class error_category
+{
+  public:
+    /*! Destructor does nothing.
+     */
+    inline virtual ~error_category(void);
+
+    // XXX enable upon c++0x
+    // error_category(const error_category &) = delete;
+    // error_category &operator=(const error_category &) = delete;
+
+    /*! \return A string naming the error category.
+     */
+    inline virtual const char *name(void) const = 0;
+
+    /*! \return \p error_condition(ev, *this).
+     */
+    inline virtual error_condition default_error_condition(int ev) const;
+
+    /*! \return <tt>default_error_condition(code) == condition</tt>
+     */
+    inline virtual bool equivalent(int code, const error_condition &condition) const;
+
+    /*! \return <tt>*this == code.category() && code.value() == condition</tt>
+     */
+    inline virtual bool equivalent(const error_code &code, int condition) const;
+
+    /*! \return A string that describes the error condition denoted by \p ev.
+     */
+    virtual std::string message(int ev) const = 0;
+
+    /*! \return <tt>*this == &rhs</tt>
+     */
+    inline bool operator==(const error_category &rhs) const;
+
+    /*! \return <tt>!(*this == rhs)</tt>
+     */
+    inline bool operator!=(const error_category &rhs) const;
+
+    /*! \return <tt>less<const error_category*>()(this, &rhs)</tt>
+     *  \note \c less provides a total ordering for pointers.
+     */
+    inline bool operator<(const error_category &rhs) const;
+}; // end error_category
+
+
+// [19.5.1.5] error_category objects
+
+
+/*! \return A reference to an object of a type derived from class \p error_category.
+ *  \note The object's \p default_error_condition and \p equivalent virtual functions
+ *        shall behave as specified for the class \p error_category. The object's
+ *        \p name virtual function shall return a pointer to the string <tt>"generic"</tt>.
+ */
+inline const error_category &generic_category(void);
+
+
+/*! \return A reference to an object of a type derived from class \p error_category.
+ *  \note The object's \p equivalent virtual functions shall behave as specified for
+ *        class \p error_category. The object's \p name virtual function shall return
+ *        a pointer to the string <tt>"system"</tt>. The object's \p default_error_condition
+ *        virtual function shall behave as follows:
+ *
+ *        If the argument <tt>ev</tt> corresponds to a POSIX <tt>errno</tt> value
+ *        \c posv, the function shall return <tt>error_condition(ev,generic_category())</tt>.
+ *        Otherwise, the function shall return <tt>error_condition(ev,system_category())</tt>.
+ *        What constitutes correspondence for any given operating system is unspecified.
+ */
+inline const error_category &system_category(void);
+
+
+// [19.5.2] Class error_code
+
+
+/*! \brief The class \p error_code describes an object used to hold error code values, such as
+ *         those originating from the operating system or other low-level application program
+ *         interfaces.
+ */
+class error_code
+{
+  public:
+    // [19.5.2.2] constructors:
+
+    /*! Effects: Constructs an object of type \p error_code.
+     *  \post <tt>value() == 0</tt> and <tt>category() == &system_category()</tt>.
+     */
+    inline error_code(void);
+
+    /*! Effects: Constructs an object of type \p error_code.
+     *  \post <tt>value() == val</tt> and <tt>category() == &cat</tt>.
+     */
+    inline error_code(int val, const error_category &cat);
+
+    /*! Effects: Constructs an object of type \p error_code.
+     *  \post <tt>*this == make_error_code(e)</tt>.
+     */
+    template <typename ErrorCodeEnum>
+      error_code(ErrorCodeEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+        , typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value>::type * = 0
+#endif // THRUST_HOST_COMPILER_MSVC
+        );
+
+    // [19.5.2.3] modifiers:
+
+    /*! \post <tt>value() == val</tt> and <tt>category() == &cat</tt>.
+     */
+    inline void assign(int val, const error_category &cat);
+
+    /*! \post <tt>*this == make_error_code(e)</tt>.
+     */
+    template <typename ErrorCodeEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+      typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value, error_code>::type &
+#else
+      error_code &
+#endif // THRUST_HOST_COMPILER_MSVC
+        operator=(ErrorCodeEnum e);
+
+    /*! \post <tt>value() == 0</tt> and <tt>category() == system_category()</tt>.
+     */
+    inline void clear(void);
+
+    // [19.5.2.4] observers:
+
+    /*! \return An integral value of this \p error_code object.
+     */
+    inline int value(void) const;
+
+    /*! \return An \p error_category describing the category of this \p error_code object.
+     */
+    inline const error_category &category(void) const;
+
+    /*! \return <tt>category().default_error_condition()</tt>.
+     */
+    inline error_condition default_error_condition(void) const;
+
+    /*! \return <tt>category().message(value())</tt>.
+     */
+    inline std::string message(void) const;
+
+    // XXX replace the below upon c++0x
+    // inline explicit operator bool (void) const;
+
+    /*! \return <tt>value() != 0</tt>.
+     */
+    inline operator bool (void) const;
+
+    /*! \cond
+     */
+  private:
+    int m_val;
+    const error_category *m_cat;
+    /*! \endcond
+     */
+}; // end error_code
+
+
+// [19.5.2.5] Class error_code non-member functions
+
+
+// XXX replace errc::errc_t with errc upon c++0x
+/*! \return <tt>error_code(static_cast<int>(e), generic_category())</tt>
+ */
+inline error_code make_error_code(errc::errc_t e);
+
+
+/*! \return <tt>lhs.category() < rhs.category() || lhs.category() == rhs.category() && lhs.value() < rhs.value()</tt>.
+ */
+inline bool operator<(const error_code &lhs, const error_code &rhs);
+
+
+/*! Effects: <tt>os << ec.category().name() << ':' << ec.value()</tt>.
+ */
+template <typename charT, typename traits>
+  std::basic_ostream<charT,traits>&
+    operator<<(std::basic_ostream<charT,traits>& os, const error_code &ec);
+
+
+// [19.5.3] class error_condition
+
+
+/*! \brief The class \p error_condition describes an object used to hold values identifying
+ *  error conditions.
+ *
+ *  \note \p error_condition values are portable abstractions, while \p error_code values
+ *        are implementation specific.
+ */
+class error_condition
+{
+  public:
+    // [19.5.3.2] constructors
+
+    /*! Constructs an object of type \p error_condition.
+     *  \post <tt>value() == 0</tt>.
+     *  \post <tt>category() == generic_category()</tt>.
+     */
+    inline error_condition(void);
+
+    /*! Constructs an object of type \p error_condition.
+     *  \post <tt>value() == val</tt>.
+     *  \post <tt>category() == cat</tt>.
+     */
+    inline error_condition(int val, const error_category &cat);
+
+    /*! Constructs an object of type \p error_condition.
+     *  \post <tt>*this == make_error_condition(e)</tt>.
+     *  \note This constructor shall not participate in overload resolution unless
+     *        <tt>is_error_condition_enum<ErrorConditionEnum>::value</tt> is <tt>true</tt>.
+     */
+    template<typename ErrorConditionEnum>
+      error_condition(ErrorConditionEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+        , typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value>::type * = 0
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                     );
+
+    // [19.5.3.3] modifiers
+
+    /*! Assigns to this \p error_code object from an error value and an \p error_category.
+     *  \param val The new value to return from <tt>value()</tt>.
+     *  \param cat The new \p error_category to return from <tt>category()</tt>.
+     *  \post <tt>value() == val</tt>.
+     *  \post <tt>category() == cat</tt>.
+     */
+    inline void assign(int val, const error_category &cat);
+
+    /*! Assigns to this \p error_code object from an error condition enumeration.
+     *  \return *this
+     *  \post <tt>*this == make_error_condition(e)</tt>.
+     *  \note This operator shall not participate in overload resolution unless
+     *        <tt>is_error_condition_enum<ErrorConditionEnum>::value</tt> is <tt>true</tt>.
+     */
+    template<typename ErrorConditionEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+      typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>::type &
+#else
+      error_condition &
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+        operator=(ErrorConditionEnum e);
+
+    /*! Clears this \p error_code object.
+     *  \post <tt>value == 0</tt>
+     *  \post <tt>category() == generic_category()</tt>.
+     */
+    inline void clear(void);
+
+    // [19.5.3.4] observers
+
+    /*! \return The value encoded by this \p error_condition.
+     */
+    inline int value(void) const;
+
+    /*! \return A <tt>const</tt> reference to the \p error_category encoded by this \p error_condition.
+     */
+    inline const error_category &category(void) const;
+
+    /*! \return <tt>category().message(value())</tt>.
+     */
+    inline std::string message(void) const;
+
+    // XXX replace below with this upon c++0x
+    //explicit operator bool (void) const;
+    
+    /*! \return <tt>value() != 0</tt>.
+     */
+    inline operator bool (void) const;
+
+    /*! \cond
+     */
+
+  private:
+    int m_val;
+    const error_category *m_cat;
+
+    /*! \endcond
+     */
+}; // end error_condition
+
+
+
+// [19.5.3.5] Class error_condition non-member functions
+
+// XXX replace errc::errc_t with errc upon c++0x
+/*! \return <tt>error_condition(static_cast<int>(e), generic_category())</tt>.
+ */
+inline error_condition make_error_condition(errc::errc_t e);
+
+
+/*! \return <tt>lhs.category() < rhs.category() || lhs.category() == rhs.category() && lhs.value() < rhs.value()</tt>.
+ */
+inline bool operator<(const error_condition &lhs, const error_condition &rhs);
+
+
+// [19.5.4] Comparison operators
+
+
+/*! \return <tt>lhs.category() == rhs.category() && lhs.value() == rhs.value()</tt>.
+ */
+inline bool operator==(const error_code &lhs, const error_code &rhs);
+
+
+/*! \return <tt>lhs.category().equivalent(lhs.value(), rhs) || rhs.category().equivalent(lhs,rhs.value())</tt>.
+ */
+inline bool operator==(const error_code &lhs, const error_condition &rhs);
+
+
+/*! \return <tt>rhs.category().equivalent(lhs.value(), lhs) || lhs.category().equivalent(rhs, lhs.value())</tt>.
+ */
+inline bool operator==(const error_condition &lhs, const error_code &rhs);
+
+
+/*! \return <tt>lhs.category() == rhs.category() && lhs.value() == rhs.value()</tt>
+ */
+inline bool operator==(const error_condition &lhs, const error_condition &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_code &lhs, const error_code &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_code &lhs, const error_condition &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_condition &lhs, const error_code &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_condition &lhs, const error_condition &rhs);
+
+/*! \} // end system_diagnostics
+ */
+
+
+} // end system
+
+
+// import names into thrust::
+using system::error_category;
+using system::error_code;
+using system::error_condition;
+using system::is_error_code_enum;
+using system::is_error_condition_enum;
+using system::make_error_code;
+using system::make_error_condition;
+
+// XXX replace with using system::errc upon c++0x
+namespace errc = system::errc;
+
+using system::generic_category;
+using system::system_category;
+
+} // end thrust
+
+#include <thrust/system/detail/error_category.inl>
+#include <thrust/system/detail/error_code.inl>
+#include <thrust/system/detail/error_condition.inl>
+
diff --git a/thrust/thrust/system/omp/detail/adjacent_difference.h b/thrust/thrust/system/omp/detail/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f314eaebbbdfee13791c347b99898369a12e0cd
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/adjacent_difference.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     BinaryFunction binary_op)
+{
+  // omp prefers generic::adjacent_difference to cpp::adjacent_difference
+  return thrust::system::detail::generic::adjacent_difference(exec, first, last, result, binary_op);
+} // end adjacent_difference()
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/assign_value.h b/thrust/thrust/system/omp/detail/assign_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf244a02193211b9b4e4f07a6bc9b975d50e5388
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/assign_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits assign_value
+#include <thrust/system/cpp/detail/assign_value.h>
+
diff --git a/thrust/thrust/system/omp/detail/binary_search.h b/thrust/thrust/system/omp/detail/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..37ff8fab5734f951252dacb9b811db1f9e2ae75d
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/binary_search.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/system/detail/generic/binary_search.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    // omp prefers generic::lower_bound to cpp::lower_bound
+    return thrust::system::detail::generic::lower_bound(exec, begin, end, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename Backend>
+ForwardIterator upper_bound(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    // omp prefers generic::upper_bound to cpp::upper_bound
+    return thrust::system::detail::generic::upper_bound(exec, begin, end, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+    // omp prefers generic::binary_search to cpp::binary_search
+    return thrust::system::detail::generic::binary_search(exec, begin, end, value, comp);
+}
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/copy.h b/thrust/thrust/system/omp/detail/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2b6661e87549ed737c8172fb553ad238e33a903
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/copy.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/copy.inl>
+
diff --git a/thrust/thrust/system/omp/detail/copy.inl b/thrust/thrust/system/omp/detail/copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4d104e5ec63fdcc8817e068f4d48ddb07e03e7fa
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/copy.inl
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/copy.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/system/detail/sequential/copy.h>
+#include <thrust/detail/type_traits/minimum_type.h>
+
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+namespace dispatch
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::incrementable_traversal_tag)
+{
+  return thrust::system::detail::sequential::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::random_access_traversal_tag)
+{
+  return thrust::system::detail::generic::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::incrementable_traversal_tag)
+{
+  return thrust::system::detail::sequential::copy_n(exec, first, n, result);
+} // end copy_n()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::random_access_traversal_tag)
+{
+  return thrust::system::detail::generic::copy_n(exec, first, n, result);
+} // end copy_n()
+
+
+} // end dispatch
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::omp::detail::dispatch::copy(exec, first, last, result, traversal());
+} // end copy()
+
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::omp::detail::dispatch::copy_n(exec, first, n, result, traversal());
+} // end copy_n()
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/copy_if.h b/thrust/thrust/system/omp/detail/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5c28704d32304d7a377a0b290fc6c711366a090
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/copy_if.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
+#include <thrust/system/omp/detail/copy_if.inl>
+
diff --git a/thrust/thrust/system/omp/detail/copy_if.inl b/thrust/thrust/system/omp/detail/copy_if.inl
new file mode 100644
index 0000000000000000000000000000000000000000..7f2516a74983ce00c353aed7db0e11940959c2cd
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/copy_if.inl
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/copy_if.h>
+#include <thrust/system/detail/generic/copy_if.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  // omp prefers generic::copy_if to cpp::copy_if
+  return thrust::system::detail::generic::copy_if(exec, first, last, stencil, result, pred);
+} // end copy_if()
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/count.h b/thrust/thrust/system/omp/detail/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..fde1728b77261d75c561b9042ec365281d78cee9
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/count.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits count
+#include <thrust/system/cpp/detail/count.h>
+
diff --git a/thrust/thrust/system/omp/detail/default_decomposition.h b/thrust/thrust/system/omp/detail/default_decomposition.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb4b03c719b7c89e2b4561066394fc3874971638
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/default_decomposition.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file default_decomposition.h
+ *  \brief Return a decomposition that is appropriate for the OpenMP backend.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/internal/decompose.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename IndexType>
+thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/default_decomposition.inl>
+
diff --git a/thrust/thrust/system/omp/detail/default_decomposition.inl b/thrust/thrust/system/omp/detail/default_decomposition.inl
new file mode 100644
index 0000000000000000000000000000000000000000..53f4b428f621466e1218ae0f163370d88fd353ec
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/default_decomposition.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/default_decomposition.h>
+
+// don't attempt to #include this file without omp support
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+#include <omp.h>
+#endif // omp support
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename IndexType>
+thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to OpenMP support in your compiler.                         X
+  // ========================================================================
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      IndexType, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, omp_get_num_procs());
+#else
+  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, 1);
+#endif
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/equal.h b/thrust/thrust/system/omp/detail/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..13398fc9db5a02ba7cd7d2141f106fa59ba2a941
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/equal.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits equal
+#include <thrust/system/cpp/detail/equal.h>
+
diff --git a/thrust/thrust/system/omp/detail/execution_policy.h b/thrust/thrust/system/omp/detail/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..52c879a168551227c059416d4b80fde69491bfa4
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/execution_policy.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace omp
+{
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::system::cpp::detail::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::system::cpp::detail::execution_policy<Derived>
+{
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
+};
+
+
+// overloads of select_system
+
+// XXX select_system(tbb, omp) & select_system(omp, tbb) are ambiguous
+//     because both convert to cpp without these overloads, which we
+//     arbitrarily define in the omp backend
+
+template<typename System1, typename System2>
+inline __host__ __device__
+  System1 select_system(execution_policy<System1> s, thrust::system::tbb::detail::execution_policy<System2>)
+{
+  return thrust::detail::derived_cast(s);
+} // end select_system()
+
+
+template<typename System1, typename System2>
+inline __host__ __device__
+  System2 select_system(thrust::system::tbb::detail::execution_policy<System1>, execution_policy<System2> s)
+{
+  return thrust::detail::derived_cast(s);
+} // end select_system()
+
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::omp::detail::execution_policy;
+using thrust::system::omp::detail::tag;
+
+} // end omp
+} // end system
+
+// alias items at top-level
+namespace omp
+{
+
+using thrust::system::omp::execution_policy;
+using thrust::system::omp::tag;
+
+} // end omp
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/extrema.h b/thrust/thrust/system/omp/detail/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..96661180d3c93c33ea8632fc0fa72aa7e2910955
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/extrema.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/system/detail/generic/extrema.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // omp prefers generic::max_element to cpp::max_element
+  return thrust::system::detail::generic::max_element(exec, first, last, comp);
+} // end max_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // omp prefers generic::min_element to cpp::min_element
+  return thrust::system::detail::generic::min_element(exec, first, last, comp);
+} // end min_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  // omp prefers generic::minmax_element to cpp::minmax_element
+  return thrust::system::detail::generic::minmax_element(exec, first, last, comp);
+} // end minmax_element()
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
+
diff --git a/thrust/thrust/system/omp/detail/fill.h b/thrust/thrust/system/omp/detail/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..6665a264873f6a0a775de0aa670ee7567d899ad9
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits fill
+#include <thrust/system/cpp/detail/fill.h>
+
diff --git a/thrust/thrust/system/omp/detail/find.h b/thrust/thrust/system/omp/detail/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6445c06831c49e05f4a82cddde0f38081b82978
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/find.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief OpenMP implementation of find_if. 
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/find.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  // omp prefers generic::find_if to cpp::find_if
+  return thrust::system::detail::generic::find_if(exec, first, last, pred);
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/for_each.h b/thrust/thrust/system/omp/detail/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e6955ea2b713e7babc6fe6facb9b2dc1278d5e4
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/for_each.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.h
+ *  \brief Defines the interface for a function that executes a 
+ *  function or functional for each value in a given range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &exec,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator first,
+                                  Size n,
+                                  UnaryFunction f);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/for_each.inl>
+
diff --git a/thrust/thrust/system/omp/detail/for_each.inl b/thrust/thrust/system/omp/detail/for_each.inl
new file mode 100644
index 0000000000000000000000000000000000000000..6be6435e6c36b841fb62bce90458f31268cfa8b1
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/for_each.inl
@@ -0,0 +1,100 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.inl
+ *  \brief Inline file for for_each.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/distance.h>
+#include <thrust/detail/function.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
+                                RandomAccessIterator first,
+                                Size n,
+                                UnaryFunction f)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      RandomAccessIterator, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+  if (n <= 0) return first;  //empty range
+
+  // create a wrapped function for f
+  thrust::detail::wrapped_function<UnaryFunction,void> wrapped_f(f);
+
+// do not attempt to compile the body of this function, which depends on #pragma omp,
+// without support from the compiler
+// XXX implement the body of this function in another file to eliminate this ugliness
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  // use a signed type for the iteration variable or suffer the consequences of warnings
+  typedef typename thrust::iterator_difference<RandomAccessIterator>::type DifferenceType;
+  DifferenceType signed_n = n;
+#pragma omp parallel for
+  for(DifferenceType i = 0;
+      i < signed_n;
+      ++i)
+  {
+    RandomAccessIterator temp = first + i;
+    wrapped_f(*temp);
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+
+  return first + n;
+} // end for_each_n() 
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f)
+{
+  return omp::detail::for_each_n(s, first, thrust::distance(first,last), f);
+} // end for_each()
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/gather.h b/thrust/thrust/system/omp/detail/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..098e0f4fbad4001632ed02ee9e9b39aa17e54ea0
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/gather.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits gather
+#include <thrust/system/cpp/detail/gather.h>
+
diff --git a/thrust/thrust/system/omp/detail/generate.h b/thrust/thrust/system/omp/detail/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..f907b6acc079577642c446d6f0736073defc44b8
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/generate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits generate
+#include <thrust/system/cpp/detail/generate.h>
+
diff --git a/thrust/thrust/system/omp/detail/get_value.h b/thrust/thrust/system/omp/detail/get_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..23a11a8574f77f95bc6ca96d0cd8ff6de8c71c7e
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/get_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits get_value
+#include <thrust/system/cpp/detail/get_value.h>
+
diff --git a/thrust/thrust/system/omp/detail/inner_product.h b/thrust/thrust/system/omp/detail/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8cf941a1dc3df1a6a516eee54f92fa610fd35cc
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/inner_product.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits inner_product
+#include <thrust/system/cpp/detail/inner_product.h>
+
diff --git a/thrust/thrust/system/omp/detail/iter_swap.h b/thrust/thrust/system/omp/detail/iter_swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..a096739947c8854afe003e6bf498d89683854ff1
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/iter_swap.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits iter_swap
+#include <thrust/system/cpp/detail/iter_swap.h>
+
diff --git a/thrust/thrust/system/omp/detail/logical.h b/thrust/thrust/system/omp/detail/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..4199063183dbc38b79c7707bb8301e5ca8aa6ad5
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/logical.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits logical
+#include <thrust/system/cpp/detail/logical.h>
+
diff --git a/thrust/thrust/system/omp/detail/malloc_and_free.h b/thrust/thrust/system/omp/detail/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..01ab1e6dbe1732da1f8606b7a9121c1b404edb6f
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/malloc_and_free.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits malloc and free
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+
diff --git a/thrust/thrust/system/omp/detail/memory.inl b/thrust/thrust/system/omp/detail/memory.inl
new file mode 100644
index 0000000000000000000000000000000000000000..331ba5cabcc4c0f39dbb8a120a9757f99f0fbb16
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/memory.inl
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/omp/memory.h>
+#include <thrust/system/cpp/memory.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::malloc
+//     is not defined
+//     WAR the problem by using adl to call cpp::malloc, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  pointer<void> malloc_workaround(Tag t, std::size_t n)
+{
+  return pointer<void>(malloc(t, n));
+} // end malloc_workaround()
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::free
+//     is not defined
+//     WAR the problem by using adl to call cpp::free, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  void free_workaround(Tag t, pointer<void> ptr)
+{
+  free(t, ptr.get());
+} // end free_workaround()
+
+} // end detail
+
+inline pointer<void> malloc(std::size_t n)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // return pointer<void>(thrust::system::cpp::malloc(n))
+  //
+  return detail::malloc_workaround(cpp::tag(), n);
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::omp::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+inline void free(pointer<void> ptr)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // thrust::system::cpp::free(ptr)
+  //
+  detail::free_workaround(cpp::tag(), ptr);
+} // end free()
+
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/merge.h b/thrust/thrust/system/omp/detail/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a7bf6a7f80e0156a069a87793f99f324a9dd603
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/merge.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits merge
+#include <thrust/system/cpp/detail/merge.h>
+
diff --git a/thrust/thrust/system/omp/detail/mismatch.h b/thrust/thrust/system/omp/detail/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5c6b2c4bdd1cc242c87d7526a42e21bf4b1561c
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/mismatch.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits mismatch
+#include <thrust/system/cpp/detail/mismatch.h>
+
diff --git a/thrust/thrust/system/omp/detail/par.h b/thrust/thrust/system/omp/detail/par.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa88b2ccd80dbcc445d28786837dd616261dd413
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/par.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::omp::detail::execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::omp::detail::execution_policy>
+{
+  __host__ __device__
+  THRUST_CONSTEXPR par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
+};
+
+
+} // end detail
+
+
+static const detail::par_t par;
+
+
+} // end omp
+} // end system
+
+
+// alias par here
+namespace omp
+{
+
+
+using thrust::system::omp::par;
+
+
+} // end omp
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/partition.h b/thrust/thrust/system/omp/detail/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..64a76e2788725134cf456742f33d57714e0f071f
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/partition.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/partition.inl>
+
diff --git a/thrust/thrust/system/omp/detail/partition.inl b/thrust/thrust/system/omp/detail/partition.inl
new file mode 100644
index 0000000000000000000000000000000000000000..b81c17cbf4c12dadb8ffd3a05e84da6fd8af3e6d
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/partition.inl
@@ -0,0 +1,108 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/partition.h>
+#include <thrust/system/detail/generic/partition.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  // omp prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  // omp prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, stencil, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // omp prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // omp prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/per_device_resource.h b/thrust/thrust/system/omp/detail/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d61f92169e0e09c3821e59218f0dcbb70cbe5
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/thrust/system/omp/detail/pointer.inl b/thrust/thrust/system/omp/detail/pointer.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2125302e4ce33c058b0e9cfa851c062f72e36437
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/pointer.inl
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/reduce.h b/thrust/thrust/system/omp/detail/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..c058e05db4c78c8601c0bd322a7edd8f9ba16d89
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reduce.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/reduce.inl>
+
diff --git a/thrust/thrust/system/omp/detail/reduce.inl b/thrust/thrust/system/omp/detail/reduce.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4609922a9456e9069164a636e7fa69ec59e07f5d
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reduce.inl
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/reduce.h>
+#include <thrust/system/omp/detail/default_decomposition.h>
+#include <thrust/system/omp/detail/reduce_intervals.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_difference<InputIterator>::type difference_type;
+
+  const difference_type n = thrust::distance(first,last);
+
+  // determine first and second level decomposition
+  thrust::system::detail::internal::uniform_decomposition<difference_type> decomp1 = thrust::system::omp::detail::default_decomposition(n);
+  thrust::system::detail::internal::uniform_decomposition<difference_type> decomp2(decomp1.size() + 1, 1, 1);
+
+  // allocate storage for the initializer and partial sums
+  // XXX use select_system for Tag
+  thrust::detail::temporary_array<OutputType,DerivedPolicy> partial_sums(exec, decomp1.size() + 1);
+  
+  // set first element of temp array to init
+  partial_sums[0] = init;
+  
+  // accumulate partial sums (first level reduction)
+  thrust::system::omp::detail::reduce_intervals(exec, first, partial_sums.begin() + 1, binary_op, decomp1);
+
+  // reduce partial sums (second level reduction)
+  thrust::system::omp::detail::reduce_intervals(exec, partial_sums.begin(), partial_sums.begin(), binary_op, decomp2);
+
+  return partial_sums[0];
+} // end reduce()
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/reduce_by_key.h b/thrust/thrust/system/omp/detail/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..37e89ecbaa3c3608024170f395263b130b9c7e0c
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reduce_by_key.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/reduce_by_key.inl>
+
diff --git a/thrust/thrust/system/omp/detail/reduce_by_key.inl b/thrust/thrust/system/omp/detail/reduce_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..afd4c8e51fdbe6eeacc631002225ade3349735b3
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reduce_by_key.inl
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/reduce_by_key.h>
+#include <thrust/system/detail/generic/reduce_by_key.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+  // omp prefers generic::reduce_by_key to cpp::reduce_by_key
+  return thrust::system::detail::generic::reduce_by_key(exec, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+} // end reduce_by_key()
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/omp/detail/reduce_intervals.h b/thrust/thrust/system/omp/detail/reduce_intervals.h
new file mode 100644
index 0000000000000000000000000000000000000000..44551e6452d7992c13412184687dbea906797aec
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reduce_intervals.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce_intervals.h
+ *  \brief OpenMP implementations of reduce_intervals algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition>
+void reduce_intervals(execution_policy<DerivedPolicy> &exec,
+                      InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/reduce_intervals.inl>
+
diff --git a/thrust/thrust/system/omp/detail/reduce_intervals.inl b/thrust/thrust/system/omp/detail/reduce_intervals.inl
new file mode 100644
index 0000000000000000000000000000000000000000..961f2757af1c61e1fcde26caf0909e7056234489
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reduce_intervals.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/reduce_intervals.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/cstdint.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition>
+void reduce_intervals(execution_policy<DerivedPolicy> &,
+                      InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      InputIterator, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+
+  // wrap binary_op
+  thrust::detail::wrapped_function<BinaryFunction,OutputType> wrapped_binary_op(binary_op);
+
+  typedef thrust::detail::intptr_t index_type;
+
+  index_type n = static_cast<index_type>(decomp.size());
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+# pragma omp parallel for
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+  for(index_type i = 0; i < n; i++)
+  {
+    InputIterator begin = input + decomp[i].begin();
+    InputIterator end   = input + decomp[i].end();
+
+    if (begin != end)
+    {
+      OutputType sum = thrust::raw_reference_cast(*begin);
+
+      ++begin;
+
+      while (begin != end)
+      {
+        sum = wrapped_binary_op(sum, *begin);
+        ++begin;
+      }
+
+      OutputIterator tmp = output + i;
+      *tmp = sum;
+    }
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/remove.h b/thrust/thrust/system/omp/detail/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca4eab84575814020f7658436ea2f78808678fc2
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/remove.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/remove.inl>
+
diff --git a/thrust/thrust/system/omp/detail/remove.inl b/thrust/thrust/system/omp/detail/remove.inl
new file mode 100644
index 0000000000000000000000000000000000000000..aa828947628930a5df0bd058a9521a5f6e525098
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/remove.inl
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/remove.h>
+#include <thrust/system/detail/generic/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  // omp prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  // omp prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, stencil, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // omp prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, result, pred);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // omp prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, stencil, result, pred);
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/replace.h b/thrust/thrust/system/omp/detail/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c5a14ba3df120019c9a5b6ed638db3f2555a5b
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/replace.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/thrust/thrust/system/omp/detail/reverse.h b/thrust/thrust/system/omp/detail/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f3e0325e257c301215e62c690837433ae24c30c
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/reverse.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits reverse
+#include <thrust/system/cpp/detail/reverse.h>
+
diff --git a/thrust/thrust/system/omp/detail/scan.h b/thrust/thrust/system/omp/detail/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..f47dbbc3087c613f36de65f704505340bb8a85b0
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits scan
+#include <thrust/system/cpp/detail/scan.h>
+
diff --git a/thrust/thrust/system/omp/detail/scan_by_key.h b/thrust/thrust/system/omp/detail/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f6b42d54410703e7ba96123e9ea0655bbc79ef9
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/scan_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scan_by_key.h>
+
diff --git a/thrust/thrust/system/omp/detail/scatter.h b/thrust/thrust/system/omp/detail/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c5a14ba3df120019c9a5b6ed638db3f2555a5b
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/scatter.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/thrust/thrust/system/omp/detail/sequence.h b/thrust/thrust/system/omp/detail/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33b2d4333ce2ded0ffe73c23c20a80c5a35b928
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/sequence.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits sequence
+#include <thrust/system/cpp/detail/sequence.h>
+
diff --git a/thrust/thrust/system/omp/detail/set_operations.h b/thrust/thrust/system/omp/detail/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..421fa8a4bd955706497d0c9b30614035ccbbc46f
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/set_operations.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits set_operations
+#include <thrust/system/cpp/detail/set_operations.h>
+
diff --git a/thrust/thrust/system/omp/detail/sort.h b/thrust/thrust/system/omp/detail/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..339ce5b6e86398a9b4ccba1c9ae5b8ed79f06958
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/sort.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp);
+    
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 keys_first,
+                        RandomAccessIterator1 keys_last,
+                        RandomAccessIterator2 values_first,
+                        StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/sort.inl>
+
diff --git a/thrust/thrust/system/omp/detail/sort.inl b/thrust/thrust/system/omp/detail/sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..587017ca6c759ae7a5fbd99705bd07667e3712d7
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/sort.inl
@@ -0,0 +1,265 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+
+// don't attempt to #include this file without omp support
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+#include <omp.h>
+#endif // omp support
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/default_decomposition.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/sort.h>
+#include <thrust/merge.h>
+#include <thrust/detail/seq.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+namespace sort_detail
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void inplace_merge(execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator middle,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+
+  thrust::detail::temporary_array<value_type,DerivedPolicy> a(exec, first, middle);
+  thrust::detail::temporary_array<value_type,DerivedPolicy> b(exec, middle, last);
+
+  thrust::merge(thrust::seq, a.begin(), a.end(), b.begin(), b.end(), first, comp);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void inplace_merge_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 first1,
+                          RandomAccessIterator1 middle1,
+                          RandomAccessIterator1 last1,
+                          RandomAccessIterator2 first2,
+                          StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  RandomAccessIterator2 middle2 = first2 + (middle1 - first1);
+  RandomAccessIterator2 last2   = first2 + (last1   - first1);
+
+  thrust::detail::temporary_array<value_type1,DerivedPolicy> lhs1(exec, first1, middle1);
+  thrust::detail::temporary_array<value_type1,DerivedPolicy> rhs1(exec, middle1, last1);
+  thrust::detail::temporary_array<value_type2,DerivedPolicy> lhs2(exec, first2, middle2);
+  thrust::detail::temporary_array<value_type2,DerivedPolicy> rhs2(exec, middle2, last2);
+
+  thrust::merge_by_key(thrust::seq,
+                       lhs1.begin(), lhs1.end(),
+                       rhs1.begin(), rhs1.end(),
+                       lhs2.begin(), rhs2.begin(),
+                       first1, first2,
+                       comp);
+}
+
+
+} // end sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      RandomAccessIterator, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  typedef typename thrust::iterator_difference<RandomAccessIterator>::type IndexType;
+  
+  if(first == last)
+    return;
+
+  #pragma omp parallel
+  {
+    thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(last - first, 1, omp_get_num_threads());
+
+    // process id
+    IndexType p_i = omp_get_thread_num();
+
+    // every thread sorts its own tile
+    if(p_i < decomp.size())
+    {
+      thrust::stable_sort(thrust::seq,
+                          first + decomp[p_i].begin(),
+                          first + decomp[p_i].end(),
+                          comp);
+    }
+
+    #pragma omp barrier
+
+    // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
+    ;
+
+    IndexType nseg = decomp.size();
+    IndexType h = 2;
+
+    // keep track of which sub-range we're processing
+    IndexType a=p_i, b=p_i, c=p_i+1;
+
+    while(nseg>1)
+    {
+      if(c >= decomp.size())
+        c = decomp.size() - 1;
+
+      if((p_i % h) == 0 && c > b)
+      {
+        sort_detail::inplace_merge(exec,
+                                   first + decomp[a].begin(),
+                                   first + decomp[b].end(),
+                                   first + decomp[c].end(),
+                                   comp);
+
+        b = c;
+        c += h;
+      }
+
+      nseg = (nseg + 1) / 2;
+      h *= 2;
+
+      #pragma omp barrier
+    }
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 keys_first,
+                        RandomAccessIterator1 keys_last,
+                        RandomAccessIterator2 values_first,
+                        StrictWeakOrdering comp)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      RandomAccessIterator1, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type IndexType;
+  
+  if(keys_first == keys_last)
+    return;
+
+  #pragma omp parallel
+  {
+    thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(keys_last - keys_first, 1, omp_get_num_threads());
+
+    // process id
+    IndexType p_i = omp_get_thread_num();
+
+    // every thread sorts its own tile
+    if(p_i < decomp.size())
+    {
+      thrust::stable_sort_by_key(thrust::seq,
+                                 keys_first + decomp[p_i].begin(),
+                                 keys_first + decomp[p_i].end(),
+                                 values_first + decomp[p_i].begin(),
+                                 comp);
+    }
+
+    #pragma omp barrier
+
+    // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
+    ;
+
+    IndexType nseg = decomp.size();
+    IndexType h = 2;
+
+    // keep track of which sub-range we're processing
+    IndexType a=p_i, b=p_i, c=p_i+1;
+
+    while(nseg>1)
+    {
+      if(c >= decomp.size())
+        c = decomp.size() - 1;
+
+      if((p_i % h) == 0 && c > b)
+      {
+        sort_detail::inplace_merge_by_key(exec,
+                                          keys_first + decomp[a].begin(),
+                                          keys_first + decomp[b].end(),
+                                          keys_first + decomp[c].end(),
+                                          values_first + decomp[a].begin(),
+                                          comp);
+
+        b = c;
+        c += h;
+      }
+
+      nseg = (nseg + 1) / 2;
+      h *= 2;
+
+      #pragma omp barrier
+    }
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+}
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/swap_ranges.h b/thrust/thrust/system/omp/detail/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c3338b1baf58a3628245072f4f4700dcd3bc025
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/swap_ranges.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// omp inherits swap_ranges
+#include <thrust/system/cpp/detail/swap_ranges.h>
+
diff --git a/thrust/thrust/system/omp/detail/tabulate.h b/thrust/thrust/system/omp/detail/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea135c707064b7195e4a78efc15849ba431e9068
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/tabulate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits tabulate
+#include <thrust/system/cpp/detail/tabulate.h>
+
diff --git a/thrust/thrust/system/omp/detail/temporary_buffer.h b/thrust/thrust/system/omp/detail/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2adfaf2810c67462e41f271e43ad0aff9cfbf75f
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/thrust/thrust/system/omp/detail/transform.h b/thrust/thrust/system/omp/detail/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..20d606dfbeec6d376a138db500ec368d94efa748
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/transform.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// omp inherits transform
+#include <thrust/system/cpp/detail/transform.h>
+
diff --git a/thrust/thrust/system/omp/detail/transform_reduce.h b/thrust/thrust/system/omp/detail/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8736bd75d06e54d9158baeb2504162d75312885
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/transform_reduce.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_reduce
+#include <thrust/system/cpp/detail/transform_reduce.h>
+
diff --git a/thrust/thrust/system/omp/detail/transform_scan.h b/thrust/thrust/system/omp/detail/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..75b075b6b16f063a1c5cda8893911d3f3c533f2d
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/transform_scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_scan
+#include <thrust/system/cpp/detail/transform_scan.h>
+
diff --git a/thrust/thrust/system/omp/detail/uninitialized_copy.h b/thrust/thrust/system/omp/detail/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda06ac13e9ca1d14ee5e047986884f5207d3d2b
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/uninitialized_copy.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_copy
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+
diff --git a/thrust/thrust/system/omp/detail/uninitialized_fill.h b/thrust/thrust/system/omp/detail/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..764de876233a012e5a9de9113c5fb2dac7a22499
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/uninitialized_fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_fill
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+
diff --git a/thrust/thrust/system/omp/detail/unique.h b/thrust/thrust/system/omp/detail/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..433e7689b69b210b9de2996beaf2849a6130779d
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/unique.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace omp 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/unique.inl>
+
diff --git a/thrust/thrust/system/omp/detail/unique.inl b/thrust/thrust/system/omp/detail/unique.inl
new file mode 100644
index 0000000000000000000000000000000000000000..70f026dbb1024e76015812f417f25e50a0fcc127
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/unique.inl
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/unique.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique to cpp::unique
+  return thrust::system::detail::generic::unique(exec,first,last,binary_pred);
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_copy to cpp::unique_copy
+  return thrust::system::detail::generic::unique_copy(exec,first,last,output,binary_pred);
+} // end unique_copy()
+
+
+} // end namespace detail
+} // end namespace omp 
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/detail/unique_by_key.h b/thrust/thrust/system/omp/detail/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff3acb09428a95dc8835902c3f5c4c6d0704c01e
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/unique_by_key.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace omp 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/unique_by_key.inl>
+
diff --git a/thrust/thrust/system/omp/detail/unique_by_key.inl b/thrust/thrust/system/omp/detail/unique_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0a4367b7b2de10e82541c78cbb77483291e00285
--- /dev/null
+++ b/thrust/thrust/system/omp/detail/unique_by_key.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/unique_by_key.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_by_key to cpp::unique_by_key
+  return thrust::system::detail::generic::unique_by_key(exec,keys_first,keys_last,values_first,binary_pred);
+} // end unique_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_by_key_copy to cpp::unique_by_key_copy
+  return thrust::system::detail::generic::unique_by_key_copy(exec,keys_first,keys_last,values_first,keys_output,values_output,binary_pred);
+} // end unique_by_key_copy()
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/omp/execution_policy.h b/thrust/thrust/system/omp/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a413f7f6fa5b050971737ab44f4fbe7ef60e0a4
--- /dev/null
+++ b/thrust/thrust/system/omp/execution_policy.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/omp/execution_policy.h
+ *  \brief Execution policies for Thrust's OpenMP system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/omp/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/omp/detail/par.h>
+
+// now get all the algorithm definitions
+
+#include <thrust/system/omp/detail/adjacent_difference.h>
+#include <thrust/system/omp/detail/assign_value.h>
+#include <thrust/system/omp/detail/binary_search.h>
+#include <thrust/system/omp/detail/copy.h>
+#include <thrust/system/omp/detail/copy_if.h>
+#include <thrust/system/omp/detail/count.h>
+#include <thrust/system/omp/detail/equal.h>
+#include <thrust/system/omp/detail/extrema.h>
+#include <thrust/system/omp/detail/fill.h>
+#include <thrust/system/omp/detail/find.h>
+#include <thrust/system/omp/detail/for_each.h>
+#include <thrust/system/omp/detail/gather.h>
+#include <thrust/system/omp/detail/generate.h>
+#include <thrust/system/omp/detail/get_value.h>
+#include <thrust/system/omp/detail/inner_product.h>
+#include <thrust/system/omp/detail/iter_swap.h>
+#include <thrust/system/omp/detail/logical.h>
+#include <thrust/system/omp/detail/malloc_and_free.h>
+#include <thrust/system/omp/detail/merge.h>
+#include <thrust/system/omp/detail/mismatch.h>
+#include <thrust/system/omp/detail/partition.h>
+#include <thrust/system/omp/detail/reduce.h>
+#include <thrust/system/omp/detail/reduce_by_key.h>
+#include <thrust/system/omp/detail/remove.h>
+#include <thrust/system/omp/detail/replace.h>
+#include <thrust/system/omp/detail/reverse.h>
+#include <thrust/system/omp/detail/scan.h>
+#include <thrust/system/omp/detail/scan_by_key.h>
+#include <thrust/system/omp/detail/scatter.h>
+#include <thrust/system/omp/detail/sequence.h>
+#include <thrust/system/omp/detail/set_operations.h>
+#include <thrust/system/omp/detail/sort.h>
+#include <thrust/system/omp/detail/swap_ranges.h>
+#include <thrust/system/omp/detail/tabulate.h>
+#include <thrust/system/omp/detail/transform.h>
+#include <thrust/system/omp/detail/transform_reduce.h>
+#include <thrust/system/omp/detail/transform_scan.h>
+#include <thrust/system/omp/detail/uninitialized_copy.h>
+#include <thrust/system/omp/detail/uninitialized_fill.h>
+#include <thrust/system/omp/detail/unique.h>
+#include <thrust/system/omp/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::omp::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's OpenMP backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p omp::tag is a type representing Thrust's standard C++ backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p omp::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p omp system.
+ */
+struct tag : thrust::system::omp::execution_policy<tag> { unspecified };
+
+
+/*! \p thrust::omp::par is the parallel execution policy associated with Thrust's OpenMP
+ *  backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's OpenMP backend system by providing \p thrust::omp::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::omp::vector.
+ *
+ *  The type of \p thrust::omp::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::omp::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the OpenMP backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/omp/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::omp::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end cpp
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/thrust/thrust/system/omp/memory.h b/thrust/thrust/system/omp/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b2f070ccd4139d5f535d47d1b685b7f397ba330
--- /dev/null
+++ b/thrust/thrust/system/omp/memory.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/omp/memory.h
+ *  \brief Managing memory associated with Thrust's OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/memory_resource.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/mr/allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+/*! Allocates an area of memory available to Thrust's <tt>omp</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>omp::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>omp::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>omp::pointer<void></tt> returned by this function must be
+ *        deallocated with \p omp::free.
+ *  \see omp::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>omp</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>omp::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>omp::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>omp::pointer<T></tt> returned by this function must be
+ *        deallocated with \p omp::free.
+ *  \see omp::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>omp::malloc</tt>.
+ *  \param ptr A <tt>omp::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>omp::malloc</tt>.
+ *  \see omp::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+/*! \p omp::allocator is the default allocator used by the \p omp system's containers such as
+ *  <tt>omp::vector</tt> if no user-specified allocator is provided. \p omp::allocator allocates
+ *  (deallocates) storage with \p omp::malloc (\p omp::free).
+ */
+template<typename T>
+using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
+
+} // end omp
+} // end system
+
+/*! \namespace thrust::omp
+ *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
+ */
+namespace omp
+{
+
+using thrust::system::omp::malloc;
+using thrust::system::omp::free;
+using thrust::system::omp::allocator;
+
+} // end omp
+
+} // end thrust
+
+#include <thrust/system/omp/detail/memory.inl>
+
diff --git a/thrust/thrust/system/omp/memory_resource.h b/thrust/thrust/system/omp/memory_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a540d834939b928a4b6049c6a97d2289ab43257
--- /dev/null
+++ b/thrust/thrust/system/omp/memory_resource.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file omp/memory_resource.h
+ *  \brief Memory resources for the OMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/fancy_pointer_resource.h>
+
+#include <thrust/system/omp/pointer.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+//! \cond
+namespace detail
+{
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::omp::pointer<void>
+    > native_resource;
+}
+//! \endcond
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! The memory resource for the OMP system. Uses \p mr::new_delete_resource and tags it with \p omp::pointer. */
+typedef detail::native_resource memory_resource;
+/*! An alias for \p omp::memory_resource. */
+typedef detail::native_resource universal_memory_resource;
+/*! An alias for \p omp::memory_resource. */
+typedef detail::native_resource universal_host_pinned_memory_resource;
+
+/*! \}
+ */
+
+}
+}
+}
diff --git a/thrust/thrust/system/omp/pointer.h b/thrust/thrust/system/omp/pointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b6bed12ac65b117242c291debb9e1ec9deae7d
--- /dev/null
+++ b/thrust/thrust/system/omp/pointer.h
@@ -0,0 +1,360 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/omp/memory.h
+ *  \brief Managing memory associated with Thrust's OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+template<typename> class pointer;
+
+} // end omp
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize thrust::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace thrust
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::omp::pointer<Element> >
+{
+  private:
+    typedef thrust::system::omp::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end thrust
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::omp
+ *  \brief \p thrust::system::omp is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's OpenMP backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::omp</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace omp
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::omp::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the omp system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in omp memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p omp::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see omp::malloc
+ *  \see omp::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::omp::tag,
+               thrust::system::omp::reference<T>,
+               thrust::system::omp::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::omp::tag,
+      //thrust::system::omp::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::omp::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    // note that omp::pointer's member functions need __host__ __device__
+    // to interoperate with nvcc + iterators' dereference member function
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer(decltype(nullptr)) : super_t(nullptr) {}
+    #endif
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p omp system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! This constructor allows construction from another pointer-like object with \p void type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::omp::tag and its element type shall be \p void.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    explicit
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer& operator=(decltype(nullptr))
+    {
+      super_t::operator=(nullptr);
+      return *this;
+    }
+    #endif
+}; // end pointer
+
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p omp system.
+ *  \p reference is the type of the result of dereferencing a \p omp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::omp::pointer<T>,
+               thrust::system::omp::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::omp::pointer<T>,
+      thrust::system::omp::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference of interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+} // end omp
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::omp
+ *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
+ */
+namespace omp
+{
+
+using thrust::system::omp::pointer;
+using thrust::system::omp::reference;
+
+} // end omp
+
+} // end thrust
+
+#include <thrust/system/omp/detail/pointer.inl>
+
diff --git a/thrust/thrust/system/omp/vector.h b/thrust/thrust/system/omp/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..101a22c7b2059d69dce14809f8761cfba345315f
--- /dev/null
+++ b/thrust/thrust/system/omp/vector.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/omp/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+// XXX why is this here? it doesn't seem necessary for anything below
+template<typename T, typename Allocator> class host_vector;
+
+namespace system
+{
+namespace omp
+{
+
+/*! \p omp::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p omp::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in an \p omp::vector reside in memory
+ *  available to the \p omp system.
+ *
+ *  \tparam T The element type of the \p omp::vector.
+ *  \tparam Allocator The allocator type of the \p omp::vector. Defaults to \p omp::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p omp::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+} // end omp
+} // end system
+
+// alias system::omp names at top-level
+namespace omp
+{
+
+using thrust::system::omp::vector;
+
+} // end omp
+
+} // end thrust
diff --git a/thrust/thrust/system/system_error.h b/thrust/thrust/system/system_error.h
new file mode 100644
index 0000000000000000000000000000000000000000..84e453dc662832acd28687126fc2a5f2a7db3d7a
--- /dev/null
+++ b/thrust/thrust/system/system_error.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file system/system_error.h
+ *  \brief An exception object used to report error conditions that have an
+ *         associated error code
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <stdexcept>
+#include <string>
+
+#include <thrust/system/error_code.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+// [19.5.5] Class system_error
+
+// [19.5.5.1] Class system_error overview
+
+/*! \addtogroup system_diagnostics System Diagnostics
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \brief The class \p system_error describes an exception object used to report error
+ *  conditions that have an associated \p error_code. Such error conditions typically
+ *  originate from the operating system or other low-level application program interfaces.
+ *
+ *  Thrust uses \p system_error to report the error codes returned from device backends
+ *  such as the CUDA runtime.
+ *
+ *  The following code listing demonstrates how to catch a \p system_error to recover
+ *  from an error.
+ *
+ *  \code
+ *
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/system.h>
+ *  #include <thrust/sort.h>
+ *
+ *  void terminate_gracefully(void)
+ *  {
+ *    // application-specific termination code here
+ *    ...
+ *  }
+ *
+ *  int main(void)
+ *  {
+ *    try
+ *    {
+ *      thrust::device_vector<float> vec;
+ *      thrust::sort(vec.begin(), vec.end());
+ *    }
+ *    catch(thrust::system_error e)
+ *    {
+ *      std::cerr << "Error inside sort: " << e.what() << std::endl;
+ *      terminate_gracefully();
+ *    }
+ *
+ *    return 0;
+ *  }
+ *
+ *  \endcode
+ *
+ *  \note If an error represents an out-of-memory condition, implementations are encouraged
+ *  to throw an exception object of type \p std::bad_alloc rather than \p system_error.
+ */
+class system_error
+  : public std::runtime_error
+{
+  public:
+    // [19.5.5.2] Class system_error members
+    
+    /*! Constructs an object of class \p system_error.
+     *  \param ec The value returned by \p code().
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == ec</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(error_code ec, const std::string &what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ec The value returned by \p code().
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == ec</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(error_code ec, const char *what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ec The value returned by \p code().
+     *  \post <tt>code() == ec</tt>.
+     */
+    inline system_error(error_code ec);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ev The error value used to create an \p error_code.
+     *  \param ecat The \p error_category used to create an \p error_code.
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == error_code(ev, ecat)</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(int ev, const error_category &ecat, const std::string &what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ev The error value used to create an \p error_code.
+     *  \param ecat The \p error_category used to create an \p error_code.
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == error_code(ev, ecat)</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(int ev, const error_category &ecat, const char *what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ev The error value used to create an \p error_code.
+     *  \param ecat The \p error_category used to create an \p error_code.
+     *  \post <tt>code() == error_code(ev, ecat)</tt>.
+     */
+    inline system_error(int ev, const error_category &ecat);
+
+    /*! Destructor does not throw.
+     */
+    inline virtual ~system_error(void) throw () {};
+    
+    /*! Returns an object encoding the error.
+     *  \return <tt>ec</tt> or <tt>error_code(ev, ecat)</tt>, from the
+     *          constructor, as appropriate.
+     */
+    inline const error_code &code(void) const throw();
+
+    /*! Returns a human-readable string indicating the nature of the error.
+     *  \return a string incorporating <tt>code().message()</tt> and the
+     *          arguments supplied in the constructor.
+     */
+    inline const char *what(void) const throw();
+
+    /*! \cond
+     */
+  private:
+    error_code          m_error_code;
+    mutable std::string m_what;
+
+    /*! \endcond
+     */
+}; // end system_error
+
+} // end system
+
+/*! \} // end system_diagnostics
+ */
+
+// import names into thrust::
+using system::system_error;
+
+} // end thrust
+
+#include <thrust/system/detail/system_error.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/adjacent_difference.h b/thrust/thrust/system/tbb/detail/adjacent_difference.h
new file mode 100644
index 0000000000000000000000000000000000000000..d22b4aac348c13fdafa9f03662c820d8fc3b377b
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/adjacent_difference.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     BinaryFunction binary_op)
+{
+  // tbb prefers generic::adjacent_difference to cpp::adjacent_difference
+  return thrust::system::detail::generic::adjacent_difference(exec, first, last, result, binary_op);
+} // end adjacent_difference()
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/assign_value.h b/thrust/thrust/system/tbb/detail/assign_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf244a02193211b9b4e4f07a6bc9b975d50e5388
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/assign_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits assign_value
+#include <thrust/system/cpp/detail/assign_value.h>
+
diff --git a/thrust/thrust/system/tbb/detail/binary_search.h b/thrust/thrust/system/tbb/detail/binary_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..0847e5d1fdb3a446651897d62c959d56ad9dd1b9
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/binary_search.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits binary_search
+#include <thrust/system/cpp/detail/binary_search.h>
+
diff --git a/thrust/thrust/system/tbb/detail/copy.h b/thrust/thrust/system/tbb/detail/copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..7977768b02be1812799733462f1a162632a9c53f
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/copy.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/copy.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/copy.inl b/thrust/thrust/system/tbb/detail/copy.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0d96ad48b7e099087e89f5e14191b625143d84f6
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/copy.inl
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/copy.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/system/detail/sequential/copy.h>
+#include <thrust/detail/type_traits/minimum_type.h>
+#include <thrust/detail/copy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace dispatch
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::incrementable_traversal_tag)
+{
+  return thrust::system::detail::sequential::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::random_access_traversal_tag)
+{
+  return thrust::system::detail::generic::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::incrementable_traversal_tag)
+{
+  return thrust::system::detail::sequential::copy_n(exec, first, n, result);
+} // end copy_n()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::random_access_traversal_tag)
+{
+  return thrust::system::detail::generic::copy_n(exec, first, n, result);
+} // end copy_n()
+
+
+} // end dispatch
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::tbb::detail::dispatch::copy(exec,first,last,result,traversal());
+} // end copy()
+
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::tbb::detail::dispatch::copy_n(exec,first,n,result,traversal());
+} // end copy_n()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/copy_if.h b/thrust/thrust/system/tbb/detail/copy_if.h
new file mode 100644
index 0000000000000000000000000000000000000000..0420893ba642d3afa0f8370d0ac060290b636edd
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/copy_if.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(tag,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
+#include <thrust/system/tbb/detail/copy_if.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/copy_if.inl b/thrust/thrust/system/tbb/detail/copy_if.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9c074a9fcde491124757073f28c7ad638ca0cdae
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/copy_if.inl
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/tbb/detail/copy_if.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace copy_if_detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate,
+         typename Size>
+struct body
+{
+
+  InputIterator1 first;
+  InputIterator2 stencil;
+  OutputIterator result;
+  thrust::detail::wrapped_function<Predicate,bool> pred;
+  Size sum;
+
+  body(InputIterator1 first, InputIterator2 stencil, OutputIterator result, Predicate pred)
+    : first(first), stencil(stencil), result(result), pred(pred), sum(0)
+  {}
+
+  body(body& b, ::tbb::split)
+    : first(b.first), stencil(b.stencil), result(b.result), pred(b.pred), sum(0)
+  {}
+
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
+  {
+    InputIterator2 iter = stencil + r.begin();
+
+    for (Size i = r.begin(); i != r.end(); ++i, ++iter)
+    {
+      if (pred(*iter))
+        ++sum;
+    }
+  }
+  
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
+  {
+    InputIterator1  iter1 = first   + r.begin();
+    InputIterator2  iter2 = stencil + r.begin();
+    OutputIterator  iter3 = result  + sum;
+      
+    for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
+    {
+      if (pred(*iter2))
+      {
+        *iter3 = *iter1;
+        ++sum;
+        ++iter3;
+      }
+    }
+  }
+
+  void reverse_join(body& b)
+  {
+    sum = b.sum + sum;
+  } 
+
+  void assign(body& b)
+  {
+    sum = b.sum;
+  } 
+}; // end body
+
+} // end copy_if_detail
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(tag,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  typedef typename thrust::iterator_difference<InputIterator1>::type Size; 
+  typedef typename copy_if_detail::body<InputIterator1,InputIterator2,OutputIterator,Predicate,Size> Body;
+  
+  Size n = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    Body body(first, stencil, result, pred);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), body);
+    thrust::advance(result, body.sum);
+  }
+
+  return result;
+} // end copy_if()
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/count.h b/thrust/thrust/system/tbb/detail/count.h
new file mode 100644
index 0000000000000000000000000000000000000000..fde1728b77261d75c561b9042ec365281d78cee9
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/count.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits count
+#include <thrust/system/cpp/detail/count.h>
+
diff --git a/thrust/thrust/system/tbb/detail/equal.h b/thrust/thrust/system/tbb/detail/equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..13398fc9db5a02ba7cd7d2141f106fa59ba2a941
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/equal.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits equal
+#include <thrust/system/cpp/detail/equal.h>
+
diff --git a/thrust/thrust/system/tbb/detail/execution_policy.h b/thrust/thrust/system/tbb/detail/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..1773f3c0611d8fd6cb311dbc9ad983c3ec10612a
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/execution_policy.h
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace tbb
+{
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::system::cpp::detail::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::system::cpp::detail::execution_policy<Derived>
+{
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
+};
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::tbb::detail::execution_policy;
+using thrust::system::tbb::detail::tag;
+
+} // end tbb
+} // end system
+
+// alias items at top-level
+namespace tbb
+{
+
+using thrust::system::tbb::execution_policy;
+using thrust::system::tbb::tag;
+
+} // end tbb
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/extrema.h b/thrust/thrust/system/tbb/detail/extrema.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0dd4c042b38bafb42d683e2f4f19bab3678a4b4
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/extrema.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/detail/generic/extrema.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // tbb prefers generic::max_element to cpp::max_element
+  return thrust::system::detail::generic::max_element(exec, first, last, comp);
+} // end max_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // tbb prefers generic::min_element to cpp::min_element
+  return thrust::system::detail::generic::min_element(exec, first, last, comp);
+} // end min_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  // tbb prefers generic::minmax_element to cpp::minmax_element
+  return thrust::system::detail::generic::minmax_element(exec, first, last, comp);
+} // end minmax_element()
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
+
diff --git a/thrust/thrust/system/tbb/detail/fill.h b/thrust/thrust/system/tbb/detail/fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..6665a264873f6a0a775de0aa670ee7567d899ad9
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits fill
+#include <thrust/system/cpp/detail/fill.h>
+
diff --git a/thrust/thrust/system/tbb/detail/find.h b/thrust/thrust/system/tbb/detail/find.h
new file mode 100644
index 0000000000000000000000000000000000000000..e07d322a87c2494a4eba62e92447b7b970112eb4
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/find.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/find.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  // tbb prefers generic::find_if to cpp::find_if
+  return thrust::system::detail::generic::find_if(exec, first, last, pred);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/for_each.h b/thrust/thrust/system/tbb/detail/for_each.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5329b84ed273e60dacab576a559e351d26c42
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/for_each.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &exec,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator first,
+                                  Size n,
+                                  UnaryFunction f);
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/for_each.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/for_each.inl b/thrust/thrust/system/tbb/detail/for_each.inl
new file mode 100644
index 0000000000000000000000000000000000000000..00e025ea0bea80217606857b5e837abced6b5b1c
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/for_each.inl
@@ -0,0 +1,100 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/system/detail/sequential/execution_policy.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace for_each_detail
+{
+
+template<typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  struct body
+{
+  RandomAccessIterator m_first;
+  UnaryFunction m_f;
+
+  body(RandomAccessIterator first, UnaryFunction f)
+    : m_first(first), m_f(f)
+  {}
+
+  void operator()(const ::tbb::blocked_range<Size> &r) const
+  {
+    // we assume that blocked_range specifies a contiguous range of integers
+    thrust::for_each_n(thrust::system::detail::sequential::seq, m_first + r.begin(), r.size(), m_f);
+  } // end operator()()
+}; // end body
+
+
+template<typename Size, typename RandomAccessIterator, typename UnaryFunction>
+  body<RandomAccessIterator,Size,UnaryFunction>
+    make_body(RandomAccessIterator first, UnaryFunction f)
+{
+  return body<RandomAccessIterator,Size,UnaryFunction>(first, f);
+} // end make_body()
+
+
+} // end for_each_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
+                                RandomAccessIterator first,
+                                Size n,
+                                UnaryFunction f)
+{
+  ::tbb::parallel_for(::tbb::blocked_range<Size>(0,n), for_each_detail::make_body<Size>(first,f));
+
+  // return the end of the range
+  return first + n;
+} // end for_each_n 
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f)
+{
+  return tbb::detail::for_each_n(s, first, thrust::distance(first,last), f);
+} // end for_each()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/gather.h b/thrust/thrust/system/tbb/detail/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..098e0f4fbad4001632ed02ee9e9b39aa17e54ea0
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/gather.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits gather
+#include <thrust/system/cpp/detail/gather.h>
+
diff --git a/thrust/thrust/system/tbb/detail/generate.h b/thrust/thrust/system/tbb/detail/generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..f907b6acc079577642c446d6f0736073defc44b8
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/generate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits generate
+#include <thrust/system/cpp/detail/generate.h>
+
diff --git a/thrust/thrust/system/tbb/detail/get_value.h b/thrust/thrust/system/tbb/detail/get_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..23a11a8574f77f95bc6ca96d0cd8ff6de8c71c7e
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/get_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits get_value
+#include <thrust/system/cpp/detail/get_value.h>
+
diff --git a/thrust/thrust/system/tbb/detail/inner_product.h b/thrust/thrust/system/tbb/detail/inner_product.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8cf941a1dc3df1a6a516eee54f92fa610fd35cc
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/inner_product.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits inner_product
+#include <thrust/system/cpp/detail/inner_product.h>
+
diff --git a/thrust/thrust/system/tbb/detail/iter_swap.h b/thrust/thrust/system/tbb/detail/iter_swap.h
new file mode 100644
index 0000000000000000000000000000000000000000..a096739947c8854afe003e6bf498d89683854ff1
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/iter_swap.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits iter_swap
+#include <thrust/system/cpp/detail/iter_swap.h>
+
diff --git a/thrust/thrust/system/tbb/detail/logical.h b/thrust/thrust/system/tbb/detail/logical.h
new file mode 100644
index 0000000000000000000000000000000000000000..4199063183dbc38b79c7707bb8301e5ca8aa6ad5
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/logical.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits logical
+#include <thrust/system/cpp/detail/logical.h>
+
diff --git a/thrust/thrust/system/tbb/detail/malloc_and_free.h b/thrust/thrust/system/tbb/detail/malloc_and_free.h
new file mode 100644
index 0000000000000000000000000000000000000000..01ab1e6dbe1732da1f8606b7a9121c1b404edb6f
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/malloc_and_free.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits malloc and free
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+
diff --git a/thrust/thrust/system/tbb/detail/memory.inl b/thrust/thrust/system/tbb/detail/memory.inl
new file mode 100644
index 0000000000000000000000000000000000000000..216480d59b44da92745927421589385c8558c721
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/memory.inl
@@ -0,0 +1,86 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/tbb/memory.h>
+#include <thrust/system/cpp/memory.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+
+namespace detail
+{
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::malloc
+//     is not defined
+//     WAR the problem by using adl to call cpp::malloc, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  pointer<void> malloc_workaround(Tag t, std::size_t n)
+{
+  return pointer<void>(malloc(t, n));
+} // end malloc_workaround()
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::free
+//     is not defined
+//     WAR the problem by using adl to call cpp::free, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  void free_workaround(Tag t, pointer<void> ptr)
+{
+  free(t, ptr.get());
+} // end free_workaround()
+
+} // end detail
+
+inline pointer<void> malloc(std::size_t n)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // return pointer<void>(thrust::system::cpp::malloc(n))
+  //
+  return detail::malloc_workaround(cpp::tag(), n);
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::tbb::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+inline void free(pointer<void> ptr)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // thrust::system::cpp::free(ptr)
+  //
+  detail::free_workaround(cpp::tag(), ptr);
+} // end free()
+
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/merge.h b/thrust/thrust/system/tbb/detail/merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..44608959ced1eff1a79da4dd8eef81979370ee29
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/merge.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(execution_policy<ExecutionPolicy> &exec,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp);
+
+template <typename ExecutionPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(execution_policy<ExecutionPolicy> &exec,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first3,
+               InputIterator4 values_first4,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp);
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
+#include <thrust/system/tbb/detail/merge.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/merge.inl b/thrust/thrust/system/tbb/detail/merge.inl
new file mode 100644
index 0000000000000000000000000000000000000000..bcc72854617f6f3734216be57abaff7243d94f8f
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/merge.inl
@@ -0,0 +1,286 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/merge.h>
+#include <thrust/binary_search.h>
+#include <thrust/detail/seq.h>
+#include <tbb/parallel_for.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace merge_detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+struct range
+{
+  InputIterator1 first1, last1;
+  InputIterator2 first2, last2;
+  OutputIterator result;
+  StrictWeakOrdering comp;
+  size_t grain_size;
+
+  range(InputIterator1 first1, InputIterator1 last1,
+        InputIterator2 first2, InputIterator2 last2,
+        OutputIterator result,
+        StrictWeakOrdering comp,
+        size_t grain_size = 1024)
+    : first1(first1), last1(last1),
+      first2(first2), last2(last2),
+      result(result), comp(comp), grain_size(grain_size)
+  {}
+  
+  range(range& r, ::tbb::split)
+    : first1(r.first1), last1(r.last1),
+      first2(r.first2), last2(r.last2),
+      result(r.result), comp(r.comp), grain_size(r.grain_size)
+  {
+    // we can assume n1 and n2 are not both 0
+    size_t n1 = thrust::distance(first1, last1);
+    size_t n2 = thrust::distance(first2, last2);
+
+    InputIterator1 mid1 = first1;
+    InputIterator2 mid2 = first2;
+
+    if (n1 > n2)
+    {
+      mid1 += n1 / 2;
+      mid2 = thrust::lower_bound(thrust::seq, first2, last2, raw_reference_cast(*mid1), comp);
+    }
+    else
+    {
+      mid2 += n2 / 2;
+      mid1 = thrust::upper_bound(thrust::seq, first1, last1, raw_reference_cast(*mid2), comp);
+    }
+    
+    // set first range to [first1, mid1), [first2, mid2), result
+    r.last1 = mid1;
+    r.last2 = mid2;
+
+    // set second range to [mid1, last1), [mid2, last2), result + (mid1 - first1) + (mid2 - first2)
+    first1 = mid1;
+    first2 = mid2;
+    result += thrust::distance(r.first1, mid1) + thrust::distance(r.first2, mid2);
+  }
+
+  bool empty(void) const
+  {
+    return (first1 == last1) && (first2 == last2);
+  }
+
+  bool is_divisible(void) const
+  {
+    return static_cast<size_t>(thrust::distance(first1, last1) + thrust::distance(first2, last2)) > grain_size;
+  }
+};
+
+struct body
+{
+  template <typename Range>
+  void operator()(Range& r) const
+  {
+    thrust::merge(thrust::seq,
+                  r.first1, r.last1,
+                  r.first2, r.last2,
+                  r.result,
+                  r.comp);
+  }
+};
+
+} // end namespace merge_detail
+
+namespace merge_by_key_detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+struct range
+{
+  InputIterator1 keys_first1, keys_last1;
+  InputIterator2 keys_first2, keys_last2;
+  InputIterator3 values_first1;
+  InputIterator4 values_first2;
+  OutputIterator1 keys_result;
+  OutputIterator2 values_result;
+  StrictWeakOrdering comp;
+  size_t grain_size;
+
+  range(InputIterator1 keys_first1, InputIterator1 keys_last1,
+        InputIterator2 keys_first2, InputIterator2 keys_last2,
+        InputIterator3 values_first1,
+        InputIterator4 values_first2,
+        OutputIterator1 keys_result,
+        OutputIterator2 values_result,
+        StrictWeakOrdering comp,
+        size_t grain_size = 1024)
+    : keys_first1(keys_first1), keys_last1(keys_last1),
+      keys_first2(keys_first2), keys_last2(keys_last2),
+      values_first1(values_first1),
+      values_first2(values_first2),
+      keys_result(keys_result), values_result(values_result),
+      comp(comp), grain_size(grain_size)
+  {}
+  
+  range(range& r, ::tbb::split)
+    : keys_first1(r.keys_first1), keys_last1(r.keys_last1),
+      keys_first2(r.keys_first2), keys_last2(r.keys_last2),
+      values_first1(r.values_first1),
+      values_first2(r.values_first2),
+      keys_result(r.keys_result), values_result(r.values_result),
+      comp(r.comp), grain_size(r.grain_size)
+  {
+    // we can assume n1 and n2 are not both 0
+    size_t n1 = thrust::distance(keys_first1, keys_last1);
+    size_t n2 = thrust::distance(keys_first2, keys_last2);
+
+    InputIterator1 mid1 = keys_first1;
+    InputIterator2 mid2 = keys_first2;
+
+    if (n1 > n2)
+    {
+      mid1 += n1 / 2;
+      mid2 = thrust::lower_bound(thrust::seq, keys_first2, keys_last2, raw_reference_cast(*mid1), comp);
+    }
+    else
+    {
+      mid2 += n2 / 2;
+      mid1 = thrust::upper_bound(thrust::seq, keys_first1, keys_last1, raw_reference_cast(*mid2), comp);
+    }
+    
+    // set first range to [keys_first1, mid1), [keys_first2, mid2), keys_result, values_result
+    r.keys_last1 = mid1;
+    r.keys_last2 = mid2;
+
+    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2) 
+    keys_first1 = mid1;
+    keys_first2 = mid2;
+    values_first1 += thrust::distance(r.keys_first1, mid1);
+    values_first2 += thrust::distance(r.keys_first2, mid2);
+    keys_result += thrust::distance(r.keys_first1, mid1) + thrust::distance(r.keys_first2, mid2);
+    values_result += thrust::distance(r.keys_first1, mid1) + thrust::distance(r.keys_first2, mid2);
+  }
+
+  bool empty(void) const
+  {
+    return (keys_first1 == keys_last1) && (keys_first2 == keys_last2);
+  }
+
+  bool is_divisible(void) const
+  {
+    return static_cast<size_t>(thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2)) > grain_size;
+  }
+};
+
+struct body
+{
+  template <typename Range>
+  void operator()(Range& r) const
+  {
+    thrust::merge_by_key(thrust::seq,
+                         r.keys_first1, r.keys_last1,
+                         r.keys_first2, r.keys_last2,
+                         r.values_first1,
+                         r.values_first2,
+                         r.keys_result,
+                         r.values_result,
+                         r.comp);
+  }
+};
+
+} // end namespace merge_by_key_detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(execution_policy<DerivedPolicy> &exec,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp)
+{
+  typedef typename merge_detail::range<InputIterator1,InputIterator2,OutputIterator,StrictWeakOrdering> Range;
+  typedef          merge_detail::body                                                                   Body;
+  Range range(first1, last1, first2, last2, result, comp);
+  Body  body;
+
+  ::tbb::parallel_for(range, body);
+
+  thrust::advance(result, thrust::distance(first1, last1) + thrust::distance(first2, last2));
+
+  return result;
+} // end merge()
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(execution_policy<DerivedPolicy> &exec,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first3,
+               InputIterator4 values_first4,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp)
+{
+  typedef typename merge_by_key_detail::range<InputIterator1,InputIterator2,InputIterator3,InputIterator4,OutputIterator1,OutputIterator2,StrictWeakOrdering> Range;
+  typedef          merge_by_key_detail::body                                                                                                                  Body;
+
+  Range range(keys_first1, keys_last1, keys_first2, keys_last2, values_first3, values_first4, keys_result, values_result, comp);
+  Body  body;
+
+  ::tbb::parallel_for(range, body);
+
+  thrust::advance(keys_result,   thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2));
+  thrust::advance(values_result, thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2));
+
+  return thrust::make_pair(keys_result,values_result);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/mismatch.h b/thrust/thrust/system/tbb/detail/mismatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5c6b2c4bdd1cc242c87d7526a42e21bf4b1561c
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/mismatch.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits mismatch
+#include <thrust/system/cpp/detail/mismatch.h>
+
diff --git a/thrust/thrust/system/tbb/detail/par.h b/thrust/thrust/system/tbb/detail/par.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5d9c14cd7a91df6bcd00dcd13419d7e67155b03
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/par.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::tbb::detail::execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::tbb::detail::execution_policy>
+{
+  __host__ __device__
+  THRUST_CONSTEXPR par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
+};
+
+
+} // end detail
+
+
+static const detail::par_t par;
+
+
+} // end tbb
+} // end system
+
+
+// alias par here
+namespace tbb
+{
+
+
+using thrust::system::tbb::par;
+
+
+} // end tbb
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/partition.h b/thrust/thrust/system/tbb/detail/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..80323535c9b0492af8411ad5c23f5edee1a0c906
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/partition.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/partition.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/partition.inl b/thrust/thrust/system/tbb/detail/partition.inl
new file mode 100644
index 0000000000000000000000000000000000000000..5085ed906be07517c69af18a530bb68f35005fb4
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/partition.inl
@@ -0,0 +1,102 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/partition.h>
+#include <thrust/system/detail/generic/partition.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  // tbb prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  // tbb prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, stencil, pred);
+} // end stable_partition()
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // tbb prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // tbb prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/per_device_resource.h b/thrust/thrust/system/tbb/detail/per_device_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d61f92169e0e09c3821e59218f0dcbb70cbe5
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/thrust/system/tbb/detail/pointer.inl b/thrust/thrust/system/tbb/detail/pointer.inl
new file mode 100644
index 0000000000000000000000000000000000000000..2b21422bc6d4d202bb7ac5b18941222006374408
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/pointer.inl
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/reduce.h b/thrust/thrust/system/tbb/detail/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..7381da3822f1980f0806e6322f8aefc7dfb313d6
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/reduce.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief TBB implementation of reduce.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/reduce.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/reduce.inl b/thrust/thrust/system/tbb/detail/reduce.inl
new file mode 100644
index 0000000000000000000000000000000000000000..22a13f63d06295b91ca9119b9f11379c89d8553b
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/reduce.inl
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/reduce.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace reduce_detail
+{
+
+template<typename RandomAccessIterator,
+         typename OutputType,
+         typename BinaryFunction>
+struct body
+{
+  RandomAccessIterator first;
+  OutputType sum;
+  bool first_call;  // TBB can invoke operator() multiple times on the same body
+  thrust::detail::wrapped_function<BinaryFunction,OutputType> binary_op;
+
+  // note: we only initalize sum with init to avoid calling OutputType's default constructor
+  body(RandomAccessIterator first, OutputType init, BinaryFunction binary_op)
+    : first(first), sum(init), first_call(true), binary_op(binary_op)
+  {}
+
+  // note: we only initalize sum with b.sum to avoid calling OutputType's default constructor
+  body(body& b, ::tbb::split)
+    : first(b.first), sum(b.sum), first_call(true), binary_op(b.binary_op)
+  {}
+
+  template <typename Size>
+  void operator()(const ::tbb::blocked_range<Size> &r)
+  {
+    // we assume that blocked_range specifies a contiguous range of integers
+    
+    if (r.empty()) return; // nothing to do
+
+    RandomAccessIterator iter = first + r.begin();
+
+    OutputType temp = thrust::raw_reference_cast(*iter);
+
+    ++iter;
+
+    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
+      temp = binary_op(temp, *iter);
+
+
+    if (first_call)
+    {
+      // first time body has been invoked
+      first_call = false;
+      sum = temp;
+    }
+    else
+    {
+      // body has been previously invoked, accumulate temp into sum
+      sum = binary_op(sum, temp);
+    }
+  } // end operator()()
+  
+  void join(body& b)
+  {
+    sum = binary_op(sum, b.sum);
+  }
+}; // end body
+
+} // end reduce_detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
+
+  Size n = thrust::distance(begin, end);
+
+  if (n == 0)
+  {
+    return init;
+  }
+  else
+  {
+    typedef typename reduce_detail::body<InputIterator,OutputType,BinaryFunction> Body;
+    Body reduce_body(begin, init, binary_op);
+    ::tbb::parallel_reduce(::tbb::blocked_range<Size>(0,n), reduce_body);
+    return binary_op(init, reduce_body.sum);
+  }
+}
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/reduce_by_key.h b/thrust/thrust/system/tbb/detail/reduce_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8e3b38c59093f309942cda0577580ee4c1df251
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/reduce_by_key.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/reduce_by_key.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/reduce_by_key.inl b/thrust/thrust/system/tbb/detail/reduce_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..70933f3070e8c3a67d1d457e011fbd152bf85bab
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/reduce_by_key.inl
@@ -0,0 +1,342 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/reduce_by_key.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/detail/seq.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/tbb/detail/reduce_intervals.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/range/tail_flags.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+
+#include <cassert>
+#include <thread>
+
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace reduce_by_key_detail
+{
+
+
+template<typename L, typename R>
+  inline L divide_ri(const L x, const R y)
+{
+  return (x + (y - 1)) / y;
+}
+
+
+template<typename InputIterator, typename BinaryFunction, typename OutputIterator = void>
+  struct partial_sum_type
+    : thrust::detail::eval_if<
+        thrust::detail::has_result_type<BinaryFunction>::value,
+        thrust::detail::result_type<BinaryFunction>,
+        thrust::detail::eval_if<
+          thrust::detail::is_output_iterator<OutputIterator>::value,
+          thrust::iterator_value<InputIterator>,
+          thrust::iterator_value<OutputIterator>
+        >
+      >
+{};
+
+
+template<typename InputIterator, typename BinaryFunction>
+  struct partial_sum_type<InputIterator,BinaryFunction,void>
+    : thrust::detail::eval_if<
+        thrust::detail::has_result_type<BinaryFunction>::value,
+        thrust::detail::result_type<BinaryFunction>,
+        thrust::iterator_value<InputIterator>
+      >
+{};
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<
+    InputIterator1,
+    thrust::pair<
+      typename thrust::iterator_value<InputIterator1>::type,
+      typename partial_sum_type<InputIterator2,BinaryFunction>::type
+    >
+  >
+    reduce_last_segment_backward(InputIterator1 keys_first,
+                                 InputIterator1 keys_last,
+                                 InputIterator2 values_first,
+                                 BinaryPredicate binary_pred,
+                                 BinaryFunction binary_op)
+{
+  typename thrust::iterator_difference<InputIterator1>::type n = keys_last - keys_first;
+
+  // reverse the ranges and consume from the end
+  thrust::reverse_iterator<InputIterator1> keys_first_r(keys_last);
+  thrust::reverse_iterator<InputIterator1> keys_last_r(keys_first);
+  thrust::reverse_iterator<InputIterator2> values_first_r(values_first + n);
+
+  typename thrust::iterator_value<InputIterator1>::type result_key = *keys_first_r;
+  typename partial_sum_type<InputIterator2,BinaryFunction>::type result_value = *values_first_r;
+
+  // consume the entirety of the first key's sequence
+  for(++keys_first_r, ++values_first_r;
+      (keys_first_r != keys_last_r) && binary_pred(*keys_first_r, result_key);
+      ++keys_first_r, ++values_first_r)
+  {
+    result_value = binary_op(result_value, *values_first_r);
+  }
+
+  return thrust::make_pair(keys_first_r.base(), thrust::make_pair(result_key, result_value));
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::tuple<
+    OutputIterator1,
+    OutputIterator2,
+    typename thrust::iterator_value<InputIterator1>::type,
+    typename partial_sum_type<InputIterator2,BinaryFunction>::type
+  >
+    reduce_by_key_with_carry(InputIterator1 keys_first, 
+                             InputIterator1 keys_last,
+                             InputIterator2 values_first,
+                             OutputIterator1 keys_output,
+                             OutputIterator2 values_output,
+                             BinaryPredicate binary_pred,
+                             BinaryFunction binary_op)
+{
+  // first, consume the last sequence to produce the carry
+  // XXX is there an elegant way to pose this such that we don't need to default construct carry?
+  thrust::pair<
+    typename thrust::iterator_value<InputIterator1>::type,
+    typename partial_sum_type<InputIterator2,BinaryFunction>::type
+  > carry;
+
+  thrust::tie(keys_last, carry) = reduce_last_segment_backward(keys_first, keys_last, values_first, binary_pred, binary_op);
+
+  // finish with sequential reduce_by_key
+  thrust::tie(keys_output, values_output) =
+    thrust::reduce_by_key(thrust::seq, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+  
+  return thrust::make_tuple(keys_output, values_output, carry.first, carry.second);
+}
+
+
+template<typename Iterator>
+  bool interval_has_carry(size_t interval_idx, size_t interval_size, size_t num_intervals, Iterator tail_flags)
+{
+  // to discover whether the interval has a carry, look at the tail_flag corresponding to its last element 
+  // the final interval never has a carry by definition
+  return (interval_idx + 1 < num_intervals) ? !tail_flags[(interval_idx + 1) * interval_size - 1] : false;
+}
+
+
+template<typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename BinaryPredicate, typename BinaryFunction>
+  struct serial_reduce_by_key_body
+{
+  typedef typename thrust::iterator_difference<Iterator1>::type size_type;
+
+  Iterator1 keys_first;
+  Iterator2 values_first;
+  Iterator3 result_offset;
+  Iterator4 keys_result;
+  Iterator5 values_result;
+  Iterator6 carry_result;
+
+  size_type n;
+  size_type interval_size;
+  size_type num_intervals;
+
+  BinaryPredicate binary_pred;
+  BinaryFunction binary_op;
+
+  serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, size_type n, size_type interval_size, size_type num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op)
+    : keys_first(keys_first), values_first(values_first),
+      result_offset(result_offset),
+      keys_result(keys_result),
+      values_result(values_result),
+      carry_result(carry_result),
+      n(n),
+      interval_size(interval_size),
+      num_intervals(num_intervals),
+      binary_pred(binary_pred),
+      binary_op(binary_op)
+  {}
+
+  void operator()(const ::tbb::blocked_range<size_type> &r) const
+  {
+    assert(r.size() == 1);
+
+    const size_type interval_idx = r.begin();
+
+    const size_type offset_to_first = interval_size * interval_idx;
+    const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size);
+
+    Iterator1 my_keys_first     = keys_first    + offset_to_first;
+    Iterator1 my_keys_last      = keys_first    + offset_to_last;
+    Iterator2 my_values_first   = values_first  + offset_to_first;
+    Iterator3 my_result_offset  = result_offset + interval_idx;
+    Iterator4 my_keys_result    = keys_result   + *my_result_offset;
+    Iterator5 my_values_result  = values_result + *my_result_offset;
+    Iterator6 my_carry_result   = carry_result  + interval_idx;
+
+    // consume the rest of the interval with reduce_by_key
+    typedef typename thrust::iterator_value<Iterator1>::type key_type;
+    typedef typename partial_sum_type<Iterator2,BinaryFunction>::type value_type;
+
+    // XXX is there a way to pose this so that we don't require default construction of carry?
+    thrust::pair<key_type, value_type> carry;
+
+    thrust::tie(my_keys_result, my_values_result, carry.first, carry.second) =
+      reduce_by_key_with_carry(my_keys_first,
+                               my_keys_last,
+                               my_values_first,
+                               my_keys_result,
+                               my_values_result,
+                               binary_pred,
+                               binary_op);
+
+    // store to carry only when we actually have a carry
+    // store to my_keys_result & my_values_result otherwise
+    
+    // create tail_flags so we can check for a carry
+    thrust::detail::tail_flags<Iterator1,BinaryPredicate> flags = thrust::detail::make_tail_flags(keys_first, keys_first + n, binary_pred);
+
+    if(interval_has_carry(interval_idx, interval_size, num_intervals, flags.begin()))
+    {
+      // we can ignore the carry's key
+      // XXX because the carry result is uninitialized, we should copy construct
+      *my_carry_result = carry.second;
+    }
+    else
+    {
+      *my_keys_result = carry.first;
+      *my_values_result = carry.second;
+    }
+  }
+};
+
+
+template<typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename BinaryPredicate, typename BinaryFunction>
+  serial_reduce_by_key_body<Iterator1,Iterator2,Iterator3,Iterator4,Iterator5,Iterator6,BinaryPredicate,BinaryFunction>
+    make_serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, typename thrust::iterator_difference<Iterator1>::type n, size_t interval_size, size_t num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op)
+{
+  return serial_reduce_by_key_body<Iterator1,Iterator2,Iterator3,Iterator4,Iterator5,Iterator6,BinaryPredicate,BinaryFunction>(keys_first, values_first, result_offset, keys_result, values_result, carry_result, n, interval_size, num_intervals, binary_pred, binary_op);
+}
+
+
+} // end reduce_by_key_detail
+
+
+template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename BinaryPredicate, typename BinaryFunction>
+  thrust::pair<Iterator3,Iterator4>
+    reduce_by_key(thrust::tbb::execution_policy<DerivedPolicy> &exec,
+                  Iterator1 keys_first, Iterator1 keys_last, 
+                  Iterator2 values_first,
+                  Iterator3 keys_result,
+                  Iterator4 values_result,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+
+  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
+  difference_type n = keys_last - keys_first;
+  if(n == 0) return thrust::make_pair(keys_result, values_result);
+
+  // XXX this value is a tuning opportunity
+  const difference_type parallelism_threshold = 10000;
+
+  if(n < parallelism_threshold)
+  {
+    // don't bother parallelizing for small n
+    return thrust::reduce_by_key(thrust::seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
+  }
+
+  // count the number of processors
+  const unsigned int p = thrust::max<unsigned int>(1u, std::thread::hardware_concurrency());
+
+  // generate O(P) intervals of sequential work
+  // XXX oversubscribing is a tuning opportunity
+  const unsigned int subscription_rate = 1;
+  difference_type interval_size = thrust::min<difference_type>(parallelism_threshold, thrust::max<difference_type>(n, n / (subscription_rate * p)));
+  difference_type num_intervals = reduce_by_key_detail::divide_ri(n, interval_size);
+
+  // decompose the input into intervals of size N / num_intervals
+  // add one extra element to this vector to store the size of the entire result
+  thrust::detail::temporary_array<difference_type, DerivedPolicy> interval_output_offsets(0, exec, num_intervals + 1);
+
+  // first count the number of tail flags in each interval
+  thrust::detail::tail_flags<Iterator1,BinaryPredicate> tail_flags = thrust::detail::make_tail_flags(keys_first, keys_last, binary_pred);
+  thrust::system::tbb::detail::reduce_intervals(exec, tail_flags.begin(), tail_flags.end(), interval_size, interval_output_offsets.begin() + 1, thrust::plus<size_t>());
+  interval_output_offsets[0] = 0;
+
+  // scan the counts to get each body's output offset
+  thrust::inclusive_scan(thrust::seq,
+                         interval_output_offsets.begin() + 1, interval_output_offsets.end(), 
+                         interval_output_offsets.begin() + 1);
+
+  // do a reduce_by_key serially in each thread
+  // the final interval never has a carry by definition, so don't reserve space for it
+  typedef typename reduce_by_key_detail::partial_sum_type<Iterator2,BinaryFunction>::type carry_type;
+  thrust::detail::temporary_array<carry_type, DerivedPolicy> carries(0, exec, num_intervals - 1);
+
+  // force grainsize == 1 with simple_partioner()
+  ::tbb::parallel_for(::tbb::blocked_range<difference_type>(0, num_intervals, 1),
+    reduce_by_key_detail::make_serial_reduce_by_key_body(keys_first, values_first, interval_output_offsets.begin(), keys_result, values_result, carries.begin(), n, interval_size, num_intervals, binary_pred, binary_op),
+    ::tbb::simple_partitioner());
+
+  difference_type size_of_result = interval_output_offsets[num_intervals];
+
+  // sequentially accumulate the carries
+  // note that the last interval does not have a carry
+  // XXX find a way to express this loop via a sequential algorithm, perhaps reduce_by_key
+  for(typename thrust::detail::temporary_array<carry_type,DerivedPolicy>::size_type i = 0; i < carries.size(); ++i)
+  {
+    // if our interval has a carry, then we need to sum the carry to the next interval's output offset
+    // if it does not have a carry, then we need to ignore carry_value[i]
+    if(reduce_by_key_detail::interval_has_carry(i, interval_size, num_intervals, tail_flags.begin()))
+    {
+      difference_type output_idx = interval_output_offsets[i+1];
+
+      values_result[output_idx] = binary_op(values_result[output_idx], carries[i]);
+    }
+  }
+
+  return thrust::make_pair(keys_result + size_of_result, values_result + size_of_result);
+}
+
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/reduce_intervals.h b/thrust/thrust/system/tbb/detail/reduce_intervals.h
new file mode 100644
index 0000000000000000000000000000000000000000..88fefe43deffde15e32fe92c45d3b3047b2ba6aa
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/reduce_intervals.h
@@ -0,0 +1,125 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/detail/seq.h>
+
+#include <tbb/parallel_for.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/reduce.h>
+#include <cassert>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace reduce_intervals_detail
+{
+
+
+template<typename L, typename R>
+  inline L divide_ri(const L x, const R y)
+{
+  return (x + (y - 1)) / y;
+}
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Size, typename BinaryFunction>
+  struct body
+{
+  RandomAccessIterator1 first;
+  RandomAccessIterator2 result;
+  Size n, interval_size;
+  BinaryFunction binary_op;
+
+  body(RandomAccessIterator1 first, RandomAccessIterator2 result, Size n, Size interval_size, BinaryFunction binary_op)
+    : first(first), result(result), n(n), interval_size(interval_size), binary_op(binary_op)
+  {}
+
+  void operator()(const ::tbb::blocked_range<Size> &r) const
+  {
+    assert(r.size() == 1);
+
+    Size interval_idx = r.begin();
+
+    Size offset_to_first = interval_size * interval_idx;
+    Size offset_to_last = thrust::min(n, offset_to_first + interval_size);
+
+    RandomAccessIterator1 my_first = first + offset_to_first;
+    RandomAccessIterator1 my_last  = first + offset_to_last;
+
+    // carefully pass the init value for the interval with raw_reference_cast
+    typedef typename BinaryFunction::result_type sum_type;
+    result[interval_idx] =
+      thrust::reduce(thrust::seq, my_first + 1, my_last, sum_type(thrust::raw_reference_cast(*my_first)), binary_op);
+  }
+};
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Size, typename BinaryFunction>
+  body<RandomAccessIterator1,RandomAccessIterator2,Size,BinaryFunction>
+    make_body(RandomAccessIterator1 first, RandomAccessIterator2 result, Size n, Size interval_size, BinaryFunction binary_op)
+{
+  return body<RandomAccessIterator1,RandomAccessIterator2,Size,BinaryFunction>(first, result, n, interval_size, binary_op);
+}
+
+
+} // end reduce_intervals_detail
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename BinaryFunction>
+  void reduce_intervals(thrust::tbb::execution_policy<DerivedPolicy> &,
+                        RandomAccessIterator1 first,
+                        RandomAccessIterator1 last,
+                        Size interval_size,
+                        RandomAccessIterator2 result,
+                        BinaryFunction binary_op)
+{
+  typename thrust::iterator_difference<RandomAccessIterator1>::type n = last - first;
+
+  Size num_intervals = reduce_intervals_detail::divide_ri(n, interval_size);
+
+  ::tbb::parallel_for(::tbb::blocked_range<Size>(0, num_intervals, 1), reduce_intervals_detail::make_body(first, result, Size(n), interval_size, binary_op), ::tbb::simple_partitioner());
+}
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
+  void reduce_intervals(thrust::tbb::execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 first,
+                        RandomAccessIterator1 last,
+                        Size interval_size,
+                        RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+
+  return thrust::system::tbb::detail::reduce_intervals(exec, first, last, interval_size, result, thrust::plus<value_type>());
+}
+
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/thrust/thrust/system/tbb/detail/remove.h b/thrust/thrust/system/tbb/detail/remove.h
new file mode 100644
index 0000000000000000000000000000000000000000..49f70588d683a0079dc561ff8a6b0f7e6fbc8468
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/remove.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<ExecutionPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/remove.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/remove.inl b/thrust/thrust/system/tbb/detail/remove.inl
new file mode 100644
index 0000000000000000000000000000000000000000..0a937799d29828a8724455bf8dd068ac5e756bac
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/remove.inl
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/remove.h>
+#include <thrust/system/detail/generic/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  // tbb prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  // tbb prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, stencil, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // tbb prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, result, pred);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // tbb prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, stencil, result, pred);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/replace.h b/thrust/thrust/system/tbb/detail/replace.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c5a14ba3df120019c9a5b6ed638db3f2555a5b
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/replace.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/thrust/thrust/system/tbb/detail/reverse.h b/thrust/thrust/system/tbb/detail/reverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f3e0325e257c301215e62c690837433ae24c30c
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/reverse.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits reverse
+#include <thrust/system/cpp/detail/reverse.h>
+
diff --git a/thrust/thrust/system/tbb/detail/scan.h b/thrust/thrust/system/tbb/detail/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..32a05a5a6bd3a5be92bbd84c1bf4edb9e929abeb
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/scan.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief TBB implementations of scan functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/scan.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl
new file mode 100644
index 0000000000000000000000000000000000000000..613b02872203ed67c3a79ddaee80e543cbf63651
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/scan.inl
@@ -0,0 +1,259 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/scan.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace scan_detail
+{
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction,
+         typename ValueType>
+struct inclusive_body
+{
+  InputIterator input;
+  OutputIterator output;
+  thrust::detail::wrapped_function<BinaryFunction,ValueType> binary_op;
+  ValueType sum;
+  bool first_call;
+
+  inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy)
+    : input(input), output(output), binary_op(binary_op), sum(dummy), first_call(true)
+  {}
+    
+  inclusive_body(inclusive_body& b, ::tbb::split)
+    : input(b.input), output(b.output), binary_op(b.binary_op), sum(b.sum), first_call(true)
+  {}
+
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
+  {
+    InputIterator iter = input + r.begin();
+ 
+    ValueType temp = *iter;
+
+    ++iter;
+
+    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
+      temp = binary_op(temp, *iter);
+
+    if (first_call)
+      sum = temp;
+    else
+      sum = binary_op(sum, temp);
+      
+    first_call = false;
+  }
+  
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
+  {
+    InputIterator  iter1 = input  + r.begin();
+    OutputIterator iter2 = output + r.begin();
+
+    if (first_call)
+    {
+      *iter2 = sum = *iter1;
+      ++iter1;
+      ++iter2;
+      for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2)
+        *iter2 = sum = binary_op(sum, *iter1);
+    }
+    else
+    {
+      for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
+        *iter2 = sum = binary_op(sum, *iter1);
+    }
+
+    first_call = false;
+  }
+
+  void reverse_join(inclusive_body& b)
+  {
+    // Only accumulate this functor's partial sum if this functor has been
+    // called at least once -- otherwise we'll over-count the initial value.
+    if (!first_call)
+    {
+      sum = binary_op(b.sum, sum);
+    }
+  } 
+
+  void assign(inclusive_body& b)
+  {
+    sum = b.sum;
+  } 
+};
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction,
+         typename ValueType>
+struct exclusive_body
+{
+  InputIterator input;
+  OutputIterator output;
+  thrust::detail::wrapped_function<BinaryFunction,ValueType> binary_op;
+  ValueType sum;
+  bool first_call;
+
+  exclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init)
+    : input(input), output(output), binary_op(binary_op), sum(init), first_call(true)
+  {}
+    
+  exclusive_body(exclusive_body& b, ::tbb::split)
+    : input(b.input), output(b.output), binary_op(b.binary_op), sum(b.sum), first_call(true)
+  {}
+
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
+  {
+    InputIterator iter = input + r.begin();
+ 
+    ValueType temp = *iter;
+
+    ++iter;
+
+    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
+      temp = binary_op(temp, *iter);
+
+    if (first_call && r.begin() > 0)
+      sum = temp;
+    else
+      sum = binary_op(sum, temp);
+      
+    first_call = false;
+  }
+  
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
+  {
+    InputIterator  iter1 = input  + r.begin();
+    OutputIterator iter2 = output + r.begin();
+
+    for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
+    {
+      ValueType temp = binary_op(sum, *iter1);
+      *iter2 = sum;
+      sum = temp;
+    }
+    
+    first_call = false;
+  }
+
+  void reverse_join(exclusive_body& b)
+  {
+    // Only accumulate this functor's partial sum if this functor has been
+    // called at least once -- otherwise we'll over-count the initial value.
+    if (!first_call)
+    {
+      sum = binary_op(b.sum, sum);
+    }
+  }
+
+  void assign(exclusive_body& b)
+  {
+    sum = b.sum;
+  } 
+};
+
+} // end scan_detail
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
+  Size n = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    typedef typename scan_detail::inclusive_body<InputIterator,OutputIterator,BinaryFunction,ValueType> Body;
+    Body scan_body(first, result, binary_op, *first);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
+  }
+
+  thrust::advance(result, n);
+
+  return result;
+}
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename InitialValueType,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                InitialValueType init,
+                                BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
+  Size n = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    typedef typename scan_detail::exclusive_body<InputIterator,OutputIterator,BinaryFunction,ValueType> Body;
+    Body scan_body(first, result, binary_op, init);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
+  }
+
+  thrust::advance(result, n);
+
+  return result;
+} 
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
diff --git a/thrust/thrust/system/tbb/detail/scan_by_key.h b/thrust/thrust/system/tbb/detail/scan_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b5fa36483c451bac93827b239c17fb7850e2ed1
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/scan_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits scan_by_key
+#include <thrust/system/cpp/detail/scan_by_key.h>
+
diff --git a/thrust/thrust/system/tbb/detail/scatter.h b/thrust/thrust/system/tbb/detail/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c5a14ba3df120019c9a5b6ed638db3f2555a5b
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/scatter.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/thrust/thrust/system/tbb/detail/sequence.h b/thrust/thrust/system/tbb/detail/sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33b2d4333ce2ded0ffe73c23c20a80c5a35b928
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/sequence.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits sequence
+#include <thrust/system/cpp/detail/sequence.h>
+
diff --git a/thrust/thrust/system/tbb/detail/set_operations.h b/thrust/thrust/system/tbb/detail/set_operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..421fa8a4bd955706497d0c9b30614035ccbbc46f
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/set_operations.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits set_operations
+#include <thrust/system/cpp/detail/set_operations.h>
+
diff --git a/thrust/thrust/system/tbb/detail/sort.h b/thrust/thrust/system/tbb/detail/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..863189a1ea6bc39e5ae9c15a088cffab8060a1b9
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/sort.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/sort.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/sort.inl b/thrust/thrust/system/tbb/detail/sort.inl
new file mode 100644
index 0000000000000000000000000000000000000000..907fa20892795594b6f5cb51200764434bb80ea8
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/sort.inl
@@ -0,0 +1,265 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/merge.h>
+#include <thrust/sort.h>
+#include <thrust/detail/seq.h>
+#include <tbb/parallel_invoke.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace sort_detail
+{
+
+
+// TODO tune this based on data type and comp
+const static int threshold = 128 * 1024;
+
+  
+template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
+void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace);
+
+
+template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
+struct merge_sort_closure
+{
+  execution_policy<DerivedPolicy> &exec;
+  Iterator1 first1, last1;
+  Iterator2 first2;
+  StrictWeakOrdering comp;
+  bool inplace;
+
+  merge_sort_closure(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace)
+    : exec(exec), first1(first1), last1(last1), first2(first2), comp(comp), inplace(inplace)
+  {}
+
+  void operator()(void) const
+  {
+    merge_sort(exec, first1, last1, first2, comp, inplace);
+  }
+};
+
+
+template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
+void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace)
+{
+  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
+
+  difference_type n = thrust::distance(first1, last1);
+
+  if (n < threshold)
+  {
+    thrust::stable_sort(thrust::seq, first1, last1, comp);
+    
+    if(!inplace)
+    {
+      thrust::copy(thrust::seq, first1, last1, first2);
+    }
+
+    return;
+  }
+
+  Iterator1 mid1  = first1 + (n / 2);
+  Iterator2 mid2  = first2 + (n / 2);
+  Iterator2 last2 = first2 + n;
+
+  typedef merge_sort_closure<DerivedPolicy,Iterator1,Iterator2,StrictWeakOrdering> Closure;
+  
+  Closure left (exec, first1, mid1,  first2, comp, !inplace);
+  Closure right(exec, mid1,   last1, mid2,   comp, !inplace);
+
+  ::tbb::parallel_invoke(left, right);
+
+  if(inplace) thrust::merge(exec, first2, mid2, mid2, last2, first1, comp);
+  else	      thrust::merge(exec, first1, mid1, mid1, last1, first2, comp);
+}
+
+
+} // end namespace sort_detail
+
+
+namespace sort_by_key_detail
+{
+
+
+// TODO tune this based on data type and comp
+const static int threshold = 128 * 1024;
+
+  
+template<typename DerivedPolicy,
+         typename Iterator1,
+         typename Iterator2,
+         typename Iterator3,
+         typename Iterator4,
+         typename StrictWeakOrdering>
+void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                       Iterator1 first1,
+                       Iterator1 last1,
+                       Iterator2 first2,
+                       Iterator3 first3,
+                       Iterator4 first4,
+                       StrictWeakOrdering comp,
+                       bool inplace);
+
+
+template<typename DerivedPolicy,
+         typename Iterator1,
+         typename Iterator2,
+         typename Iterator3,
+         typename Iterator4,
+         typename StrictWeakOrdering>
+struct merge_sort_by_key_closure
+{
+  execution_policy<DerivedPolicy> &exec;
+  Iterator1 first1, last1;
+  Iterator2 first2;
+  Iterator3 first3;
+  Iterator4 first4;
+  StrictWeakOrdering comp;
+  bool inplace;
+
+  merge_sort_by_key_closure(execution_policy<DerivedPolicy> &exec,
+                            Iterator1 first1,
+                            Iterator1 last1,
+                            Iterator2 first2,
+                            Iterator3 first3,
+                            Iterator4 first4,
+                            StrictWeakOrdering comp,
+                            bool inplace)
+    : exec(exec), first1(first1), last1(last1), first2(first2), first3(first3), first4(first4), comp(comp), inplace(inplace)
+  {}
+
+  void operator()(void) const
+  {
+    merge_sort_by_key(exec, first1, last1, first2, first3, first4, comp, inplace);
+  }
+};
+
+
+template<typename DerivedPolicy,
+         typename Iterator1,
+         typename Iterator2,
+         typename Iterator3,
+         typename Iterator4,
+         typename StrictWeakOrdering>
+void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                       Iterator1 first1,
+                       Iterator1 last1,
+                       Iterator2 first2,
+                       Iterator3 first3,
+                       Iterator4 first4,
+                       StrictWeakOrdering comp,
+                       bool inplace)
+{
+  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
+
+  difference_type n = thrust::distance(first1, last1);
+  
+  Iterator1 mid1  = first1 + (n / 2);
+  Iterator2 mid2  = first2 + (n / 2);
+  Iterator3 mid3  = first3 + (n / 2);
+  Iterator4 mid4  = first4 + (n / 2);
+  Iterator2 last2 = first2 + n;
+  Iterator3 last3 = first3 + n;
+
+  if (n < threshold)
+  {
+    thrust::stable_sort_by_key(thrust::seq, first1, last1, first2, comp);
+    
+    if(!inplace)
+    {
+      thrust::copy(thrust::seq, first1, last1, first3);
+      thrust::copy(thrust::seq, first2, last2, first4);
+    }
+
+    return;
+  }
+
+  typedef merge_sort_by_key_closure<DerivedPolicy,Iterator1,Iterator2,Iterator3,Iterator4,StrictWeakOrdering> Closure;
+  
+  Closure left (exec, first1, mid1,  first2, first3, first4, comp, !inplace);
+  Closure right(exec, mid1,   last1, mid2,   mid3,   mid4,   comp, !inplace);
+
+  ::tbb::parallel_invoke(left, right);
+
+  if(inplace)
+  {
+    thrust::merge_by_key(exec, first3, mid3, mid3, last3, first4, mid4, first1, first2, comp);
+  }
+  else
+  {
+    thrust::merge_by_key(exec, first1, mid1, mid1, last1, first2, mid2, first3, first4, comp);
+  }
+}
+
+
+} // end namespace sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type key_type;
+
+  thrust::detail::temporary_array<key_type, DerivedPolicy> temp(exec, first, last);
+
+  sort_detail::merge_sort(exec, first, last, temp.begin(), comp, true);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 first1,
+                          RandomAccessIterator1 last1,
+                          RandomAccessIterator2 first2,
+                          StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type key_type;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type val_type;
+
+  RandomAccessIterator2 last2 = first2 + thrust::distance(first1, last1);
+
+  thrust::detail::temporary_array<key_type, DerivedPolicy> temp1(exec, first1, last1);
+  thrust::detail::temporary_array<val_type, DerivedPolicy> temp2(exec, first2, last2);
+
+  sort_by_key_detail::merge_sort_by_key(exec, first1, last1, first2, temp1.begin(), temp2.begin(), comp, true);
+}
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/swap_ranges.h b/thrust/thrust/system/tbb/detail/swap_ranges.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a0b06cbe9e770e0a5b6475dbb5402c1563f383b
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/swap_ranges.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// tbb inherits swap_ranges
+#include <thrust/system/cpp/detail/swap_ranges.h>
+
diff --git a/thrust/thrust/system/tbb/detail/tabulate.h b/thrust/thrust/system/tbb/detail/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea135c707064b7195e4a78efc15849ba431e9068
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/tabulate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits tabulate
+#include <thrust/system/cpp/detail/tabulate.h>
+
diff --git a/thrust/thrust/system/tbb/detail/temporary_buffer.h b/thrust/thrust/system/tbb/detail/temporary_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2adfaf2810c67462e41f271e43ad0aff9cfbf75f
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/thrust/thrust/system/tbb/detail/transform.h b/thrust/thrust/system/tbb/detail/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..20d606dfbeec6d376a138db500ec368d94efa748
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/transform.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// omp inherits transform
+#include <thrust/system/cpp/detail/transform.h>
+
diff --git a/thrust/thrust/system/tbb/detail/transform_reduce.h b/thrust/thrust/system/tbb/detail/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8736bd75d06e54d9158baeb2504162d75312885
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/transform_reduce.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_reduce
+#include <thrust/system/cpp/detail/transform_reduce.h>
+
diff --git a/thrust/thrust/system/tbb/detail/transform_scan.h b/thrust/thrust/system/tbb/detail/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..75b075b6b16f063a1c5cda8893911d3f3c533f2d
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/transform_scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_scan
+#include <thrust/system/cpp/detail/transform_scan.h>
+
diff --git a/thrust/thrust/system/tbb/detail/uninitialized_copy.h b/thrust/thrust/system/tbb/detail/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda06ac13e9ca1d14ee5e047986884f5207d3d2b
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/uninitialized_copy.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_copy
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+
diff --git a/thrust/thrust/system/tbb/detail/uninitialized_fill.h b/thrust/thrust/system/tbb/detail/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..764de876233a012e5a9de9113c5fb2dac7a22499
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/uninitialized_fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_fill
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+
diff --git a/thrust/thrust/system/tbb/detail/unique.h b/thrust/thrust/system/tbb/detail/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e46d2bb4897a54313b7190173bc295d4aba4502
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/unique.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<ExecutionPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<ExecutionPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace tbb 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/unique.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/unique.inl b/thrust/thrust/system/tbb/detail/unique.inl
new file mode 100644
index 0000000000000000000000000000000000000000..4ee3c0d9a2501cf857f1f172ba95ed84a8628935
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/unique.inl
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/unique.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique to cpp::unique
+  return thrust::system::detail::generic::unique(exec,first,last,binary_pred);
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_copy to cpp::unique_copy
+  return thrust::system::detail::generic::unique_copy(exec,first,last,output,binary_pred);
+} // end unique_copy()
+
+
+} // end namespace detail
+} // end namespace tbb 
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/detail/unique_by_key.h b/thrust/thrust/system/tbb/detail/unique_by_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ab8578407e1cd90aeaba982780b966b4aee013e
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/unique_by_key.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace tbb 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/unique_by_key.inl>
+
diff --git a/thrust/thrust/system/tbb/detail/unique_by_key.inl b/thrust/thrust/system/tbb/detail/unique_by_key.inl
new file mode 100644
index 0000000000000000000000000000000000000000..9c1a150e1b3da244b4621b12cd37a3d74bf7a187
--- /dev/null
+++ b/thrust/thrust/system/tbb/detail/unique_by_key.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/unique_by_key.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_by_key to cpp::unique_by_key
+  return thrust::system::detail::generic::unique_by_key(exec,keys_first,keys_last,values_first,binary_pred);
+} // end unique_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_by_key_copy to cpp::unique_by_key_copy
+  return thrust::system::detail::generic::unique_by_key_copy(exec,keys_first,keys_last,values_first,keys_output,values_output,binary_pred);
+} // end unique_by_key_copy()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/thrust/thrust/system/tbb/execution_policy.h b/thrust/thrust/system/tbb/execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..18f68bfdc6c544ecf0ab9ad8562632ec73c8c95c
--- /dev/null
+++ b/thrust/thrust/system/tbb/execution_policy.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/tbb/execution_policy.h
+ *  \brief Execution policies for Thrust's TBB system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/tbb/detail/par.h>
+
+// now get all the algorithm definitions
+
+#include <thrust/system/tbb/detail/adjacent_difference.h>
+#include <thrust/system/tbb/detail/assign_value.h>
+#include <thrust/system/tbb/detail/binary_search.h>
+#include <thrust/system/tbb/detail/copy.h>
+#include <thrust/system/tbb/detail/copy_if.h>
+#include <thrust/system/tbb/detail/count.h>
+#include <thrust/system/tbb/detail/equal.h>
+#include <thrust/system/tbb/detail/extrema.h>
+#include <thrust/system/tbb/detail/fill.h>
+#include <thrust/system/tbb/detail/find.h>
+#include <thrust/system/tbb/detail/for_each.h>
+#include <thrust/system/tbb/detail/gather.h>
+#include <thrust/system/tbb/detail/generate.h>
+#include <thrust/system/tbb/detail/get_value.h>
+#include <thrust/system/tbb/detail/inner_product.h>
+#include <thrust/system/tbb/detail/iter_swap.h>
+#include <thrust/system/tbb/detail/logical.h>
+#include <thrust/system/tbb/detail/malloc_and_free.h>
+#include <thrust/system/tbb/detail/merge.h>
+#include <thrust/system/tbb/detail/mismatch.h>
+#include <thrust/system/tbb/detail/partition.h>
+#include <thrust/system/tbb/detail/reduce.h>
+#include <thrust/system/tbb/detail/reduce_by_key.h>
+#include <thrust/system/tbb/detail/remove.h>
+#include <thrust/system/tbb/detail/replace.h>
+#include <thrust/system/tbb/detail/reverse.h>
+#include <thrust/system/tbb/detail/scan.h>
+#include <thrust/system/tbb/detail/scan_by_key.h>
+#include <thrust/system/tbb/detail/scatter.h>
+#include <thrust/system/tbb/detail/sequence.h>
+#include <thrust/system/tbb/detail/set_operations.h>
+#include <thrust/system/tbb/detail/sort.h>
+#include <thrust/system/tbb/detail/swap_ranges.h>
+#include <thrust/system/tbb/detail/tabulate.h>
+#include <thrust/system/tbb/detail/transform.h>
+#include <thrust/system/tbb/detail/transform_reduce.h>
+#include <thrust/system/tbb/detail/transform_scan.h>
+#include <thrust/system/tbb/detail/uninitialized_copy.h>
+#include <thrust/system/tbb/detail/uninitialized_fill.h>
+#include <thrust/system/tbb/detail/unique.h>
+#include <thrust/system/tbb/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::tbb::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's TBB backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p tbb::tag is a type representing Thrust's TBB backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p tbb::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p tbb system.
+ */
+struct tag : thrust::system::tbb::execution_policy<tag> { unspecified };
+
+
+/*! \p thrust::tbb::par is the parallel execution policy associated with Thrust's TBB
+ *  backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's TBB backend system by providing \p thrust::tbb::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::tbb::vector.
+ *
+ *  The type of \p thrust::tbb::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::tbb::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the TBB backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/tbb/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n", x);
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::tbb::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end tbb
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/thrust/thrust/system/tbb/memory.h b/thrust/thrust/system/tbb/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..a680157006ba126b5ce7b87829bc697a7b7dfcf6
--- /dev/null
+++ b/thrust/thrust/system/tbb/memory.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/tbb/memory.h
+ *  \brief Managing memory associated with Thrust's TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/memory_resource.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/mr/allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+/*! Allocates an area of memory available to Thrust's <tt>tbb</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>tbb::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>tbb::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>tbb::pointer<void></tt> returned by this function must be
+ *        deallocated with \p tbb::free.
+ *  \see tbb::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>tbb</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>tbb::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>tbb::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>tbb::pointer<T></tt> returned by this function must be
+ *        deallocated with \p tbb::free.
+ *  \see tbb::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>tbb::malloc</tt>.
+ *  \param ptr A <tt>tbb::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>tbb::malloc</tt>.
+ *  \see tbb::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+/*! \p tbb::allocator is the default allocator used by the \p tbb system's containers such as
+ *  <tt>tbb::vector</tt> if no user-specified allocator is provided. \p tbb::allocator allocates
+ *  (deallocates) storage with \p tbb::malloc (\p tbb::free).
+ */
+template<typename T>
+using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
+
+} // end tbb
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::tbb
+ *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
+ */
+namespace tbb
+{
+
+using thrust::system::tbb::malloc;
+using thrust::system::tbb::free;
+using thrust::system::tbb::allocator;
+
+} // end tbb
+
+} // end thrust
+
+#include <thrust/system/tbb/detail/memory.inl>
+
diff --git a/thrust/thrust/system/tbb/memory_resource.h b/thrust/thrust/system/tbb/memory_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..de664eb9374904d92984bb7626dbeb6d3d8e8df8
--- /dev/null
+++ b/thrust/thrust/system/tbb/memory_resource.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file tbb/memory_resource.h
+ *  \brief Memory resources for the TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/fancy_pointer_resource.h>
+
+#include <thrust/system/tbb/pointer.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+//! \cond
+namespace detail
+{
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::tbb::pointer<void>
+    > native_resource;
+}
+//! \endcond
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! The memory resource for the TBB system. Uses \p mr::new_delete_resource and tags it with \p tbb::pointer. */
+typedef detail::native_resource memory_resource;
+/*! An alias for \p tbb::memory_resource. */
+typedef detail::native_resource universal_memory_resource;
+/*! An alias for \p tbb::memory_resource. */
+typedef detail::native_resource universal_host_pinned_memory_resource;
+
+/*! \}
+ */
+
+}
+}
+}
diff --git a/thrust/thrust/system/tbb/pointer.h b/thrust/thrust/system/tbb/pointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2912508a5191f8242c486be1c0c7c9038d9d9dc
--- /dev/null
+++ b/thrust/thrust/system/tbb/pointer.h
@@ -0,0 +1,354 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+template<typename> class pointer;
+
+} // end tbb
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize thrust::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace thrust
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::tbb::pointer<Element> >
+{
+  private:
+    typedef thrust::system::tbb::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end thrust
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::tbb
+ *  \brief \p thrust::system::tbb is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's TBB backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace tbb
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::tbb::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the tbb system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in tbb memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p tbb::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see tbb::malloc
+ *  \see tbb::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::tbb::tag,
+               thrust::system::tbb::reference<T>,
+               thrust::system::tbb::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::tbb::tag,
+      //thrust::system::tbb::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::tbb::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    // note that tbb::pointer's member functions need __host__ __device__
+    // to interoperate with nvcc + iterators' dereference member function
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer(decltype(nullptr)) : super_t(nullptr) {}
+    #endif
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p tbb system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! This constructor allows construction from another pointer-like object with \p void type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::tbb::tag and its element type shall be \p void.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    explicit
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+
+    #if THRUST_CPP_DIALECT >= 2011
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer& operator=(decltype(nullptr))
+    {
+      super_t::operator=(nullptr);
+      return *this;
+    }
+    #endif
+}; // end pointer
+
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p tbb system.
+ *  \p reference is the type of the result of dereferencing a \p tbb::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::tbb::pointer<T>,
+               thrust::system::tbb::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::tbb::pointer<T>,
+      thrust::system::tbb::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference ot interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+} // end tbb
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::tbb
+ *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
+ */
+namespace tbb
+{
+
+using thrust::system::tbb::pointer;
+using thrust::system::tbb::reference;
+
+} // end tbb
+
+} // end thrust
+
+#include <thrust/system/tbb/detail/pointer.inl>
+
diff --git a/thrust/thrust/system/tbb/vector.h b/thrust/thrust/system/tbb/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e08c8cf0d905e80b7c2441dbdba515cb5d218fd
--- /dev/null
+++ b/thrust/thrust/system/tbb/vector.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/tbb/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+/*! \p tbb::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p tbb::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p tbb::vector reside in memory
+ *  available to the \p tbb system.
+ *
+ *  \tparam T The element type of the \p tbb::vector.
+ *  \tparam Allocator The allocator type of the \p tbb::vector. Defaults to \p tbb::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p tbb::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+} // end tbb
+} // end system
+
+// alias system::tbb names at top-level
+namespace tbb
+{
+
+using thrust::system::tbb::vector;
+
+} // end tbb
+
+} // end thrust
diff --git a/thrust/thrust/system_error.h b/thrust/thrust/system_error.h
new file mode 100644
index 0000000000000000000000000000000000000000..7119ac4b63c1c05687b064eb17d07be92ca1b074
--- /dev/null
+++ b/thrust/thrust/system_error.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system_error.h
+ *  \brief System diagnostics
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+/*! \addtogroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system
+ *  \brief \p thrust::system is the namespace which contains functionality for manipulating
+ *         memory specific to one of Thrust's backend systems. It also contains functionality
+ *         for reporting error conditions originating from the operating system or other
+ *         low-level application program interfaces such as the CUDA runtime.
+ *         They are provided in a separate namespace for import convenience but are
+ *         also aliased in the top-level \p thrust namespace for easy access.
+ */
+namespace system
+{
+} // end system
+
+/*! \} // end system
+ */
+
+} // end thrust
+
+#include <thrust/system/error_code.h>
+#include <thrust/system/system_error.h>
+
diff --git a/thrust/thrust/tabulate.h b/thrust/thrust/tabulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dcd2c9ee388056d338cfe689deb8ebbb70a96d3
--- /dev/null
+++ b/thrust/thrust/tabulate.h
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file tabulate.h
+ *  \brief Fills a range with the tabulation of a function
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \{
+ */
+
+
+/*! \p tabulate fills the range <tt>[first, last)</tt> with the value of a function applied to each
+ *     element's index.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, \p tabulate performs the assignment
+ *  <tt>*i = unary_op(i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range.
+ *  \param last The end of the range.
+ *  \param unary_op The unary operation to apply.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/tabulate.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::tabulate(thrust::host, A, A + 10, thrust::negate<int>());
+ *  // A is now {0, -1, -2, -3, -4, -5, -6, -7, -8, -9}
+ *  \endcode
+ *
+ *  \see thrust::fill
+ *  \see thrust::generate
+ *  \see thrust::sequence
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename UnaryOperation>
+__host__ __device__
+  void tabulate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op);
+
+
+/*! \p tabulate fills the range <tt>[first, last)</tt> with the value of a function applied to each
+ *     element's index.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, \p tabulate performs the assignment
+ *  <tt>*i = unary_op(i - first)</tt>.
+ *
+ *  \param first The beginning of the range.
+ *  \param last The end of the range.
+ *  \param unary_op The unary operation to apply.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers:
+ *
+ *  \code
+ *  #include <thrust/tabulate.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::tabulate(A, A + 10, thrust::negate<int>());
+ *  // A is now {0, -1, -2, -3, -4, -5, -6, -7, -8, -9}
+ *  \endcode
+ *
+ *  \see thrust::fill
+ *  \see thrust::generate
+ *  \see thrust::sequence
+ */
+template<typename ForwardIterator, typename UnaryOperation>
+  void tabulate(ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op);
+
+
+/*! \} // end transformations
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/tabulate.inl>
+
diff --git a/thrust/thrust/transform.h b/thrust/thrust/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..86cda93e31d5ceda68769bf993b6f1df81df00d4
--- /dev/null
+++ b/thrust/thrust/transform.h
@@ -0,0 +1,725 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/transform.h
+ *  \brief Transforms input ranges using a function object
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup transformations
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! This version of \p transform applies a unary function to each element
+ *  of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence.  Specifically, for each iterator 
+ *  <tt>i</tt> in the range [\p first, \p last) the operation 
+ *  <tt>op(*i)</tt> is performed and the result is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  [\p result, \p result + (\p last - \p first) ).  The input and
+ *  output sequences may coincide, resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The transformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform to negate a range in-place
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::negate<int> op;
+ *
+ *  thrust::transform(thrust::host, data, data + 10, data, op); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+__host__ __device__
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first, InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op);
+
+	
+/*! This version of \p transform applies a unary function to each element
+ *  of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence.  Specifically, for each iterator 
+ *  <tt>i</tt> in the range [\p first, \p last) the operation 
+ *  <tt>op(*i)</tt> is performed and the result is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  [\p result, \p result + (\p last - \p first) ).  The input and
+ *  output sequences may coincide, resulting in an in-place transformation.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::negate<int> op;
+ *
+ *  thrust::transform(data, data + 10, data, op); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(InputIterator first, InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op);
+
+
+/*! This version of \p transform applies a binary function to each pair
+ *  of elements from two input sequences and stores the result in the
+ *  corresponding position in an output sequence.  Specifically, for
+ *  each iterator <tt>i</tt> in the range [\p first1, \p last1) and 
+ *  <tt>j = first + (i - first1)</tt> in the range [\p first2, \p last2)
+ *  the operation <tt>op(*i,*j)</tt> is performed and the result is 
+ *  assigned to <tt>*o</tt>,  where <tt>o</tt> is the corresponding
+ *  output iterator in the range [\p result, \p result + (\p last - \p first) ).
+ *  The input and output sequences may coincide, resulting in an 
+ *  in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform to compute the sum of two
+ *  ranges using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int input1[6] = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6] = { 3,  6, -2,  1,  2,  3};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *
+ *  thrust::transform(thrust::host, input1, input1 + 6, input2, output, op);
+ *
+ *  // output is now {-2,  6,  0,  4,  4,  7};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+__host__ __device__
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1 first1, InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op);
+
+
+/*! This version of \p transform applies a binary function to each pair
+ *  of elements from two input sequences and stores the result in the
+ *  corresponding position in an output sequence.  Specifically, for
+ *  each iterator <tt>i</tt> in the range [\p first1, \p last1) and 
+ *  <tt>j = first + (i - first1)</tt> in the range [\p first2, \p last2)
+ *  the operation <tt>op(*i,*j)</tt> is performed and the result is 
+ *  assigned to <tt>*o</tt>,  where <tt>o</tt> is the corresponding
+ *  output iterator in the range [\p result, \p result + (\p last - \p first) ).
+ *  The input and output sequences may coincide, resulting in an 
+ *  in-place transformation.
+ *    
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int input1[6] = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6] = { 3,  6, -2,  1,  2,  3};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *
+ *  thrust::transform(input1, input1 + 6, input2, output, op);
+ *
+ *  // output is now {-2,  6,  0,  4,  4,  7};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in the input sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*i)</tt> is evaluated. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if to negate the odd-valued
+ *  elements of a range using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x % 2;
+ *    }
+ *  };
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  // negate odd elements
+ *  thrust::transform_if(thrust::host, data, data + 10, data, op, is_odd()); // in-place transformation
+ *
+ *  // data is now {5, 0, 2, 3, 2, 4, 0, 1, 2, 8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator first, InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in the input sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*i)</tt> is evaluated. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x % 2;
+ *    }
+ *  };
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  // negate odd elements
+ *  thrust::transform_if(data, data + 10, data, op, is_odd()); // in-place transformation
+ *
+ *  // data is now {5, 0, 2, 3, 2, 4, 0, 1, 2, 8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator first, InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satisfies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last - first) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *  int stencil[10] = { 1, 0, 1,  0, 1, 0, 1,  0, 1, 0};
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(thrust::host, data, data + 10, stencil, data, op, identity); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, -3, -2,  4, 0, -1, -2,  8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first, InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satisfies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last - first) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *  int stencil[10] = { 1, 0, 1,  0, 1, 0, 1,  0, 1, 0};
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(data, data + 10, stencil, data, op, identity); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, -3, -2,  4, 0, -1, -2,  8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first, InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a binary function
+ *  to each pair of elements from two input sequences and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first1, last1)</tt> and 
+ *  <tt>j = first2 + (i - first1)</tt> in the range <tt>[first2, first2 + (last1 - first1) )</tt>,
+ *  the predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last1 - first1) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>binary_op(*i,*j)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last1 - first1) )</tt>. Otherwise, <tt>binary_op(*i,*j)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The transformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int input1[6]  = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6]  = { 3,  6, -2,  1,  2,  3};
+ *  int stencil[8] = { 1,  0,  1,  0,  1,  0};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(thrust::host, input1, input1 + 6, input2, stencil, output, op, identity);
+ *
+ *  // output is now {-2,  0,  0,  3,  4,  4};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+__host__ __device__
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a binary function
+ *  to each pair of elements from two input sequences and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first1, last1)</tt> and 
+ *  <tt>j = first2 + (i - first1)</tt> in the range <tt>[first2, first2 + (last1 - first1) )</tt>,
+ *  the predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last1 - first1) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>binary_op(*i,*j)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last1 - first1) )</tt>. Otherwise, <tt>binary_op(*i,*j)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *    
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The transformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int input1[6]  = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6]  = { 3,  6, -2,  1,  2,  3};
+ *  int stencil[8] = { 1,  0,  1,  0,  1,  0};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(input1, input1 + 6, input2, stencil, output, op, identity);
+ *
+ *  // output is now {-2,  0,  0,  3,  4,  4};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred);
+
+
+/*! \} // end transformations
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/transform.inl>
+
diff --git a/thrust/thrust/transform_reduce.h b/thrust/thrust/transform_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..32e172d1e1a818b791ec1e567b35dc4aba358d18
--- /dev/null
+++ b/thrust/thrust/transform_reduce.h
@@ -0,0 +1,198 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_reduce.h
+ *  \brief Fused transform / reduction
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup transformed_reductions Transformed Reductions
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p transform_reduce fuses the \p transform and \p reduce operations.
+ *  \p transform_reduce is equivalent to performing a transformation defined by
+ *  \p unary_op into a temporary sequence and then performing \p reduce on the
+ *  transformed sequence. In most cases, fusing these two operations together is
+ *  more efficient, since fewer memory reads and writes are required.
+ *
+ *  \p transform_reduce performs a reduction on the transformation of the
+ *  sequence <tt>[first, last)</tt> according to \p unary_op. Specifically,
+ *  \p unary_op is applied to each element of the sequence and then the result
+ *  is reduced to a single value with \p binary_op using the initial value 
+ *  \p init.  Note that the transformation \p unary_op is not applied to 
+ *  the initial value \p init.  The order of reduction is not specified, 
+ *  so \p binary_op must be both commutative and associative. 
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param unary_op The function to apply to each element of the input sequence.
+ *  \param init The result is initialized to this value.
+ *  \param binary_op The reduction operation.
+ *  \return The result of the transformed reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p transform_reduce
+ *  to compute the maximum value of the absolute value of the elements
+ *  of a range using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_reduce.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  template<typename T>
+ *  struct absolute_value : public unary_function<T,T>
+ *  {
+ *    __host__ __device__ T operator()(const T &x) const
+ *    {
+ *      return x < T(0) ? -x : x;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int data[6] = {-1, 0, -2, -2, 1, -3};
+ *  int result = thrust::transform_reduce(thrust::host,
+ *                                        data, data + 6,
+ *                                        absolute_value<int>(),
+ *                                        0,
+ *                                        thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see \c transform
+ *  \see \c reduce
+ */
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+__host__ __device__
+  OutputType transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op);
+
+
+/*! \p transform_reduce fuses the \p transform and \p reduce operations.
+ *  \p transform_reduce is equivalent to performing a transformation defined by
+ *  \p unary_op into a temporary sequence and then performing \p reduce on the
+ *  transformed sequence. In most cases, fusing these two operations together is
+ *  more efficient, since fewer memory reads and writes are required.
+ *
+ *  \p transform_reduce performs a reduction on the transformation of the
+ *  sequence <tt>[first, last)</tt> according to \p unary_op. Specifically,
+ *  \p unary_op is applied to each element of the sequence and then the result
+ *  is reduced to a single value with \p binary_op using the initial value 
+ *  \p init.  Note that the transformation \p unary_op is not applied to 
+ *  the initial value \p init.  The order of reduction is not specified, 
+ *  so \p binary_op must be both commutative and associative. 
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param unary_op The function to apply to each element of the input sequence.
+ *  \param init The result is initialized to this value.
+ *  \param binary_op The reduction operation.
+ *  \return The result of the transformed reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p transform_reduce
+ *  to compute the maximum value of the absolute value of the elements
+ *  of a range.
+ *
+ *  \code
+ *  #include <thrust/transform_reduce.h>
+ *  #include <thrust/functional.h>
+ *
+ *  template<typename T>
+ *  struct absolute_value : public unary_function<T,T>
+ *  {
+ *    __host__ __device__ T operator()(const T &x) const
+ *    {
+ *      return x < T(0) ? -x : x;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int data[6] = {-1, 0, -2, -2, 1, -3};
+ *  int result = thrust::transform_reduce(data, data + 6,
+ *                                        absolute_value<int>(),
+ *                                        0,
+ *                                        thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see \c transform
+ *  \see \c reduce
+ */
+template<typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op);
+
+
+/*! \} // end transformed_reductions
+ *  \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/transform_reduce.inl>
+
diff --git a/thrust/thrust/transform_scan.h b/thrust/thrust/transform_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bb883d541e3ebba6f5d577ec063e025bb786d15
--- /dev/null
+++ b/thrust/thrust/transform_scan.h
@@ -0,0 +1,324 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_scan.h
+ *  \brief Fused transform / prefix-sum
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup prefixsums Prefix Sums
+ *  \ingroup algorithms
+ *  \{
+ */
+	
+/*! \addtogroup transformed_prefixsums Transformed Prefix Sums
+ *  \ingroup prefixsums
+ *  \{
+ */
+
+
+/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
+ *  operations.  \p transform_inclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p inclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
+ *  <tt>unary_op(\*first)</tt> is assigned to <tt>\*result</tt> and the result
+ *  of <tt>binary_op(unary_op(\*first), unary_op(\*(first + 1)))</tt> is
+ *  assigned to <tt>\*(result + 1)</tt>, and so on.  The transform scan
+ *  operation is permitted to be in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_inclusive_scan using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_inclusive_scan(thrust::host, data, data + 6, data, unary_op, binary_op); // in-place scan
+ *
+ *  // data is now {-1, -1, -3, -5, -6, -9}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p inclusive_scan
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator transform_inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          AssociativeOperator binary_op);
+
+
+/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
+ *  operations.  \p transform_inclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p inclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
+ *  <tt>unary_op(\*first)</tt> is assigned to <tt>\*result</tt> and the result
+ *  of <tt>binary_op(unary_op(\*first), unary_op(\*(first + 1)))</tt> is
+ *  assigned to <tt>\*(result + 1)</tt>, and so on.  The transform scan
+ *  operation is permitted to be in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_inclusive_scan
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_inclusive_scan(data, data + 6, data, unary_op, binary_op); // in-place scan
+ *
+ *  // data is now {-1, -1, -3, -5, -6, -9}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p inclusive_scan
+ *
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+  OutputIterator transform_inclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          AssociativeOperator binary_op);
+
+
+/*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
+ *  operations.  \p transform_exclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p exclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In 
+ *  \p transform_exclusive_scan, \p init is assigned to <tt>\*result</tt> 
+ *  and the result of <tt>binary_op(init, unary_op(\*first))</tt> is assigned
+ *  to <tt>\*(result + 1)</tt>, and so on.  The transform scan operation is 
+ *  permitted to be in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param init The initial value of the \p exclusive_scan
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_exclusive_scan using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_exclusive_scan(thrust::host, data, data + 6, data, unary_op, 4, binary_op); // in-place scan
+ *
+ *  // data is now {4, 3, 3, 1, -1, -2}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p exclusive_scan
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+__host__ __device__
+  OutputIterator transform_exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op);
+
+
+/*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
+ *  operations.  \p transform_exclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p exclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In 
+ *  \p transform_exclusive_scan, \p init is assigned to <tt>\*result</tt> 
+ *  and the result of <tt>binary_op(init, unary_op(\*first))</tt> is assigned
+ *  to <tt>\*(result + 1)</tt>, and so on.  The transform scan operation is 
+ *  permitted to be in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param init The initial value of the \p exclusive_scan
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_exclusive_scan(data, data + 6, data, unary_op, 4, binary_op); // in-place scan
+ *
+ *  // data is now {4, 3, 3, 1, -1, -2}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p exclusive_scan
+ *
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op);
+
+
+/*! \} // end transformed_prefixsums
+ */
+
+
+/*! \} // end prefixsums
+ */
+
+	
+} // end namespace thrust
+
+#include <thrust/detail/transform_scan.inl>
+
diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
new file mode 100644
index 0000000000000000000000000000000000000000..930f9032611d9f86caf9a50adb576f047eafd14d
--- /dev/null
+++ b/thrust/thrust/tuple.h
@@ -0,0 +1,585 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file tuple.h
+ *  \brief A type encapsulating a heterogeneous collection of elements
+ */
+
+/*
+ * Copyright (C) 1999, 2000 Jaakko Järvi (jaakko.jarvi@cs.utu.fi)
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/tuple.inl>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup tuple
+ *  \{
+ */
+
+/*! \cond
+ */
+
+struct null_type;
+
+/*! \endcond
+ */
+
+/*! This metafunction returns the type of a
+ *  \p tuple's <tt>N</tt>th element.
+ *
+ *  \tparam N This parameter selects the element of interest.
+ *  \tparam T A \c tuple type of interest.
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<int N, class T>
+  struct tuple_element
+{
+  private:
+    typedef typename T::tail_type Next;
+
+  public:
+    /*! The result of this metafunction is returned in \c type.
+     */
+    typedef typename tuple_element<N-1, Next>::type type;
+}; // end tuple_element
+
+/*! This metafunction returns the number of elements
+ *  of a \p tuple type of interest.
+ *
+ *  \tparam T A \c tuple type of interest.
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<class T>
+  struct tuple_size
+{
+  /*! The result of this metafunction is returned in \c value.
+   */
+  static const int value = 1 + tuple_size<typename T::tail_type>::value;
+}; // end tuple_size
+
+// get function for non-const cons-lists, returns a reference to the element
+
+/*! The \p get function returns a reference to a \p tuple element of
+ *  interest.
+ *
+ *  \param t A reference to a \p tuple of interest.
+ *  \return A reference to \p t's <tt>N</tt>th element.
+ *
+ *  \tparam N The index of the element of interest.
+ *
+ *  The following code snippet demonstrates how to use \p get to print
+ *  the value of a \p tuple element.
+ *
+ *  \code
+ *  #include <thrust/tuple.h>
+ *  #include <iostream>
+ *  ...
+ *  thrust::tuple<int, const char *> t(13, "thrust");
+ *
+ *  std::cout << "The 1st value of t is " << thrust::get<0>(t) << std::endl;
+ *  \endcode
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::non_const_type
+get(detail::cons<HT, TT>& t);
+
+
+/*! The \p get function returns a \c const reference to a \p tuple element of
+ *  interest.
+ *
+ *  \param t A reference to a \p tuple of interest.
+ *  \return A \c const reference to \p t's <tt>N</tt>th element.
+ *
+ *  \tparam N The index of the element of interest.
+ *
+ *  The following code snippet demonstrates how to use \p get to print
+ *  the value of a \p tuple element.
+ *
+ *  \code
+ *  #include <thrust/tuple.h>
+ *  #include <iostream>
+ *  ...
+ *  thrust::tuple<int, const char *> t(13, "thrust");
+ *
+ *  std::cout << "The 1st value of t is " << thrust::get<0>(t) << std::endl;
+ *  \endcode
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::const_type
+get(const detail::cons<HT, TT>& t);
+
+
+
+/*! \p tuple is a class template that can be instantiated with up to ten arguments.
+ *  Each template argument specifies the type of element in the \p tuple.
+ *  Consequently, tuples are heterogeneous, fixed-size collections of values. An
+ *  instantiation of \p tuple with two arguments is similar to an instantiation
+ *  of \p pair with the same two arguments. Individual elements of a \p tuple may
+ *  be accessed with the \p get function.
+ *
+ *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
+ *          type currently supports up to ten elements.
+ *
+ *  The following code snippet demonstrates how to create a new \p tuple object
+ *  and inspect and modify the value of its elements.
+ *
+ *  \code
+ *  #include <thrust/tuple.h>
+ *  #include <iostream>
+ *  ...
+ *  // create a tuple containing an int, a float, and a string
+ *  thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
+ *
+ *  // individual members are accessed with the free function get
+ *  std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl; 
+ *
+ *  // or the member function get
+ *  std::cout << "The second element's value is " << t.get<1>() << std::endl;
+ *
+ *  // we can also modify elements with the same function
+ *  thrust::get<0>(t) += 10;
+ *  \endcode
+ *
+ *  \see pair
+ *  \see get
+ *  \see make_tuple
+ *  \see tuple_element
+ *  \see tuple_size
+ *  \see tie
+ */
+template <class T0, class T1, class T2, class T3, class T4,
+          class T5, class T6, class T7, class T8, class T9>
+  class tuple :
+    public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+{
+  /*! \cond
+   */
+
+  private:
+  typedef typename detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type inherited;
+
+  /*! \endcond
+   */
+
+  public:
+  /*! \p tuple's no-argument constructor initializes each element.
+   */
+  inline __host__ __device__
+  tuple(void) {}
+
+  /*! \p tuple's one-argument constructor copy constructs the first element from the given parameter
+   *     and intializes all other elements.
+   *  \param t0 The value to assign to this \p tuple's first element.
+   */
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0)
+    : inherited(t0,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  /*! \p tuple's one-argument constructor copy constructs the first two elements from the given parameters
+   *     and intializes all other elements.
+   *  \param t0 The value to assign to this \p tuple's first element.
+   *  \param t1 The value to assign to this \p tuple's second element.
+   *  \note \p tuple's constructor has ten variants of this form, the rest of which are ommitted here for brevity.
+   */
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1)
+    : inherited(t0, t1,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  /*! \cond
+   */
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2)
+    : inherited(t0, t1, t2,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3)
+    : inherited(t0, t1, t2, t3,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4)
+    : inherited(t0, t1, t2, t3, t4,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5)
+    : inherited(t0, t1, t2, t3, t4, t5,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6)
+    : inherited(t0, t1, t2, t3, t4, t5, t6,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6,
+        typename access_traits<T7>::parameter_type t7)
+    : inherited(t0, t1, t2, t3, t4, t5, t6, t7,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6,
+        typename access_traits<T7>::parameter_type t7,
+        typename access_traits<T8>::parameter_type t8)
+    : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8,
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6,
+        typename access_traits<T7>::parameter_type t7,
+        typename access_traits<T8>::parameter_type t8,
+        typename access_traits<T9>::parameter_type t9)
+    : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) {}
+
+
+  template<class U1, class U2>
+  inline __host__ __device__ 
+  tuple(const detail::cons<U1, U2>& p) : inherited(p) {}
+
+  __thrust_exec_check_disable__
+  template <class U1, class U2>
+  inline __host__ __device__ 
+  tuple& operator=(const detail::cons<U1, U2>& k)
+  {
+    inherited::operator=(k);
+    return *this;
+  }
+
+  /*! \endcond
+   */
+
+  /*! This assignment operator allows assigning the first two elements of this \p tuple from a \p pair.
+   *  \param k A \p pair to assign from.
+   */
+  __thrust_exec_check_disable__
+  template <class U1, class U2>
+  __host__ __device__ inline
+  tuple& operator=(const thrust::pair<U1, U2>& k) {
+    //BOOST_STATIC_ASSERT(length<tuple>::value == 2);// check_length = 2
+    this->head = k.first;
+    this->tail.head = k.second;
+    return *this;
+  }
+
+  /*! \p swap swaps the elements of two <tt>tuple</tt>s.
+   *
+   *  \param t The other <tt>tuple</tt> with which to swap.
+   */
+  inline __host__ __device__
+  void swap(tuple &t)
+  {
+    inherited::swap(t);
+  }
+};
+
+/*! \cond
+ */
+
+template <>
+class tuple<null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type>  :
+  public null_type
+{
+public:
+  typedef null_type inherited;
+};
+
+/*! \endcond
+ */
+
+
+/*! This version of \p make_tuple creates a new \c tuple object from a
+ *  single object.
+ *
+ *  \param t0 The object to copy from.
+ *  \return A \p tuple object with a single member which is a copy of \p t0.
+ */
+template<class T0>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0>::type
+    make_tuple(const T0& t0);
+
+/*! This version of \p make_tuple creates a new \c tuple object from two
+ *  objects.
+ *
+ *  \param t0 The first object to copy from.
+ *  \param t1 The second object to copy from.
+ *  \return A \p tuple object with two members which are copies of \p t0
+ *          and \p t1.
+ *
+ *  \note \p make_tuple has ten variants, the rest of which are omitted here
+ *        for brevity.
+ */
+template<class T0, class T1>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1>::type
+    make_tuple(const T0& t0, const T1& t1);
+
+/*! This version of \p tie creates a new \c tuple whose single element is
+ *  a reference which refers to this function's argument.
+ *
+ *  \param t0 The object to reference.
+ *  \return A \p tuple object with one member which is a reference to \p t0.
+ */
+template<typename T0>
+__host__ __device__ inline
+tuple<T0&> tie(T0& t0);
+
+/*! This version of \p tie creates a new \c tuple of references object which
+ *  refers to this function's arguments.
+ *
+ *  \param t0 The first object to reference.
+ *  \param t1 The second object to reference.
+ *  \return A \p tuple object with two members which are references to \p t0
+ *          and \p t1.
+ *
+ *  \note \p tie has ten variants, the rest of which are omitted here for
+ *           brevity.
+ */
+template<typename T0, typename T1>
+__host__ __device__ inline
+tuple<T0&,T1&> tie(T0& t0, T1& t1);
+
+/*! \p swap swaps the contents of two <tt>tuple</tt>s.
+ *
+ *  \param x The first \p tuple to swap.
+ *  \param y The second \p tuple to swap.
+ */
+template<
+  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
+  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
+>
+inline __host__ __device__
+void swap(tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> &x,
+          tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> &y);
+
+
+
+/*! \cond
+ */
+
+template<class T0, class T1, class T2>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2);
+
+template<class T0, class T1, class T2, class T3>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3);
+
+template<class T0, class T1, class T2, class T3, class T4>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8, const T9& t9);
+
+template<typename T0, typename T1, typename T2>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&> tie(T0 &t0, T1 &t1, T2 &t2);
+
+template<typename T0, typename T1, typename T2, typename T3>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8, T9 &t9);
+
+
+__host__ __device__ inline
+bool operator==(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator>=(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator<=(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator!=(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator<(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator>(const null_type&, const null_type&);
+
+/*! \endcond
+ */
+
+/*! \} // tuple
+ */
+
+/*! \} // utility
+ */
+
+} // end thrust
+
diff --git a/thrust/thrust/type_traits/integer_sequence.h b/thrust/thrust/type_traits/integer_sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..e28e4f95c03c38ab5b0f34edccce42625f7e7b8e
--- /dev/null
+++ b/thrust/thrust/type_traits/integer_sequence.h
@@ -0,0 +1,262 @@
+///////////////////////////////////////////////////////////////////////////////
+//  Copyright (c)      2018 NVIDIA Corporation
+//  Copyright (c) 2015-2018 Bryce Adelstein Lelbach aka wash
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \file integer_sequence.h
+ *  \brief C++14's \c integer_sequence and associated helper aliases plus some
+ *         extensions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <type_traits>
+#include <utility>
+#include <cstdint>
+#include <utility>
+
+namespace thrust
+{
+
+#if THRUST_CPP_DIALECT >= 2014
+
+// A compile-time sequence of integral constants of type T.
+template <typename T, T... Is>
+using integer_sequence = std::integer_sequence<T, Is...>;
+
+// A compile-time sequence of std::size_t constants.
+template <std::size_t... Is>
+using index_sequence = std::index_sequence<Is...>;
+
+// Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
+template <typename T, std::size_t N>
+using make_integer_sequence = std::make_integer_sequence<T, N>;
+
+// Create a new index_sequence with elements 0, 1, 2, ..., N - 1.
+template <std::size_t N>
+using make_index_sequence = std::make_index_sequence<N>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+#else // Older than C++14.
+
+// A compile-time sequence of integral constants of type T.
+template <typename T, T... Is>
+struct integer_sequence;
+
+// A compile-time sequence of std::size_t constants.
+template <std::size_t... Is>
+using index_sequence = integer_sequence<std::size_t, Is...>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+// Create a new integer_sequence containing the elements of Sequence0 followed
+// by the elements of Sequence1. Sequence0::size() is added to each element from
+// Sequence1 in the new sequence.
+template <typename Sequence0, typename Sequence1>
+  struct merge_and_renumber_integer_sequences_impl;
+template <typename Sequence0, typename Sequence1>
+  using merge_and_renumber_integer_sequences =
+      typename merge_and_renumber_integer_sequences_impl<
+          Sequence0, Sequence1
+      >::type;
+
+// Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
+template <typename T, std::size_t N>
+  struct make_integer_sequence_impl;
+
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
+template <typename T, std::size_t N>
+using make_integer_sequence =
+  typename detail::make_integer_sequence_impl<T, N>::type;
+
+// Create a new index_sequence with elements 0, 1, 2, ..., N - 1.
+template <std::size_t N>
+using make_index_sequence =
+  make_integer_sequence<std::size_t, N>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, T... Is>
+struct integer_sequence
+{
+  using type = integer_sequence;
+  using value_type = T;
+  using size_type = std::size_t;
+
+  __host__ __device__
+  static constexpr size_type size() noexcept
+  {
+    return sizeof...(Is);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename T, T... Is0, T... Is1>
+struct merge_and_renumber_integer_sequences_impl<
+  integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
+>
+{
+  using type = integer_sequence<T, Is0..., (sizeof...(Is0) + Is1)...>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, std::size_t N>
+struct make_integer_sequence_impl
+{
+  using type = merge_and_renumber_integer_sequences<
+    make_integer_sequence<T, N / 2>
+  , make_integer_sequence<T, N - N / 2>
+  >;
+};
+
+template <typename T>
+struct make_integer_sequence_impl<T, 0>
+{
+  using type = integer_sequence<T>;
+};
+
+template <typename T>
+struct make_integer_sequence_impl<T, 1>
+{
+  using type = integer_sequence<T, 0>;
+};
+
+} // namespace detail
+
+#endif // THRUST_CPP_DIALECT >= 2014
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+// Create a new integer_sequence containing the elements of Sequence0 followed
+// by the elements of Sequence1. Sequence1::size() is added to each element from
+// Sequence0 in the new sequence.
+template <typename Sequence0, typename Sequence1>
+  struct merge_and_renumber_reversed_integer_sequences_impl;
+template <typename Sequence0, typename Sequence1>
+  using merge_and_renumber_reversed_integer_sequences =
+      typename merge_and_renumber_reversed_integer_sequences_impl<
+          Sequence0, Sequence1
+      >::type;
+
+// Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0.
+template <typename T, std::size_t N>
+struct make_reversed_integer_sequence_impl;
+
+// Add a new element to the front of an integer_sequence<>.
+template <typename T, T I, typename Sequence> 
+struct integer_sequence_push_front_impl;
+
+// Add a new element to the back of an integer_sequence<>.
+template <typename T, T I, typename Sequence> 
+struct integer_sequence_push_back_impl;
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0.
+template <typename T, std::size_t N>
+using make_reversed_integer_sequence =
+  typename detail::make_reversed_integer_sequence_impl<T, N>::type;
+
+// Create a new index_sequence with elements N - 1, N - 2, N - 3, ..., 0.
+template <std::size_t N>
+using make_reversed_index_sequence =
+  make_reversed_integer_sequence<std::size_t, N>;
+
+// Add a new element to the front of an integer_sequence<>.
+template <typename T, T I, typename Sequence> 
+using integer_sequence_push_front =
+  typename detail::integer_sequence_push_front_impl<T, I, Sequence>::type;
+
+// Add a new element to the back of an integer_sequence<>.
+template <typename T, T I, typename Sequence> 
+using integer_sequence_push_back =
+  typename detail::integer_sequence_push_back_impl<T, I, Sequence>::type;
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename T, T... Is0, T... Is1>
+struct merge_and_renumber_reversed_integer_sequences_impl<
+  integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
+>
+{
+  using type = integer_sequence<T, (sizeof...(Is1) + Is0)..., Is1...>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, std::size_t N>
+struct make_reversed_integer_sequence_impl
+{
+  using type = merge_and_renumber_reversed_integer_sequences<
+      make_reversed_integer_sequence<T, N / 2>
+    , make_reversed_integer_sequence<T, N - N / 2>
+  >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct make_reversed_integer_sequence_impl<T, 0>
+{
+  using type = integer_sequence<T>;
+};
+
+template <typename T>
+struct make_reversed_integer_sequence_impl<T, 1>
+{
+  using type = integer_sequence<T, 0>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, T I0, T... Is> 
+struct integer_sequence_push_front_impl<T, I0, integer_sequence<T, Is...> >
+{
+  using type = integer_sequence<T, I0, Is...>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, T I0, T... Is> 
+struct integer_sequence_push_back_impl<T, I0, integer_sequence<T, Is...> >
+{
+  using type = integer_sequence<T, Is..., I0>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/type_traits/is_contiguous_iterator.h b/thrust/thrust/type_traits/is_contiguous_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e075bd28bd7141b162e97c226a07ebe582659ef
--- /dev/null
+++ b/thrust/thrust/type_traits/is_contiguous_iterator.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file is_contiguous_iterator.h
+ *  \brief An extensible type trait for determining if an iterator satisifies
+ *         the <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
+ *         requirements (e.g. is pointer-like).
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#include <iterator>
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && _MSC_VER < 1916 // MSVC 2017 version 15.9
+  #include <vector>
+  #include <string>
+  #include <array>
+
+  #if THRUST_CPP_DIALECT >= 2017
+    #include <string_view>
+  #endif
+#endif
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename Iterator>
+struct is_contiguous_iterator_impl;
+
+} // namespace detail
+
+/// Unary metafunction returns \c true_type if \c Iterator satisfies
+/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+/// e.g. it points to elements that are contiguous in memory, and \c false_type
+/// otherwise.
+template <typename Iterator>
+#if THRUST_CPP_DIALECT >= 2011
+using is_contiguous_iterator =
+#else
+struct is_contiguous_iterator :
+#endif
+  detail::is_contiguous_iterator_impl<Iterator>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c Iterator satisfies
+/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+/// e.g. it points to elements that are contiguous in memory, and \c false
+/// otherwise.
+template <typename Iterator>
+constexpr bool is_contiguous_iterator_v = is_contiguous_iterator<Iterator>::value;
+#endif
+
+/// Customization point that can be customized to indicate that an iterator
+/// type \c Iterator satisfies
+/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+/// e.g. it points to elements that are contiguous in memory.
+template <typename Iterator>
+struct proclaim_contiguous_iterator : false_type {};
+
+/// Declares that the iterator \c Iterator is
+/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
+/// by specializing `thrust::proclaim_contiguous_iterator`.
+#define THRUST_PROCLAIM_CONTIGUOUS_ITERATOR(Iterator)                         \
+  namespace thrust {                                                          \
+  template <>                                                                 \
+  struct proclaim_contiguous_iterator<Iterator> : ::thrust::true_type {};     \
+  } /* end namespace thrust */                                                \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename Iterator>
+struct is_libcxx_wrap_iter : false_type {};
+
+#if defined(_LIBCPP_VERSION)
+template <typename Iterator>
+struct is_libcxx_wrap_iter<
+  _VSTD::__wrap_iter<Iterator>
+> : true_type {};
+#endif
+
+template <typename Iterator>
+struct is_libstdcxx_normal_iterator : false_type {};
+
+#if defined(__GLIBCXX__)
+template <typename Iterator, typename Container>
+struct is_libstdcxx_normal_iterator<
+  ::__gnu_cxx::__normal_iterator<Iterator, Container>
+> : true_type {};
+#endif
+
+#if   _MSC_VER >= 1916 // MSVC 2017 version 15.9.
+template <typename Iterator>
+struct is_msvc_contiguous_iterator
+  : is_pointer<::std::_Unwrapped_t<Iterator> > {};
+#elif _MSC_VER >= 1700 // MSVC 2012.
+template <typename Iterator>
+struct is_msvc_contiguous_iterator : false_type {};
+
+template <typename Vector>
+struct is_msvc_contiguous_iterator<
+  ::std::_Vector_const_iterator<Vector>
+> : true_type {};
+
+template <typename Vector>
+struct is_msvc_contiguous_iterator<
+  ::std::_Vector_iterator<Vector>
+> : true_type {};
+
+template <typename String>
+struct is_msvc_contiguous_iterator<
+  ::std::_String_const_iterator<String>
+> : true_type {};
+
+template <typename String>
+struct is_msvc_contiguous_iterator<
+  ::std::_String_iterator<String>
+> : true_type {};
+
+template <typename T, std::size_t N>
+struct is_msvc_contiguous_iterator<
+  ::std::_Array_const_iterator<T, N>
+> : true_type {};
+
+template <typename T, std::size_t N>
+struct is_msvc_contiguous_iterator<
+  ::std::_Array_iterator<T, N>
+> : true_type {};
+
+#if THRUST_CPP_DIALECT >= 2017
+template <typename Traits>
+struct is_msvc_contiguous_iterator<
+  ::std::_String_view_iterator<Traits>
+> : true_type {};
+#endif
+#else
+template <typename Iterator>
+struct is_msvc_contiguous_iterator : false_type {};
+#endif
+
+
+template <typename Iterator>
+struct is_contiguous_iterator_impl
+  : integral_constant<
+      bool
+    ,    is_pointer<Iterator>::value
+      || is_thrust_pointer<Iterator>::value
+      || is_libcxx_wrap_iter<Iterator>::value
+      || is_libstdcxx_normal_iterator<Iterator>::value
+      || is_msvc_contiguous_iterator<Iterator>::value
+      || proclaim_contiguous_iterator<Iterator>::value
+    >
+{};
+
+} // namespace detail
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/type_traits/is_execution_policy.h b/thrust/thrust/type_traits/is_execution_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f2f7ef80a702cacce5d544f4e7d58691a3b3d92
--- /dev/null
+++ b/thrust/thrust/type_traits/is_execution_policy.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+/// Unary metafunction that is \c true if \c T is an \a ExecutionPolicy and
+/// \c false otherwise.
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_execution_policy =
+#else
+struct is_execution_policy :
+#endif
+  detail::is_base_of<detail::execution_policy_marker, T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+/// <CODE>constexpr bool</CODE> that is \c true if \c T is an \a ExecutionPolicy
+/// and \c false otherwise.
+#if THRUST_CPP_DIALECT >= 2014
+template <typename T>
+constexpr bool is_execution_policy_v = is_execution_policy<T>::value;
+#endif
+
+} // end namespace thrust
+
+
diff --git a/thrust/thrust/type_traits/is_operator_less_or_greater_function_object.h b/thrust/thrust/type_traits/is_operator_less_or_greater_function_object.h
new file mode 100644
index 0000000000000000000000000000000000000000..6efc002238127a2cd34a0c9189663f0245e5ef17
--- /dev/null
+++ b/thrust/thrust/type_traits/is_operator_less_or_greater_function_object.h
@@ -0,0 +1,136 @@
+
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file is_operator_less_or_greater_function_object.h
+ *  \brief Type traits for determining if a \c BinaryFunction is equivalent to
+///        either \c operator< or \c operator>.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename FunctionObject>
+struct is_operator_less_function_object_impl;
+
+template <typename FunctionObject>
+struct is_operator_greater_function_object_impl;
+
+} // namespace detail
+
+/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
+/// to \c operator<, and \c false_type otherwise.
+template <typename FunctionObject>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_less_function_object =
+#else
+struct is_operator_less_function_object :
+#endif
+  detail::is_operator_less_function_object_impl<FunctionObject>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
+/// equivalent to \c operator<, and \c false otherwise.
+template <typename FunctionObject>
+constexpr bool is_operator_less_function_object_v
+  = is_operator_less_function_object<FunctionObject>::value;
+#endif
+
+/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
+/// to \c operator>, and \c false_type otherwise.
+template <typename FunctionObject>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_greater_function_object =
+#else
+struct is_operator_greater_function_object :
+#endif
+  detail::is_operator_greater_function_object_impl<FunctionObject>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
+/// equivalent to \c operator>, and \c false otherwise.
+template <typename FunctionObject>
+constexpr bool is_operator_greater_function_object_v
+  = is_operator_greater_function_object<FunctionObject>::value;
+#endif
+
+/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
+/// to either \c operator<, and \c false_type otherwise.
+template <typename FunctionObject>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_less_or_greater_function_object =
+#else
+struct is_operator_less_or_greater_function_object :
+#endif
+  integral_constant<
+    bool 
+  ,    detail::is_operator_less_function_object_impl<FunctionObject>::value
+    || detail::is_operator_greater_function_object_impl<FunctionObject>::value
+  >
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
+/// equivalent to either \c operator< or \c operator>, and \c false otherwise.
+template <typename FunctionObject>
+constexpr bool is_operator_less_or_greater_function_object_v
+  = is_operator_less_or_greater_function_object<FunctionObject>::value;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename FunctionObject>
+struct is_operator_less_function_object_impl                   : false_type {};
+template <typename T>
+struct is_operator_less_function_object_impl<thrust::less<T> > : true_type {};
+template <typename T>
+struct is_operator_less_function_object_impl<std::less<T>    > : true_type {};
+
+template <typename FunctionObject>
+struct is_operator_greater_function_object_impl                      : false_type {};
+template <typename T>
+struct is_operator_greater_function_object_impl<thrust::greater<T> > : true_type {};
+template <typename T>
+struct is_operator_greater_function_object_impl<std::greater<T>    > : true_type {};
+
+} // namespace detail
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/type_traits/is_operator_plus_function_object.h b/thrust/thrust/type_traits/is_operator_plus_function_object.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b2ebb107434f4a28f1b1901d6566ed92cb57dd1
--- /dev/null
+++ b/thrust/thrust/type_traits/is_operator_plus_function_object.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file is_operator_plus_function_object.h
+ *  \brief Type traits for determining if a \c BinaryFunction is equivalent to
+///        \c operator+.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename FunctionObject>
+struct is_operator_plus_function_object_impl;
+
+} // namespace detail
+
+/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
+/// to \c operator<, and \c false_type otherwise.
+template <typename FunctionObject>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_plus_function_object =
+#else
+struct is_operator_plus_function_object :
+#endif
+  detail::is_operator_plus_function_object_impl<FunctionObject>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
+/// equivalent to \c operator<, and \c false otherwise.
+template <typename FunctionObject>
+constexpr bool is_operator_plus_function_object_v
+  = is_operator_plus_function_object<FunctionObject>::value;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename FunctionObject>
+struct is_operator_plus_function_object_impl                   : false_type {};
+template <typename T>
+struct is_operator_plus_function_object_impl<thrust::plus<T> > : true_type {};
+template <typename T>
+struct is_operator_plus_function_object_impl<std::plus<T>    > : true_type {};
+
+} // namespace detail
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h
new file mode 100644
index 0000000000000000000000000000000000000000..de38735d290bf0c436f31af9ec4e74be18364e29
--- /dev/null
+++ b/thrust/thrust/type_traits/is_trivially_relocatable.h
@@ -0,0 +1,251 @@
+///////////////////////////////////////////////////////////////////////////////
+//  Copyright (c)      2018 NVIDIA Corporation
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \file is_trivially_relocatable.h
+ *  \brief <a href="https://wg21.link/P1144R0">P1144R0</a>'s
+ *         \c is_trivially_relocatable, an extensible type trait indicating
+ *         whether a type can be bitwise copied (e.g. via \c memcpy).
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <type_traits>
+#endif
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename T>
+struct is_trivially_relocatable_impl;
+
+} // namespace detail
+
+/// Unary metafunction returns \c true_type if \c T is \a TriviallyRelocatable, 
+/// e.g. can be bitwise copied (with a facility like \c memcpy), and
+/// \c false_type otherwise.
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_trivially_relocatable =
+#else
+struct is_trivially_relocatable :
+#endif
+  detail::is_trivially_relocatable_impl<T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c T is
+/// \a TriviallyRelocatable e.g. can be copied bitwise (with a facility like
+/// \c memcpy), and \c false otherwise.
+template <typename T>
+constexpr bool is_trivially_relocatable_v = is_trivially_relocatable<T>::value;
+#endif
+
+/// Unary metafunction returns \c true_type if \c From is \a TriviallyRelocatable
+/// to \c To, e.g. can be bitwise copied (with a facility like \c memcpy), and
+/// \c false_type otherwise.
+template <typename From, typename To>
+#if THRUST_CPP_DIALECT >= 2011
+using is_trivially_relocatable_to =
+#else
+struct is_trivially_relocatable_to :
+#endif
+  integral_constant<
+    bool
+  , detail::is_same<From, To>::value && is_trivially_relocatable<To>::value
+  >
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if \c From is 
+/// \a TriviallyRelocatable to \c To, e.g. can be copied bitwise (with a
+/// facility like \c memcpy), and \c false otherwise.
+template <typename From, typename To>
+constexpr bool is_trivially_relocatable_to_v
+  = is_trivially_relocatable_to<From, To>::value;
+#endif
+
+/// Unary metafunction that returns \c true_type if the element type of
+/// \c FromIterator is \a TriviallyRelocatable to the element type of
+/// \c ToIterator, and \c false_type otherwise.
+template <typename FromIterator, typename ToIterator>
+#if THRUST_CPP_DIALECT >= 2011
+using is_indirectly_trivially_relocatable_to =
+#else
+struct is_indirectly_trivially_relocatable_to :
+#endif
+  integral_constant<
+    bool
+  ,    is_contiguous_iterator<FromIterator>::value
+    && is_contiguous_iterator<ToIterator>::value
+    && is_trivially_relocatable_to<
+         typename thrust::iterator_traits<FromIterator>::value_type,
+         typename thrust::iterator_traits<ToIterator>::value_type
+       >::value
+  >
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// <code>constexpr bool</code> that is \c true if the element type of
+/// \c FromIterator is \a TriviallyRelocatable to the element type of
+/// \c ToIterator, and \c false otherwise.
+template <typename FromIterator, typename ToIterator>
+constexpr bool is_trivial_relocatable_sequence_copy_v
+  = is_indirectly_trivially_relocatable_to<FromIterator, ToIterator>::value;
+#endif
+
+/// Customization point that can be customized to indicate that a type \c T is
+/// \a TriviallyRelocatable, e.g. can be copied bitwise (with a facility like
+/// \c memcpy).
+template <typename T>
+struct proclaim_trivially_relocatable : false_type {};
+
+/// Declares that the type \c T is \a TriviallyRelocatable by specializing
+/// `thrust::proclaim_trivially_relocatable`.
+#define THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(T)                              \
+  namespace thrust {                                                          \
+  template <>                                                                 \
+  struct proclaim_trivially_relocatable<T> : ::thrust::true_type {};          \
+  } /* end namespace thrust */                                                \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+// There is no way to actually detect the libstdc++ version; __GLIBCXX__
+// is always set to the date of libstdc++ being packaged, not the release
+// day or version. This means that we can't detect the libstdc++ version,
+// except when compiling with GCC.
+//
+// Therefore, for the best approximation of is_trivially_copyable, we need to
+// handle three distinct cases:
+// 1) GCC above 5, or another C++11 compiler not using libstdc++: use the
+//      standard trait directly.
+// 2) A C++11 compiler using libstdc++ that provides the intrinsic: use the
+//      intrinsic.
+// 3) Any other case (essentially: compiling without C++11): has_trivial_assign.
+
+#ifndef __has_feature
+    #define __has_feature(x) 0
+#endif
+
+template <typename T>
+struct is_trivially_copyable_impl
+    : integral_constant<
+        bool,
+        #if THRUST_CPP_DIALECT >= 2011
+            #if defined(__GLIBCXX__) && __has_feature(is_trivially_copyable)
+                __is_trivially_copyable(T)
+            #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && THRUST_GCC_VERSION >= 50000
+                std::is_trivially_copyable<T>::value
+            #else
+                has_trivial_assign<T>::value
+            #endif
+        #else
+            has_trivial_assign<T>::value
+        #endif
+    >
+{
+};
+
+// https://wg21.link/P1144R0#wording-inheritance
+template <typename T>
+struct is_trivially_relocatable_impl
+    : integral_constant<
+        bool,
+        is_trivially_copyable_impl<T>::value
+            || proclaim_trivially_relocatable<T>::value
+    >
+{};
+
+template <typename T, std::size_t N>
+struct is_trivially_relocatable_impl<T[N]> : is_trivially_relocatable_impl<T> {};
+
+} // namespace detail
+
+} // end namespace thrust
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong4)
+
+struct __half;
+struct __half2;
+
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(__half)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(__half2)
+
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double4)
+#endif
+
diff --git a/thrust/thrust/type_traits/logical_metafunctions.h b/thrust/thrust/type_traits/logical_metafunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f86ee6a820d5dd4e5c98d0f9ba21ffd3b287b45
--- /dev/null
+++ b/thrust/thrust/type_traits/logical_metafunctions.h
@@ -0,0 +1,179 @@
+///////////////////////////////////////////////////////////////////////////////
+//  Copyright (c)      2018 NVIDIA Corporation
+//  Copyright (c) 2015-2018 Bryce Adelstein Lelbach aka wash
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \file logical_metafunctions.h
+ *  \brief C++17's \c conjunction, \c disjunction, and \c negation metafunctions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <type_traits>
+
+namespace thrust
+{
+
+#if THRUST_CPP_DIALECT >= 2017
+
+/// An \c integral_constant whose value is <code>(... && Ts::value)</code>. 
+template <typename... Ts>
+using conjunction = std::conjunction<Ts...>;
+
+/// A <code>constexpr bool</code> whose value is <code>(... && Ts::value)</code>.
+template <typename... Ts>
+constexpr bool conjunction_v = conjunction<Ts...>::value;
+
+/// An \c integral_constant whose value is <code>(... || Ts::value)</code>. 
+template <typename... Ts>
+using disjunction = std::disjunction<Ts...>;
+
+/// A <code>constexpr bool</code> whose value is <code>(... || Ts::value)</code>.
+template <typename... Ts>
+constexpr bool disjunction_v = disjunction<Ts...>::value;
+
+/// An \c integral_constant whose value is <code>!Ts::value</code>. 
+template <typename T>
+using negation = std::negation<T>;
+
+/// A <code>constexpr bool</code> whose value is <code>!Ts::value</code>.
+template <typename T>
+constexpr bool negation_v = negation<T>::value;
+
+///////////////////////////////////////////////////////////////////////////////
+
+#else // Older than C++17.
+
+/// An \c integral_constant whose value is <code>(... && Ts::value)</code>. 
+template <typename... Ts>
+struct conjunction;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// A <code>constexpr bool</code> whose value is <code>(... && Ts::value)</code>.
+template <typename... Ts>
+constexpr bool conjunction_v = conjunction<Ts...>::value;
+#endif
+
+template <>
+struct conjunction<> : std::true_type {};
+
+template <typename T>
+struct conjunction<T> : T {};
+
+template <typename T0, typename T1>
+struct conjunction<T0, T1> : std::conditional<T0::value, T1, T0>::type {};
+
+template<typename T0, typename T1, typename T2, typename... TN>
+struct conjunction<T0, T1, T2, TN...>
+  : std::conditional<T0::value, conjunction<T1, T2, TN...>, T0>::type {};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// An \c integral_constant whose value is <code>(... || Ts::value)</code>. 
+template <typename... Ts>
+struct disjunction;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// A <code>constexpr bool</code> whose value is <code>(... || Ts::value)</code>.
+template <typename... Ts>
+constexpr bool disjunction_v = disjunction<Ts...>::value;
+#endif
+
+template <>
+struct disjunction<> : std::false_type {};
+
+template <typename T>
+struct disjunction<T> : T {};
+
+template <typename T0, typename... TN>
+struct disjunction<T0, TN...>
+  : std::conditional<T0::value != false, T0, disjunction<TN...> >::type {};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// An \c integral_constant whose value is <code>!T::value</code>. 
+template <typename T>
+struct negation;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// A <code>constexpr bool</code> whose value is <code>!T::value</code>.
+template <typename T>
+constexpr bool negation_v = negation<T>::value;
+#endif
+
+template <typename T>
+struct negation : std::integral_constant<bool, !T::value> {};
+
+#endif // THRUST_CPP_DIALECT >= 2017
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// An \c integral_constant whose value is <code>(... && Bs)</code>. 
+template <bool... Bs>
+struct conjunction_value;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// A <code>constexpr bool</code> whose value is <code>(... && Bs)</code>.
+template <bool... Bs>
+constexpr bool conjunction_value_v = conjunction_value<Bs...>::value;
+#endif
+
+template <>
+struct conjunction_value<> : std::true_type {};
+
+template <bool B>
+struct conjunction_value<B> : std::integral_constant<bool, B> {};
+
+template <bool B0, bool... BN>
+struct conjunction_value<B0, BN...>
+  : std::integral_constant<bool, B0 && conjunction_value<BN...>::value> {};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// An \c integral_constant whose value is <code>(... || Bs)</code>. 
+template <bool... Bs>
+struct disjunction_value;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// A <code>constexpr bool</code> whose value is <code>(... || Bs)</code>.
+template <bool... Bs>
+constexpr bool disjunction_value_v = disjunction_value<Bs...>::value;
+#endif
+
+template <>
+struct disjunction_value<> : std::false_type {};
+
+template <bool B>
+struct disjunction_value<B> : std::integral_constant<bool, B> {};
+
+template <bool B0, bool... BN>
+struct disjunction_value<B0, BN...>
+  : std::integral_constant<bool, B0 || disjunction_value<BN...>::value> {};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// An \c integral_constant whose value is <code>!B</code>. 
+template <bool B>
+struct negation_value;
+
+#if THRUST_CPP_DIALECT >= 2014
+/// A <code>constexpr bool</code> whose value is <code>!B</code>.
+template <bool B>
+constexpr bool negation_value_v = negation_value<B>::value;
+#endif
+
+template <bool B>
+struct negation_value : std::integral_constant<bool, !B> {};
+
+} // end namespace thrust
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/thrust/type_traits/remove_cvref.h b/thrust/thrust/type_traits/remove_cvref.h
new file mode 100644
index 0000000000000000000000000000000000000000..4079bfe8e81df29bae686f1d4b8d30b598648aaa
--- /dev/null
+++ b/thrust/thrust/type_traits/remove_cvref.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+#if THRUST_CPP_DIALECT >= 2020
+
+using std::remove_cvref;
+using std::remove_cvref_t;
+
+#else // Older than C++20.
+
+template <typename T>
+struct remove_cvref
+{
+  typedef typename detail::remove_cv<
+    typename detail::remove_reference<T>::type
+  >::type type;
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template <typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+#endif
+
+#endif // THRUST_CPP_DIALECT >= 2020
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/type_traits/void_t.h b/thrust/thrust/type_traits/void_t.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ab56a3e874fea077fea1c7e9bdd4812a6217fb6
--- /dev/null
+++ b/thrust/thrust/type_traits/void_t.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file void_t.h
+ *  \brief C++17's `void_t`. 
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2017
+#  include <type_traits>
+#endif
+
+namespace thrust
+{
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template <typename...> struct voider { using type = void; };
+
+#if THRUST_CPP_DIALECT >= 2017
+using std::void_t;
+#else
+template <typename... Ts> using void_t = typename voider<Ts...>::type;
+#endif
+
+#else // Older than C++11.
+
+template <
+  typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+>
+struct voider
+{
+  typedef void type;
+};
+
+#endif
+
+} // end namespace thrust
+
diff --git a/thrust/thrust/uninitialized_copy.h b/thrust/thrust/uninitialized_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..af0f641a7f3a5323793005434e7297b50ca051e6
--- /dev/null
+++ b/thrust/thrust/uninitialized_copy.h
@@ -0,0 +1,303 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_copy.h
+ *  \brief Copy construction into a range of uninitialized elements from a source range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup copying
+ *  \{
+ */
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + (last - first))</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy creates a copy of
+ *  <tt>[first, last)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p ForwardIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the input range to copy from.
+ *  \param last The last element of the input range to copy from.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy(thrust::device, input.begin(), input.end(), array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + (last - first))</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy creates a copy of
+ *  <tt>[first, last)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p ForwardIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  \param first The first element of the input range to copy from.
+ *  \param last The last element of the input range to copy from.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy(input.begin(), input.end(), array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename InputIterator, typename ForwardIterator>
+  ForwardIterator uninitialized_copy(InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + n)</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy_n creates a copy of
+ *  <tt>[first, first + n)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy_n creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p InputIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the input range to copy from.
+ *  \param n The number of elements to copy.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Size is an integral type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, first + n)</tt> and the range <tt>[result, result + n)</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy_n(thrust::device, input.begin(), N, array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c uninitialized_copy
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
+__host__ __device__
+  ForwardIterator uninitialized_copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + n)</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy_n creates a copy of
+ *  <tt>[first, first + n)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy_n creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p InputIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  \param first The first element of the input range to copy from.
+ *  \param n The number of elements to copy.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Size is an integral type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, first + n)</tt> and the range <tt>[result, result + n)</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy_n(input.begin(), N, array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c uninitialized_copy
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename InputIterator, typename Size, typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(InputIterator first,
+                                       Size n,
+                                       ForwardIterator result);
+
+
+/*! \} // copying
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/uninitialized_copy.inl>
+
diff --git a/thrust/thrust/uninitialized_fill.h b/thrust/thrust/uninitialized_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..33dc24886c30586a1302ec31f51a9f2dfca9b051
--- /dev/null
+++ b/thrust/thrust/uninitialized_fill.h
@@ -0,0 +1,275 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_fill.h
+ *  \brief Copy construction into a range of uninitialized elements from a source value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup filling
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, last)</tt> points
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *  
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the range of interest.
+ *  \param last The last element of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill(thrust::device, array, array + N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill_n
+ *  \see \c fill
+ *  \see \c uninitialized_copy
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+__host__ __device__
+  void uninitialized_fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, last)</tt> points
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *  
+ *  \param first The first element of the range of interest.
+ *  \param last The last element of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill(array, array + N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill_n
+ *  \see \c fill
+ *  \see \c uninitialized_copy
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename ForwardIterator, typename T>
+  void uninitialized_fill(ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *  
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *  \return <tt>first+n</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill_n(thrust::device, array, N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill
+ *  \see \c fill
+ *  \see \c uninitialized_copy_n
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Size, typename T>
+__host__ __device__
+  ForwardIterator uninitialized_fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *  
+ *  \param first The first element of the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *  \return <tt>first+n</tt>
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill_n(array, N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill
+ *  \see \c fill
+ *  \see \c uninitialized_copy_n
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename ForwardIterator, typename Size, typename T>
+  ForwardIterator uninitialized_fill_n(ForwardIterator first,
+                                       Size n,
+                                       const T &x);
+
+/*! \} // end filling
+ *  \} // transformations
+ */
+
+} // end thrust
+
+#include <thrust/detail/uninitialized_fill.inl>
+
diff --git a/thrust/thrust/unique.h b/thrust/thrust/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4b2118d321374e2dac04592914d33b2003fad8a
--- /dev/null
+++ b/thrust/thrust/unique.h
@@ -0,0 +1,968 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.h
+ *  \brief Move unique elements to the front of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup stream_compaction
+ *  \{
+ */
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses \c operator== to test for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(thrust::host, A, A + N);
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last);
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses \c operator== to test for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(A, A + N);
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template<typename ForwardIterator>
+ForwardIterator unique(ForwardIterator first,
+                       ForwardIterator last);
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses the function object \p binary_pred to test
+ *  for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(thrust::host, A, A + N, thrust::equal_to<int>());
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred);
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses the function object \p binary_pred to test
+ *  for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(A, A + N, thrust::equal_to<int>());
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template<typename ForwardIterator,
+         typename BinaryPredicate>
+ForwardIterator unique(ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred);
+
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * The reason there are two different versions of unique_copy is that there
+ * are two different definitions of what it means for a consecutive group of
+ * elements to be duplicates. In the first version, the test is simple
+ * equality: the elements in a range <tt>[f, l)</tt> are duplicates if,
+ * for every iterator \p i in the range, either <tt>i == f</tt> or else 
+ * <tt>*i == *(i-1)</tt>. In the second, the test is an arbitrary 
+ * \p BinaryPredicate \p binary_pred: the elements in <tt>[f, l)</tt> are
+ * duplicates if, for every iterator \p i in the range, either <tt>i == f</tt>
+ * or else <tt>binary_pred(*i, *(i-1))</tt> is \p true.
+ *
+ * This version of \p unique_copy uses \c operator== to test for equality.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(thrust::host, A, A + N, B);
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+__host__ __device__
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result);
+
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * The reason there are two different versions of unique_copy is that there
+ * are two different definitions of what it means for a consecutive group of
+ * elements to be duplicates. In the first version, the test is simple
+ * equality: the elements in a range <tt>[f, l)</tt> are duplicates if,
+ * for every iterator \p i in the range, either <tt>i == f</tt> or else 
+ * <tt>*i == *(i-1)</tt>. In the second, the test is an arbitrary 
+ * \p BinaryPredicate \p binary_pred: the elements in <tt>[f, l)</tt> are
+ * duplicates if, for every iterator \p i in the range, either <tt>i == f</tt>
+ * or else <tt>binary_pred(*i, *(i-1))</tt> is \p true.
+ *
+ * This version of \p unique_copy uses \c operator== to test for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(A, A + N, B);
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template<typename InputIterator,
+         typename OutputIterator>
+OutputIterator unique_copy(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result);
+
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * This version of \p unique_copy uses the function object \c binary_pred 
+ * to test for equality.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(thrust::host, A, A + N, B, thrust::equal_to<int>());
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+__host__ __device__
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           BinaryPredicate binary_pred);
+                       
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * This version of \p unique_copy uses the function object \c binary_pred 
+ * to test for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(A, A + N, B, thrust::equal_to<int>());
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+OutputIterator unique_copy(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  The return value is a \p pair of iterators <tt>(new_keys_last,new_values_last)</tt>
+ *  such that no two consecutive elements in the range <tt>[keys_first, new_keys_last)</tt>
+ *  are equal.
+ *
+ *  This version of \p unique_by_key uses \c operator== to test for equality and 
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key(thrust::host, A, A + N, B);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  The return value is a \p pair of iterators <tt>(new_keys_last,new_values_last)</tt>
+ *  such that no two consecutive elements in the range <tt>[keys_first, new_keys_last)</tt>
+ *  are equal.
+ *
+ *  This version of \p unique_by_key uses \c operator== to test for equality and 
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
+ *
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key(A, A + N, B);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  This version of \p unique_by_key uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key(thrust::host, keys, keys + N, values, binary_pred);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  This version of \p unique_by_key uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key(keys, keys + N, values, binary_pred);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template<typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first,
+                BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses \c operator== to test for equality and
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key_copy(thrust::host, A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_result,
+                       OutputIterator2 values_result);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses \c operator== to test for equality and
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key_copy(A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key_copy(thrust::host, A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+__host__ __device__
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_result,
+                       OutputIterator2 values_result,
+                       BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key_copy(A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result,
+                     BinaryPredicate binary_pred);
+
+
+/*! \} // end stream_compaction
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/unique.inl>
+
diff --git a/thrust/thrust/version.h b/thrust/thrust/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..84f9af141b47806e90005c9192f3ce27ae477960
--- /dev/null
+++ b/thrust/thrust/version.h
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file version.h
+ *  \brief Compile-time macros encoding Thrust release version
+ *
+ *         <thrust/version.h> is the only Thrust header that is guaranteed to
+ *         change with every thrust release.
+ *
+ *         It is also the only header that does not cause THRUST_HOST_SYSTEM
+ *         and THRUST_DEVICE_SYSTEM to be defined. This way, a user may include
+ *         this header and inspect THRUST_VERSION before programatically defining
+ *         either of these macros herself.
+ */
+
+#pragma once
+
+//  This is the only Thrust header that is guaranteed to
+//  change with every Thrust release.
+//
+//  THRUST_VERSION % 100 is the sub-minor version
+//  THRUST_VERSION / 100 % 1000 is the minor version
+//  THRUST_VERSION / 100000 is the major version
+//
+//  Because this header does not #include <thrust/detail/config.h>,
+//  it is the only Thrust header that does not cause
+//  THRUST_HOST_SYSTEM and THRUST_DEVICE_SYSTEM to be defined.
+
+/*! \def THRUST_VERSION
+ *  \brief The preprocessor macro \p THRUST_VERSION encodes the version
+ *         number of the Thrust library.
+ *
+ *         <tt>THRUST_VERSION % 100</tt> is the sub-minor version.
+ *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
+ *         <tt>THRUST_VERSION / 100000</tt> is the major version.
+ */
+#define THRUST_VERSION 101000
+
+/*! \def THRUST_MAJOR_VERSION
+ *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the
+ *         major version number of the Thrust library.
+ */
+#define THRUST_MAJOR_VERSION     (THRUST_VERSION / 100000)
+
+/*! \def THRUST_MINOR_VERSION
+ *  \brief The preprocessor macro \p THRUST_MINOR_VERSION encodes the
+ *         minor version number of the Thrust library.
+ */
+#define THRUST_MINOR_VERSION     (THRUST_VERSION / 100 % 1000)
+
+/*! \def THRUST_SUBMINOR_VERSION
+ *  \brief The preprocessor macro \p THRUST_SUBMINOR_VERSION encodes the
+ *         sub-minor version number of the Thrust library.
+ */
+#define THRUST_SUBMINOR_VERSION  (THRUST_VERSION % 100)
+
+/*! \def THRUST_PATCH_NUMBER
+ *  \brief The preprocessor macro \p THRUST_PATCH_NUMBER encodes the
+ *         patch number of the Thrust library.
+ */
+#define THRUST_PATCH_NUMBER 0
+
+/*! \namespace thrust
+ *  \brief \p thrust is the top-level namespace which contains all Thrust
+ *         functions and types.
+ */
+namespace thrust
+{
+
+}
diff --git a/thrust/thrust/zip_function.h b/thrust/thrust/zip_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..faea59d4c5b3204924ab63d155f546c2ec4d9e6c
--- /dev/null
+++ b/thrust/thrust/zip_function.h
@@ -0,0 +1,211 @@
+
+/*! \file thrust/zip_function.h
+ *  \brief Adaptor type that turns an N-ary function object into one that takes
+ *         a tuple of size N so it can easily be used with algorithms taking zip
+ *         iterators
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <thrust/detail/type_deduction.h>
+
+namespace thrust
+{
+
+/*! \addtogroup function_objects Function Objects
+ *  \{
+ */
+
+/*! \addtogroup function_object_adaptors Function Object Adaptors
+ *  \ingroup function_objects
+ *  \{
+ */
+
+namespace detail {
+namespace zip_detail {
+
+// Add workaround for decltype(auto) on C++11-only compilers:
+#if THRUST_CPP_DIALECT >= 2014
+
+template <typename Function, typename Tuple, std::size_t... Is>
+__host__ __device__
+decltype(auto) apply_impl(Function&& func, Tuple&& args, index_sequence<Is...>)
+{
+  return func(thrust::get<Is>(THRUST_FWD(args))...);
+}
+
+template <typename Function, typename Tuple>
+__host__ __device__
+decltype(auto) apply(Function&& func, Tuple&& args)
+{
+  constexpr auto tuple_size = thrust::tuple_size<typename std::decay<Tuple>::type>::value;
+  return apply_impl(THRUST_FWD(func), THRUST_FWD(args), make_index_sequence<tuple_size>{});
+}
+
+#else // THRUST_CPP_DIALECT
+
+template <typename Function, typename Tuple, std::size_t... Is>
+__host__ __device__
+auto apply_impl(Function&& func, Tuple&& args, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(func(thrust::get<Is>(THRUST_FWD(args))...))
+
+template <typename Function, typename Tuple>
+__host__ __device__
+auto apply(Function&& func, Tuple&& args)
+THRUST_DECLTYPE_RETURNS(
+    apply_impl(
+      THRUST_FWD(func),
+      THRUST_FWD(args),
+      make_index_sequence<
+        thrust::tuple_size<typename std::decay<Tuple>::type>::value>{})
+)
+
+#endif // THRUST_CPP_DIALECT
+
+} // namespace zip_detail
+} // namespace detail
+
+/*! \p zip_function is a function object that allows the easy use of N-ary 
+ *  function objects with \p zip_iterators without redefining them to take a
+ *  \p tuple instead of N arguments.
+ *
+ *  This means that if a functor that takes 2 arguments which could be used with
+ *  the \p transform function and \p device_iterators can be extended to take 3
+ *  arguments and \p zip_iterators without rewriting the functor in terms of
+ *  \p tuple.
+ * 
+ *  The \p make_zip_function convenience function is provided to avoid having
+ *  to explicitely define the type of the functor when creating a \p zip_function, 
+ *  whic is especially helpful when using lambdas as the functor.
+ *  
+ *  \code
+ *  #include <thrust/iterator/zip_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/zip_function.h>
+ * 
+ *  struct SumTuple {
+ *    float operator()(Tuple tup) {
+ *      return std::get<0>(tup) + std::get<1>(tup) + std::get<2>(tup);
+ *    }
+ *  };
+ *  struct SumArgs {
+ *    float operator()(float a, float b, float c) {
+ *      return a + b + c;
+ *    }
+ *  };
+ *  
+ *  int main() {
+ *    thrust::device_vector<float> A(3);
+ *    thrust::device_vector<float> B(3);
+ *    thrust::device_vector<float> C(3);
+ *    thrust::device_vector<float> D(3);
+ *    A[0] = 0.f; A[1] = 1.f; A[2] = 2.f;
+ *    B[0] = 1.f; B[1] = 2.f; B[2] = 3.f;
+ *    C[0] = 2.f; C[1] = 3.f; C[2] = 4.f;
+ * 
+ *    // The following four invocations of transform are equivalent
+ *    // Transform with 3-tuple
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      SumTuple{});
+ * 
+ *    // Transform with 3 parameters
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      adapted);
+ * 
+ *    // Transform with 3 parameters with convenience function
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      thrust::make_zip_function(SumArgs{}));
+ * 
+ *    // Transform with 3 parameters with convenience function and lambda
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      thrust::make_zip_function([] (float a, float b, float c) {
+ *                                                  return a + b + c;
+ *                                                }));
+ *    return 0;
+ *  }
+ *  \endcode
+ * 
+ *  \see make_zip_function
+ *  \see zip_iterator
+ */
+template <typename Function>
+class zip_function
+{
+  public:
+     __host__ __device__
+    zip_function(Function func) : func(std::move(func)) {}
+
+// Add workaround for decltype(auto) on C++11-only compilers:
+#if THRUST_CPP_DIALECT >= 2014
+
+    template <typename Tuple>
+    __host__ __device__
+    decltype(auto) operator()(Tuple&& args) const
+    {
+        return detail::zip_detail::apply(func, THRUST_FWD(args));
+    }
+
+#else // THRUST_CPP_DIALECT
+
+    // Can't just use THRUST_DECLTYPE_RETURNS here since we need to use
+    // std::declval for the signature components:
+    template <typename Tuple>
+    __host__ __device__
+    auto operator()(Tuple&& args) const
+    noexcept(noexcept(detail::zip_detail::apply(std::declval<Function>(), THRUST_FWD(args))))
+    -> decltype(detail::zip_detail::apply(std::declval<Function>(), THRUST_FWD(args)))
+
+    {
+        return detail::zip_detail::apply(func, THRUST_FWD(args));
+    }
+
+#endif // THRUST_CPP_DIALECT
+
+  private:
+    mutable Function func;
+}; 
+
+/*! \p make_zip_function creates a \p zip_function from a function object.
+ *
+ *  \param fun The N-ary function object.
+ *  \return A \p zip_function that takes a N-tuple.
+ *
+ *  \see zip_function
+ */
+template <typename Function>
+__host__ __device__
+auto make_zip_function(Function&& fun) -> zip_function<typename std::decay<Function>::type>
+{
+    using func_t = typename std::decay<Function>::type;
+    return zip_function<func_t>(THRUST_FWD(fun));
+}
+
+/*! \} // end function_object_adaptors
+ */
+
+/*! \} // end function_objects
+ */
+
+} // end namespace thrust
+
+#endif
diff --git a/thrust/thrust_perf_tests.trs b/thrust/thrust_perf_tests.trs
new file mode 100644
index 0000000000000000000000000000000000000000..c657014d88ab6e5330c13f35d89f686301c254df
--- /dev/null
+++ b/thrust/thrust_perf_tests.trs
@@ -0,0 +1,37 @@
+{
+  # Descriptive name for the testsuite (required).
+  "name"        : "Thrust Performance Testsuite",
+  "version"     : "2",
+  "extrapath"   : [ "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}" ],
+  # Testsuite owner's email (required).
+  "owner"       : "blelbach@nvidia.com",
+  # Define paths containing shared libraries required by the tests. Use envvar VULCAN_SHAREDLIB_DIR to refer 
+  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
+  # Linux, etc.)
+  "librarypath" : [ "{TR_INSTALL_DIR}\/cuda\/_internal\/driver" ],
+  # Default working directory for test runs (optional).
+  "cwd"        : "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}",
+  # Timeout for entire testsuite, in seconds (optional).
+  "timeout"     : "3600",
+  # Default timeout for individual tests, in seconds (optional).
+  "testtimeout" : "3600",
+  # The tests in the testsuite (required).
+  "tests" : [
+      {
+        "init" : "{PYTHON} {TR_INSTALL_DIR}/GPUConfMgr/GPUConfMgr.py -caseName=set -clock=P0 -VULCAN_INSTALL={TR_INSTALL_DIR}",
+        "attributes" : [ ]
+      },
+      {
+        "exe": "{PYTHON} {TR_TESTSUITE_DIR}/internal/scripts/eris_perf.py -b {TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}/bench -p {TR_INSTALL_DIR}/thrust/internal/benchmark/combine_benchmark_results.py",
+        "attributes": [ "result=multi" ]
+      },
+      {
+        "fini" : "{PYTHON} {TR_INSTALL_DIR}/GPUConfMgr/GPUConfMgr.py -caseName=reset -clock=P0 -VULCAN_INSTALL={TR_INSTALL_DIR}",
+        "attributes" : [ ]
+      }
+ ]
+}
+
+# File /home/rjray/Perforce/general/gpgpu/thrust/thrust_perf_tests.trs
+# Converted from /home/rjray/Perforce/general/gpgpu/thrust/thrust_perf_tests.vlct
+# Converted by tr_configtool.pl/0.4, on Fri Oct  6 13:07:44 2017
diff --git a/thrust/thrust_perf_tests.vlcc b/thrust/thrust_perf_tests.vlcc
new file mode 100644
index 0000000000000000000000000000000000000000..d02bf9e682bc9daa0c951d69577e9a0c992f74b6
--- /dev/null
+++ b/thrust/thrust_perf_tests.vlcc
@@ -0,0 +1,38 @@
+# Thrust performance tests component configuration. 
+{ 
+  # Descriptive name for the component
+  "name"      : "Thrust Performance Test Suite",
+  "type"      : "performance",
+  # Component owner (email address)
+  "owner"     : "blelbach@nvidia.com",
+  "module"    : "CUDA - Thrust",
+
+  # Build timeout (in seconds).
+  "buildtimeout" : "600",
+  # Define variables usable in this component
+  "env"       : [ "THRUST_TESTS_BIN_DIR=${VULCAN_BUILD_DIR}/bin/${VULCAN_ARCH}_${VULCAN_OS}${VULCAN_ABI}_${VULCAN_BUILD}" ],
+  # Files included in this component specified with one or more paths. 
+  # Paths are relative to this file location. Path syntax uses wildcards and p4-like '...'.
+  "files"     : [
+                  "...",
+                  { "include" : "CUDA_TOOLKIT_BUILD_FILES" }
+                ],
+  # Output produced by this component and the installation location
+  # for each output. The install location is relative to
+  # VULCAN_INSTALL_DIR. Optional 'kind' property can be used to tag
+  # artifact kinds.
+  "artifacts" : [
+                  { "${THRUST_TESTS_BIN_DIR}/bench" : "cuda/_tests/thrust_perf_tests/.", "kind": "EXE" },
+                  { "internal/benchmark/combine_benchmark_results.py" : "cuda/_tests/thrust_perf_tests/." },
+                  { "internal/scripts/eris_perf.py" : "cuda/_tests/thrust_perf_tests/." },
+                  { "thrust_perf_tests.vlct"        : "cuda/_tests/thrust_perf_tests/.", "kind": "TESTSUITE" }
+                ],
+  # Dependencies for this component.
+  "depends"   : [ "driver_headers", "driver", "compiler", "cuda", "thrust", "GPUConfMgr" ],
+  # The agent for this component, relative to this file location. The
+  # agent is invoked to perform component actions.
+  "agent"     : {
+                  "CUDA_MAKE" : "Makefile", "build_target" : "all", "clean_target" : "clean",
+                  "args" : [ "TEST_BENCH=1" ]
+                }
+}
diff --git a/thrust/thrust_perf_tests.vlct b/thrust/thrust_perf_tests.vlct
new file mode 100644
index 0000000000000000000000000000000000000000..1edbb7247c2bed305abf675e54bedde066faa4a9
--- /dev/null
+++ b/thrust/thrust_perf_tests.vlct
@@ -0,0 +1,33 @@
+# Thrust performance tests component configuration. 
+{
+  # Descriptive name for the testsuite (required).
+  "name"        : "Thrust Performance Testsuite",
+  # Testsuite owner's email (required).
+  "owner"       : "blelbach@nvidia.com",
+
+  # Define paths containing shared libraries required by the tests. Use envvar VULCAN_SHAREDLIB_DIR to refer 
+  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
+  # Linux, etc.)
+  "dllpath"     : [ "${VULCAN_INSTALL_DIR}/cuda/_internal/driver" ],
+  # Default working directory for test runs (optional).
+  "cwd"         : "${VULCAN_TESTSUITE_DIR}",
+  # Timeout for entire testsuite, in seconds (optional).
+  "timeout"     : "3600",
+  # Default timeout for individual tests, in seconds (optional).
+  "testtimeout" : "3600",
+  # The tests in the testsuite (required).
+  "tests" : [
+      {
+        "exe" : "${PYTHON} ${VULCAN_INSTALL_DIR}/opencl/_tests/GPUConfMgr/GPUConfMgr.py -caseName=set -clock=P0 -VULCAN_INSTALL=${VULCAN_INSTALL_DIR}",
+        "attributes" : [ ]
+      },
+      {
+        "exe": "${PYTHON} eris_perf.py",
+        "attributes": [ "result=multi" ]
+      },
+      {
+        "exe" : "${PYTHON} ${VULCAN_INSTALL_DIR}/opencl/_tests/GPUConfMgr/GPUConfMgr.py -caseName=reset -clock=P0 -VULCAN_INSTALL=${VULCAN_INSTALL_DIR}",
+        "attributes" : [ ]
+      }
+ ]
+}
diff --git a/thrust/thrust_tests.trs b/thrust/thrust_tests.trs
new file mode 100644
index 0000000000000000000000000000000000000000..f38f742014f9384e3fbbd8702f88504af19a246c
--- /dev/null
+++ b/thrust/thrust_tests.trs
@@ -0,0 +1,36 @@
+{ 
+  # Descriptive name for the component
+  "name"        : "Thrust Test Suite",
+  "version"     : "2",
+  # Component owner (email address)
+  "owner"       : "blelbach@nvidia.com",
+
+  "extrapath"   : [ "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}" ],
+  # Define paths containing shared libraries required by the tests. Use envvar TR_SHAREDLIB_DIR to refer 
+  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
+  # Linux, etc.)
+  "librarypath" : [ 
+                    "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}", 
+                    "{TR_INSTALL_DIR}\/cuda\/_internal\/driver",
+                    { "filter" : { "gpu": "gv100sxm2", "os": "Ubuntu18_04", "arch": "ppc64le" } },
+                    "{TR_INSTALL_DIR}/XLC_16_1_1/lib"
+                  ],
+  # Default working directory for test runs (optional). The directory can be a an absolute
+  # or relative path. A relative path is relative to this file's location. Variables can
+  # be used in the path using the {var} syntax.
+  "cwd"         : "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}",
+  # Timeout for entire testsuite, in seconds (optional). If not timeout is specified the
+  # default timeout value of 900 seconds will be used.
+  "timeout"     : "12000",
+  # Default timeout for individual tests, in seconds (optional).
+  "testtimeout" : "2700",
+  # The tests in the testsuite (required).
+  "tests"       : [
+    
+    {
+      "exe" : "{PERL} {TR_TESTSUITE_DIR}/internal/test/thrust_nightly.pl -bin-path={TR_INSTALL_DIR}/bin/{TR_ARCH_PATH} -filecheck-data-path={TR_TESTSUITE_DIR}/internal/test -filecheck-path={TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}/nvvm/tools", 
+      "attributes": [ "result=multi" ]
+    }
+
+  ]
+}
diff --git a/thrust/thrust_tests.vlcc b/thrust/thrust_tests.vlcc
new file mode 100644
index 0000000000000000000000000000000000000000..32ca412fad562074576e5c44de77d87815e8e73d
--- /dev/null
+++ b/thrust/thrust_tests.vlcc
@@ -0,0 +1,36 @@
+{ 
+  # Descriptive name for the component
+  "name"      : "Thrust Test Suite",
+  # Component owner (email address)
+  "owner"     : "blelbach@nvidia.com",
+  "module"    : "CUDA - Thrust",
+
+  # Build timeout (in seconds).
+  "buildtimeout" : "28800",
+  # Define variables usable in this component
+  "env"       : [ "THRUST_TESTS_BIN_DIR=${VULCAN_BUILD_DIR}/bin/${VULCAN_ARCH}_${VULCAN_OS}${VULCAN_ABI}_${VULCAN_BUILD}" ],
+  # Files included in this component specified with one or more paths. 
+  # Paths are relative to this file location. Path syntax uses wildcards and p4-like '...'.
+  "files"     : [
+                  "...",
+                  { "include" : "CUDA_TOOLKIT_BUILD_FILES" }
+                ],
+  # Output produced by this component and the installation location
+  # for each output. The install location is relative to
+  # VULCAN_INSTALL_DIR. Optional 'kind' property can be used to tag
+  # artifact kinds.
+  "artifacts" : [
+                  { "${THRUST_TESTS_BIN_DIR}/*"                    : "cuda/_tests/thrust_tests/." },
+                  { "internal/test/thrust_nightly.pl"              : "cuda/_tests/thrust_tests/." },
+                  { "internal/test/*.filecheck"                    : "cuda/_tests/thrust_tests/filecheck_data/." },
+                  { "thrust_tests.vlct"                            : "cuda/_tests/thrust_tests/.", "kind" : "TESTSUITE" }
+                ],
+  # Dependencies for this component.
+  "depends"   : [ "driver_headers", "driver", "compiler", "cuda", "thrust" ],
+  # The agent for this component, relative to this file location. The
+  # agent is invoked to perform component actions.
+  "agent"     : {
+                  "CUDA_MAKE" : "Makefile", "build_target" : "all", "clean_target" : "clean",
+                  "args" : [ "TEST_ALL=1" ]
+                }
+}
diff --git a/thrust/thrust_tests.vlct b/thrust/thrust_tests.vlct
new file mode 100644
index 0000000000000000000000000000000000000000..9ecd7d5219c006e3b040271c2c840e1922c53c12
--- /dev/null
+++ b/thrust/thrust_tests.vlct
@@ -0,0 +1,31 @@
+{
+  # Descriptive name for the testsuite (required).
+  "name"        : "Thrust Test Suite",
+  # Testsuite owner's email (required).
+  "owner"       : "blelbach@nvidia.com",
+
+  # Define paths containing shared libraries required by the tests. Use envvar VULCAN_SHAREDLIB_DIR to refer 
+  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
+  # Linux, etc.)
+  "dllpath"     : [ "${VULCAN_INSTALL_DIR}/cuda/${INSTALL_TARGET_DIR}/${SHAREDLIB_DIR}",
+                    "${VULCAN_INSTALL_DIR}/cuda/_internal/driver"
+                  ],
+  # Default working directory for test runs (optional). The directory can be a an absolute
+  # or relative path. A relative path is relative to this file's location. Variables can
+  # be used in the path using the ${var} syntax.
+  "cwd"         : "${VULCAN_TESTSUITE_DIR}",
+  # Timeout for entire testsuite, in seconds (optional). If not timeout is specified the
+  # default timeout value of 900 seconds will be used.
+  "timeout"     : "12000",
+  # Default timeout for individual tests, in seconds (optional).
+  "testtimeout" : "5400",
+  # The tests in the testsuite (required).
+  "tests"       : [
+    
+    {
+      "exe" : "${PERL} thrust_nightly.pl -bin-path=${VULCAN_TESTSUITE_DIR} -filecheck-data-path=${VULCAN_TESTSUITE_DIR}/filecheck_data -filecheck-path=${VULCAN_INSTALL_DIR}/cuda/_internal/compiler/nvvm/tools",
+      "attributes" : [ "result=multi" ]
+    }
+    
+  ]
+}
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..13922c13cbd4c0bfd25353ca306b0faeca13f080
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,56 @@
+import os
+import os.path as osp
+
+def get_experiment_id(debug=False):
+    if debug:
+        return 999999999999
+    import time
+    time.sleep(0.5)
+    return int(time.time()*100)
+
+def get_path_schedule(type, **kwargs):
+    if type == 'repeat':
+        max_path = kwargs['max_path']
+        schedule_each = kwargs['schedule_each']
+        return [schedule_each] * max_path
+    elif type == 'list':
+        schedule = kwargs['schedule']
+        return schedule
+    elif type == 'exp':
+        import math
+        base = kwargs['base']
+        max_path = kwargs['max_path']
+        max_path_per_iter = kwargs['max_path_per_iter']
+        schedule = []
+        cnt = 0
+        while sum(schedule) < max_path:
+            proposed_step = min(
+                max_path - sum(schedule), 
+                base**cnt, 
+                max_path_per_iter)
+            cnt += 1
+            schedule += [proposed_step]
+        return schedule
+    else:
+        raise ValueError
+
+def edict_2_dict(x):
+    if isinstance(x, dict):
+        xnew = {}
+        for k in x:
+            xnew[k] = edict_2_dict(x[k])
+        return xnew
+    elif isinstance(x, list):
+        xnew = []
+        for i in range(len(x)):
+            xnew.append( edict_2_dict(x[i]) )
+        return xnew
+    else:
+        return x
+
+def check_and_create_dir(path):
+    pathdir = osp.split(path)[0]
+    if osp.isdir(pathdir):
+        pass
+    else:
+        os.makedirs(pathdir)
diff --git a/vector.h b/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..3575b269b8f47cf26580bfa8cafbbf9af8ee1d7e
--- /dev/null
+++ b/vector.h
@@ -0,0 +1,817 @@
+#pragma once
+
+#include "diffvg.h"
+#include <cmath>
+#include <iostream>
+
+template <typename T>
+struct TVector2 {
+    DEVICE TVector2() {}
+
+    template <typename T2>
+    DEVICE
+    TVector2(T2 x, T2 y) : x(T(x)), y(T(y)) {}
+
+    template <typename T2>
+    DEVICE
+    TVector2(const TVector2<T2> &v) : x(T(v.x)), y(T(v.y)) {}
+
+    DEVICE T& operator[](int i) {
+        return *(&x + i);
+    }
+
+    DEVICE T operator[](int i) const {
+        return *(&x + i);
+    }
+
+    T x, y;
+};
+
+template <typename T>
+struct TVector3 {
+    DEVICE TVector3() {}
+
+    template <typename T2>
+    DEVICE
+    TVector3(T2 x, T2 y, T2 z) : x(T(x)), y(T(y)), z(T(z)) {}
+
+    template <typename T2>
+    DEVICE
+    TVector3(const TVector3<T2> &v) : x(T(v.x)), y(T(v.y)), z(T(v.z)) {}
+
+    DEVICE T& operator[](int i) {
+        return *(&x + i);
+    }
+
+    DEVICE T operator[](int i) const {
+        return *(&x + i);
+    }
+
+    T x, y, z;
+};
+
+template <typename T>
+struct TVector4 {
+    DEVICE TVector4() {}
+
+    template <typename T2>
+    DEVICE
+    TVector4(T2 x, T2 y, T2 z, T2 w) : x(T(x)), y(T(y)), z(T(z)), w(T(w)) {}
+
+    template <typename T2>
+    DEVICE
+    TVector4(const TVector4<T2> &v) : x(T(v.x)), y(T(v.y)), z(T(v.z)), w(T(v.w)) {}
+
+
+    DEVICE T& operator[](int i) {
+        return *(&x + i);
+    }
+
+    DEVICE T operator[](int i) const {
+        return *(&x + i);
+    }
+
+    T x, y, z, w;
+};
+
+using Vector2f = TVector2<float>;
+using Vector2d = TVector2<double>;
+using Vector2i = TVector2<int>;
+using Vector2 = TVector2<Real>;
+using Vector3i = TVector3<int>;
+using Vector3f = TVector3<float>;
+using Vector3d = TVector3<double>;
+using Vector3 = TVector3<Real>;
+using Vector4f = TVector4<float>;
+using Vector4d = TVector4<double>;
+using Vector4 = TVector4<Real>;
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TVector2<T0> &v0,
+                      const TVector2<T1> &v1) -> TVector2<decltype(v0[0] + v1[0])> {
+    return TVector2<decltype(v0[0] + v1[0])>{
+        v0[0] + v1[0], v0[1] + v1[1]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const T0 &v0,
+                      const TVector2<T1> &v1) -> TVector2<decltype(v0 + v1[0])> {
+    return TVector2<decltype(v0 + v1[0])>{v0 + v1[0], v0 + v1[1]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const T0 &v0,
+                      const TVector3<T1> &v1) -> TVector3<decltype(v0[0] + v1)> {
+    return TVector3<decltype(v0 + v1[0])>{
+        v0 + v1[0], v0 + v1[1], v0 + v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TVector2<T0> &v0,
+                      const T1 &v1) -> TVector2<decltype(v0[0] + v1)> {
+    return TVector2<decltype(v0[0] + v1)>{
+        v0[0] + v1, v0[1] + v1};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TVector3<T0> &v0,
+                      const T1 &v1) -> TVector3<decltype(v0[0] + v1[0])> {
+    return TVector3<decltype(v0[0] + v1)>{
+        v0[0] + v1, v0[1] + v1, v0[2] + v1};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TVector3<T0> &v0,
+                      const TVector3<T1> &v1) -> TVector3<decltype(v0[0] + v1[0])> {
+    return TVector3<decltype(v0[0] + v1[0])>{
+        v0[0] + v1[0], v0[1] + v1[1], v0[2] + v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TVector4<T0> &v0,
+                      const TVector4<T1> &v1) -> TVector4<decltype(v0[0] + v1[0])> {
+    return TVector4<decltype(v0[0] + v1[0])>{
+        v0[0] + v1[0], v0[1] + v1[1], v0[2] + v1[2], v0[3] + v1[3]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+=(TVector2<T0> &v0,
+                       const TVector2<T1> &v1) -> TVector2<T0>& {
+    v0[0] += v1[0];
+    v0[1] += v1[1];
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+=(TVector3<T0> &v0,
+                       const TVector3<T1> &v1) -> TVector3<T0>& {
+    v0[0] += v1[0];
+    v0[1] += v1[1];
+    v0[2] += v1[2];
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+=(TVector3<T0> &v0,
+                       const T1 &v1) -> TVector3<T0>& {
+    v0[0] += v1;
+    v0[1] += v1;
+    v0[2] += v1;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+=(TVector4<T0> &v0,
+                       const TVector4<T1> &v1) -> TVector4<T0>& {
+    v0[0] += v1[0];
+    v0[1] += v1[1];
+    v0[2] += v1[2];
+    v0[3] += v1[3];
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+=(TVector4<T0> &v0,
+                       const T1 &v1) -> TVector4<T0>& {
+    v0[0] += v1;
+    v0[1] += v1;
+    v0[2] += v1;
+    v0[3] += v1;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const T0 &v0,
+                      const TVector2<T1> &v1) -> TVector2<decltype(v0 - v1[0])> {
+    return TVector2<decltype(v0 - v1[0])>{v0 - v1[0], v0 - v1[1]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const T0 &v0,
+                      const TVector3<T1> &v1) -> TVector2<decltype(v0 - v1[0])> {
+    return TVector3<decltype(v0 - v1[0])>{v0 - v1[0], v0 - v1[1], v0 - v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TVector2<T0> &v0,
+                      const T1 &v1) -> TVector2<decltype(v0[0] - v1)> {
+    return TVector2<decltype(v0[0] - v1)>{v0[0] - v1, v0[1] - v1};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TVector3<T0> &v0,
+                      const T1 &v1) -> TVector3<decltype(v0[0] - v1)> {
+    return TVector3<decltype(v0[0] - v1)>{v0[0] - v1, v0[1] - v1, v0[2] - v1};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TVector2<T0> &v0,
+                      const TVector2<T1> &v1) -> TVector2<decltype(v0[0] - v1[0])> {
+    return TVector2<decltype(v0[0] - v1[0])>{
+        v0[0] - v1[0], v0[1] - v1[1]};
+}
+
+template <typename T>
+DEVICE
+inline auto operator-(const TVector2<T> &v) -> TVector2<T> {
+    return TVector2<T>{-v[0], -v[1]};
+}
+
+template <typename T>
+DEVICE
+inline auto operator-(const TVector3<T> &v) -> TVector3<T> {
+    return TVector3<T>{-v[0], -v[1], -v[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TVector3<T0> &v0,
+                      const TVector3<T1> &v1) -> TVector3<decltype(v0[0] - v1[0])> {
+    return TVector3<decltype(v0[0] - v1[0])>{
+        v0[0] - v1[0], v0[1] - v1[1], v0[2] - v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TVector4<T0> &v0,
+                      const TVector4<T1> &v1) -> TVector4<decltype(v0[0] - v1[0])>  {
+    return TVector4<decltype(v0[0] - v1[0])>{
+        v0[0] - v1[0], v0[1] - v1[1], v0[2] - v1[2], v0[3] - v1[3]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-=(TVector2<T0> &v0,
+                       const TVector2<T1> &v1) -> TVector2<T0>&  {
+    v0[0] -= v1[0];
+    v0[1] -= v1[1];
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-=(TVector3<T0> &v0,
+                       const TVector3<T1> &v1) -> TVector3<T0>& {
+    v0[0] -= v1[0];
+    v0[1] -= v1[1];
+    v0[2] -= v1[2];
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const TVector2<T0> &v0,
+                      const TVector2<T1> &v1) -> TVector2<decltype(v0[0] * v1[0])> {
+    return TVector2<decltype(v0[0] * v1[0])>{
+        v0[0] * v1[0], v0[1] * v1[1]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const TVector2<T0> &v0,
+                      const T1 &s) -> TVector2<decltype(v0[0] * s)> {
+    return TVector2<decltype(v0[0] * s)>{
+        v0[0] * s, v0[1] * s};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const T0 &s,
+                      const TVector2<T1> &v0) -> TVector2<decltype(s * v0[0])> {
+    return TVector2<decltype(s * v0[0])>{s * v0[0], s * v0[1]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*=(TVector2<T0> &v0,
+                       const T1 &s) -> TVector2<T0>& {
+    v0[0] *= s;
+    v0[1] *= s;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const TVector3<T0> &v0,
+                      const T1 &s) -> TVector3<decltype(v0[0] * s)> {
+    return TVector3<decltype(v0[0] * s)>{
+        v0[0] * s, v0[1] * s, v0[2] * s};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const T0 &s,
+                      const TVector3<T1> &v0) -> TVector3<decltype(s * v0[0])> {
+    return TVector3<decltype(s * v0[0])>{
+        s * v0[0], s * v0[1], s * v0[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*=(TVector3<T0> &v0,
+                        const T1 &s) -> TVector3<T0>& {
+    v0[0] *= s;
+    v0[1] *= s;
+    v0[2] *= s;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*=(TVector4<T0> &v0,
+                        const T1 &s) -> TVector4<T0>& {
+    v0[0] *= s;
+    v0[1] *= s;
+    v0[2] *= s;
+    v0[3] *= s;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const TVector3<T0> &v0,
+                      const TVector3<T1> &v1) -> TVector3<decltype(v0[0] * v1[0])> {
+    return TVector3<decltype(v0[0] * v1[0])>{
+        v0[0] * v1[0], v0[1] * v1[1], v0[2] * v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const TVector4<T0> &v0,
+                      const T1 &s) -> TVector4<decltype(v0[0] * s)> {
+    return TVector4<decltype(v0[0] * s)>{
+        v0[0] * s, v0[1] * s, v0[2] * s, v0[3] * s};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const T0 &s,
+                      const TVector4<T1> &v0) -> TVector4<decltype(s * v0[0])> {
+    return TVector4<decltype(s * v0[0])>{
+        s * v0[0], s * v0[1], s * v0[2], s * v0[3]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator*(const TVector4<T0> &v0,
+                      const TVector4<T1> &v1) -> TVector4<decltype(v0[0] * v1[0])> {
+    return TVector4<decltype(v0[0] * v1[0])>{
+        v0[0] * v1[0], v0[1] * v1[1], v0[2] * v1[2], v0[3] * v1[3]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/(const TVector2<T0> &v0,
+                      const T1 &s) -> TVector2<decltype(v0[0] / s)> {
+    auto inv_s = 1.f / s;
+    return v0 * inv_s;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/(const TVector3<T0> &v0,
+                      const T1 &s) -> TVector3<decltype(v0[0] / s)> {
+    auto inv_s = 1.f / s;
+    return v0 * inv_s;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/(const TVector4<T0> &v0,
+                      const T1 &s) -> TVector4<decltype(v0[0] / s)> {
+    auto inv_s = 1.f / s;
+    return v0 * inv_s;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/(const T0 &s,
+                      const TVector3<T1> &v1) -> TVector3<decltype(s / v1[0])> {
+    return TVector3<decltype(s / v1[0])>{
+        s / v1[0], s / v1[2], s / v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/(const TVector3<T0> &v0,
+                      const TVector3<T1> &v1) -> TVector3<decltype(v0[0] / v1[0])> {
+    return TVector3<decltype(v0[0] / v1[0])>{
+        v0[0] / v1[0], v0[1] / v1[2], v0[2] / v1[2]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/(const TVector2<T0> &v0,
+                      const TVector2<T1> &v1) -> TVector2<decltype(v0[0] / v1[0])> {
+    return TVector2<decltype(v0[0] / v1[0])>{
+        v0[0] / v1[0], v0[1] / v1[1]};
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/=(TVector3<T0> &v0,
+                        const T1 &s) -> TVector3<T0>& {
+    auto inv_s = 1.f / s;
+    v0[0] *= inv_s;
+    v0[1] *= inv_s;
+    v0[2] *= inv_s;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto operator/=(TVector4<T0> &v0,
+                        const T1 &s) -> TVector4<T0>& {
+    auto inv_s = 1.f / s;
+    v0[0] *= inv_s;
+    v0[1] *= inv_s;
+    v0[2] *= inv_s;
+    v0[3] *= inv_s;
+    return v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline bool operator==(const TVector2<T0> &v0,
+                       const TVector2<T1> &v1) {
+    return v0.x == v1.x && v0.y == v1.y;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline bool operator==(const TVector3<T0> &v0,
+                       const TVector3<T1> &v1) {
+    return v0.x == v1.x && v0.y == v1.y && v0.z == v1.z;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline bool operator!=(const TVector3<T0> &v0,
+                       const TVector3<T1> &v1) {
+    return v0.x != v1.x || v0.y != v1.y || v0.z != v1.z;
+}
+
+template <typename T>
+DEVICE
+inline TVector2<T> get_normal(const TVector2<T> &v) {
+    return TVector2<T>{v.y, -v.x};
+}
+
+template <typename T>
+DEVICE
+inline T length_squared(const TVector2<T> &v0) {
+    return square(v0[0]) + square(v0[1]);
+}
+
+template <typename T>
+DEVICE
+inline TVector2<T> d_length_squared(const TVector2<T> &v0, const T &d_l_sq) {
+    //l_sq = square(v0[0]) + square(v0[1])
+    return 2 * d_l_sq * v0;
+}
+
+template <typename T>
+DEVICE
+inline T length(const TVector2<T> &v0) {
+    return sqrt(length_squared(v0));
+}
+
+template <typename T>
+DEVICE
+inline TVector2<T> d_length(const TVector2<T> &v0, const T &d_l) {
+    auto l_sq = length_squared(v0);
+    auto l = sqrt(l_sq);
+    auto d_l_sq = 0.5f * d_l / l;
+    return d_length_squared(v0, T(d_l_sq));
+}
+
+template <typename T>
+DEVICE
+inline T length_squared(const TVector3<T> &v0) {
+    return square(v0[0]) + square(v0[1]) + square(v0[2]);
+}
+
+template <typename T>
+DEVICE
+inline TVector3<T> d_length_squared(const TVector3<T> &v0, const T &d_l_sq) {
+    //l_sq = square(v0[0]) + square(v0[1]) + square(v0[2])
+    return 2 * d_l_sq * v0;
+}
+
+template <typename T>
+DEVICE
+inline T length(const TVector3<T> &v0) {
+    return sqrt(length_squared(v0));
+}
+
+template <typename T>
+DEVICE
+inline TVector3<T> d_length(const TVector3<T> &v0, const T &d_l) {
+    auto l_sq = length_squared(v0);
+    auto l = sqrt(l_sq);
+    auto d_l_sq = 0.5f * d_l / l;
+    return d_length_squared(v0, d_l_sq);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto distance_squared(const TVector2<T0> &v0,
+                             const TVector2<T1> &v1) -> decltype(length_squared(v1 - v0)) {
+    return length_squared(v1 - v0);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto distance_squared(const TVector3<T0> &v0,
+                             const TVector3<T1> &v1) -> decltype(length_squared(v1 - v0)) {
+    return length_squared(v1 - v0);
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto distance(const TVector2<T0> &v0,
+                     const TVector2<T1> &v1) -> decltype(length(v1 - v0)) {
+    return length(v1 - v0);
+}
+
+template <typename T>
+DEVICE
+inline void d_distance(const TVector2<T> &v0,
+                       const TVector2<T> &v1,
+                       const T &d_output,
+                       TVector2<T> &d_v0,
+                       TVector2<T> &d_v1) {
+    auto d_v1_v0 = d_length(v1 - v0, d_output);
+    d_v0 -= d_v1_v0;
+    d_v1 += d_v1_v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto distance(const TVector3<T0> &v0,
+                     const TVector3<T1> &v1) -> decltype(length(v1 - v0)) {
+    return length(v1 - v0);
+}
+
+template <typename T>
+DEVICE
+inline void d_distance(const TVector3<T> &v0,
+                       const TVector3<T> &v1,
+                       const T &d_output,
+                       TVector3<T> &d_v0,
+                       TVector3<T> &d_v1) {
+    auto d_v1_v0 = d_length(v1 - v0, d_output);
+    d_v0 -= d_v1_v0;
+    d_v1 += d_v1_v0;
+}
+
+template <typename T>
+DEVICE
+inline TVector2<T> normalize(const TVector2<T> &v0) {
+    return v0 / length(v0);
+}
+
+template <typename T>
+DEVICE
+inline TVector2<T> d_normalize(const TVector2<T> &v0, const TVector2<T> &d_n) {
+    auto l = length(v0);
+    auto n = v0 / l;
+    auto d_v0 = d_n / l;
+    auto d_l = -dot(d_n, n) / l;
+    // l = length(v0)
+    d_v0 += d_length(v0, d_l);
+    return d_v0;
+}
+
+template <typename T>
+DEVICE
+inline TVector3<T> normalize(const TVector3<T> &v0) {
+    return v0 / length(v0);
+}
+
+template <typename T>
+DEVICE
+inline TVector3<T> d_normalize(const TVector3<T> &v0, const TVector3<T> &d_n) {
+    auto l = length(v0);
+    auto n = v0 / l;
+    auto d_v0 = d_n / l;
+    auto d_l = -dot(d_n, n) / l;
+    // l = length(v0)
+    d_v0 += d_length(v0, d_l);
+    return d_v0;
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto dot(const TVector2<T0> &v0, const TVector2<T1> &v1) -> decltype(v0[0] * v1[0]) {
+    return v0[0] * v1[0] +
+           v0[1] * v1[1];
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto dot(const TVector3<T0> &v0, const TVector3<T1> &v1) -> decltype(v0[0] * v1[0]) {
+    return v0[0] * v1[0] +
+           v0[1] * v1[1] +
+           v0[2] * v1[2];
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto dot(const TVector4<T0> &v0, const TVector4<T1> &v1) -> decltype(v0[0] * v1[0]) {
+    return v0[0] * v1[0] +
+           v0[1] * v1[1] +
+           v0[2] * v1[2] +
+           v0[3] * v1[3];
+}
+
+template <typename T0, typename T1>
+DEVICE
+inline auto cross(const TVector3<T0> &v0, const TVector3<T1> &v1) -> TVector3<decltype(v0[1] * v1[2] - v0[2] * v1[1])> {
+    return TVector3<decltype(v0[1] * v1[2] - v0[2] * v1[1])>{
+        v0[1] * v1[2] - v0[2] * v1[1],
+        v0[2] * v1[0] - v0[0] * v1[2],
+        v0[0] * v1[1] - v0[1] * v1[0]};
+}
+
+template <typename T>
+DEVICE
+inline void d_cross(const TVector3<T> &v0, const TVector3<T> &v1, const TVector3<T> &d_output,
+                    TVector3<T> &d_v0, TVector3<T> &d_v1) {
+    d_v0 += cross(v1, d_output);
+    d_v1 += cross(d_output, v0);
+}
+
+template <typename T>
+DEVICE
+inline T luminance(const TVector3<T> &v) {
+    return 0.212671f * v[0] +
+           0.715160f * v[1] +
+           0.072169f * v[2];
+}
+
+template <typename T>
+DEVICE
+inline T sum(const T &v) {
+    return v;
+}
+
+template <typename T>
+DEVICE
+inline T sum(const TVector2<T> &v) {
+    return v[0] + v[1];
+}
+
+template <typename T>
+DEVICE
+inline T sum(const TVector3<T> &v) {
+    return v[0] + v[1] + v[2];
+}
+
+template <typename T>
+DEVICE
+inline T sum(const TVector4<T> &v) {
+    return v[0] + v[1] + v[2] + v[3];
+}
+
+template <typename T>
+DEVICE
+void coordinate_system(const TVector3<T> &n, TVector3<T> &x, TVector3<T> &y) {
+    if (n[2] < -1.f + 1e-6f) {
+        x = TVector3<T>{T(0), T(-1), T(0)};
+        y = TVector3<T>{T(-1), T(0), T(0)};
+    } else {
+        auto a = 1.f / (1.f + n[2]);
+        auto b = -n[0] * n[1] * a;
+        x = TVector3<T>{1.f - square(n[0]) * a, b, -n[0]};
+        y = TVector3<T>{b, 1.f - square(n[1]) * a, -n[1]};
+    }
+}
+
+template <typename T>
+DEVICE
+void d_coordinate_system(const TVector3<T> &n, const TVector3<T> &d_x, const TVector3<T> &d_y,
+                         TVector3<T> &d_n) {
+    if (n[2] < -1.f + 1e-6f) {
+        //x = TVector3<T>{T(0), T(-1), T(0)};
+        //y = TVector3<T>{T(-1), T(0), T(0)};
+        // don't need to do anything
+    } else {
+        auto a = 1.f / (1.f + n[2]);
+        // auto b = -n[0] * n[1] * a;
+        // x = TVector3<T>{1.f - square(n[0]) * a, b, -n[0]}
+        d_n[0] -= 2.f * n[0] * d_x[0] * a;
+        auto d_a = -square(n[0]) * d_x[0];
+        auto d_b = d_x[1];
+        d_n[0] -= d_x[2];
+        // y = TVector3<T>{b, 1.f - square(n[1]) * a, -n[1]}
+        d_b += d_y[0];
+        d_n[1] -= 2.f * d_y[1] * n[1] * a;
+        d_a -= d_y[1] * square(n[1]);
+        d_n[1] -= d_y[2];
+        // b = -n[0] * n[1] * a
+        d_n[0] -= d_b * n[1] * a;
+        d_n[1] -= d_b * n[0] * a;
+        d_a -= d_b * n[0] * n[1];
+        // a = 1 / (1 + n[2])
+        d_n[2] -= d_a * a / (1 + n[2]);
+    }
+}
+
+DEVICE
+inline bool isfinite(const Vector2 &v) {
+    return isfinite(v.x) &&
+           isfinite(v.y);
+}
+
+DEVICE
+inline bool isfinite(const Vector3 &v) {
+    return isfinite(v.x) &&
+           isfinite(v.y) &&
+           isfinite(v.z);
+}
+
+DEVICE
+inline bool isfinite(const Vector4 &v) {
+    return isfinite(v.x) &&
+           isfinite(v.y) &&
+           isfinite(v.z) &&
+           isfinite(v.w);
+}
+
+DEVICE
+inline bool is_zero(const Vector3 &v) {
+    return v.x == 0 && v.y == 0 && v.z == 0;
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TVector2<T> &v) {
+    return os << "(" << v[0] << ", " << v[1] << ")";
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TVector3<T> &v) {
+    return os << "(" << v[0] << ", " << v[1] << ", " << v[2] << ")";
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TVector4<T> &v) {
+    return os << "(" << v[0] << ", " << v[1] << ", " << v[2] << ", " << v[3] << ")";
+}
+
+DEVICE
+inline
+float det(const Vector2f &a, const Vector2f &b) {
+    return a.x*b.y-b.x*a.y;
+}
+
+DEVICE
+inline
+Vector2f quadratic_closest_pt_approx(const Vector2f &b0,
+                                     const Vector2f &b1,
+                                     const Vector2f &b2,
+                                     float *t_ = nullptr) {
+    // From http://w3.impa.br/~diego/publications/NehHop08.pdf
+    float a=det(b0,b2), b=2*det(b1,b0), d=2*det(b2,b1);
+    float f=b*d-a*a;
+    Vector2f d21=b2-b1, d10=b1-b0, d20=b2-b0;
+    Vector2f gf=2*(b*d21+d*d10+a*d20);
+    gf=Vector2f(gf.y,-gf.x);
+    Vector2f pp=-f*gf/dot(gf,gf);
+    Vector2f d0p=b0-pp;
+    float ap=det(d0p,d20), bp=2*det(d10,d0p);
+    float t=clamp((ap+bp)/(2*a+b+d),0.f,1.f);
+    float tt = 1 - t;
+    if (t_ != nullptr) {
+        *t_ = t;
+    }
+    return (tt*tt)*b0 + (2*tt*t)*b1 + (t*t)*b2;
+}
+
+DEVICE
+inline
+Vector2f quadratic_closest_pt_approx(const Vector2f &b0,
+                                     const Vector2f &b1,
+                                     const Vector2f &b2,
+                                     const Vector2f &pt,
+                                     float *t = nullptr) {
+    // Approximate closest point to a quadratic curve
+    return quadratic_closest_pt_approx(b0 - pt, b1 - pt, b2 - pt, t) + pt;
+}
diff --git a/winding_number.h b/winding_number.h
new file mode 100644
index 0000000000000000000000000000000000000000..8791a4cdeeeb6136e12182782ea66946053cb554
--- /dev/null
+++ b/winding_number.h
@@ -0,0 +1,202 @@
+#pragma once
+
+#include "diffvg.h"
+#include "scene.h"
+#include "shape.h"
+#include "solve.h"
+#include "vector.h"
+
+DEVICE
+int compute_winding_number(const Circle &circle, const Vector2f &pt) {
+    const auto &c = circle.center;
+    auto r = circle.radius;
+    // inside the circle: return 1, outside the circle: return 0
+    if (distance_squared(c, pt) < r * r) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+DEVICE
+int compute_winding_number(const Ellipse &ellipse, const Vector2f &pt) {
+    const auto &c = ellipse.center;
+    const auto &r = ellipse.radius;
+    // inside the ellipse: return 1, outside the ellipse: return 0
+    if (square(c.x - pt.x) / square(r.x) + square(c.y - pt.y) / square(r.y) < 1) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+DEVICE
+bool intersect(const AABB &box, const Vector2f &pt) {
+    if (pt.y < box.p_min.y || pt.y > box.p_max.y) {
+        return false;
+    }
+    if (pt.x > box.p_max.x) {
+        return false;
+    }
+    return true;
+}
+
+DEVICE
+int compute_winding_number(const Path &path, const BVHNode *bvh_nodes, const Vector2f &pt) {
+    // Shoot a horizontal ray from pt to right, intersect with all curves of the path,
+    // count intersection
+    auto num_segments = path.num_base_points;
+    constexpr auto max_bvh_size = 128;
+    int bvh_stack[max_bvh_size];
+    auto stack_size = 0;
+    auto winding_number = 0;
+    bvh_stack[stack_size++] = 2 * num_segments - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto base_point_id = node.child0;
+            auto point_id = - node.child1 - 1;
+            assert(base_point_id < num_segments);
+            assert(point_id < path.num_points);
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                // intersect p0 + t * (p1 - p0) with pt + t' * (1, 0)
+                // solve:
+                // pt.x + t' = v0.x + t * (v1.x - v0.x)
+                // pt.y      = v0.y + t * (v1.y - v0.y)
+                if (p1.y != p0.y) {
+                    auto t = (pt.y - p0.y) / (p1.y - p0.y);
+                    if (t >= 0 && t <= 1) {
+                        auto tp = p0.x - pt.x + t * (p1.x - p0.x);
+                        if (tp >= 0) {
+                            if (p1.y - p0.y > 0) {
+                                winding_number += 1;
+                            } else {
+                                winding_number -= 1;
+                            }
+                        }
+                    }
+                }
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                // The curve is (1-t)^2p0 + 2(1-t)tp1 + t^2p2
+                // = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0
+                // intersect with pt + t' * (1 0)
+                // solve
+                // pt.y = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0
+                float t[2];
+                if (solve_quadratic(p0.y-2*p1.y+p2.y,
+                                    -2*p0.y+2*p1.y,
+                                    p0.y-pt.y,
+                                    &t[0], &t[1])) {
+                    for (int j = 0; j < 2; j++) {
+                        if (t[j] >= 0 && t[j] <= 1) {
+                            auto tp = (p0.x-2*p1.x+p2.x)*t[j]*t[j] +
+                                      (-2*p0.x+2*p1.x)*t[j] +
+                                      p0.x-pt.x;
+                            if (tp >= 0) {
+                                if (2*(p0.y-2*p1.y+p2.y)*t[j]+(-2*p0.y+2*p1.y) > 0) {
+                                    winding_number += 1;
+                                } else {
+                                    winding_number -= 1;
+                                }
+                            }
+                        }
+                    }
+                }
+            } else if (path.num_control_points[base_point_id] == 2) {
+                // Cubic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+                // The curve is (1 - t)^3 p0 + 3 * (1 - t)^2 t p1 + 3 * (1 - t) t^2 p2 + t^3 p3
+                // = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+                // intersect with pt + t' * (1 0)
+                // solve:
+                // pt.y = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+                double t[3];
+                int num_sol = solve_cubic(double(-p0.y+3*p1.y-3*p2.y+p3.y),
+                                          double(3*p0.y-6*p1.y+3*p2.y),
+                                          double(-3*p0.y+3*p1.y),
+                                          double(p0.y-pt.y),
+                                          t);
+                for (int j = 0; j < num_sol; j++) {
+                    if (t[j] >= 0 && t[j] <= 1) {
+                        // t' = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0 - pt.x
+                        auto tp = (-p0.x+3*p1.x-3*p2.x+p3.x)*t[j]*t[j]*t[j]+
+                                  (3*p0.x-6*p1.x+3*p2.x)*t[j]*t[j]+
+                                  (-3*p0.x+3*p1.x)*t[j]+
+                                  p0.x-pt.x;
+                        if (tp > 0) {
+                            if (3*(-p0.y+3*p1.y-3*p2.y+p3.y)*t[j]*t[j]+
+                                2*(3*p0.y-6*p1.y+3*p2.y)*t[j]+
+                                (-3*p0.y+3*p1.y) > 0) {
+                                winding_number += 1;
+                            } else {
+                                winding_number -= 1;
+                            }
+                        }
+                    }
+                }
+            } else {
+                assert(false);
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (intersect(b0, pt)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (intersect(b1, pt)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_size);
+        }
+    }
+    return winding_number;
+}
+
+DEVICE
+int compute_winding_number(const Rect &rect, const Vector2f &pt) {
+    const auto &p_min = rect.p_min;
+    const auto &p_max = rect.p_max;
+    // inside the rectangle: return 1, outside the rectangle: return 0
+    if (pt.x > p_min.x && pt.x < p_max.x && pt.y > p_min.y && pt.y < p_max.y) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+DEVICE
+int compute_winding_number(const Shape &shape, const BVHNode *bvh_nodes, const Vector2f &pt) {
+    switch (shape.type) {
+        case ShapeType::Circle:
+            return compute_winding_number(*(const Circle *)shape.ptr, pt);
+        case ShapeType::Ellipse:
+            return compute_winding_number(*(const Ellipse *)shape.ptr, pt);
+        case ShapeType::Path:
+            return compute_winding_number(*(const Path *)shape.ptr, bvh_nodes, pt);
+        case ShapeType::Rect:
+            return compute_winding_number(*(const Rect *)shape.ptr, pt);
+    }
+    assert(false);
+    return 0;
+}
diff --git a/within_distance.h b/within_distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..e81537786189b9ded312cdb9b0472b2eef7bd512
--- /dev/null
+++ b/within_distance.h
@@ -0,0 +1,446 @@
+#pragma once
+
+#include "diffvg.h"
+#include "edge_query.h"
+#include "shape.h"
+#include "vector.h"
+
+DEVICE
+inline
+bool within_distance(const Circle &circle, const Vector2f &pt, float r) {
+    auto dist_to_center = distance(circle.center, pt);
+    if (fabs(dist_to_center - circle.radius) < r) {
+        return true;
+    }
+    return false;
+}
+
+DEVICE
+inline
+bool within_distance(const Path &path, const BVHNode *bvh_nodes, const Vector2f &pt, float r) {
+    auto num_segments = path.num_base_points;
+    constexpr auto max_bvh_size = 128;
+    int bvh_stack[max_bvh_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * num_segments - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto base_point_id = node.child0;
+            auto point_id = - node.child1 - 1;
+            assert(base_point_id < num_segments);
+            assert(point_id < path.num_points);
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                // project pt to line
+                auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+                auto r0 = r;
+                auto r1 = r;
+                // override radius if path has thickness
+                if (path.thickness != nullptr) {
+                    r0 = path.thickness[i0];
+                    r1 = path.thickness[i1];
+                }
+                if (t < 0) {
+                    if (distance_squared(p0, pt) < r0 * r0) {
+                        return true;
+                    }
+                } else if (t > 1) {
+                    if (distance_squared(p1, pt) < r1 * r1) {
+                        return true;
+                    }
+                } else {
+                    auto r = r0 + t * (r1 - r0);
+                    if (distance_squared(p0 + t * (p1 - p0), pt) < r * r) {
+                        return true;
+                    }
+                }
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                if (path.use_distance_approx) {
+                    auto cp = quadratic_closest_pt_approx(p0, p1, p2, pt);
+                    return distance_squared(cp, pt) < r * r;
+                }
+                auto eval = [&](float t) -> Vector2f {
+                    auto tt = 1 - t;
+                    return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+                };
+                auto r0 = r;
+                auto r1 = r;
+                auto r2 = r;
+                // override radius if path has thickness
+                if (path.thickness != nullptr) {
+                    r0 = path.thickness[i0];
+                    r1 = path.thickness[i1];
+                    r2 = path.thickness[i2];
+                }
+                if (distance_squared(eval(0), pt) < r0 * r0) {
+                    return true;
+                }
+                if (distance_squared(eval(1), pt) < r2 * r2) {
+                    return true;
+                }
+
+                // The curve is (1-t)^2p0 + 2(1-t)tp1 + t^2p2
+                // = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0 = q
+                // Want to solve (q - pt) dot q' = 0
+                // q' = (p0-2p1+p2)t + (-p0+p1)
+                // Expanding (p0-2p1+p2)^2 t^3 +
+                //           3(p0-2p1+p2)(-p0+p1) t^2 +
+                //           (2(-p0+p1)^2+(p0-2p1+p2)(p0-pt))t +
+                //           (-p0+p1)(p0-pt) = 0
+                auto A = sum((p0-2*p1+p2)*(p0-2*p1+p2));
+                auto B = sum(3*(p0-2*p1+p2)*(-p0+p1));
+                auto C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt));
+                auto D = sum((-p0+p1)*(p0-pt));
+                float t[3];
+                int num_sol = solve_cubic(A, B, C, D, t);
+                for (int j = 0; j < num_sol; j++) {
+                    if (t[j] >= 0 && t[j] <= 1) {
+                        auto tt = 1 - t[j];
+                        auto r = (tt*tt)*r0 + (2*tt*t[j])*r1 + (t[j]*t[j])*r2;
+                        auto p = eval(t[j]);
+                        if (distance_squared(p, pt) < r*r) {
+                            return true;
+                        }
+                    }
+                }
+            } else if (path.num_control_points[base_point_id] == 2) {
+                // Cubic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+                auto eval = [&](float t) -> Vector2f {
+                    auto tt = 1 - t;
+                    return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+                };
+                auto r0 = r;
+                auto r1 = r;
+                auto r2 = r;
+                auto r3 = r;
+                // override radius if path has thickness
+                if (path.thickness != nullptr) {
+                    r0 = path.thickness[i0];
+                    r1 = path.thickness[i1];
+                    r2 = path.thickness[i2];
+                    r3 = path.thickness[i3];
+                }
+                if (distance_squared(eval(0), pt) < r0*r0) {
+                    return true;
+                }
+                if (distance_squared(eval(1), pt) < r3*r3) {
+                    return true;
+                }
+                // The curve is (1 - t)^3 p0 + 3 * (1 - t)^2 t p1 + 3 * (1 - t) t^2 p2 + t^3 p3
+                // = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+                // Want to solve (q - pt) dot q' = 0
+                // q' = 3*(-p0+3p1-3p2+p3)t^2 + 2*(3p0-6p1+3p2)t + (-3p0+3p1)
+                // Expanding 
+                // 3*(-p0+3p1-3p2+p3)^2 t^5
+                // 5*(-p0+3p1-3p2+p3)(3p0-6p1+3p2) t^4
+                // 4*(-p0+3p1-3p2+p3)(-3p0+3p1) + 2*(3p0-6p1+3p2)^2 t^3
+                // 3*(3p0-6p1+3p2)(-3p0+3p1) + 3*(-p0+3p1-3p2+p3)(p0-pt) t^2
+                // (-3p0+3p1)^2+2(p0-pt)(3p0-6p1+3p2) t
+                // (p0-pt)(-3p0+3p1)
+                double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3));
+                double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+                double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+                double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+                double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+                double F = sum((p0-pt)*(-3*p0+3*p1));
+                // normalize the polynomial
+                B /= A;
+                C /= A;
+                D /= A;
+                E /= A;
+                F /= A;
+                // Isolator Polynomials:
+                // https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.133.2233&rep=rep1&type=pdf
+                //                                       x/5 + B/25
+                //                                    /-----------------------------------------------------
+                // 5x^4 + 4B x^3 + 3C x^2 + 2D x + E /   x^5 +    B x^4 +       C x^3 +      D x^2 +      E x + F
+                //                                       x^5 + 4B/5 x^4 +    3C/5 x^3 +   2D/5 x^2 +    E/5 x
+                //                                      ----------------------------------------------------
+                //                                              B/5 x^4 +    2C/5 x^3 +   3D/5 x^2 +   4E/5 x + F
+                //                                              B/5 x^4 + 4B^2/25 x^3 + 3BC/25 x^2 + 2BD/25 x + BE/25
+                //                                      ----------------------------------------------------
+                //                                     (2C/5 - 4B^2/25)x^3 + (3D/5-3BC/25)x^2 + (4E/5-2BD/25) + (F-BE/25)
+                auto p1A = ((2 / 5.f) * C - (4 / 25.f) * B * B);
+                auto p1B = ((3 / 5.f) * D - (3 / 25.f) * B * C);
+                auto p1C = ((4 / 5.f) * E - (2 / 25.f) * B * D);
+                auto p1D = F - B * E / 25.f;
+                // auto q1A = 1 / 5.f;
+                // auto q1B = B / 25.f;
+                // x/5 + B/25 = 0
+                // x = -B/5
+                auto q_root = -B/5.f;
+                double p_roots[3];
+                int num_sol = solve_cubic(p1A, p1B, p1C, p1D, p_roots);
+                float intervals[4];
+                if (q_root >= 0 && q_root <= 1) {
+                    intervals[0] = q_root;
+                }
+                for (int j = 0; j < num_sol; j++) {
+                    intervals[j + 1] = p_roots[j];
+                }
+                auto num_intervals = 1 + num_sol;
+                // sort intervals
+                for (int j = 1; j < num_intervals; j++) {
+                    for (int k = j; k > 0 && intervals[k - 1] > intervals[k]; k--) {
+                        auto tmp = intervals[k];
+                        intervals[k] = intervals[k - 1];
+                        intervals[k - 1] = tmp;
+                    }
+                }
+                auto eval_polynomial = [&] (double t) {
+                    return t*t*t*t*t+
+                           B*t*t*t*t+
+                           C*t*t*t+
+                           D*t*t+
+                           E*t+
+                           F;
+                };
+                auto eval_polynomial_deriv = [&] (double t) {
+                    return 5*t*t*t*t+
+                           4*B*t*t*t+
+                           3*C*t*t+
+                           2*D*t+
+                           E;
+                };
+                auto lower_bound = 0.f;
+                for (int j = 0; j < num_intervals + 1; j++) {
+                    if (j < num_intervals && intervals[j] < 0.f) {
+                        continue;
+                    }
+                    auto upper_bound = j < num_intervals ?
+                        min(intervals[j], 1.f) : 1.f;
+                    auto lb = lower_bound;
+                    auto ub = upper_bound;
+                    auto lb_eval = eval_polynomial(lb);
+                    auto ub_eval = eval_polynomial(ub);
+                    if (lb_eval * ub_eval > 0) {
+                        // Doesn't have root
+                        continue;
+                    }
+                    if (lb_eval > ub_eval) {
+                        swap_(lb, ub);
+                    }
+                    auto t = 0.5f * (lb + ub);
+                    for (int it = 0; it < 20; it++) {
+                        if (!(t >= lb && t <= ub)) {
+                            t = 0.5f * (lb + ub);
+                        }
+                        auto value = eval_polynomial(t);
+                        if (fabs(value) < 1e-5f || it == 19) {
+                            break;
+                        }
+                        // The derivative may not be entirely accurate,
+                        // but the bisection is going to handle this
+                        if (value > 0.f) {
+                            ub = t;
+                        } else {
+                            lb = t;
+                        }
+                        auto derivative = eval_polynomial_deriv(t);
+                        t -= value / derivative;
+                    }
+                    auto tt = 1 - t;
+                    auto r = (tt*tt*tt)*r0 + (3*tt*tt*t)*r1 + (3*tt*t*t)*r2 + (t*t*t)*r3;
+                    if (distance_squared(eval(t), pt) < r * r) {
+                        return true;
+                    }
+                    if (upper_bound >= 1.f) {
+                        break;
+                    }
+                    lower_bound = upper_bound;
+                }
+            } else {
+                assert(false);
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (within_distance(b0, pt, bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (within_distance(b1, pt, bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_size);
+        }
+    }
+    return false;
+}
+
+DEVICE
+inline
+int within_distance(const Rect &rect, const Vector2f &pt, float r) {
+    auto test = [&](const Vector2f &p0, const Vector2f &p1) {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            if (distance_squared(p0, pt) < r * r) {
+                return true;
+            }
+        } else if (t > 1) {
+            if (distance_squared(p1, pt) < r * r) {
+                return true;
+            }
+        } else {
+            if (distance_squared(p0 + t * (p1 - p0), pt) < r * r) {
+                return true;
+            }
+        }
+        return false;
+    };
+    auto left_top = rect.p_min;
+    auto right_top = Vector2f{rect.p_max.x, rect.p_min.y};
+    auto left_bottom = Vector2f{rect.p_min.x, rect.p_max.y};
+    auto right_bottom = rect.p_max;
+    // left
+    if (test(left_top, left_bottom)) {
+        return true;
+    }
+    // top
+    if (test(left_top, right_top)) {
+        return true;
+    }
+    // right
+    if (test(right_top, right_bottom)) {
+        return true;
+    }
+    // bottom
+    if (test(left_bottom, right_bottom)) {
+        return true;
+    }
+    return false;
+}
+
+DEVICE
+inline
+bool within_distance(const Shape &shape, const BVHNode *bvh_nodes, const Vector2f &pt, float r) {
+    switch (shape.type) {
+        case ShapeType::Circle:
+            return within_distance(*(const Circle *)shape.ptr, pt, r);
+        case ShapeType::Ellipse:
+            // https://www.geometrictools.com/Documentation/DistancePointEllipseEllipsoid.pdf
+            assert(false);
+            return false;
+        case ShapeType::Path:
+            return within_distance(*(const Path *)shape.ptr, bvh_nodes, pt, r);
+        case ShapeType::Rect:
+            return within_distance(*(const Rect *)shape.ptr, pt, r);
+    }
+    assert(false);
+    return false;
+}
+
+DEVICE
+inline
+bool within_distance(const SceneData &scene,
+                     int shape_group_id,
+                     const Vector2f &pt) {
+    const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(shape_group.canvas_to_shape, pt);
+
+    constexpr auto max_bvh_stack_size = 64;
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * shape_group.num_shapes - 2;
+    const auto &bvh_nodes = scene.shape_groups_bvh_nodes[shape_group_id];
+
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto shape_id = node.child0;
+            const auto &shape = scene.shapes[shape_id];
+            if (within_distance(shape, scene.path_bvhs[shape_id],
+                                local_pt, shape.stroke_width)) {
+                return true;
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (inside(b0, local_pt, bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (inside(b1, local_pt, bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+
+    return false;
+}
+
+DEVICE
+inline
+bool within_distance(const SceneData &scene,
+                     int shape_group_id,
+                     const Vector2f &pt,
+                     EdgeQuery *edge_query) {
+    if (edge_query == nullptr || shape_group_id != edge_query->shape_group_id) {
+        // Specialized version
+        return within_distance(scene, shape_group_id, pt);
+    }
+    const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(shape_group.canvas_to_shape, pt);
+
+    constexpr auto max_bvh_stack_size = 64;
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * shape_group.num_shapes - 2;
+    const auto &bvh_nodes = scene.shape_groups_bvh_nodes[shape_group_id];
+
+    auto ret = false;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto shape_id = node.child0;
+            const auto &shape = scene.shapes[shape_id];
+            if (within_distance(shape, scene.path_bvhs[shape_id],
+                                local_pt, shape.stroke_width)) {
+                ret = true;
+                if (shape_id == edge_query->shape_id) {
+                    edge_query->hit = true;
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (inside(b0, local_pt, bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (inside(b1, local_pt, bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+
+    return ret;
+}
diff --git a/xing_loss.py b/xing_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..472ed17749dfe041eb262aff80b10506bdaadf01
--- /dev/null
+++ b/xing_loss.py
@@ -0,0 +1,66 @@
+import torch
+import numpy as np
+
+
+def area(a, b, c):
+    return (c[1] - a[1]) * (b[0] - a[0]) - (b[1] - a[1]) * (c[0] - a[0])
+
+
+def triangle_area(A, B, C):
+    out = (C - A).flip([-1]) * (B - A)
+    out = out[..., 1] - out[..., 0]
+    return out
+
+def compute_sine_theta(s1, s2):  #s1 and s2 aret two segments to be uswed
+    #s1, s2 (2, 2)
+    v1 = s1[1,:] - s1[0, :]
+    v2 = s2[1,:] - s2[0, :]
+    #print(v1, v2)
+    sine_theta = ( v1[0] * v2[1] - v1[1] * v2[0] ) / (torch.norm(v1) * torch.norm(v2))
+    return sine_theta
+
+def xing_loss(x_list, scale=1e-3):  # x[ npoints,2]
+    loss = 0.
+    #print(len(x_list))
+    for x in x_list:
+        #print(x)
+        seg_loss = 0.
+        N = x.size()[0]
+        x = torch.cat([x,x[0,:].unsqueeze(0)], dim=0)  #(N+1,2)
+        segments =  torch.cat([x[:-1,:].unsqueeze(1), x[1:,:].unsqueeze(1)], dim=1)  #(N, start/end, 2)
+        assert N % 3 == 0, 'The segment number is not correct!'
+        segment_num = int(N / 3)
+        for i in range(segment_num):
+            cs1 = segments[i*3, :, :]  #start control segs
+            cs2 = segments[i*3 + 1, :, :] #middle control segs
+            cs3 = segments[i*3 + 2, :, :]   #end control segs
+            #print('the direction of the vectors:')
+            #print(compute_sine_theta(cs1, cs2))
+            direct = (compute_sine_theta(cs1, cs2) >= 0).float()
+            opst = 1 - direct  #another direction
+            sina = compute_sine_theta(cs1, cs3)  #the angle between cs1 and cs3
+            seg_loss += direct * torch.relu( - sina) + opst * torch.relu(sina)
+            # print(direct, opst, sina)
+        seg_loss /= segment_num
+
+
+        templ = seg_loss
+        loss += templ * scale #area_loss * scale
+
+    return loss / (len(x_list))
+
+
+if __name__ == "__main__":
+    #x = torch.rand([6, 2])
+    #x = torch.tensor([[0,0], [1,1], [2,1], [1.5,0]])
+    x = torch.tensor([[0,0], [1,1], [2,1], [0.5,0]])
+    #x = torch.tensor([[1,0], [2,1], [0,1], [2,0]])
+    scale = 1 #0.5
+    y = xing_loss([x], scale)
+    print(y)
+
+    x = torch.tensor([[0,0], [1,1], [2,1], [2.,0]])
+    #x = torch.tensor([[1,0], [2,1], [0,1], [2,0]])
+    scale = 1 #0.5
+    y = xing_loss([x], scale)
+    print(y)